HW3: [WIP] V0 version added
This commit is contained in:
parent
146e975ac1
commit
1fe5ab4da7
@ -25,12 +25,12 @@ PROJECT := PDS_homework_3
|
||||
TARGET := bitonic
|
||||
|
||||
# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
|
||||
SRC_DIR_LIST := src test test/gtest
|
||||
SRC_DIR_LIST := src #test test/gtest
|
||||
|
||||
# Include directories list(space seperated). Makefile-relative path.
|
||||
INC_DIR_LIST := src \
|
||||
test \
|
||||
test/gtest/ \
|
||||
INC_DIR_LIST := src
|
||||
# test \
|
||||
# test/gtest/ \
|
||||
|
||||
|
||||
# Exclude files list(space seperated). Filenames only.
|
||||
@ -45,10 +45,10 @@ OUTPUT_DIR := out
|
||||
|
||||
# ========== Compiler settings ==========
|
||||
# Compiler flags for debug and release
|
||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp
|
||||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 -fopenmp
|
||||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp
|
||||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 -fopenmp
|
||||
DEB_CFLAGS := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra"
|
||||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
|
||||
DEB_CXXFLAGS := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
||||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
||||
|
||||
# Pre-defines
|
||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
||||
@ -56,15 +56,15 @@ PRE_DEFS :=
|
||||
|
||||
# ============== Linker settings ==============
|
||||
# Linker flags (example: -pthread -lm)
|
||||
LDFLAGS := -pthread
|
||||
LDFLAGS :=
|
||||
|
||||
# Map output file
|
||||
MAP_FILE := output.map
|
||||
MAP_FLAG := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
|
||||
MAP_FILE := # output.map
|
||||
MAP_FLAG := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
|
||||
|
||||
# ============== Docker settings ==============
|
||||
# We need:
|
||||
# - Bind the entire project directory(the dir that icludes all the code) as volume.
|
||||
# - Bind the entire project directory(the dir that includes all the code) as volume.
|
||||
# - In docker instance, change to working directory(where the makefile is).
|
||||
DOCKER_VOL_DIR := $(shell pwd)
|
||||
DOCKER_WRK_DIR :=
|
||||
@ -85,6 +85,7 @@ CFLAGS := $(DEB_CFLAGS)
|
||||
CXXFLAGS := $(DEB_CXXFLAGS)
|
||||
CXX := g++ #mpic++
|
||||
CC := gcc #mpicc
|
||||
LINKER := g++
|
||||
|
||||
#
|
||||
# =========== Main body and Patterns ===========
|
||||
@ -117,37 +118,37 @@ DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
|
||||
# It is based on Tom Tromey's method.
|
||||
#
|
||||
# Invoke cpp to create makefile rules with dependencies for each source file
|
||||
$(DEP_DIR)/%.d: %.c
|
||||
@mkdir -p $(@D)
|
||||
@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
|
||||
#$(DEP_DIR)/%.d: %.c
|
||||
# @mkdir -p $(@D)
|
||||
# @$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
|
||||
|
||||
# c file objects depent on .c AND dependency files, which have an empty recipe
|
||||
$(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d
|
||||
$(OBJ_DIR)/%.o: %.c
|
||||
@mkdir -p $(@D)
|
||||
@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
|
||||
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
|
||||
|
||||
$(DEP_DIR)/%.d: %.cpp
|
||||
@mkdir -p $(@D)
|
||||
@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
|
||||
#$(DEP_DIR)/%.d: %.cpp
|
||||
# @mkdir -p $(@D)
|
||||
# @$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
|
||||
|
||||
# cpp file objects depent on .cpp AND dependency files, which have an empty recipe
|
||||
$(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
|
||||
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
|
||||
$(OBJ_DIR)/%.o: %.cpp
|
||||
@mkdir -p $(@D)
|
||||
@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
|
||||
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
|
||||
|
||||
# empty recipe for dependency files. This prevents make errors
|
||||
$(DEP):
|
||||
#$(DEP):
|
||||
|
||||
# now include all dependencies
|
||||
# After all they are makefile dependency rules ;)
|
||||
include $(wildcard $(DEP))
|
||||
#include $(wildcard $(DEP))
|
||||
|
||||
# main target rule
|
||||
$(BUILD_DIR)/$(TARGET): $(OBJ)
|
||||
@mkdir -p $(@D)
|
||||
@echo Linking to target: $(TARGET)
|
||||
@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
||||
@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
||||
@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
||||
@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
||||
@echo
|
||||
@echo Print size information
|
||||
@$(CSIZE) $(@D)/$(TARGET)
|
||||
@ -179,10 +180,12 @@ release: $(BUILD_DIR)/$(TARGET)
|
||||
#
|
||||
|
||||
|
||||
bitonic_v0: CC := nvcc
|
||||
bitonic_v0: CXX := nvcc
|
||||
bitonic_v0: CC := nvcc -x cu
|
||||
bitonic_v0: CXX := nvcc -x cu
|
||||
bitonic_v0: LINKER := nvcc
|
||||
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
|
||||
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
|
||||
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
||||
bitonic_v0: TARGET := bitonic_v0
|
||||
bitonic_v0: $(BUILD_DIR)/$(TARGET)
|
||||
@mkdir -p $(OUTPUT_DIR)
|
||||
@ -191,11 +194,7 @@ bitonic_v0: $(BUILD_DIR)/$(TARGET)
|
||||
|
||||
hpc-build:
|
||||
make clean
|
||||
make distbubbletonic
|
||||
make clean
|
||||
make distbitonic
|
||||
make clean
|
||||
make tests
|
||||
make bitonic_v0
|
||||
|
||||
|
||||
all: debug bitonic_v0
|
||||
|
33
homework_3/exersize.md
Normal file
33
homework_3/exersize.md
Normal file
@ -0,0 +1,33 @@
|
||||
Parallel & Distributed Computer Systems HW3
|
||||
|
||||
January, 2025
|
||||
|
||||
Write a program that sorts $N$ integers in ascending order, using CUDA.
|
||||
|
||||
The program must perform the following tasks:
|
||||
|
||||
- The user specifies a positive integers $q$.
|
||||
|
||||
- Start a process with an array of $N = 2^q$ random integers is each processes.
|
||||
|
||||
- Sort all $N$ elements int ascending order.
|
||||
|
||||
- Check the correctness of the final result.
|
||||
|
||||
Your implementation should be based on the following steps:
|
||||
|
||||
V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
|
||||
|
||||
V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
|
||||
|
||||
V2. Modify the kernel of V1 to work with local memory instead of global.
|
||||
|
||||
You must deliver:
|
||||
- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
|
||||
|
||||
- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
|
||||
|
||||
- The source code of your program uploaded online.
|
||||
|
||||
Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
|
||||
Deadline: 2 February, $2025$.
|
145
homework_3/src/bitonicsort.hpp
Normal file
145
homework_3/src/bitonicsort.hpp
Normal file
@ -0,0 +1,145 @@
|
||||
/*!
|
||||
* \file
|
||||
* \brief Bitonic sort CUDA implementation header
|
||||
*
|
||||
* \author
|
||||
* Christos Choutouridis AEM:8997
|
||||
* <cchoutou@ece.auth.gr>
|
||||
*/
|
||||
|
||||
#ifndef BITONICSORTCUDA_H_
|
||||
#define BITONICSORTCUDA_H_
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
/*
|
||||
* Exported timers
|
||||
*/
|
||||
extern Timing Timer_total;
|
||||
|
||||
using threadId_t = size_t;
|
||||
|
||||
|
||||
/*
|
||||
* ============================== Sort utilities ==============================
|
||||
*/
|
||||
|
||||
/*!
|
||||
* Returns the ascending or descending configuration (up/down phase) of the thread id
|
||||
* depending on the current depth
|
||||
*
|
||||
* @param tid [threadId_t] The current thread
|
||||
* @param stage [size_t] The current stage of the sorting network (same for each step)
|
||||
* @return [bool] True if we need ascending configuration, false otherwise
|
||||
*/
|
||||
__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
|
||||
return !(tid & (1 << stage));
|
||||
}
|
||||
|
||||
/*!
|
||||
* Returns the thread's partner for data exchange during the sorting network iterations
|
||||
* of Bitonic
|
||||
*
|
||||
* @param tid [threadId_t] The current node
|
||||
* @param step [size_t] The step of the sorting network
|
||||
* @return [threadId_t] The node id of the partner for data exchange
|
||||
*/
|
||||
__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
|
||||
return (tid ^ (1 << step));
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
|
||||
*
|
||||
* @param tid [threadId_t] The node for which we check
|
||||
* @param partner [threadId_t] The partner of the data exchange
|
||||
* @param stage [size_t] The current stage of the sorting network (same for each step)
|
||||
* @return [bool] True if the node should keep the small values, false otherwise
|
||||
*/
|
||||
|
||||
__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
|
||||
return ascending(tid, stage) == (tid < partner);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* ============================== Sort algorithms ==============================
|
||||
*/
|
||||
|
||||
|
||||
template <typename ValueT>
|
||||
__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) {
|
||||
if (( keepSmall && (data[tid] > data[partner])) ||
|
||||
(!keepSmall && (data[tid] < data[partner])) ) {
|
||||
ValueT temp = data[tid];
|
||||
data[tid] = data[partner];
|
||||
data[partner] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename ValueT>
|
||||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
||||
if (tid < n) {
|
||||
threadId_t pid = partner(tid, step);
|
||||
if (pid < n) {
|
||||
bool keep = keepSmall(tid, pid, stage);
|
||||
cudaExchange(data, tid, pid, keep);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* A distributed version of the Bitonic sort algorithm.
|
||||
*
|
||||
* @note
|
||||
* Each MPI process should run an instance of this function.
|
||||
*
|
||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||
*
|
||||
* @param data [ShadowedDataT] The local to MPI process data to sort
|
||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
||||
* @param rank [mpi_id_t] The current process id
|
||||
*/
|
||||
|
||||
template <typename DataT>
|
||||
void bitonicSort(DataT& data) {
|
||||
using value_t = typename DataT::value_type;
|
||||
|
||||
value_t* dev_data;
|
||||
auto size = data.size();
|
||||
|
||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
||||
|
||||
int Nthreads = 1024;
|
||||
int Nblocks = (size + Nthreads - 1) / Nthreads;
|
||||
|
||||
size_t max_depth = static_cast<size_t>(log2(size));
|
||||
for (size_t stage = 1; stage <= max_depth; ++stage) {
|
||||
for (size_t step = stage; step > 0; ) {
|
||||
--step;
|
||||
bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
||||
cudaFree(dev_data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //BITONICSORTCUDA_H_
|
@ -35,7 +35,7 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||
|
||||
|
||||
/*!
|
||||
* Value type selection
|
||||
* Value and Buffer type selection
|
||||
*
|
||||
* We support the following compiler types or the <cstdint> that translate to them:
|
||||
* char - unsigned char
|
||||
@ -46,7 +46,8 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||
* float
|
||||
* double
|
||||
*/
|
||||
using distValue_t = uint32_t;
|
||||
using Value_t = uint32_t;
|
||||
using Data_t = std::vector<Value_t>;
|
||||
|
||||
/*!
|
||||
* Session option for each invocation of the executable.
|
||||
|
@ -1,51 +0,0 @@
|
||||
/*!
|
||||
* \file
|
||||
* \brief Distributed sort implementation
|
||||
*
|
||||
* \author
|
||||
* Christos Choutouridis AEM:8997
|
||||
* <cchoutou@ece.auth.gr>
|
||||
*/
|
||||
#include "utils.hpp"
|
||||
#include "distsort.hpp"
|
||||
|
||||
|
||||
/*!
|
||||
* Returns the ascending or descending configuration of the node's sequence based on
|
||||
* the current node (MPI process) and the depth of the sorting network
|
||||
*
|
||||
* @param node [mpi_id_t] The current node (MPI process)
|
||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network)
|
||||
* @return [bool] True if we need ascending configuration, false otherwise
|
||||
*/
|
||||
bool ascending(mpi_id_t node, size_t depth) noexcept {
|
||||
return !(node & (1 << depth));
|
||||
}
|
||||
|
||||
/*!
|
||||
* Returns the node's partner for data exchange during the sorting network iterations
|
||||
* of Bitonic
|
||||
*
|
||||
* @param node [mpi_id_t] The current node
|
||||
* @param step [size_t] The step of the sorting network
|
||||
* @return [mpi_id_t] The node id of the partner for data exchange
|
||||
*/
|
||||
mpi_id_t partner(mpi_id_t node, size_t step) noexcept {
|
||||
return (node ^ (1 << step));
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
|
||||
*
|
||||
* @param node [mpi_id_t] The node for which we check
|
||||
* @param partner [mpi_id_t] The partner of the data exchange
|
||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network)
|
||||
* @return [bool] True if the node should keep the small values, false otherwise
|
||||
*/
|
||||
|
||||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth) {
|
||||
if (node == partner)
|
||||
throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n");
|
||||
return ascending(node, depth) == (node < partner);
|
||||
}
|
@ -1,223 +0,0 @@
|
||||
/*!
|
||||
* \file
|
||||
* \brief Distributed sort implementation header
|
||||
*
|
||||
* \author
|
||||
* Christos Choutouridis AEM:8997
|
||||
* <cchoutou@ece.auth.gr>
|
||||
*/
|
||||
|
||||
#ifndef DISTBITONIC_H_
|
||||
#define DISTBITONIC_H_
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <parallel/algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#if !defined DEBUG
|
||||
#define NDEBUG
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
/*
|
||||
* Exported timers
|
||||
*/
|
||||
extern Timing Timer_total;
|
||||
extern Timing Timer_fullSort;
|
||||
extern Timing Timer_exchange;
|
||||
extern Timing Timer_minmax;
|
||||
extern Timing Timer_elbowSort;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* ============================== Sort utilities ==============================
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
* Returns the ascending or descending configuration of the node's sequence based on
|
||||
* the current node (MPI process) and the depth of the sorting network
|
||||
*
|
||||
* @param node [mpi_id_t] The current node (MPI process)
|
||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network)
|
||||
* @return [bool] True if we need ascending configuration, false otherwise
|
||||
*/
|
||||
bool ascending(mpi_id_t node, size_t depth);
|
||||
|
||||
/*!
|
||||
* Returns the node's partner for data exchange during the sorting network iterations
|
||||
* of Bitonic
|
||||
*
|
||||
* @param node [mpi_id_t] The current node
|
||||
* @param step [size_t] The step of the sorting network
|
||||
* @return [mpi_id_t] The node id of the partner for data exchange
|
||||
*/
|
||||
mpi_id_t partner(mpi_id_t node, size_t step);
|
||||
|
||||
|
||||
/*!
|
||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
|
||||
*
|
||||
* @param node [mpi_id_t] The node for which we check
|
||||
* @param partner [mpi_id_t] The partner of the data exchange
|
||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network)
|
||||
* @return [bool] True if the node should keep the small values, false otherwise
|
||||
*/
|
||||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* ============================== Data utilities ==============================
|
||||
*/
|
||||
|
||||
/*!
|
||||
* Sort a range using the build-in O(Nlog(N)) algorithm
|
||||
*
|
||||
* @tparam RangeT A range type with random access iterator
|
||||
*
|
||||
* @param data [RangeT] The data to be sorted
|
||||
* @param ascending [bool] Flag to indicate the sorting order
|
||||
*/
|
||||
template<typename RangeT>
|
||||
void fullSort(RangeT& data, bool ascending) noexcept {
|
||||
// Use introsort from stdlib++ here, unless ... __gnu_parallel
|
||||
if (ascending) {
|
||||
__gnu_parallel::sort(data.begin(), data.end(), std::less<>());
|
||||
}
|
||||
else {
|
||||
__gnu_parallel::sort(data.begin(), data.end(), std::greater<>());
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* Core functionality of sort for shadowed buffer types using
|
||||
* the "elbow sort" algorithm.
|
||||
*
|
||||
* @note:
|
||||
* This algorithm can not work "in place".
|
||||
* We use the active buffer as source and the shadow as target.
|
||||
* At the end we switch which buffer is active and which is the shadow.
|
||||
* @note
|
||||
* This is the core functionality. Use the elbowSort() function instead
|
||||
*
|
||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||
* @tparam CompT A Comparison type for binary operation comparisons
|
||||
*
|
||||
* @param data [ShadowedDataT] The data to sort
|
||||
* @param ascending [bool] Flag to indicate the sorting order
|
||||
* @param comp [CompT] The binary operator object
|
||||
*/
|
||||
template<typename ShadowedDataT, typename CompT>
|
||||
void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept {
|
||||
auto& active = data.getActive(); // Get the source vector (the data to sort)
|
||||
auto& shadow = data.getShadow(); // Get the target vector (the sorted data)
|
||||
|
||||
size_t N = data.size(); // The total size is the same or both vectors
|
||||
size_t left = std::distance(
|
||||
active.begin(),
|
||||
(ascending) ?
|
||||
std::min_element(active.begin(), active.end()) :
|
||||
std::max_element(active.begin(), active.end())
|
||||
); // start 'left' from elbow of the bitonic
|
||||
size_t right = (left == N-1) ? 0 : left + 1;
|
||||
|
||||
// Walk in opposite directions from elbow and insert-sort to target vector
|
||||
for (size_t i = 0 ; i<N ; ++i) {
|
||||
if (comp(active[left], active[right])) {
|
||||
shadow[i] = active[left];
|
||||
left = (left == 0) ? N-1 : left -1; // cycle decrease
|
||||
}
|
||||
else {
|
||||
shadow[i] = active[right];
|
||||
right = (right + 1) % N; // cycle increase
|
||||
}
|
||||
}
|
||||
data.switch_active(); // Switch active-shadow buffers
|
||||
}
|
||||
|
||||
/*!
|
||||
* Sort a shadowed buffer using the "elbow sort" algorithm.
|
||||
*
|
||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||
*
|
||||
* @param data [ShadowedDataT] The data to sort
|
||||
* @param ascending [bool] Flag to indicate the sorting order
|
||||
*/
|
||||
template<typename ShadowedDataT>
|
||||
void elbowSort(ShadowedDataT& data, bool ascending) noexcept {
|
||||
if (ascending)
|
||||
elbowSortCore(data, ascending, std::less<>());
|
||||
else
|
||||
elbowSortCore(data, ascending, std::greater<>());
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* Takes two sequences and selects either the larger or the smaller items
|
||||
* in one-to-one comparison between them. If the initial sequences are bitonic, then
|
||||
* the result is a bitonic sequence too!
|
||||
*
|
||||
* @tparam ValueT The underlying type of the sequences
|
||||
*
|
||||
* @param local [ValueT*] Pointer to the local sequence
|
||||
* @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
|
||||
* @param count [size_t] The number of items to process
|
||||
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence
|
||||
*/
|
||||
template<typename ValueT>
|
||||
void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
|
||||
std::transform(
|
||||
local, local + count,
|
||||
remote,
|
||||
local,
|
||||
[&keepSmall](const ValueT& a, const ValueT& b){
|
||||
return (keepSmall) ? std::min(a, b) : std::max(a, b);
|
||||
});
|
||||
}
|
||||
|
||||
/*
|
||||
* ============================== Sort algorithms ==============================
|
||||
*/
|
||||
|
||||
|
||||
/*!
|
||||
* A distributed version of the Bitonic sort algorithm.
|
||||
*
|
||||
* @note
|
||||
* Each MPI process should run an instance of this function.
|
||||
*
|
||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||
*
|
||||
* @param data [ShadowedDataT] The local to MPI process data to sort
|
||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
||||
* @param rank [mpi_id_t] The current process id
|
||||
*/
|
||||
template<typename ShadowedDataT>
|
||||
void distBitonic(ShadowedDataT& data) {
|
||||
// Initially sort to create a half part of a bitonic sequence
|
||||
timeCall(Timer_fullSort, fullSort, data, ascending(rank, 0));
|
||||
|
||||
// Run through sort network using elbow-sort ( O(LogN * LogN) iterations )
|
||||
auto p = static_cast<uint32_t>(std::log2(Processes));
|
||||
for (size_t depth = 1; depth <= p; ++depth) {
|
||||
for (size_t step = depth; step > 0;) {
|
||||
--step;
|
||||
// Find out exchange configuration
|
||||
auto part = partner(rank, step);
|
||||
auto ks = keepSmall(rank, part, depth);
|
||||
// Exchange with partner, keep nim-or-max
|
||||
exchange(data, part, ks, tag);
|
||||
|
||||
}
|
||||
// sort - O(N)
|
||||
timeCall(Timer_elbowSort, elbowSort, data, ascending(rank, depth));
|
||||
}
|
||||
}
|
||||
|
||||
#endif //DISTBITONIC_H_
|
@ -14,34 +14,26 @@
|
||||
|
||||
#include "utils.hpp"
|
||||
#include "config.h"
|
||||
#include "distsort.hpp"
|
||||
#include "bitonicsort.hpp"
|
||||
|
||||
|
||||
// Global session data
|
||||
Data_t Data;
|
||||
config_t config;
|
||||
distBuffer_t Data;
|
||||
Log logger;
|
||||
|
||||
|
||||
// Mersenne seeded from hw if possible. range: [type_min, type_max]
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
|
||||
//! Performance timers for each one of the "costly" functions
|
||||
Timing Timer_total;
|
||||
Timing Timer_fullSort;
|
||||
Timing Timer_exchange;
|
||||
Timing Timer_minmax;
|
||||
Timing Timer_elbowSort;
|
||||
|
||||
|
||||
//! Init timing objects for extra rounds
|
||||
void measurements_init() {
|
||||
if (config.perf > 1) {
|
||||
Timer_total.init(config.perf);
|
||||
Timer_fullSort.init(config.perf);
|
||||
Timer_exchange.init(config.perf);
|
||||
Timer_minmax.init(config.perf);
|
||||
Timer_elbowSort.init(config.perf);
|
||||
}
|
||||
}
|
||||
|
||||
@ -49,10 +41,6 @@ void measurements_init() {
|
||||
void measurements_next() {
|
||||
if (config.perf > 1) {
|
||||
Timer_total.next();
|
||||
Timer_fullSort.next();
|
||||
Timer_exchange.next();
|
||||
Timer_minmax.next();
|
||||
Timer_elbowSort.next();
|
||||
}
|
||||
}
|
||||
|
||||
@ -136,20 +124,14 @@ bool get_options(int argc, char* argv[]){
|
||||
/*!
|
||||
* A simple validator for the entire distributed process
|
||||
*
|
||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||
* @tparam DataT A buffer type with random access iterator.
|
||||
*
|
||||
* @param data [ShadowedDataT] The local to MPI process
|
||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
||||
* @param rank [mpi_id_t] The current process id
|
||||
*
|
||||
* @return [bool] True if all are sorted and in total ascending order
|
||||
* @param data [DataT] The data
|
||||
* @return [bool] True if sorted in ascending order
|
||||
*/
|
||||
template<typename ShadowedDataT>
|
||||
bool validator(ShadowedDataT& data) {
|
||||
using value_t = typename ShadowedDataT::value_type;
|
||||
bool ret = true; // Have faith!
|
||||
|
||||
return ret;
|
||||
template<typename DataT>
|
||||
bool validator(DataT& data) {
|
||||
return std::is_sorted(data.begin(), data.end());
|
||||
}
|
||||
|
||||
/*!
|
||||
@ -180,15 +162,15 @@ int main(int argc, char* argv[]) try {
|
||||
for (size_t it = 0 ; it < config.perf ; ++it) {
|
||||
// Initialize local data
|
||||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
|
||||
std::uniform_int_distribution<distValue_t > dis(
|
||||
std::numeric_limits<distValue_t>::min(),
|
||||
std::numeric_limits<distValue_t>::max()
|
||||
std::uniform_int_distribution<Value_t > dis(
|
||||
std::numeric_limits<Value_t>::min(),
|
||||
std::numeric_limits<Value_t>::max()
|
||||
);
|
||||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
|
||||
// Run distributed sort
|
||||
logger << "Starting distributed sorting ... ";
|
||||
Timer_total.start();
|
||||
distBitonic(Data);
|
||||
bitonicSort(Data);
|
||||
Timer_total.stop();
|
||||
measurements_next();
|
||||
logger << " Done." << logger.endl;
|
||||
@ -196,11 +178,7 @@ int main(int argc, char* argv[]) try {
|
||||
|
||||
// Print-outs and validation
|
||||
if (config.perf > 1) {
|
||||
Timing::print_duration(Timer_total.median(), "Total ", 0);
|
||||
Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", 0);
|
||||
Timing::print_duration(Timer_exchange.median(), "Exchange ", 0);
|
||||
Timing::print_duration(Timer_minmax.median(), "Min-Max ", 0);
|
||||
Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", 0);
|
||||
Timing::print_duration(Timer_total.median(), "Total");
|
||||
}
|
||||
if (config.validation) {
|
||||
// If requested, we have the chance to fail!
|
||||
|
@ -18,124 +18,6 @@
|
||||
#include "config.h"
|
||||
|
||||
|
||||
/*!
|
||||
* @brief A std::vector wrapper with 2 vectors, an active and a shadow.
|
||||
*
|
||||
* This type exposes the standard vector functionality of the active vector.
|
||||
* The shadow can be used when we need to use the vector as mutable
|
||||
* data in algorithms that can not support "in-place" editing (like elbow-sort for example)
|
||||
*
|
||||
* @tparam Value_t the underlying data type of the vectors
|
||||
*/
|
||||
template <typename Value_t>
|
||||
struct ShadowedVec_t {
|
||||
// STL requirements
|
||||
using value_type = Value_t;
|
||||
using iterator = typename std::vector<Value_t>::iterator;
|
||||
using const_iterator = typename std::vector<Value_t>::const_iterator;
|
||||
using size_type = typename std::vector<Value_t>::size_type;
|
||||
|
||||
// Default constructor
|
||||
ShadowedVec_t() = default;
|
||||
|
||||
// Constructor from an std::vector
|
||||
explicit ShadowedVec_t(const std::vector<Value_t>& vec)
|
||||
: North(vec), South(), active(north) {
|
||||
South.resize(North.size());
|
||||
}
|
||||
|
||||
explicit ShadowedVec_t(std::vector<Value_t>&& vec)
|
||||
: North(std::move(vec)), South(), active(north) {
|
||||
South.resize(North.size());
|
||||
}
|
||||
|
||||
// Copy assignment operator
|
||||
ShadowedVec_t& operator=(const ShadowedVec_t& other) {
|
||||
if (this != &other) { // Avoid self-assignment
|
||||
North = other.North;
|
||||
South = other.South;
|
||||
active = other.active;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Move assignment operator
|
||||
ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept {
|
||||
if (this != &other) { // Avoid self-assignment
|
||||
North = std::move(other.North);
|
||||
South = std::move(other.South);
|
||||
active = other.active;
|
||||
|
||||
// There is no need to zero out other since it is valid but in a non-defined state
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Type accessors
|
||||
std::vector<Value_t>& getActive() { return (active == north) ? North : South; }
|
||||
std::vector<Value_t>& getShadow() { return (active == north) ? South : North; }
|
||||
const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; }
|
||||
const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; }
|
||||
|
||||
// Swap vectors
|
||||
void switch_active() { active = (active == north) ? south : north; }
|
||||
|
||||
// Dispatch vector functionality to active vector
|
||||
Value_t& operator[](size_type index) { return getActive()[index]; }
|
||||
const Value_t& operator[](size_type index) const { return getActive()[index]; }
|
||||
|
||||
Value_t& at(size_type index) { return getActive().at(index); }
|
||||
const Value_t& at(size_type index) const { return getActive().at(index); }
|
||||
|
||||
void push_back(const Value_t& value) { getActive().push_back(value); }
|
||||
void push_back(Value_t&& value) { getActive().push_back(std::move(value)); }
|
||||
void pop_back() { getActive().pop_back(); }
|
||||
Value_t& front() { return getActive().front(); }
|
||||
Value_t& back() { return getActive().back(); }
|
||||
const Value_t& front() const { return getActive().front(); }
|
||||
const Value_t& back() const { return getActive().back(); }
|
||||
|
||||
iterator begin() { return getActive().begin(); }
|
||||
const_iterator begin() const { return getActive().begin(); }
|
||||
iterator end() { return getActive().end(); }
|
||||
const_iterator end() const { return getActive().end(); }
|
||||
|
||||
size_type size() const { return getActive().size(); }
|
||||
void resize(size_t new_size) {
|
||||
North.resize(new_size);
|
||||
South.resize(new_size);
|
||||
}
|
||||
|
||||
void reserve(size_t new_capacity) {
|
||||
North.reserve(new_capacity);
|
||||
South.reserve(new_capacity);
|
||||
}
|
||||
[[nodiscard]] size_t capacity() const { return getActive().capacity(); }
|
||||
[[nodiscard]] bool empty() const { return getActive().empty(); }
|
||||
|
||||
void clear() { getActive().clear(); }
|
||||
void swap(std::vector<Value_t>& other) { getActive().swap(other); }
|
||||
|
||||
// Comparisons
|
||||
bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); }
|
||||
bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); }
|
||||
bool operator== (const std::vector<value_type>& other) { return getActive() == other; }
|
||||
bool operator!= (const std::vector<value_type>& other) { return getActive() != other; }
|
||||
|
||||
private:
|
||||
std::vector<Value_t> North{}; //!< Actual buffer to be used either as active or shadow
|
||||
std::vector<Value_t> South{}; //!< Actual buffer to be used either as active or shadow
|
||||
enum {
|
||||
north, south
|
||||
} active{north}; //!< Flag to select between North and South buffer
|
||||
};
|
||||
|
||||
/*
|
||||
* Exported data types
|
||||
*/
|
||||
using distBuffer_t = ShadowedVec_t<distValue_t>;
|
||||
extern distBuffer_t Data;
|
||||
|
||||
/*!
|
||||
* A Logger for entire program.
|
||||
*/
|
||||
|
@ -25,8 +25,7 @@ protected:
|
||||
|
||||
|
||||
/*
|
||||
* MPI: SysTest (acceptance)
|
||||
* Each process executes distBubbletonic for uin8_t [16]
|
||||
*
|
||||
*/
|
||||
TEST_F(TCUDAbitonic, test1) {
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user