@@ -25,12 +25,12 @@ PROJECT := PDS_homework_3 | |||||
TARGET := bitonic | TARGET := bitonic | ||||
# Source directories list(space seperated). Makefile-relative path, UNDER current directory. | # Source directories list(space seperated). Makefile-relative path, UNDER current directory. | ||||
SRC_DIR_LIST := src test test/gtest | |||||
SRC_DIR_LIST := src #test test/gtest | |||||
# Include directories list(space seperated). Makefile-relative path. | # Include directories list(space seperated). Makefile-relative path. | ||||
INC_DIR_LIST := src \ | |||||
test \ | |||||
test/gtest/ \ | |||||
INC_DIR_LIST := src | |||||
# test \ | |||||
# test/gtest/ \ | |||||
# Exclude files list(space seperated). Filenames only. | # Exclude files list(space seperated). Filenames only. | ||||
@@ -45,10 +45,10 @@ OUTPUT_DIR := out | |||||
# ========== Compiler settings ========== | # ========== Compiler settings ========== | ||||
# Compiler flags for debug and release | # Compiler flags for debug and release | ||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp | |||||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 -fopenmp | |||||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp | |||||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 -fopenmp | |||||
DEB_CFLAGS := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra" | |||||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra" | |||||
DEB_CXXFLAGS := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra" | |||||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" | |||||
# Pre-defines | # Pre-defines | ||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE | # PRE_DEFS := MYCAB=1729 SUPER_MODE | ||||
@@ -56,15 +56,15 @@ PRE_DEFS := | |||||
# ============== Linker settings ============== | # ============== Linker settings ============== | ||||
# Linker flags (example: -pthread -lm) | # Linker flags (example: -pthread -lm) | ||||
LDFLAGS := -pthread | |||||
LDFLAGS := | |||||
# Map output file | # Map output file | ||||
MAP_FILE := output.map | |||||
MAP_FLAG := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE) | |||||
MAP_FILE := # output.map | |||||
MAP_FLAG := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE) | |||||
# ============== Docker settings ============== | # ============== Docker settings ============== | ||||
# We need: | # We need: | ||||
# - Bind the entire project directory(the dir that icludes all the code) as volume. | |||||
# - Bind the entire project directory(the dir that includes all the code) as volume. | |||||
# - In docker instance, change to working directory(where the makefile is). | # - In docker instance, change to working directory(where the makefile is). | ||||
DOCKER_VOL_DIR := $(shell pwd) | DOCKER_VOL_DIR := $(shell pwd) | ||||
DOCKER_WRK_DIR := | DOCKER_WRK_DIR := | ||||
@@ -85,6 +85,7 @@ CFLAGS := $(DEB_CFLAGS) | |||||
CXXFLAGS := $(DEB_CXXFLAGS) | CXXFLAGS := $(DEB_CXXFLAGS) | ||||
CXX := g++ #mpic++ | CXX := g++ #mpic++ | ||||
CC := gcc #mpicc | CC := gcc #mpicc | ||||
LINKER := g++ | |||||
# | # | ||||
# =========== Main body and Patterns =========== | # =========== Main body and Patterns =========== | ||||
@@ -117,37 +118,37 @@ DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file)) | |||||
# It is based on Tom Tromey's method. | # It is based on Tom Tromey's method. | ||||
# | # | ||||
# Invoke cpp to create makefile rules with dependencies for each source file | # Invoke cpp to create makefile rules with dependencies for each source file | ||||
$(DEP_DIR)/%.d: %.c | |||||
@mkdir -p $(@D) | |||||
@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $< | |||||
#$(DEP_DIR)/%.d: %.c | |||||
# @mkdir -p $(@D) | |||||
# @$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $< | |||||
# c file objects depent on .c AND dependency files, which have an empty recipe | # c file objects depent on .c AND dependency files, which have an empty recipe | ||||
$(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d | |||||
$(OBJ_DIR)/%.o: %.c | |||||
@mkdir -p $(@D) | @mkdir -p $(@D) | ||||
@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< | |||||
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< | |||||
$(DEP_DIR)/%.d: %.cpp | |||||
@mkdir -p $(@D) | |||||
@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $< | |||||
#$(DEP_DIR)/%.d: %.cpp | |||||
# @mkdir -p $(@D) | |||||
# @$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $< | |||||
# cpp file objects depent on .cpp AND dependency files, which have an empty recipe | |||||
$(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d | |||||
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe | |||||
$(OBJ_DIR)/%.o: %.cpp | |||||
@mkdir -p $(@D) | @mkdir -p $(@D) | ||||
@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< | |||||
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< | |||||
# empty recipe for dependency files. This prevents make errors | # empty recipe for dependency files. This prevents make errors | ||||
$(DEP): | |||||
#$(DEP): | |||||
# now include all dependencies | # now include all dependencies | ||||
# After all they are makefile dependency rules ;) | # After all they are makefile dependency rules ;) | ||||
include $(wildcard $(DEP)) | |||||
#include $(wildcard $(DEP)) | |||||
# main target rule | # main target rule | ||||
$(BUILD_DIR)/$(TARGET): $(OBJ) | $(BUILD_DIR)/$(TARGET): $(OBJ) | ||||
@mkdir -p $(@D) | @mkdir -p $(@D) | ||||
@echo Linking to target: $(TARGET) | @echo Linking to target: $(TARGET) | ||||
@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||||
@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||||
@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||||
@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||||
@echo | @echo | ||||
@echo Print size information | @echo Print size information | ||||
@$(CSIZE) $(@D)/$(TARGET) | @$(CSIZE) $(@D)/$(TARGET) | ||||
@@ -179,10 +180,12 @@ release: $(BUILD_DIR)/$(TARGET) | |||||
# | # | ||||
bitonic_v0: CC := nvcc | |||||
bitonic_v0: CXX := nvcc | |||||
bitonic_v0: CC := nvcc -x cu | |||||
bitonic_v0: CXX := nvcc -x cu | |||||
bitonic_v0: LINKER := nvcc | |||||
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0 | bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0 | ||||
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0 | bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0 | ||||
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0 | |||||
bitonic_v0: TARGET := bitonic_v0 | bitonic_v0: TARGET := bitonic_v0 | ||||
bitonic_v0: $(BUILD_DIR)/$(TARGET) | bitonic_v0: $(BUILD_DIR)/$(TARGET) | ||||
@mkdir -p $(OUTPUT_DIR) | @mkdir -p $(OUTPUT_DIR) | ||||
@@ -191,11 +194,7 @@ bitonic_v0: $(BUILD_DIR)/$(TARGET) | |||||
hpc-build: | hpc-build: | ||||
make clean | make clean | ||||
make distbubbletonic | |||||
make clean | |||||
make distbitonic | |||||
make clean | |||||
make tests | |||||
make bitonic_v0 | |||||
all: debug bitonic_v0 | all: debug bitonic_v0 | ||||
@@ -0,0 +1,33 @@ | |||||
Parallel & Distributed Computer Systems HW3 | |||||
January, 2025 | |||||
Write a program that sorts $N$ integers in ascending order, using CUDA. | |||||
The program must perform the following tasks: | |||||
- The user specifies a positive integers $q$. | |||||
- Start a process with an array of $N = 2^q$ random integers is each processes. | |||||
- Sort all $N$ elements int ascending order. | |||||
- Check the correctness of the final result. | |||||
Your implementation should be based on the following steps: | |||||
V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations. | |||||
V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0! | |||||
V2. Modify the kernel of V1 to work with local memory instead of global. | |||||
You must deliver: | |||||
- A report (about $3-4$ pages) that describes your parallel algorithm and implementation. | |||||
- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$. | |||||
- The source code of your program uploaded online. | |||||
Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names. | |||||
Deadline: 2 February, $2025$. |
@@ -0,0 +1,145 @@ | |||||
/*! | |||||
* \file | |||||
* \brief Bitonic sort CUDA implementation header | |||||
* | |||||
* \author | |||||
* Christos Choutouridis AEM:8997 | |||||
* <cchoutou@ece.auth.gr> | |||||
*/ | |||||
#ifndef BITONICSORTCUDA_H_ | |||||
#define BITONICSORTCUDA_H_ | |||||
#include <cuda_runtime.h> | |||||
#include <vector> | |||||
#include <cmath> | |||||
#include <cstdint> | |||||
#include <utility> | |||||
#include "utils.hpp" | |||||
/* | |||||
* Exported timers | |||||
*/ | |||||
extern Timing Timer_total; | |||||
using threadId_t = size_t; | |||||
/* | |||||
* ============================== Sort utilities ============================== | |||||
*/ | |||||
/*! | |||||
* Returns the ascending or descending configuration (up/down phase) of the thread id | |||||
* depending on the current depth | |||||
* | |||||
* @param tid [threadId_t] The current thread | |||||
* @param stage [size_t] The current stage of the sorting network (same for each step) | |||||
* @return [bool] True if we need ascending configuration, false otherwise | |||||
*/ | |||||
__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept { | |||||
return !(tid & (1 << stage)); | |||||
} | |||||
/*! | |||||
* Returns the thread's partner for data exchange during the sorting network iterations | |||||
* of Bitonic | |||||
* | |||||
* @param tid [threadId_t] The current node | |||||
* @param step [size_t] The step of the sorting network | |||||
* @return [threadId_t] The node id of the partner for data exchange | |||||
*/ | |||||
__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept { | |||||
return (tid ^ (1 << step)); | |||||
} | |||||
/*! | |||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||||
* | |||||
* @param tid [threadId_t] The node for which we check | |||||
* @param partner [threadId_t] The partner of the data exchange | |||||
* @param stage [size_t] The current stage of the sorting network (same for each step) | |||||
* @return [bool] True if the node should keep the small values, false otherwise | |||||
*/ | |||||
__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) { | |||||
return ascending(tid, stage) == (tid < partner); | |||||
} | |||||
/* | |||||
* ============================== Sort algorithms ============================== | |||||
*/ | |||||
template <typename ValueT> | |||||
__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) { | |||||
if (( keepSmall && (data[tid] > data[partner])) || | |||||
(!keepSmall && (data[tid] < data[partner])) ) { | |||||
ValueT temp = data[tid]; | |||||
data[tid] = data[partner]; | |||||
data[partner] = temp; | |||||
} | |||||
} | |||||
template <typename ValueT> | |||||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { | |||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID | |||||
if (tid < n) { | |||||
threadId_t pid = partner(tid, step); | |||||
if (pid < n) { | |||||
bool keep = keepSmall(tid, pid, stage); | |||||
cudaExchange(data, tid, pid, keep); | |||||
} | |||||
} | |||||
} | |||||
/*! | |||||
* A distributed version of the Bitonic sort algorithm. | |||||
* | |||||
* @note | |||||
* Each MPI process should run an instance of this function. | |||||
* | |||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* | |||||
* @param data [ShadowedDataT] The local to MPI process data to sort | |||||
* @param Processes [mpi_id_t] The total number of MPI processes | |||||
* @param rank [mpi_id_t] The current process id | |||||
*/ | |||||
template <typename DataT> | |||||
void bitonicSort(DataT& data) { | |||||
using value_t = typename DataT::value_type; | |||||
value_t* dev_data; | |||||
auto size = data.size(); | |||||
cudaMalloc(&dev_data, size * sizeof(value_t)); | |||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); | |||||
int Nthreads = 1024; | |||||
int Nblocks = (size + Nthreads - 1) / Nthreads; | |||||
size_t max_depth = static_cast<size_t>(log2(size)); | |||||
for (size_t stage = 1; stage <= max_depth; ++stage) { | |||||
for (size_t step = stage; step > 0; ) { | |||||
--step; | |||||
bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); | |||||
cudaDeviceSynchronize(); | |||||
} | |||||
} | |||||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); | |||||
cudaFree(dev_data); | |||||
} | |||||
#endif //BITONICSORTCUDA_H_ |
@@ -35,7 +35,7 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; | |||||
/*! | /*! | ||||
* Value type selection | |||||
* Value and Buffer type selection | |||||
* | * | ||||
* We support the following compiler types or the <cstdint> that translate to them: | * We support the following compiler types or the <cstdint> that translate to them: | ||||
* char - unsigned char | * char - unsigned char | ||||
@@ -46,7 +46,8 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; | |||||
* float | * float | ||||
* double | * double | ||||
*/ | */ | ||||
using distValue_t = uint32_t; | |||||
using Value_t = uint32_t; | |||||
using Data_t = std::vector<Value_t>; | |||||
/*! | /*! | ||||
* Session option for each invocation of the executable. | * Session option for each invocation of the executable. | ||||
@@ -1,51 +0,0 @@ | |||||
/*! | |||||
* \file | |||||
* \brief Distributed sort implementation | |||||
* | |||||
* \author | |||||
* Christos Choutouridis AEM:8997 | |||||
* <cchoutou@ece.auth.gr> | |||||
*/ | |||||
#include "utils.hpp" | |||||
#include "distsort.hpp" | |||||
/*! | |||||
* Returns the ascending or descending configuration of the node's sequence based on | |||||
* the current node (MPI process) and the depth of the sorting network | |||||
* | |||||
* @param node [mpi_id_t] The current node (MPI process) | |||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||||
* @return [bool] True if we need ascending configuration, false otherwise | |||||
*/ | |||||
bool ascending(mpi_id_t node, size_t depth) noexcept { | |||||
return !(node & (1 << depth)); | |||||
} | |||||
/*! | |||||
* Returns the node's partner for data exchange during the sorting network iterations | |||||
* of Bitonic | |||||
* | |||||
* @param node [mpi_id_t] The current node | |||||
* @param step [size_t] The step of the sorting network | |||||
* @return [mpi_id_t] The node id of the partner for data exchange | |||||
*/ | |||||
mpi_id_t partner(mpi_id_t node, size_t step) noexcept { | |||||
return (node ^ (1 << step)); | |||||
} | |||||
/*! | |||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||||
* | |||||
* @param node [mpi_id_t] The node for which we check | |||||
* @param partner [mpi_id_t] The partner of the data exchange | |||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||||
* @return [bool] True if the node should keep the small values, false otherwise | |||||
*/ | |||||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth) { | |||||
if (node == partner) | |||||
throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n"); | |||||
return ascending(node, depth) == (node < partner); | |||||
} |
@@ -1,223 +0,0 @@ | |||||
/*! | |||||
* \file | |||||
* \brief Distributed sort implementation header | |||||
* | |||||
* \author | |||||
* Christos Choutouridis AEM:8997 | |||||
* <cchoutou@ece.auth.gr> | |||||
*/ | |||||
#ifndef DISTBITONIC_H_ | |||||
#define DISTBITONIC_H_ | |||||
#include <vector> | |||||
#include <algorithm> | |||||
#include <parallel/algorithm> | |||||
#include <cmath> | |||||
#include <cstdint> | |||||
#if !defined DEBUG | |||||
#define NDEBUG | |||||
#endif | |||||
#include <cassert> | |||||
#include "utils.hpp" | |||||
/* | |||||
* Exported timers | |||||
*/ | |||||
extern Timing Timer_total; | |||||
extern Timing Timer_fullSort; | |||||
extern Timing Timer_exchange; | |||||
extern Timing Timer_minmax; | |||||
extern Timing Timer_elbowSort; | |||||
/* | |||||
* ============================== Sort utilities ============================== | |||||
*/ | |||||
/*! | |||||
* Returns the ascending or descending configuration of the node's sequence based on | |||||
* the current node (MPI process) and the depth of the sorting network | |||||
* | |||||
* @param node [mpi_id_t] The current node (MPI process) | |||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||||
* @return [bool] True if we need ascending configuration, false otherwise | |||||
*/ | |||||
bool ascending(mpi_id_t node, size_t depth); | |||||
/*! | |||||
* Returns the node's partner for data exchange during the sorting network iterations | |||||
* of Bitonic | |||||
* | |||||
* @param node [mpi_id_t] The current node | |||||
* @param step [size_t] The step of the sorting network | |||||
* @return [mpi_id_t] The node id of the partner for data exchange | |||||
*/ | |||||
mpi_id_t partner(mpi_id_t node, size_t step); | |||||
/*! | |||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||||
* | |||||
* @param node [mpi_id_t] The node for which we check | |||||
* @param partner [mpi_id_t] The partner of the data exchange | |||||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||||
* @return [bool] True if the node should keep the small values, false otherwise | |||||
*/ | |||||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth); | |||||
/* | |||||
* ============================== Data utilities ============================== | |||||
*/ | |||||
/*! | |||||
* Sort a range using the build-in O(Nlog(N)) algorithm | |||||
* | |||||
* @tparam RangeT A range type with random access iterator | |||||
* | |||||
* @param data [RangeT] The data to be sorted | |||||
* @param ascending [bool] Flag to indicate the sorting order | |||||
*/ | |||||
template<typename RangeT> | |||||
void fullSort(RangeT& data, bool ascending) noexcept { | |||||
// Use introsort from stdlib++ here, unless ... __gnu_parallel | |||||
if (ascending) { | |||||
__gnu_parallel::sort(data.begin(), data.end(), std::less<>()); | |||||
} | |||||
else { | |||||
__gnu_parallel::sort(data.begin(), data.end(), std::greater<>()); | |||||
} | |||||
} | |||||
/*! | |||||
* Core functionality of sort for shadowed buffer types using | |||||
* the "elbow sort" algorithm. | |||||
* | |||||
* @note: | |||||
* This algorithm can not work "in place". | |||||
* We use the active buffer as source and the shadow as target. | |||||
* At the end we switch which buffer is active and which is the shadow. | |||||
* @note | |||||
* This is the core functionality. Use the elbowSort() function instead | |||||
* | |||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* @tparam CompT A Comparison type for binary operation comparisons | |||||
* | |||||
* @param data [ShadowedDataT] The data to sort | |||||
* @param ascending [bool] Flag to indicate the sorting order | |||||
* @param comp [CompT] The binary operator object | |||||
*/ | |||||
template<typename ShadowedDataT, typename CompT> | |||||
void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept { | |||||
auto& active = data.getActive(); // Get the source vector (the data to sort) | |||||
auto& shadow = data.getShadow(); // Get the target vector (the sorted data) | |||||
size_t N = data.size(); // The total size is the same or both vectors | |||||
size_t left = std::distance( | |||||
active.begin(), | |||||
(ascending) ? | |||||
std::min_element(active.begin(), active.end()) : | |||||
std::max_element(active.begin(), active.end()) | |||||
); // start 'left' from elbow of the bitonic | |||||
size_t right = (left == N-1) ? 0 : left + 1; | |||||
// Walk in opposite directions from elbow and insert-sort to target vector | |||||
for (size_t i = 0 ; i<N ; ++i) { | |||||
if (comp(active[left], active[right])) { | |||||
shadow[i] = active[left]; | |||||
left = (left == 0) ? N-1 : left -1; // cycle decrease | |||||
} | |||||
else { | |||||
shadow[i] = active[right]; | |||||
right = (right + 1) % N; // cycle increase | |||||
} | |||||
} | |||||
data.switch_active(); // Switch active-shadow buffers | |||||
} | |||||
/*! | |||||
* Sort a shadowed buffer using the "elbow sort" algorithm. | |||||
* | |||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* | |||||
* @param data [ShadowedDataT] The data to sort | |||||
* @param ascending [bool] Flag to indicate the sorting order | |||||
*/ | |||||
template<typename ShadowedDataT> | |||||
void elbowSort(ShadowedDataT& data, bool ascending) noexcept { | |||||
if (ascending) | |||||
elbowSortCore(data, ascending, std::less<>()); | |||||
else | |||||
elbowSortCore(data, ascending, std::greater<>()); | |||||
} | |||||
/*! | |||||
* Takes two sequences and selects either the larger or the smaller items | |||||
* in one-to-one comparison between them. If the initial sequences are bitonic, then | |||||
* the result is a bitonic sequence too! | |||||
* | |||||
* @tparam ValueT The underlying type of the sequences | |||||
* | |||||
* @param local [ValueT*] Pointer to the local sequence | |||||
* @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI) | |||||
* @param count [size_t] The number of items to process | |||||
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence | |||||
*/ | |||||
template<typename ValueT> | |||||
void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept { | |||||
std::transform( | |||||
local, local + count, | |||||
remote, | |||||
local, | |||||
[&keepSmall](const ValueT& a, const ValueT& b){ | |||||
return (keepSmall) ? std::min(a, b) : std::max(a, b); | |||||
}); | |||||
} | |||||
/* | |||||
* ============================== Sort algorithms ============================== | |||||
*/ | |||||
/*! | |||||
* A distributed version of the Bitonic sort algorithm. | |||||
* | |||||
* @note | |||||
* Each MPI process should run an instance of this function. | |||||
* | |||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* | |||||
* @param data [ShadowedDataT] The local to MPI process data to sort | |||||
* @param Processes [mpi_id_t] The total number of MPI processes | |||||
* @param rank [mpi_id_t] The current process id | |||||
*/ | |||||
template<typename ShadowedDataT> | |||||
void distBitonic(ShadowedDataT& data) { | |||||
// Initially sort to create a half part of a bitonic sequence | |||||
timeCall(Timer_fullSort, fullSort, data, ascending(rank, 0)); | |||||
// Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) | |||||
auto p = static_cast<uint32_t>(std::log2(Processes)); | |||||
for (size_t depth = 1; depth <= p; ++depth) { | |||||
for (size_t step = depth; step > 0;) { | |||||
--step; | |||||
// Find out exchange configuration | |||||
auto part = partner(rank, step); | |||||
auto ks = keepSmall(rank, part, depth); | |||||
// Exchange with partner, keep nim-or-max | |||||
exchange(data, part, ks, tag); | |||||
} | |||||
// sort - O(N) | |||||
timeCall(Timer_elbowSort, elbowSort, data, ascending(rank, depth)); | |||||
} | |||||
} | |||||
#endif //DISTBITONIC_H_ |
@@ -14,34 +14,26 @@ | |||||
#include "utils.hpp" | #include "utils.hpp" | ||||
#include "config.h" | #include "config.h" | ||||
#include "distsort.hpp" | |||||
#include "bitonicsort.hpp" | |||||
// Global session data | // Global session data | ||||
Data_t Data; | |||||
config_t config; | config_t config; | ||||
distBuffer_t Data; | |||||
Log logger; | Log logger; | ||||
// Mersenne seeded from hw if possible. range: [type_min, type_max] | // Mersenne seeded from hw if possible. range: [type_min, type_max] | ||||
std::random_device rd; | std::random_device rd; | ||||
std::mt19937 gen(rd()); | std::mt19937 gen(rd()); | ||||
//! Performance timers for each one of the "costly" functions | //! Performance timers for each one of the "costly" functions | ||||
Timing Timer_total; | Timing Timer_total; | ||||
Timing Timer_fullSort; | |||||
Timing Timer_exchange; | |||||
Timing Timer_minmax; | |||||
Timing Timer_elbowSort; | |||||
//! Init timing objects for extra rounds | //! Init timing objects for extra rounds | ||||
void measurements_init() { | void measurements_init() { | ||||
if (config.perf > 1) { | if (config.perf > 1) { | ||||
Timer_total.init(config.perf); | Timer_total.init(config.perf); | ||||
Timer_fullSort.init(config.perf); | |||||
Timer_exchange.init(config.perf); | |||||
Timer_minmax.init(config.perf); | |||||
Timer_elbowSort.init(config.perf); | |||||
} | } | ||||
} | } | ||||
@@ -49,10 +41,6 @@ void measurements_init() { | |||||
void measurements_next() { | void measurements_next() { | ||||
if (config.perf > 1) { | if (config.perf > 1) { | ||||
Timer_total.next(); | Timer_total.next(); | ||||
Timer_fullSort.next(); | |||||
Timer_exchange.next(); | |||||
Timer_minmax.next(); | |||||
Timer_elbowSort.next(); | |||||
} | } | ||||
} | } | ||||
@@ -136,20 +124,14 @@ bool get_options(int argc, char* argv[]){ | |||||
/*! | /*! | ||||
* A simple validator for the entire distributed process | * A simple validator for the entire distributed process | ||||
* | * | ||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* @tparam DataT A buffer type with random access iterator. | |||||
* | * | ||||
* @param data [ShadowedDataT] The local to MPI process | |||||
* @param Processes [mpi_id_t] The total number of MPI processes | |||||
* @param rank [mpi_id_t] The current process id | |||||
* | |||||
* @return [bool] True if all are sorted and in total ascending order | |||||
* @param data [DataT] The data | |||||
* @return [bool] True if sorted in ascending order | |||||
*/ | */ | ||||
template<typename ShadowedDataT> | |||||
bool validator(ShadowedDataT& data) { | |||||
using value_t = typename ShadowedDataT::value_type; | |||||
bool ret = true; // Have faith! | |||||
return ret; | |||||
template<typename DataT> | |||||
bool validator(DataT& data) { | |||||
return std::is_sorted(data.begin(), data.end()); | |||||
} | } | ||||
/*! | /*! | ||||
@@ -180,15 +162,15 @@ int main(int argc, char* argv[]) try { | |||||
for (size_t it = 0 ; it < config.perf ; ++it) { | for (size_t it = 0 ; it < config.perf ; ++it) { | ||||
// Initialize local data | // Initialize local data | ||||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; | logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; | ||||
std::uniform_int_distribution<distValue_t > dis( | |||||
std::numeric_limits<distValue_t>::min(), | |||||
std::numeric_limits<distValue_t>::max() | |||||
std::uniform_int_distribution<Value_t > dis( | |||||
std::numeric_limits<Value_t>::min(), | |||||
std::numeric_limits<Value_t>::max() | |||||
); | ); | ||||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | ||||
// Run distributed sort | // Run distributed sort | ||||
logger << "Starting distributed sorting ... "; | logger << "Starting distributed sorting ... "; | ||||
Timer_total.start(); | Timer_total.start(); | ||||
distBitonic(Data); | |||||
bitonicSort(Data); | |||||
Timer_total.stop(); | Timer_total.stop(); | ||||
measurements_next(); | measurements_next(); | ||||
logger << " Done." << logger.endl; | logger << " Done." << logger.endl; | ||||
@@ -196,11 +178,7 @@ int main(int argc, char* argv[]) try { | |||||
// Print-outs and validation | // Print-outs and validation | ||||
if (config.perf > 1) { | if (config.perf > 1) { | ||||
Timing::print_duration(Timer_total.median(), "Total ", 0); | |||||
Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", 0); | |||||
Timing::print_duration(Timer_exchange.median(), "Exchange ", 0); | |||||
Timing::print_duration(Timer_minmax.median(), "Min-Max ", 0); | |||||
Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", 0); | |||||
Timing::print_duration(Timer_total.median(), "Total"); | |||||
} | } | ||||
if (config.validation) { | if (config.validation) { | ||||
// If requested, we have the chance to fail! | // If requested, we have the chance to fail! | ||||
@@ -18,124 +18,6 @@ | |||||
#include "config.h" | #include "config.h" | ||||
/*! | |||||
* @brief A std::vector wrapper with 2 vectors, an active and a shadow. | |||||
* | |||||
* This type exposes the standard vector functionality of the active vector. | |||||
* The shadow can be used when we need to use the vector as mutable | |||||
* data in algorithms that can not support "in-place" editing (like elbow-sort for example) | |||||
* | |||||
* @tparam Value_t the underlying data type of the vectors | |||||
*/ | |||||
template <typename Value_t> | |||||
struct ShadowedVec_t { | |||||
// STL requirements | |||||
using value_type = Value_t; | |||||
using iterator = typename std::vector<Value_t>::iterator; | |||||
using const_iterator = typename std::vector<Value_t>::const_iterator; | |||||
using size_type = typename std::vector<Value_t>::size_type; | |||||
// Default constructor | |||||
ShadowedVec_t() = default; | |||||
// Constructor from an std::vector | |||||
explicit ShadowedVec_t(const std::vector<Value_t>& vec) | |||||
: North(vec), South(), active(north) { | |||||
South.resize(North.size()); | |||||
} | |||||
explicit ShadowedVec_t(std::vector<Value_t>&& vec) | |||||
: North(std::move(vec)), South(), active(north) { | |||||
South.resize(North.size()); | |||||
} | |||||
// Copy assignment operator | |||||
ShadowedVec_t& operator=(const ShadowedVec_t& other) { | |||||
if (this != &other) { // Avoid self-assignment | |||||
North = other.North; | |||||
South = other.South; | |||||
active = other.active; | |||||
} | |||||
return *this; | |||||
} | |||||
// Move assignment operator | |||||
ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept { | |||||
if (this != &other) { // Avoid self-assignment | |||||
North = std::move(other.North); | |||||
South = std::move(other.South); | |||||
active = other.active; | |||||
// There is no need to zero out other since it is valid but in a non-defined state | |||||
} | |||||
return *this; | |||||
} | |||||
// Type accessors | |||||
std::vector<Value_t>& getActive() { return (active == north) ? North : South; } | |||||
std::vector<Value_t>& getShadow() { return (active == north) ? South : North; } | |||||
const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; } | |||||
const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; } | |||||
// Swap vectors | |||||
void switch_active() { active = (active == north) ? south : north; } | |||||
// Dispatch vector functionality to active vector | |||||
Value_t& operator[](size_type index) { return getActive()[index]; } | |||||
const Value_t& operator[](size_type index) const { return getActive()[index]; } | |||||
Value_t& at(size_type index) { return getActive().at(index); } | |||||
const Value_t& at(size_type index) const { return getActive().at(index); } | |||||
void push_back(const Value_t& value) { getActive().push_back(value); } | |||||
void push_back(Value_t&& value) { getActive().push_back(std::move(value)); } | |||||
void pop_back() { getActive().pop_back(); } | |||||
Value_t& front() { return getActive().front(); } | |||||
Value_t& back() { return getActive().back(); } | |||||
const Value_t& front() const { return getActive().front(); } | |||||
const Value_t& back() const { return getActive().back(); } | |||||
iterator begin() { return getActive().begin(); } | |||||
const_iterator begin() const { return getActive().begin(); } | |||||
iterator end() { return getActive().end(); } | |||||
const_iterator end() const { return getActive().end(); } | |||||
size_type size() const { return getActive().size(); } | |||||
void resize(size_t new_size) { | |||||
North.resize(new_size); | |||||
South.resize(new_size); | |||||
} | |||||
void reserve(size_t new_capacity) { | |||||
North.reserve(new_capacity); | |||||
South.reserve(new_capacity); | |||||
} | |||||
[[nodiscard]] size_t capacity() const { return getActive().capacity(); } | |||||
[[nodiscard]] bool empty() const { return getActive().empty(); } | |||||
void clear() { getActive().clear(); } | |||||
void swap(std::vector<Value_t>& other) { getActive().swap(other); } | |||||
// Comparisons | |||||
bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); } | |||||
bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); } | |||||
bool operator== (const std::vector<value_type>& other) { return getActive() == other; } | |||||
bool operator!= (const std::vector<value_type>& other) { return getActive() != other; } | |||||
private: | |||||
std::vector<Value_t> North{}; //!< Actual buffer to be used either as active or shadow | |||||
std::vector<Value_t> South{}; //!< Actual buffer to be used either as active or shadow | |||||
enum { | |||||
north, south | |||||
} active{north}; //!< Flag to select between North and South buffer | |||||
}; | |||||
/* | |||||
* Exported data types | |||||
*/ | |||||
using distBuffer_t = ShadowedVec_t<distValue_t>; | |||||
extern distBuffer_t Data; | |||||
/*! | /*! | ||||
* A Logger for entire program. | * A Logger for entire program. | ||||
*/ | */ | ||||
@@ -25,8 +25,7 @@ protected: | |||||
/* | /* | ||||
* MPI: SysTest (acceptance) | |||||
* Each process executes distBubbletonic for uin8_t [16] | |||||
* | |||||
*/ | */ | ||||
TEST_F(TCUDAbitonic, test1) { | TEST_F(TCUDAbitonic, test1) { | ||||