@@ -25,12 +25,12 @@ PROJECT := PDS_homework_3 | |||
TARGET := bitonic | |||
# Source directories list(space seperated). Makefile-relative path, UNDER current directory. | |||
SRC_DIR_LIST := src test test/gtest | |||
SRC_DIR_LIST := src #test test/gtest | |||
# Include directories list(space seperated). Makefile-relative path. | |||
INC_DIR_LIST := src \ | |||
test \ | |||
test/gtest/ \ | |||
INC_DIR_LIST := src | |||
# test \ | |||
# test/gtest/ \ | |||
# Exclude files list(space seperated). Filenames only. | |||
@@ -45,10 +45,10 @@ OUTPUT_DIR := out | |||
# ========== Compiler settings ========== | |||
# Compiler flags for debug and release | |||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp | |||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 -fopenmp | |||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp | |||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 -fopenmp | |||
DEB_CFLAGS := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra" | |||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra" | |||
DEB_CXXFLAGS := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra" | |||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" | |||
# Pre-defines | |||
# PRE_DEFS := MYCAB=1729 SUPER_MODE | |||
@@ -56,15 +56,15 @@ PRE_DEFS := | |||
# ============== Linker settings ============== | |||
# Linker flags (example: -pthread -lm) | |||
LDFLAGS := -pthread | |||
LDFLAGS := | |||
# Map output file | |||
MAP_FILE := output.map | |||
MAP_FLAG := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE) | |||
MAP_FILE := # output.map | |||
MAP_FLAG := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE) | |||
# ============== Docker settings ============== | |||
# We need: | |||
# - Bind the entire project directory(the dir that icludes all the code) as volume. | |||
# - Bind the entire project directory(the dir that includes all the code) as volume. | |||
# - In docker instance, change to working directory(where the makefile is). | |||
DOCKER_VOL_DIR := $(shell pwd) | |||
DOCKER_WRK_DIR := | |||
@@ -85,6 +85,7 @@ CFLAGS := $(DEB_CFLAGS) | |||
CXXFLAGS := $(DEB_CXXFLAGS) | |||
CXX := g++ #mpic++ | |||
CC := gcc #mpicc | |||
LINKER := g++ | |||
# | |||
# =========== Main body and Patterns =========== | |||
@@ -117,37 +118,37 @@ DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file)) | |||
# It is based on Tom Tromey's method. | |||
# | |||
# Invoke cpp to create makefile rules with dependencies for each source file | |||
$(DEP_DIR)/%.d: %.c | |||
@mkdir -p $(@D) | |||
@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $< | |||
#$(DEP_DIR)/%.d: %.c | |||
# @mkdir -p $(@D) | |||
# @$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $< | |||
# c file objects depent on .c AND dependency files, which have an empty recipe | |||
$(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d | |||
$(OBJ_DIR)/%.o: %.c | |||
@mkdir -p $(@D) | |||
@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< | |||
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< | |||
$(DEP_DIR)/%.d: %.cpp | |||
@mkdir -p $(@D) | |||
@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $< | |||
#$(DEP_DIR)/%.d: %.cpp | |||
# @mkdir -p $(@D) | |||
# @$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $< | |||
# cpp file objects depent on .cpp AND dependency files, which have an empty recipe | |||
$(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d | |||
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe | |||
$(OBJ_DIR)/%.o: %.cpp | |||
@mkdir -p $(@D) | |||
@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< | |||
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< | |||
# empty recipe for dependency files. This prevents make errors | |||
$(DEP): | |||
#$(DEP): | |||
# now include all dependencies | |||
# After all they are makefile dependency rules ;) | |||
include $(wildcard $(DEP)) | |||
#include $(wildcard $(DEP)) | |||
# main target rule | |||
$(BUILD_DIR)/$(TARGET): $(OBJ) | |||
@mkdir -p $(@D) | |||
@echo Linking to target: $(TARGET) | |||
@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||
@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||
@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||
@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | |||
@echo | |||
@echo Print size information | |||
@$(CSIZE) $(@D)/$(TARGET) | |||
@@ -179,10 +180,12 @@ release: $(BUILD_DIR)/$(TARGET) | |||
# | |||
bitonic_v0: CC := nvcc | |||
bitonic_v0: CXX := nvcc | |||
bitonic_v0: CC := nvcc -x cu | |||
bitonic_v0: CXX := nvcc -x cu | |||
bitonic_v0: LINKER := nvcc | |||
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0 | |||
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0 | |||
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0 | |||
bitonic_v0: TARGET := bitonic_v0 | |||
bitonic_v0: $(BUILD_DIR)/$(TARGET) | |||
@mkdir -p $(OUTPUT_DIR) | |||
@@ -191,11 +194,7 @@ bitonic_v0: $(BUILD_DIR)/$(TARGET) | |||
hpc-build: | |||
make clean | |||
make distbubbletonic | |||
make clean | |||
make distbitonic | |||
make clean | |||
make tests | |||
make bitonic_v0 | |||
all: debug bitonic_v0 | |||
@@ -0,0 +1,33 @@ | |||
Parallel & Distributed Computer Systems HW3 | |||
January, 2025 | |||
Write a program that sorts $N$ integers in ascending order, using CUDA. | |||
The program must perform the following tasks: | |||
- The user specifies a positive integers $q$. | |||
- Start a process with an array of $N = 2^q$ random integers is each processes. | |||
- Sort all $N$ elements int ascending order. | |||
- Check the correctness of the final result. | |||
Your implementation should be based on the following steps: | |||
V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations. | |||
V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0! | |||
V2. Modify the kernel of V1 to work with local memory instead of global. | |||
You must deliver: | |||
- A report (about $3-4$ pages) that describes your parallel algorithm and implementation. | |||
- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$. | |||
- The source code of your program uploaded online. | |||
Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names. | |||
Deadline: 2 February, $2025$. |
@@ -0,0 +1,145 @@ | |||
/*! | |||
* \file | |||
* \brief Bitonic sort CUDA implementation header | |||
* | |||
* \author | |||
* Christos Choutouridis AEM:8997 | |||
* <cchoutou@ece.auth.gr> | |||
*/ | |||
#ifndef BITONICSORTCUDA_H_ | |||
#define BITONICSORTCUDA_H_ | |||
#include <cuda_runtime.h> | |||
#include <vector> | |||
#include <cmath> | |||
#include <cstdint> | |||
#include <utility> | |||
#include "utils.hpp" | |||
/* | |||
* Exported timers | |||
*/ | |||
extern Timing Timer_total; | |||
using threadId_t = size_t; | |||
/* | |||
* ============================== Sort utilities ============================== | |||
*/ | |||
/*! | |||
* Returns the ascending or descending configuration (up/down phase) of the thread id | |||
* depending on the current depth | |||
* | |||
* @param tid [threadId_t] The current thread | |||
* @param stage [size_t] The current stage of the sorting network (same for each step) | |||
* @return [bool] True if we need ascending configuration, false otherwise | |||
*/ | |||
__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept { | |||
return !(tid & (1 << stage)); | |||
} | |||
/*! | |||
* Returns the thread's partner for data exchange during the sorting network iterations | |||
* of Bitonic | |||
* | |||
* @param tid [threadId_t] The current node | |||
* @param step [size_t] The step of the sorting network | |||
* @return [threadId_t] The node id of the partner for data exchange | |||
*/ | |||
__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept { | |||
return (tid ^ (1 << step)); | |||
} | |||
/*! | |||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||
* | |||
* @param tid [threadId_t] The node for which we check | |||
* @param partner [threadId_t] The partner of the data exchange | |||
* @param stage [size_t] The current stage of the sorting network (same for each step) | |||
* @return [bool] True if the node should keep the small values, false otherwise | |||
*/ | |||
__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) { | |||
return ascending(tid, stage) == (tid < partner); | |||
} | |||
/* | |||
* ============================== Sort algorithms ============================== | |||
*/ | |||
template <typename ValueT> | |||
__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) { | |||
if (( keepSmall && (data[tid] > data[partner])) || | |||
(!keepSmall && (data[tid] < data[partner])) ) { | |||
ValueT temp = data[tid]; | |||
data[tid] = data[partner]; | |||
data[partner] = temp; | |||
} | |||
} | |||
template <typename ValueT> | |||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { | |||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID | |||
if (tid < n) { | |||
threadId_t pid = partner(tid, step); | |||
if (pid < n) { | |||
bool keep = keepSmall(tid, pid, stage); | |||
cudaExchange(data, tid, pid, keep); | |||
} | |||
} | |||
} | |||
/*! | |||
* A distributed version of the Bitonic sort algorithm. | |||
* | |||
* @note | |||
* Each MPI process should run an instance of this function. | |||
* | |||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||
* | |||
* @param data [ShadowedDataT] The local to MPI process data to sort | |||
* @param Processes [mpi_id_t] The total number of MPI processes | |||
* @param rank [mpi_id_t] The current process id | |||
*/ | |||
template <typename DataT> | |||
void bitonicSort(DataT& data) { | |||
using value_t = typename DataT::value_type; | |||
value_t* dev_data; | |||
auto size = data.size(); | |||
cudaMalloc(&dev_data, size * sizeof(value_t)); | |||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); | |||
int Nthreads = 1024; | |||
int Nblocks = (size + Nthreads - 1) / Nthreads; | |||
size_t max_depth = static_cast<size_t>(log2(size)); | |||
for (size_t stage = 1; stage <= max_depth; ++stage) { | |||
for (size_t step = stage; step > 0; ) { | |||
--step; | |||
bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); | |||
cudaDeviceSynchronize(); | |||
} | |||
} | |||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); | |||
cudaFree(dev_data); | |||
} | |||
#endif //BITONICSORTCUDA_H_ |
@@ -35,7 +35,7 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; | |||
/*! | |||
* Value type selection | |||
* Value and Buffer type selection | |||
* | |||
* We support the following compiler types or the <cstdint> that translate to them: | |||
* char - unsigned char | |||
@@ -46,7 +46,8 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; | |||
* float | |||
* double | |||
*/ | |||
using distValue_t = uint32_t; | |||
using Value_t = uint32_t; | |||
using Data_t = std::vector<Value_t>; | |||
/*! | |||
* Session option for each invocation of the executable. | |||
@@ -1,51 +0,0 @@ | |||
/*! | |||
* \file | |||
* \brief Distributed sort implementation | |||
* | |||
* \author | |||
* Christos Choutouridis AEM:8997 | |||
* <cchoutou@ece.auth.gr> | |||
*/ | |||
#include "utils.hpp" | |||
#include "distsort.hpp" | |||
/*! | |||
* Returns the ascending or descending configuration of the node's sequence based on | |||
* the current node (MPI process) and the depth of the sorting network | |||
* | |||
* @param node [mpi_id_t] The current node (MPI process) | |||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||
* @return [bool] True if we need ascending configuration, false otherwise | |||
*/ | |||
bool ascending(mpi_id_t node, size_t depth) noexcept { | |||
return !(node & (1 << depth)); | |||
} | |||
/*! | |||
* Returns the node's partner for data exchange during the sorting network iterations | |||
* of Bitonic | |||
* | |||
* @param node [mpi_id_t] The current node | |||
* @param step [size_t] The step of the sorting network | |||
* @return [mpi_id_t] The node id of the partner for data exchange | |||
*/ | |||
mpi_id_t partner(mpi_id_t node, size_t step) noexcept { | |||
return (node ^ (1 << step)); | |||
} | |||
/*! | |||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||
* | |||
* @param node [mpi_id_t] The node for which we check | |||
* @param partner [mpi_id_t] The partner of the data exchange | |||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||
* @return [bool] True if the node should keep the small values, false otherwise | |||
*/ | |||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth) { | |||
if (node == partner) | |||
throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n"); | |||
return ascending(node, depth) == (node < partner); | |||
} |
@@ -1,223 +0,0 @@ | |||
/*! | |||
* \file | |||
* \brief Distributed sort implementation header | |||
* | |||
* \author | |||
* Christos Choutouridis AEM:8997 | |||
* <cchoutou@ece.auth.gr> | |||
*/ | |||
#ifndef DISTBITONIC_H_ | |||
#define DISTBITONIC_H_ | |||
#include <vector> | |||
#include <algorithm> | |||
#include <parallel/algorithm> | |||
#include <cmath> | |||
#include <cstdint> | |||
#if !defined DEBUG | |||
#define NDEBUG | |||
#endif | |||
#include <cassert> | |||
#include "utils.hpp" | |||
/* | |||
* Exported timers | |||
*/ | |||
extern Timing Timer_total; | |||
extern Timing Timer_fullSort; | |||
extern Timing Timer_exchange; | |||
extern Timing Timer_minmax; | |||
extern Timing Timer_elbowSort; | |||
/* | |||
* ============================== Sort utilities ============================== | |||
*/ | |||
/*! | |||
* Returns the ascending or descending configuration of the node's sequence based on | |||
* the current node (MPI process) and the depth of the sorting network | |||
* | |||
* @param node [mpi_id_t] The current node (MPI process) | |||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||
* @return [bool] True if we need ascending configuration, false otherwise | |||
*/ | |||
bool ascending(mpi_id_t node, size_t depth); | |||
/*! | |||
* Returns the node's partner for data exchange during the sorting network iterations | |||
* of Bitonic | |||
* | |||
* @param node [mpi_id_t] The current node | |||
* @param step [size_t] The step of the sorting network | |||
* @return [mpi_id_t] The node id of the partner for data exchange | |||
*/ | |||
mpi_id_t partner(mpi_id_t node, size_t step); | |||
/*! | |||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | |||
* | |||
* @param node [mpi_id_t] The node for which we check | |||
* @param partner [mpi_id_t] The partner of the data exchange | |||
* @param depth [size_t] The total depth of the sorting network (same for each step for a given network) | |||
* @return [bool] True if the node should keep the small values, false otherwise | |||
*/ | |||
bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth); | |||
/* | |||
* ============================== Data utilities ============================== | |||
*/ | |||
/*! | |||
* Sort a range using the build-in O(Nlog(N)) algorithm | |||
* | |||
* @tparam RangeT A range type with random access iterator | |||
* | |||
* @param data [RangeT] The data to be sorted | |||
* @param ascending [bool] Flag to indicate the sorting order | |||
*/ | |||
template<typename RangeT> | |||
void fullSort(RangeT& data, bool ascending) noexcept { | |||
// Use introsort from stdlib++ here, unless ... __gnu_parallel | |||
if (ascending) { | |||
__gnu_parallel::sort(data.begin(), data.end(), std::less<>()); | |||
} | |||
else { | |||
__gnu_parallel::sort(data.begin(), data.end(), std::greater<>()); | |||
} | |||
} | |||
/*! | |||
* Core functionality of sort for shadowed buffer types using | |||
* the "elbow sort" algorithm. | |||
* | |||
* @note: | |||
* This algorithm can not work "in place". | |||
* We use the active buffer as source and the shadow as target. | |||
* At the end we switch which buffer is active and which is the shadow. | |||
* @note | |||
* This is the core functionality. Use the elbowSort() function instead | |||
* | |||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||
* @tparam CompT A Comparison type for binary operation comparisons | |||
* | |||
* @param data [ShadowedDataT] The data to sort | |||
* @param ascending [bool] Flag to indicate the sorting order | |||
* @param comp [CompT] The binary operator object | |||
*/ | |||
template<typename ShadowedDataT, typename CompT> | |||
void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept { | |||
auto& active = data.getActive(); // Get the source vector (the data to sort) | |||
auto& shadow = data.getShadow(); // Get the target vector (the sorted data) | |||
size_t N = data.size(); // The total size is the same or both vectors | |||
size_t left = std::distance( | |||
active.begin(), | |||
(ascending) ? | |||
std::min_element(active.begin(), active.end()) : | |||
std::max_element(active.begin(), active.end()) | |||
); // start 'left' from elbow of the bitonic | |||
size_t right = (left == N-1) ? 0 : left + 1; | |||
// Walk in opposite directions from elbow and insert-sort to target vector | |||
for (size_t i = 0 ; i<N ; ++i) { | |||
if (comp(active[left], active[right])) { | |||
shadow[i] = active[left]; | |||
left = (left == 0) ? N-1 : left -1; // cycle decrease | |||
} | |||
else { | |||
shadow[i] = active[right]; | |||
right = (right + 1) % N; // cycle increase | |||
} | |||
} | |||
data.switch_active(); // Switch active-shadow buffers | |||
} | |||
/*! | |||
* Sort a shadowed buffer using the "elbow sort" algorithm. | |||
* | |||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||
* | |||
* @param data [ShadowedDataT] The data to sort | |||
* @param ascending [bool] Flag to indicate the sorting order | |||
*/ | |||
template<typename ShadowedDataT> | |||
void elbowSort(ShadowedDataT& data, bool ascending) noexcept { | |||
if (ascending) | |||
elbowSortCore(data, ascending, std::less<>()); | |||
else | |||
elbowSortCore(data, ascending, std::greater<>()); | |||
} | |||
/*! | |||
* Takes two sequences and selects either the larger or the smaller items | |||
* in one-to-one comparison between them. If the initial sequences are bitonic, then | |||
* the result is a bitonic sequence too! | |||
* | |||
* @tparam ValueT The underlying type of the sequences | |||
* | |||
* @param local [ValueT*] Pointer to the local sequence | |||
* @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI) | |||
* @param count [size_t] The number of items to process | |||
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence | |||
*/ | |||
template<typename ValueT> | |||
void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept { | |||
std::transform( | |||
local, local + count, | |||
remote, | |||
local, | |||
[&keepSmall](const ValueT& a, const ValueT& b){ | |||
return (keepSmall) ? std::min(a, b) : std::max(a, b); | |||
}); | |||
} | |||
/* | |||
* ============================== Sort algorithms ============================== | |||
*/ | |||
/*! | |||
* A distributed version of the Bitonic sort algorithm. | |||
* | |||
* @note | |||
* Each MPI process should run an instance of this function. | |||
* | |||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||
* | |||
* @param data [ShadowedDataT] The local to MPI process data to sort | |||
* @param Processes [mpi_id_t] The total number of MPI processes | |||
* @param rank [mpi_id_t] The current process id | |||
*/ | |||
template<typename ShadowedDataT> | |||
void distBitonic(ShadowedDataT& data) { | |||
// Initially sort to create a half part of a bitonic sequence | |||
timeCall(Timer_fullSort, fullSort, data, ascending(rank, 0)); | |||
// Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) | |||
auto p = static_cast<uint32_t>(std::log2(Processes)); | |||
for (size_t depth = 1; depth <= p; ++depth) { | |||
for (size_t step = depth; step > 0;) { | |||
--step; | |||
// Find out exchange configuration | |||
auto part = partner(rank, step); | |||
auto ks = keepSmall(rank, part, depth); | |||
// Exchange with partner, keep nim-or-max | |||
exchange(data, part, ks, tag); | |||
} | |||
// sort - O(N) | |||
timeCall(Timer_elbowSort, elbowSort, data, ascending(rank, depth)); | |||
} | |||
} | |||
#endif //DISTBITONIC_H_ |
@@ -14,34 +14,26 @@ | |||
#include "utils.hpp" | |||
#include "config.h" | |||
#include "distsort.hpp" | |||
#include "bitonicsort.hpp" | |||
// Global session data | |||
Data_t Data; | |||
config_t config; | |||
distBuffer_t Data; | |||
Log logger; | |||
// Mersenne seeded from hw if possible. range: [type_min, type_max] | |||
std::random_device rd; | |||
std::mt19937 gen(rd()); | |||
//! Performance timers for each one of the "costly" functions | |||
Timing Timer_total; | |||
Timing Timer_fullSort; | |||
Timing Timer_exchange; | |||
Timing Timer_minmax; | |||
Timing Timer_elbowSort; | |||
//! Init timing objects for extra rounds | |||
void measurements_init() { | |||
if (config.perf > 1) { | |||
Timer_total.init(config.perf); | |||
Timer_fullSort.init(config.perf); | |||
Timer_exchange.init(config.perf); | |||
Timer_minmax.init(config.perf); | |||
Timer_elbowSort.init(config.perf); | |||
} | |||
} | |||
@@ -49,10 +41,6 @@ void measurements_init() { | |||
void measurements_next() { | |||
if (config.perf > 1) { | |||
Timer_total.next(); | |||
Timer_fullSort.next(); | |||
Timer_exchange.next(); | |||
Timer_minmax.next(); | |||
Timer_elbowSort.next(); | |||
} | |||
} | |||
@@ -136,20 +124,14 @@ bool get_options(int argc, char* argv[]){ | |||
/*! | |||
* A simple validator for the entire distributed process | |||
* | |||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||
* @tparam DataT A buffer type with random access iterator. | |||
* | |||
* @param data [ShadowedDataT] The local to MPI process | |||
* @param Processes [mpi_id_t] The total number of MPI processes | |||
* @param rank [mpi_id_t] The current process id | |||
* | |||
* @return [bool] True if all are sorted and in total ascending order | |||
* @param data [DataT] The data | |||
* @return [bool] True if sorted in ascending order | |||
*/ | |||
template<typename ShadowedDataT> | |||
bool validator(ShadowedDataT& data) { | |||
using value_t = typename ShadowedDataT::value_type; | |||
bool ret = true; // Have faith! | |||
return ret; | |||
template<typename DataT> | |||
bool validator(DataT& data) { | |||
return std::is_sorted(data.begin(), data.end()); | |||
} | |||
/*! | |||
@@ -180,15 +162,15 @@ int main(int argc, char* argv[]) try { | |||
for (size_t it = 0 ; it < config.perf ; ++it) { | |||
// Initialize local data | |||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; | |||
std::uniform_int_distribution<distValue_t > dis( | |||
std::numeric_limits<distValue_t>::min(), | |||
std::numeric_limits<distValue_t>::max() | |||
std::uniform_int_distribution<Value_t > dis( | |||
std::numeric_limits<Value_t>::min(), | |||
std::numeric_limits<Value_t>::max() | |||
); | |||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | |||
// Run distributed sort | |||
logger << "Starting distributed sorting ... "; | |||
Timer_total.start(); | |||
distBitonic(Data); | |||
bitonicSort(Data); | |||
Timer_total.stop(); | |||
measurements_next(); | |||
logger << " Done." << logger.endl; | |||
@@ -196,11 +178,7 @@ int main(int argc, char* argv[]) try { | |||
// Print-outs and validation | |||
if (config.perf > 1) { | |||
Timing::print_duration(Timer_total.median(), "Total ", 0); | |||
Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", 0); | |||
Timing::print_duration(Timer_exchange.median(), "Exchange ", 0); | |||
Timing::print_duration(Timer_minmax.median(), "Min-Max ", 0); | |||
Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", 0); | |||
Timing::print_duration(Timer_total.median(), "Total"); | |||
} | |||
if (config.validation) { | |||
// If requested, we have the chance to fail! | |||
@@ -18,124 +18,6 @@ | |||
#include "config.h" | |||
/*! | |||
* @brief A std::vector wrapper with 2 vectors, an active and a shadow. | |||
* | |||
* This type exposes the standard vector functionality of the active vector. | |||
* The shadow can be used when we need to use the vector as mutable | |||
* data in algorithms that can not support "in-place" editing (like elbow-sort for example) | |||
* | |||
* @tparam Value_t the underlying data type of the vectors | |||
*/ | |||
template <typename Value_t> | |||
struct ShadowedVec_t { | |||
// STL requirements | |||
using value_type = Value_t; | |||
using iterator = typename std::vector<Value_t>::iterator; | |||
using const_iterator = typename std::vector<Value_t>::const_iterator; | |||
using size_type = typename std::vector<Value_t>::size_type; | |||
// Default constructor | |||
ShadowedVec_t() = default; | |||
// Constructor from an std::vector | |||
explicit ShadowedVec_t(const std::vector<Value_t>& vec) | |||
: North(vec), South(), active(north) { | |||
South.resize(North.size()); | |||
} | |||
explicit ShadowedVec_t(std::vector<Value_t>&& vec) | |||
: North(std::move(vec)), South(), active(north) { | |||
South.resize(North.size()); | |||
} | |||
// Copy assignment operator | |||
ShadowedVec_t& operator=(const ShadowedVec_t& other) { | |||
if (this != &other) { // Avoid self-assignment | |||
North = other.North; | |||
South = other.South; | |||
active = other.active; | |||
} | |||
return *this; | |||
} | |||
// Move assignment operator | |||
ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept { | |||
if (this != &other) { // Avoid self-assignment | |||
North = std::move(other.North); | |||
South = std::move(other.South); | |||
active = other.active; | |||
// There is no need to zero out other since it is valid but in a non-defined state | |||
} | |||
return *this; | |||
} | |||
// Type accessors | |||
std::vector<Value_t>& getActive() { return (active == north) ? North : South; } | |||
std::vector<Value_t>& getShadow() { return (active == north) ? South : North; } | |||
const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; } | |||
const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; } | |||
// Swap vectors | |||
void switch_active() { active = (active == north) ? south : north; } | |||
// Dispatch vector functionality to active vector | |||
Value_t& operator[](size_type index) { return getActive()[index]; } | |||
const Value_t& operator[](size_type index) const { return getActive()[index]; } | |||
Value_t& at(size_type index) { return getActive().at(index); } | |||
const Value_t& at(size_type index) const { return getActive().at(index); } | |||
void push_back(const Value_t& value) { getActive().push_back(value); } | |||
void push_back(Value_t&& value) { getActive().push_back(std::move(value)); } | |||
void pop_back() { getActive().pop_back(); } | |||
Value_t& front() { return getActive().front(); } | |||
Value_t& back() { return getActive().back(); } | |||
const Value_t& front() const { return getActive().front(); } | |||
const Value_t& back() const { return getActive().back(); } | |||
iterator begin() { return getActive().begin(); } | |||
const_iterator begin() const { return getActive().begin(); } | |||
iterator end() { return getActive().end(); } | |||
const_iterator end() const { return getActive().end(); } | |||
size_type size() const { return getActive().size(); } | |||
void resize(size_t new_size) { | |||
North.resize(new_size); | |||
South.resize(new_size); | |||
} | |||
void reserve(size_t new_capacity) { | |||
North.reserve(new_capacity); | |||
South.reserve(new_capacity); | |||
} | |||
[[nodiscard]] size_t capacity() const { return getActive().capacity(); } | |||
[[nodiscard]] bool empty() const { return getActive().empty(); } | |||
void clear() { getActive().clear(); } | |||
void swap(std::vector<Value_t>& other) { getActive().swap(other); } | |||
// Comparisons | |||
bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); } | |||
bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); } | |||
bool operator== (const std::vector<value_type>& other) { return getActive() == other; } | |||
bool operator!= (const std::vector<value_type>& other) { return getActive() != other; } | |||
private: | |||
std::vector<Value_t> North{}; //!< Actual buffer to be used either as active or shadow | |||
std::vector<Value_t> South{}; //!< Actual buffer to be used either as active or shadow | |||
enum { | |||
north, south | |||
} active{north}; //!< Flag to select between North and South buffer | |||
}; | |||
/* | |||
* Exported data types | |||
*/ | |||
using distBuffer_t = ShadowedVec_t<distValue_t>; | |||
extern distBuffer_t Data; | |||
/*! | |||
* A Logger for entire program. | |||
*/ | |||
@@ -25,8 +25,7 @@ protected: | |||
/* | |||
* MPI: SysTest (acceptance) | |||
* Each process executes distBubbletonic for uin8_t [16] | |||
* | |||
*/ | |||
TEST_F(TCUDAbitonic, test1) { | |||