diff --git a/homework_3/Makefile b/homework_3/Makefile index 4856d50..3773f8f 100644 --- a/homework_3/Makefile +++ b/homework_3/Makefile @@ -45,9 +45,9 @@ OUTPUT_DIR := out # ========== Compiler settings ========== # Compiler flags for debug and release -DEB_CFLAGS := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra" +DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra" -DEB_CXXFLAGS := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra" +DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" # Pre-defines @@ -186,15 +186,40 @@ bitonic_v0: LINKER := nvcc bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0 bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0 -bitonic_v0: TARGET := bitonic_v0 bitonic_v0: $(BUILD_DIR)/$(TARGET) @mkdir -p $(OUTPUT_DIR) cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) +bitonic_v1: CC := nvcc -x cu +bitonic_v1: CXX := nvcc -x cu +bitonic_v1: LINKER := nvcc +bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1 +bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1 +bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1 +bitonic_v1: $(BUILD_DIR)/$(TARGET) + @mkdir -p $(OUTPUT_DIR) + cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) + + +bitonic_v2: CC := nvcc -G -g -x cu +bitonic_v2: CXX := nvcc -G -g -x cu +bitonic_v2: LINKER := nvcc +bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2 +bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2 +bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2 +bitonic_v2: $(BUILD_DIR)/$(TARGET) + @mkdir -p $(OUTPUT_DIR) + cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) + + hpc-build: make clean make bitonic_v0 + make clean + make bitonic_v1 + make clean + make bitonic_v2 all: debug bitonic_v0 diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp index 100cb2d..447e82c 100644 --- a/homework_3/src/bitonicsort.hpp +++ b/homework_3/src/bitonicsort.hpp @@ -76,7 +76,7 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag template -__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) { +__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { if (( keepSmall && (data[tid] > data[partner])) || (!keepSmall && (data[tid] < data[partner])) ) { ValueT temp = data[tid]; @@ -85,16 +85,18 @@ __device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) } } - +#if CODE_VERSION == V0 template __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID - if (tid < n) { - threadId_t pid = partner(tid, step); - if (pid < n) { - bool keep = keepSmall(tid, pid, stage); - cudaExchange(data, tid, pid, keep); - } + threadId_t pid = partner(tid, step); + if (tid > pid) { + tid += n >> 1; + pid += n >> 1; + } + if ((tid < n) && (pid < n)) { // Boundary check + bool keep = keepSmall(tid, pid, stage); + exchange(data, tid, pid, keep); } } @@ -122,14 +124,14 @@ void bitonicSort(DataT& data) { cudaMalloc(&dev_data, size * sizeof(value_t)); cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); - int Nthreads = 1024; - int Nblocks = (size + Nthreads - 1) / Nthreads; + int Nthreads = THREADS_PER_BLOCK; + int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; - size_t max_depth = static_cast(log2(size)); - for (size_t stage = 1; stage <= max_depth; ++stage) { + size_t Stages = static_cast(log2(size)); + for (size_t stage = 1; stage <= Stages; ++stage) { for (size_t step = stage; step > 0; ) { --step; - bitonicStep<<>>(dev_data, size, step, stage); + bitonicStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } } @@ -138,8 +140,182 @@ void bitonicSort(DataT& data) { cudaFree(dev_data); } +#elif CODE_VERSION == V1 + +template +__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) { + threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID + threadId_t pid = partner(tid, step); + if (tid > pid) { + tid += n >> 1; + pid += n >> 1; + } + if ((tid < n) && (pid < n)) { // Boundary check + bool keep = keepSmall(tid, pid, stage); + exchange(data, tid, pid, keep); + } +} + +template +__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { + interBlockStep_(data, n, step, stage); +} + + +template +__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { + for (size_t step = innerSteps + 1; step > 0; ) { + --step; + interBlockStep_(data, n, step, stage); + __syncthreads(); + } +} + +/*! + * A distributed version of the Bitonic sort algorithm. + * + * @note + * Each MPI process should run an instance of this function. + * + * @tparam ShadowedDataT A Shadowed buffer type with random access iterator. + * + * @param data [ShadowedDataT] The local to MPI process data to sort + * @param Processes [mpi_id_t] The total number of MPI processes + * @param rank [mpi_id_t] The current process id + */ + +template +void bitonicSort(DataT& data) { + using value_t = typename DataT::value_type; + + value_t* dev_data; + auto size = data.size(); + + cudaMalloc(&dev_data, size * sizeof(value_t)); + cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + + int Nthreads = THREADS_PER_BLOCK; + int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; + + auto Stages = static_cast(log2(size)); + auto InnerBlockSteps = static_cast(log2(IN_BLOCK_THRESHOLD)); + for (size_t stage = 1; stage <= Stages; ++stage) { + size_t step = stage - 1; + for ( ; step > InnerBlockSteps; --step) { + interBlockStep<<>>(dev_data, size, step, stage); + cudaDeviceSynchronize(); + } + inBlockStep<<>>(dev_data, size, step, stage); + cudaDeviceSynchronize(); + } + + cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); + cudaFree(dev_data); +} +#elif CODE_VERSION == V2 + +template +__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { + threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID + threadId_t pid = partner(tid, step); + if (tid > pid) { + tid += n >> 1; + pid += n >> 1; + } + if ((tid < n) && (pid < n)) { // Boundary check + bool keep = keepSmall(tid, pid, stage); + exchange(data, tid, pid, keep); + } +} + + +template +__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) { + extern __shared__ ValueT shared_data[]; + ValueT* lowerHalf = (ValueT*) shared_data; + ValueT* upperHalf = (ValueT*) &shared_data[nthreads]; + + // Global memory thread and partner ids + threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x; + + // Shared memory thread and partner ids + threadId_t local_tid = threadIdx.x; + + if ( thread_id < (n >> 1) ) { + // Fetch to local memory for both half + lowerHalf[local_tid] = data[thread_id]; + upperHalf[local_tid] = data[thread_id + (n >> 1)]; + __syncthreads(); + + for (size_t step = innerSteps + 1; step > 0; ) { + --step; + // Find partner and localize it + threadId_t partner_id = partner(thread_id, step); + threadId_t local_pid = partner_id % nthreads; + + if (local_tid < local_pid) { + // exchange on low site buffer (half of the threads) + bool keep = keepSmall(thread_id, partner_id, stage); + exchange(lowerHalf, local_tid, local_pid, keep); + } + else { + // exchange on high site buffer (other half of the threads) + bool keep = keepSmall(thread_id, partner_id, stage); + exchange(upperHalf, local_tid, local_pid, keep); + } + __syncthreads(); + } + // Write back to global memory + data[thread_id] = lowerHalf[local_tid]; + data[thread_id + (n >> 1)] = upperHalf[local_tid]; + __syncthreads(); + } +} + +/*! + * A distributed version of the Bitonic sort algorithm. + * + * @note + * Each MPI process should run an instance of this function. + * + * @tparam dDataT A Shadowed buffer type with random access iterator. + * + * @param data [ShadowedDataT] The local to MPI process data to sort + * @param Processes [mpi_id_t] The total number of MPI processes + * @param rank [mpi_id_t] The current process id + */ + +template +void bitonicSort(DataT& data) { + using value_t = typename DataT::value_type; + + value_t* dev_data; + auto size = data.size(); + + cudaMalloc(&dev_data, size * sizeof(value_t)); + cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + + int Nthreads = THREADS_PER_BLOCK; + int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; + + auto Stages = static_cast(log2(size)); + auto InnerBlockSteps = static_cast(log2(IN_BLOCK_THRESHOLD)); + for (size_t stage = 1; stage <= Stages; ++stage) { + size_t step = stage - 1; + for ( ; step > InnerBlockSteps; --step) { + interBlockStep<<>>(dev_data, size, step, stage); + cudaDeviceSynchronize(); + } + inBlockStep<<>>(dev_data, size, Nthreads, step, stage); + cudaDeviceSynchronize(); + } + + cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); + cudaFree(dev_data); +} +#endif #endif //BITONICSORTCUDA_H_ diff --git a/homework_3/src/config.h b/homework_3/src/config.h index 3ff1235..2cd4b49 100644 --- a/homework_3/src/config.h +++ b/homework_3/src/config.h @@ -27,12 +27,14 @@ static constexpr char version[] = "0.0"; // Fail-safe version selection #if !defined CODE_VERSION -#define CODE_VERSION V0 +#define CODE_VERSION V2 #endif // Default Data size (in case -q is not present) static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; +static constexpr size_t THREADS_PER_BLOCK = 8; +static constexpr size_t IN_BLOCK_THRESHOLD = 4; /*! * Value and Buffer type selection @@ -46,7 +48,7 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; * float * double */ -using Value_t = uint32_t; +using Value_t = uint8_t; using Data_t = std::vector; /*! diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp index 64cba4e..bd533c5 100644 --- a/homework_3/src/main.cpp +++ b/homework_3/src/main.cpp @@ -185,6 +185,13 @@ int main(int argc, char* argv[]) try { std::cout << "[Validation] Results validation ..."; bool val = validator(Data); std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n"); + if (Data.size() < 128) { + std::cout << "Data: "; + for (auto& v : Data) { + std::cout << (int)v << ", "; + } + std::cout << '\n'; + } } return 0; }