From b31ca23757f0603fca4677aaa42e42077356efa1 Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Fri, 14 Feb 2025 17:56:56 +0200 Subject: [PATCH] HW3: [WIP] Test HPC build --- homework_3/Makefile | 12 +++---- homework_3/src/bitonicsort.hpp | 64 ++++++++++++++++------------------ homework_3/src/config.h | 4 +-- homework_3/src/main.cpp | 2 +- 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/homework_3/Makefile b/homework_3/Makefile index 3773f8f..dae2b51 100644 --- a/homework_3/Makefile +++ b/homework_3/Makefile @@ -45,9 +45,9 @@ OUTPUT_DIR := out # ========== Compiler settings ========== # Compiler flags for debug and release -DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" +DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG" REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra" -DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" +DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG" REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" # Pre-defines @@ -202,11 +202,11 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET) cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) -bitonic_v2: CC := nvcc -G -g -x cu -bitonic_v2: CXX := nvcc -G -g -x cu +bitonic_v2: CC := nvcc -x cu +bitonic_v2: CXX := nvcc -x cu bitonic_v2: LINKER := nvcc -bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2 -bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2 +bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2 +bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2 bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2 bitonic_v2: $(BUILD_DIR)/$(TARGET) @mkdir -p $(OUTPUT_DIR) diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp index 447e82c..e20031e 100644 --- a/homework_3/src/bitonicsort.hpp +++ b/homework_3/src/bitonicsort.hpp @@ -232,46 +232,40 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage template -__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) { +__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) { extern __shared__ ValueT shared_data[]; - ValueT* lowerHalf = (ValueT*) shared_data; - ValueT* upperHalf = (ValueT*) &shared_data[nthreads]; - // Global memory thread and partner ids - threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t step = innerSteps + 1; step > 0; ) { + --step; - // Shared memory thread and partner ids - threadId_t local_tid = threadIdx.x; + // Global memory thread and partner ids + threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x; + threadId_t Pid = partner(Tid, step); + if (Tid > Pid) { + Tid += n >> 1; + Pid += n >> 1; + } - if ( thread_id < (n >> 1) ) { - // Fetch to local memory for both half - lowerHalf[local_tid] = data[thread_id]; - upperHalf[local_tid] = data[thread_id + (n >> 1)]; - __syncthreads(); + if ((Tid < n) && (Pid < n)) { // Boundary check + // Global to local index resolution + threadId_t tid = (Tid> 1))*nthreads)%(2*nthreads)); + threadId_t pid = tid + 1; + // Fetch to local memory + shared_data[tid] = data[Tid]; + shared_data[pid] = data[Pid]; + __syncthreads(); + + bool keep = keepSmall(Tid, Pid, stage); + exchange(shared_data, tid, pid, keep); + __syncthreads(); - for (size_t step = innerSteps + 1; step > 0; ) { - --step; - // Find partner and localize it - threadId_t partner_id = partner(thread_id, step); - threadId_t local_pid = partner_id % nthreads; - - if (local_tid < local_pid) { - // exchange on low site buffer (half of the threads) - bool keep = keepSmall(thread_id, partner_id, stage); - exchange(lowerHalf, local_tid, local_pid, keep); - } - else { - // exchange on high site buffer (other half of the threads) - bool keep = keepSmall(thread_id, partner_id, stage); - exchange(upperHalf, local_tid, local_pid, keep); - } + // Write back to global memory + data[Tid] = shared_data[tid]; + data[Pid] = shared_data[pid]; __syncthreads(); } - // Write back to global memory - data[thread_id] = lowerHalf[local_tid]; - data[thread_id + (n >> 1)] = upperHalf[local_tid]; - __syncthreads(); } + } /*! @@ -297,6 +291,10 @@ void bitonicSort(DataT& data) { cudaMalloc(&dev_data, size * sizeof(value_t)); cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + int* d_mutex; + cudaMalloc(&d_mutex, sizeof(int)); + cudaMemset(d_mutex, 0, sizeof(int)); // init mutex + int Nthreads = THREADS_PER_BLOCK; int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; @@ -308,7 +306,7 @@ void bitonicSort(DataT& data) { interBlockStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } - inBlockStep<<>>(dev_data, size, Nthreads, step, stage); + inBlockStep<<>>(dev_data, size, Nthreads, step, stage, d_mutex); cudaDeviceSynchronize(); } diff --git a/homework_3/src/config.h b/homework_3/src/config.h index 2cd4b49..b230526 100644 --- a/homework_3/src/config.h +++ b/homework_3/src/config.h @@ -33,8 +33,8 @@ static constexpr char version[] = "0.0"; // Default Data size (in case -q is not present) static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; -static constexpr size_t THREADS_PER_BLOCK = 8; -static constexpr size_t IN_BLOCK_THRESHOLD = 4; +static constexpr size_t THREADS_PER_BLOCK = 1024; +static constexpr size_t IN_BLOCK_THRESHOLD = 512; /*! * Value and Buffer type selection diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp index bd533c5..961d275 100644 --- a/homework_3/src/main.cpp +++ b/homework_3/src/main.cpp @@ -18,7 +18,7 @@ // Global session data -Data_t Data; +Data_t Data = {3, 5, 1, 2, 4, 7, 8, 6}; config_t config; Log logger;