HW3: [WIP] Test HPC build

2025-02-14 17:56:56 +02:00 · 2025-02-14 17:56:56 +02:00 · b31ca23757
commit b31ca23757
parent 2a2c7fec38
4 changed files with 39 additions and 41 deletions
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@ -45,9 +45,9 @@ OUTPUT_DIR      := out

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
-DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
+DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
 REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
-DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" 
+DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG" 
 REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"

 # Pre-defines
@ -202,11 +202,11 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)


-bitonic_v2: CC := nvcc -G -g -x cu
-bitonic_v2: CXX := nvcc -G -g -x cu
+bitonic_v2: CC := nvcc -x cu
+bitonic_v2: CXX := nvcc -x cu
 bitonic_v2: LINKER := nvcc
-bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
-bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
+bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
+bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
 bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
 bitonic_v2: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@ -232,48 +232,42 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage


 template <typename ValueT>
-__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) {
+__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) {
    extern __shared__ ValueT shared_data[];
-    ValueT* lowerHalf = (ValueT*) shared_data;
-    ValueT* upperHalf = (ValueT*) &shared_data[nthreads];
-
-    // Global memory thread and partner ids
-    threadId_t thread_id  = threadIdx.x + blockIdx.x * blockDim.x;
-
-    // Shared memory thread and partner ids
-    threadId_t local_tid = threadIdx.x;
-
-    if ( thread_id  < (n >> 1) ) {
-        // Fetch to local memory for both half
-        lowerHalf[local_tid] = data[thread_id];
-        upperHalf[local_tid] = data[thread_id + (n >> 1)];
-        __syncthreads();

    for (size_t step = innerSteps + 1; step > 0; ) {
        --step;
-            // Find partner and localize it
-            threadId_t partner_id = partner(thread_id, step);
-            threadId_t local_pid = partner_id % nthreads;

-            if (local_tid < local_pid) {
-                // exchange on low site buffer (half of the threads)
-                bool keep = keepSmall(thread_id, partner_id, stage);
-                exchange(lowerHalf, local_tid, local_pid, keep);
-            }
-            else {
-                // exchange on high site buffer (other half of the threads)
-                bool keep = keepSmall(thread_id, partner_id, stage);
-                exchange(upperHalf, local_tid, local_pid, keep);
+        // Global memory thread and partner ids
+        threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x;
+        threadId_t Pid = partner(Tid, step);
+        if (Tid > Pid) {
+            Tid += n >> 1;
+            Pid += n >> 1;
        }
+
+        if ((Tid < n) && (Pid < n)) {       // Boundary check
+        	// Global to local index resolution
+        	threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads));
+        	threadId_t pid = tid + 1;
+        	// Fetch to local memory
+        	shared_data[tid] = data[Tid];
+        	shared_data[pid] = data[Pid];
        	__syncthreads();
-        }
+
+            bool keep = keepSmall(Tid, Pid, stage);
+            exchange(shared_data, tid, pid, keep);
+            __syncthreads();
+
            // Write back to global memory
-        data[thread_id] = lowerHalf[local_tid];
-        data[thread_id + (n >> 1)] = upperHalf[local_tid];
+            data[Tid] = shared_data[tid];
+            data[Pid] = shared_data[pid];
            __syncthreads();
        }
    }

+}
+
 /*!
 * A distributed version of the Bitonic sort algorithm.
 *
@ -297,6 +291,10 @@ void bitonicSort(DataT& data) {
    cudaMalloc(&dev_data, size * sizeof(value_t));
    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);

+    int* d_mutex;
+    cudaMalloc(&d_mutex, sizeof(int));
+    cudaMemset(d_mutex, 0, sizeof(int));  // init mutex
+
    int Nthreads = THREADS_PER_BLOCK;
    int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;

@ -308,7 +306,7 @@ void bitonicSort(DataT& data) {
            interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
            cudaDeviceSynchronize();
        }
-        inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage);
+        inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex);
        cudaDeviceSynchronize();
    }

--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@ -33,8 +33,8 @@ static constexpr char version[] = "0.0";
 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;

-static constexpr size_t THREADS_PER_BLOCK   = 8;
-static constexpr size_t IN_BLOCK_THRESHOLD  = 4;
+static constexpr size_t THREADS_PER_BLOCK   = 1024;
+static constexpr size_t IN_BLOCK_THRESHOLD  = 512;

 /*!
 * Value and Buffer type selection
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@ -18,7 +18,7 @@


 // Global session data
-Data_t          Data;
+Data_t          Data = {3, 5, 1, 2, 4, 7, 8, 6};
 config_t        config;
 Log             logger;