HW3: [WIP] Test HPC build
This commit is contained in:
parent
2a2c7fec38
commit
b31ca23757
@ -45,9 +45,9 @@ OUTPUT_DIR := out
|
||||
|
||||
# ========== Compiler settings ==========
|
||||
# Compiler flags for debug and release
|
||||
DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
|
||||
DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
|
||||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
|
||||
DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
|
||||
DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG"
|
||||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
||||
|
||||
# Pre-defines
|
||||
@ -202,11 +202,11 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET)
|
||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||
|
||||
|
||||
bitonic_v2: CC := nvcc -G -g -x cu
|
||||
bitonic_v2: CXX := nvcc -G -g -x cu
|
||||
bitonic_v2: CC := nvcc -x cu
|
||||
bitonic_v2: CXX := nvcc -x cu
|
||||
bitonic_v2: LINKER := nvcc
|
||||
bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
|
||||
bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
|
||||
bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
|
||||
bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
|
||||
bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
|
||||
bitonic_v2: $(BUILD_DIR)/$(TARGET)
|
||||
@mkdir -p $(OUTPUT_DIR)
|
||||
|
@ -232,48 +232,42 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage
|
||||
|
||||
|
||||
template <typename ValueT>
|
||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) {
|
||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) {
|
||||
extern __shared__ ValueT shared_data[];
|
||||
ValueT* lowerHalf = (ValueT*) shared_data;
|
||||
ValueT* upperHalf = (ValueT*) &shared_data[nthreads];
|
||||
|
||||
// Global memory thread and partner ids
|
||||
threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
// Shared memory thread and partner ids
|
||||
threadId_t local_tid = threadIdx.x;
|
||||
|
||||
if ( thread_id < (n >> 1) ) {
|
||||
// Fetch to local memory for both half
|
||||
lowerHalf[local_tid] = data[thread_id];
|
||||
upperHalf[local_tid] = data[thread_id + (n >> 1)];
|
||||
__syncthreads();
|
||||
|
||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||
--step;
|
||||
// Find partner and localize it
|
||||
threadId_t partner_id = partner(thread_id, step);
|
||||
threadId_t local_pid = partner_id % nthreads;
|
||||
|
||||
if (local_tid < local_pid) {
|
||||
// exchange on low site buffer (half of the threads)
|
||||
bool keep = keepSmall(thread_id, partner_id, stage);
|
||||
exchange(lowerHalf, local_tid, local_pid, keep);
|
||||
}
|
||||
else {
|
||||
// exchange on high site buffer (other half of the threads)
|
||||
bool keep = keepSmall(thread_id, partner_id, stage);
|
||||
exchange(upperHalf, local_tid, local_pid, keep);
|
||||
// Global memory thread and partner ids
|
||||
threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
threadId_t Pid = partner(Tid, step);
|
||||
if (Tid > Pid) {
|
||||
Tid += n >> 1;
|
||||
Pid += n >> 1;
|
||||
}
|
||||
|
||||
if ((Tid < n) && (Pid < n)) { // Boundary check
|
||||
// Global to local index resolution
|
||||
threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads));
|
||||
threadId_t pid = tid + 1;
|
||||
// Fetch to local memory
|
||||
shared_data[tid] = data[Tid];
|
||||
shared_data[pid] = data[Pid];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
bool keep = keepSmall(Tid, Pid, stage);
|
||||
exchange(shared_data, tid, pid, keep);
|
||||
__syncthreads();
|
||||
|
||||
// Write back to global memory
|
||||
data[thread_id] = lowerHalf[local_tid];
|
||||
data[thread_id + (n >> 1)] = upperHalf[local_tid];
|
||||
data[Tid] = shared_data[tid];
|
||||
data[Pid] = shared_data[pid];
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*!
|
||||
* A distributed version of the Bitonic sort algorithm.
|
||||
*
|
||||
@ -297,6 +291,10 @@ void bitonicSort(DataT& data) {
|
||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
||||
|
||||
int* d_mutex;
|
||||
cudaMalloc(&d_mutex, sizeof(int));
|
||||
cudaMemset(d_mutex, 0, sizeof(int)); // init mutex
|
||||
|
||||
int Nthreads = THREADS_PER_BLOCK;
|
||||
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
||||
|
||||
@ -308,7 +306,7 @@ void bitonicSort(DataT& data) {
|
||||
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage);
|
||||
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
@ -33,8 +33,8 @@ static constexpr char version[] = "0.0";
|
||||
// Default Data size (in case -q <N> is not present)
|
||||
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||
|
||||
static constexpr size_t THREADS_PER_BLOCK = 8;
|
||||
static constexpr size_t IN_BLOCK_THRESHOLD = 4;
|
||||
static constexpr size_t THREADS_PER_BLOCK = 1024;
|
||||
static constexpr size_t IN_BLOCK_THRESHOLD = 512;
|
||||
|
||||
/*!
|
||||
* Value and Buffer type selection
|
||||
|
@ -18,7 +18,7 @@
|
||||
|
||||
|
||||
// Global session data
|
||||
Data_t Data;
|
||||
Data_t Data = {3, 5, 1, 2, 4, 7, 8, 6};
|
||||
config_t config;
|
||||
Log logger;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user