HW3: [WIP] V2 added (needs debug)
This commit is contained in:
parent
1fe5ab4da7
commit
2a2c7fec38
@ -45,9 +45,9 @@ OUTPUT_DIR := out
|
|||||||
|
|
||||||
# ========== Compiler settings ==========
|
# ========== Compiler settings ==========
|
||||||
# Compiler flags for debug and release
|
# Compiler flags for debug and release
|
||||||
DEB_CFLAGS := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra"
|
DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
|
||||||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
|
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
|
||||||
DEB_CXXFLAGS := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
|
||||||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
||||||
|
|
||||||
# Pre-defines
|
# Pre-defines
|
||||||
@ -186,15 +186,40 @@ bitonic_v0: LINKER := nvcc
|
|||||||
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
|
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
|
||||||
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
|
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
|
||||||
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
||||||
bitonic_v0: TARGET := bitonic_v0
|
|
||||||
bitonic_v0: $(BUILD_DIR)/$(TARGET)
|
bitonic_v0: $(BUILD_DIR)/$(TARGET)
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
bitonic_v1: CC := nvcc -x cu
|
||||||
|
bitonic_v1: CXX := nvcc -x cu
|
||||||
|
bitonic_v1: LINKER := nvcc
|
||||||
|
bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
|
||||||
|
bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
|
||||||
|
bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
|
||||||
|
bitonic_v1: $(BUILD_DIR)/$(TARGET)
|
||||||
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
bitonic_v2: CC := nvcc -G -g -x cu
|
||||||
|
bitonic_v2: CXX := nvcc -G -g -x cu
|
||||||
|
bitonic_v2: LINKER := nvcc
|
||||||
|
bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
|
||||||
|
bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
|
||||||
|
bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
|
||||||
|
bitonic_v2: $(BUILD_DIR)/$(TARGET)
|
||||||
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
hpc-build:
|
hpc-build:
|
||||||
make clean
|
make clean
|
||||||
make bitonic_v0
|
make bitonic_v0
|
||||||
|
make clean
|
||||||
|
make bitonic_v1
|
||||||
|
make clean
|
||||||
|
make bitonic_v2
|
||||||
|
|
||||||
|
|
||||||
all: debug bitonic_v0
|
all: debug bitonic_v0
|
||||||
|
@ -76,7 +76,7 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag
|
|||||||
|
|
||||||
|
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) {
|
__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
|
||||||
if (( keepSmall && (data[tid] > data[partner])) ||
|
if (( keepSmall && (data[tid] > data[partner])) ||
|
||||||
(!keepSmall && (data[tid] < data[partner])) ) {
|
(!keepSmall && (data[tid] < data[partner])) ) {
|
||||||
ValueT temp = data[tid];
|
ValueT temp = data[tid];
|
||||||
@ -85,16 +85,18 @@ __device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CODE_VERSION == V0
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
||||||
if (tid < n) {
|
|
||||||
threadId_t pid = partner(tid, step);
|
threadId_t pid = partner(tid, step);
|
||||||
if (pid < n) {
|
if (tid > pid) {
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
tid += n >> 1;
|
||||||
cudaExchange(data, tid, pid, keep);
|
pid += n >> 1;
|
||||||
}
|
}
|
||||||
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
|
exchange(data, tid, pid, keep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,14 +124,14 @@ void bitonicSort(DataT& data) {
|
|||||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
cudaMalloc(&dev_data, size * sizeof(value_t));
|
||||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
int Nthreads = 1024;
|
int Nthreads = THREADS_PER_BLOCK;
|
||||||
int Nblocks = (size + Nthreads - 1) / Nthreads;
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
||||||
|
|
||||||
size_t max_depth = static_cast<size_t>(log2(size));
|
size_t Stages = static_cast<size_t>(log2(size));
|
||||||
for (size_t stage = 1; stage <= max_depth; ++stage) {
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
for (size_t step = stage; step > 0; ) {
|
for (size_t step = stage; step > 0; ) {
|
||||||
--step;
|
--step;
|
||||||
bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
|
bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -138,8 +140,182 @@ void bitonicSort(DataT& data) {
|
|||||||
cudaFree(dev_data);
|
cudaFree(dev_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif CODE_VERSION == V1
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
||||||
|
threadId_t pid = partner(tid, step);
|
||||||
|
if (tid > pid) {
|
||||||
|
tid += n >> 1;
|
||||||
|
pid += n >> 1;
|
||||||
|
}
|
||||||
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
|
exchange(data, tid, pid, keep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
|
interBlockStep_(data, n, step, stage);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
||||||
|
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||||
|
--step;
|
||||||
|
interBlockStep_(data, n, step, stage);
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A distributed version of the Bitonic sort algorithm.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* Each MPI process should run an instance of this function.
|
||||||
|
*
|
||||||
|
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||||
|
*
|
||||||
|
* @param data [ShadowedDataT] The local to MPI process data to sort
|
||||||
|
* @param Processes [mpi_id_t] The total number of MPI processes
|
||||||
|
* @param rank [mpi_id_t] The current process id
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename DataT>
|
||||||
|
void bitonicSort(DataT& data) {
|
||||||
|
using value_t = typename DataT::value_type;
|
||||||
|
|
||||||
|
value_t* dev_data;
|
||||||
|
auto size = data.size();
|
||||||
|
|
||||||
|
cudaMalloc(&dev_data, size * sizeof(value_t));
|
||||||
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
int Nthreads = THREADS_PER_BLOCK;
|
||||||
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
||||||
|
|
||||||
|
auto Stages = static_cast<size_t>(log2(size));
|
||||||
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
|
||||||
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
|
size_t step = stage - 1;
|
||||||
|
for ( ; step > InnerBlockSteps; --step) {
|
||||||
|
interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(dev_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif CODE_VERSION == V2
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
||||||
|
threadId_t pid = partner(tid, step);
|
||||||
|
if (tid > pid) {
|
||||||
|
tid += n >> 1;
|
||||||
|
pid += n >> 1;
|
||||||
|
}
|
||||||
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
|
exchange(data, tid, pid, keep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) {
|
||||||
|
extern __shared__ ValueT shared_data[];
|
||||||
|
ValueT* lowerHalf = (ValueT*) shared_data;
|
||||||
|
ValueT* upperHalf = (ValueT*) &shared_data[nthreads];
|
||||||
|
|
||||||
|
// Global memory thread and partner ids
|
||||||
|
threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
|
// Shared memory thread and partner ids
|
||||||
|
threadId_t local_tid = threadIdx.x;
|
||||||
|
|
||||||
|
if ( thread_id < (n >> 1) ) {
|
||||||
|
// Fetch to local memory for both half
|
||||||
|
lowerHalf[local_tid] = data[thread_id];
|
||||||
|
upperHalf[local_tid] = data[thread_id + (n >> 1)];
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||||
|
--step;
|
||||||
|
// Find partner and localize it
|
||||||
|
threadId_t partner_id = partner(thread_id, step);
|
||||||
|
threadId_t local_pid = partner_id % nthreads;
|
||||||
|
|
||||||
|
if (local_tid < local_pid) {
|
||||||
|
// exchange on low site buffer (half of the threads)
|
||||||
|
bool keep = keepSmall(thread_id, partner_id, stage);
|
||||||
|
exchange(lowerHalf, local_tid, local_pid, keep);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// exchange on high site buffer (other half of the threads)
|
||||||
|
bool keep = keepSmall(thread_id, partner_id, stage);
|
||||||
|
exchange(upperHalf, local_tid, local_pid, keep);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
// Write back to global memory
|
||||||
|
data[thread_id] = lowerHalf[local_tid];
|
||||||
|
data[thread_id + (n >> 1)] = upperHalf[local_tid];
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A distributed version of the Bitonic sort algorithm.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* Each MPI process should run an instance of this function.
|
||||||
|
*
|
||||||
|
* @tparam dDataT A Shadowed buffer type with random access iterator.
|
||||||
|
*
|
||||||
|
* @param data [ShadowedDataT] The local to MPI process data to sort
|
||||||
|
* @param Processes [mpi_id_t] The total number of MPI processes
|
||||||
|
* @param rank [mpi_id_t] The current process id
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename DataT>
|
||||||
|
void bitonicSort(DataT& data) {
|
||||||
|
using value_t = typename DataT::value_type;
|
||||||
|
|
||||||
|
value_t* dev_data;
|
||||||
|
auto size = data.size();
|
||||||
|
|
||||||
|
cudaMalloc(&dev_data, size * sizeof(value_t));
|
||||||
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
int Nthreads = THREADS_PER_BLOCK;
|
||||||
|
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
||||||
|
|
||||||
|
auto Stages = static_cast<size_t>(log2(size));
|
||||||
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
|
||||||
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
|
size_t step = stage - 1;
|
||||||
|
for ( ; step > InnerBlockSteps; --step) {
|
||||||
|
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(dev_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif //BITONICSORTCUDA_H_
|
#endif //BITONICSORTCUDA_H_
|
||||||
|
@ -27,12 +27,14 @@ static constexpr char version[] = "0.0";
|
|||||||
|
|
||||||
// Fail-safe version selection
|
// Fail-safe version selection
|
||||||
#if !defined CODE_VERSION
|
#if !defined CODE_VERSION
|
||||||
#define CODE_VERSION V0
|
#define CODE_VERSION V2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Default Data size (in case -q <N> is not present)
|
// Default Data size (in case -q <N> is not present)
|
||||||
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||||
|
|
||||||
|
static constexpr size_t THREADS_PER_BLOCK = 8;
|
||||||
|
static constexpr size_t IN_BLOCK_THRESHOLD = 4;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Value and Buffer type selection
|
* Value and Buffer type selection
|
||||||
@ -46,7 +48,7 @@ static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
|||||||
* float
|
* float
|
||||||
* double
|
* double
|
||||||
*/
|
*/
|
||||||
using Value_t = uint32_t;
|
using Value_t = uint8_t;
|
||||||
using Data_t = std::vector<Value_t>;
|
using Data_t = std::vector<Value_t>;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -185,6 +185,13 @@ int main(int argc, char* argv[]) try {
|
|||||||
std::cout << "[Validation] Results validation ...";
|
std::cout << "[Validation] Results validation ...";
|
||||||
bool val = validator(Data);
|
bool val = validator(Data);
|
||||||
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
|
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
|
||||||
|
if (Data.size() < 128) {
|
||||||
|
std::cout << "Data: ";
|
||||||
|
for (auto& v : Data) {
|
||||||
|
std::cout << (int)v << ", ";
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user