|
|
@@ -76,7 +76,7 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag |
|
|
|
|
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) { |
|
|
|
__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { |
|
|
|
if (( keepSmall && (data[tid] > data[partner])) || |
|
|
|
(!keepSmall && (data[tid] < data[partner])) ) { |
|
|
|
ValueT temp = data[tid]; |
|
|
@@ -85,16 +85,18 @@ __device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if CODE_VERSION == V0 |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
if (tid < n) { |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (pid < n) { |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
|
cudaExchange(data, tid, pid, keep); |
|
|
|
} |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
|
exchange(data, tid, pid, keep); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
@@ -122,14 +124,14 @@ void bitonicSort(DataT& data) { |
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
int Nthreads = 1024; |
|
|
|
int Nblocks = (size + Nthreads - 1) / Nthreads; |
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
|
|
|
|
size_t max_depth = static_cast<size_t>(log2(size)); |
|
|
|
for (size_t stage = 1; stage <= max_depth; ++stage) { |
|
|
|
size_t Stages = static_cast<size_t>(log2(size)); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
} |
|
|
@@ -138,8 +140,182 @@ void bitonicSort(DataT& data) { |
|
|
|
cudaFree(dev_data); |
|
|
|
} |
|
|
|
|
|
|
|
#elif CODE_VERSION == V1 |
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
|
exchange(data, tid, pid, keep); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
interBlockStep_(data, n, step, stage); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { |
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
interBlockStep_(data, n, step, stage); |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* A distributed version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each MPI process should run an instance of this function. |
|
|
|
* |
|
|
|
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. |
|
|
|
* |
|
|
|
* @param data [ShadowedDataT] The local to MPI process data to sort |
|
|
|
* @param Processes [mpi_id_t] The total number of MPI processes |
|
|
|
* @param rank [mpi_id_t] The current process id |
|
|
|
*/ |
|
|
|
|
|
|
|
template <typename DataT> |
|
|
|
void bitonicSort(DataT& data) { |
|
|
|
using value_t = typename DataT::value_type; |
|
|
|
|
|
|
|
value_t* dev_data; |
|
|
|
auto size = data.size(); |
|
|
|
|
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD)); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
|
|
|
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); |
|
|
|
cudaFree(dev_data); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#elif CODE_VERSION == V2 |
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
|
exchange(data, tid, pid, keep); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) { |
|
|
|
extern __shared__ ValueT shared_data[]; |
|
|
|
ValueT* lowerHalf = (ValueT*) shared_data; |
|
|
|
ValueT* upperHalf = (ValueT*) &shared_data[nthreads]; |
|
|
|
|
|
|
|
// Global memory thread and partner ids |
|
|
|
threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
|
|
|
|
|
// Shared memory thread and partner ids |
|
|
|
threadId_t local_tid = threadIdx.x; |
|
|
|
|
|
|
|
if ( thread_id < (n >> 1) ) { |
|
|
|
// Fetch to local memory for both half |
|
|
|
lowerHalf[local_tid] = data[thread_id]; |
|
|
|
upperHalf[local_tid] = data[thread_id + (n >> 1)]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
// Find partner and localize it |
|
|
|
threadId_t partner_id = partner(thread_id, step); |
|
|
|
threadId_t local_pid = partner_id % nthreads; |
|
|
|
|
|
|
|
if (local_tid < local_pid) { |
|
|
|
// exchange on low site buffer (half of the threads) |
|
|
|
bool keep = keepSmall(thread_id, partner_id, stage); |
|
|
|
exchange(lowerHalf, local_tid, local_pid, keep); |
|
|
|
} |
|
|
|
else { |
|
|
|
// exchange on high site buffer (other half of the threads) |
|
|
|
bool keep = keepSmall(thread_id, partner_id, stage); |
|
|
|
exchange(upperHalf, local_tid, local_pid, keep); |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
// Write back to global memory |
|
|
|
data[thread_id] = lowerHalf[local_tid]; |
|
|
|
data[thread_id + (n >> 1)] = upperHalf[local_tid]; |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* A distributed version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each MPI process should run an instance of this function. |
|
|
|
* |
|
|
|
* @tparam dDataT A Shadowed buffer type with random access iterator. |
|
|
|
* |
|
|
|
* @param data [ShadowedDataT] The local to MPI process data to sort |
|
|
|
* @param Processes [mpi_id_t] The total number of MPI processes |
|
|
|
* @param rank [mpi_id_t] The current process id |
|
|
|
*/ |
|
|
|
|
|
|
|
template <typename DataT> |
|
|
|
void bitonicSort(DataT& data) { |
|
|
|
using value_t = typename DataT::value_type; |
|
|
|
|
|
|
|
value_t* dev_data; |
|
|
|
auto size = data.size(); |
|
|
|
|
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD)); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
|
|
|
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); |
|
|
|
cudaFree(dev_data); |
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
#endif //BITONICSORTCUDA_H_ |