|
|
@@ -21,7 +21,7 @@ |
|
|
|
/* |
|
|
|
* Exported timers |
|
|
|
*/ |
|
|
|
extern Timing Timer_total; |
|
|
|
extern Timing Timer_total, Timer_memory, Timer_sorting; |
|
|
|
|
|
|
|
using threadId_t = size_t; |
|
|
|
|
|
|
@@ -74,9 +74,41 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag |
|
|
|
* ============================== Sort algorithms ============================== |
|
|
|
*/ |
|
|
|
|
|
|
|
/*! |
|
|
|
* Each thread can handle 2 points in the array. For each of these 2 points it may |
|
|
|
* - compare and exchange if needed |
|
|
|
* - copy data to local and back if needed |
|
|
|
*/ |
|
|
|
static constexpr size_t SizeToThreadsRatio = 2; |
|
|
|
|
|
|
|
/*! |
|
|
|
* Calculates the blocks needed for the entire sorting process |
|
|
|
* |
|
|
|
* @note |
|
|
|
* This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact |
|
|
|
* multipliers of config.blockSize. |
|
|
|
* Even if we don't need it, we keep it in case we experiment with weird sizes in the future! |
|
|
|
* |
|
|
|
* @param arraySize [ArraySize_t] The size of the entire array (in points) |
|
|
|
* @return [size_t] The number of blocks |
|
|
|
*/ |
|
|
|
inline size_t NBlocks(ArraySize_t arraySize) { |
|
|
|
return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* Exchange utility |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param tid [threadId_t] Current thread's index to data |
|
|
|
* @param pid [threadId_t] Parents's index to data |
|
|
|
* @param keepSmall [bool] Flag to indicate if current threads is keeping the small |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { |
|
|
|
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) { |
|
|
|
if (( keepSmall && (data[tid] > data[partner])) || |
|
|
|
(!keepSmall && (data[tid] < data[partner])) ) { |
|
|
|
ValueT temp = data[tid]; |
|
|
@@ -86,13 +118,24 @@ __device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { |
|
|
|
} |
|
|
|
|
|
|
|
#if CODE_VERSION == V0 |
|
|
|
|
|
|
|
/*! |
|
|
|
* This is the body of each thread. This function compare and exchange data |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
// Shift to the other half of the array for global data |
|
|
|
tid += n / SizeToThreadsRatio; |
|
|
|
pid += n / SizeToThreadsRatio; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
@@ -102,18 +145,11 @@ __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* A distributed version of the Bitonic sort algorithm. |
|
|
|
* A CUDA version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each MPI process should run an instance of this function. |
|
|
|
* |
|
|
|
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. |
|
|
|
* |
|
|
|
* @param data [ShadowedDataT] The local to MPI process data to sort |
|
|
|
* @param Processes [mpi_id_t] The total number of MPI processes |
|
|
|
* @param rank [mpi_id_t] The current process id |
|
|
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods |
|
|
|
* @param data [DataT&] Reference to the container to sort |
|
|
|
*/ |
|
|
|
|
|
|
|
template <typename DataT> |
|
|
|
void bitonicSort(DataT& data) { |
|
|
|
using value_t = typename DataT::value_type; |
|
|
@@ -121,34 +157,57 @@ void bitonicSort(DataT& data) { |
|
|
|
value_t* dev_data; |
|
|
|
auto size = data.size(); |
|
|
|
|
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
size_t Nth = config.blockSize; |
|
|
|
size_t Nbl = NBlocks(size); |
|
|
|
|
|
|
|
size_t Stages = static_cast<size_t>(log2(size)); |
|
|
|
Timer_sorting.start(); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
} |
|
|
|
Timer_sorting.stop(); |
|
|
|
|
|
|
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
|
|
|
|
|
#elif CODE_VERSION == V1 |
|
|
|
|
|
|
|
/*! |
|
|
|
* This is the body of each thread. This function compare and exchange data |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
/* |
|
|
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) |
|
|
|
* And we use the neighbor block address indices for the other half of the threads |
|
|
|
*/ |
|
|
|
threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
// Shift to the other half of the array for global data |
|
|
|
tid += blockDim.x; |
|
|
|
pid += blockDim.x; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
@@ -156,12 +215,29 @@ __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stag |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* This is the version of the body that is called outside of the loop unrolling |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
interBlockStep_(data, n, step, stage); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* This is unrolled part of the bitonic double loop. |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { |
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
@@ -172,18 +248,11 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* A distributed version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each MPI process should run an instance of this function. |
|
|
|
* |
|
|
|
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. |
|
|
|
* A CUDA version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @param data [ShadowedDataT] The local to MPI process data to sort |
|
|
|
* @param Processes [mpi_id_t] The total number of MPI processes |
|
|
|
* @param rank [mpi_id_t] The current process id |
|
|
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods |
|
|
|
* @param data [DataT&] Reference to the container to sort |
|
|
|
*/ |
|
|
|
|
|
|
|
template <typename DataT> |
|
|
|
void bitonicSort(DataT& data) { |
|
|
|
using value_t = typename DataT::value_type; |
|
|
@@ -191,38 +260,85 @@ void bitonicSort(DataT& data) { |
|
|
|
value_t* dev_data; |
|
|
|
auto size = data.size(); |
|
|
|
|
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
size_t Nth = config.blockSize; |
|
|
|
size_t Nbl = NBlocks(size); |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD)); |
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); // |
|
|
|
Timer_sorting.start(); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
Timer_sorting.stop(); |
|
|
|
|
|
|
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#elif CODE_VERSION == V2 |
|
|
|
|
|
|
|
/*! |
|
|
|
* @return The memory that each block local threads can affect. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each block thread collection can exchange twice the size of data points. |
|
|
|
*/ |
|
|
|
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* Converts the global address of the data to the local shared memory array which is used |
|
|
|
* as cached memory to the unrolled part of the bitonic sort loop. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each block's thread collection can exchange twice the size of data points. |
|
|
|
* These points get copied (cached) in the shared memory location. We use contiguous blocks |
|
|
|
* both in global data memory and the shared memory buffer. |
|
|
|
* |
|
|
|
* @param gIndex The global array index |
|
|
|
* @param blockDim The block size (threads per block) |
|
|
|
* @return The equivalent local address of the shared memory |
|
|
|
*/ |
|
|
|
__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) { |
|
|
|
return gIndex % (SizeToThreadsRatio * blockDim); |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* This is the version of the body that is called outside of the loop unrolling |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID |
|
|
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array |
|
|
|
threadId_t pid = partner(tid, step); |
|
|
|
if (tid > pid) { |
|
|
|
tid += n >> 1; |
|
|
|
pid += n >> 1; |
|
|
|
// Shift to the other half of the array for global data |
|
|
|
tid += n / SizeToThreadsRatio; |
|
|
|
pid += n / SizeToThreadsRatio; |
|
|
|
} |
|
|
|
if ((tid < n) && (pid < n)) { // Boundary check |
|
|
|
bool keep = keepSmall(tid, pid, stage); |
|
|
@@ -230,57 +346,72 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* This is unrolled part of the bitonic double loop. |
|
|
|
* |
|
|
|
* First each thread caches its corresponding data point from the current and the following data block. |
|
|
|
* After that we execute the loop unrolling on the local data and then we write back to global memory. |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param step [size_t] The current step of the current stage of bitonic sort |
|
|
|
* @param stage [size_t] The current stage of bitonic sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) { |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { |
|
|
|
extern __shared__ ValueT shared_data[]; |
|
|
|
|
|
|
|
/* |
|
|
|
* Global and local(shared) memory indices (calculated once) |
|
|
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) |
|
|
|
* And we cache the neighbor block address indexes in local (shared) memory |
|
|
|
*/ |
|
|
|
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); |
|
|
|
|
|
|
|
if (gIdx0 + blockDim.x >= n) // Boundary check |
|
|
|
return; |
|
|
|
|
|
|
|
// Fetch to local memory the entire effective block size (2 positions for each thread) |
|
|
|
shared_data[lIdx0] = data[gIdx0]; |
|
|
|
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Global memory thread and partner ids |
|
|
|
threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
|
threadId_t Pid = partner(Tid, step); |
|
|
|
if (Tid > Pid) { |
|
|
|
Tid += n >> 1; |
|
|
|
Pid += n >> 1; |
|
|
|
// Init thread global and local indices |
|
|
|
threadId_t gIdx = gIdx0; |
|
|
|
threadId_t lIdx = lIdx0; |
|
|
|
// Find partner and keep-small configuration based on the global data positions |
|
|
|
threadId_t pIdx = partner(gIdx, step); |
|
|
|
if (gIdx > pIdx) { |
|
|
|
// Shift inside effective block |
|
|
|
gIdx += blockDim.x; // global |
|
|
|
pIdx += blockDim.x; |
|
|
|
lIdx += blockDim.x; // local |
|
|
|
} |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
|
|
|
|
if ((Tid < n) && (Pid < n)) { // Boundary check |
|
|
|
// Global to local index resolution |
|
|
|
threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads)); |
|
|
|
threadId_t pid = tid + 1; |
|
|
|
// Fetch to local memory |
|
|
|
shared_data[tid] = data[Tid]; |
|
|
|
shared_data[pid] = data[Pid]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
bool keep = keepSmall(Tid, Pid, stage); |
|
|
|
exchange(shared_data, tid, pid, keep); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
// Write back to global memory |
|
|
|
data[Tid] = shared_data[tid]; |
|
|
|
data[Pid] = shared_data[pid]; |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
|
|
|
|
// Write back to global memory |
|
|
|
data[gIdx0] = shared_data[lIdx0]; |
|
|
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* A distributed version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @note |
|
|
|
* Each MPI process should run an instance of this function. |
|
|
|
* |
|
|
|
* @tparam dDataT A Shadowed buffer type with random access iterator. |
|
|
|
* A CUDA version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|
* @param data [ShadowedDataT] The local to MPI process data to sort |
|
|
|
* @param Processes [mpi_id_t] The total number of MPI processes |
|
|
|
* @param rank [mpi_id_t] The current process id |
|
|
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods |
|
|
|
* @param data [DataT&] Reference to the container to sort |
|
|
|
*/ |
|
|
|
|
|
|
|
template <typename DataT> |
|
|
|
void bitonicSort(DataT& data) { |
|
|
|
using value_t = typename DataT::value_type; |
|
|
@@ -288,30 +419,36 @@ void bitonicSort(DataT& data) { |
|
|
|
value_t* dev_data; |
|
|
|
auto size = data.size(); |
|
|
|
|
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
int* d_mutex; |
|
|
|
cudaMalloc(&d_mutex, sizeof(int)); |
|
|
|
cudaMemset(d_mutex, 0, sizeof(int)); // init mutex |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
size_t Nth = config.blockSize; |
|
|
|
size_t Nbl = NBlocks(size); |
|
|
|
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD)); |
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); |
|
|
|
Timer_sorting.start(); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex); |
|
|
|
inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
Timer_sorting.stop(); |
|
|
|
|
|
|
|
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); |
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|