|
|
@@ -15,6 +15,7 @@ |
|
|
|
#include <cmath> |
|
|
|
#include <cstdint> |
|
|
|
#include <utility> |
|
|
|
#include <stdexcept> |
|
|
|
|
|
|
|
#include "utils.hpp" |
|
|
|
|
|
|
@@ -159,9 +160,9 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
size_t Nth = config.blockSize; |
|
|
@@ -180,7 +181,7 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
@@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* This is unrolled part of the bitonic double loop for the init phase where the entire |
|
|
|
* double loop can fit in one block with shared memory access. |
|
|
|
* |
|
|
|
* First each thread caches its corresponding data point from the current and the following data block. |
|
|
|
* After that we execute the pre-phase on the local data and then we write back to global memory. |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param stages [size_t] The number of stages to pre execute |
|
|
|
* @param maxStages [size_t] The maximum number of stages for the entire sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { |
|
|
|
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
interBlockStep_(data, n, step, stage); |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* A CUDA version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
@@ -262,18 +288,22 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
size_t Nth = config.blockSize; |
|
|
|
size_t Nbl = NBlocks(size); |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); // |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); |
|
|
|
size_t PrephaseStages= InnerBlockSteps + 1; |
|
|
|
|
|
|
|
Timer_sorting.start(); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
@@ -286,7 +316,7 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
@@ -301,7 +331,9 @@ void bitonicSort(DataT& data) { |
|
|
|
* @note |
|
|
|
* Each block thread collection can exchange twice the size of data points. |
|
|
|
*/ |
|
|
|
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } |
|
|
|
inline constexpr size_t effectiveBlockSize(size_t blockSize) { |
|
|
|
return SizeToThreadsRatio * blockSize; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
|
|
|
|
// Write back to global memory |
|
|
|
// Write back to global memory (no sync here, there will be sync from host) |
|
|
|
data[gIdx0] = shared_data[lIdx0]; |
|
|
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
|
* This is unrolled part of the bitonic double loop for the init phase where the entire |
|
|
|
* double loop can fit in one block with shared memory access. |
|
|
|
* |
|
|
|
* First each thread caches its corresponding data point from the current and the following data block. |
|
|
|
* After that we execute the pre-phase on the local data and then we write back to global memory. |
|
|
|
* |
|
|
|
* @tparam ValueT The underlying data type of the array items |
|
|
|
* @param data [ValueT*] Pointer to data array |
|
|
|
* @param n [size_t] The total size of the array |
|
|
|
* @param stages [size_t] The number of stages to pre execute |
|
|
|
* @param maxStages [size_t] The maximum number of stages for the entire sort |
|
|
|
*/ |
|
|
|
template <typename ValueT> |
|
|
|
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { |
|
|
|
extern __shared__ ValueT shared_data[]; |
|
|
|
|
|
|
|
/* |
|
|
|
* Global and local(shared) memory indices (calculated once) |
|
|
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) |
|
|
|
* And we cache the neighbor block address indexes in local (shared) memory |
|
|
|
*/ |
|
|
|
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); |
|
|
|
|
|
|
|
if (gIdx0 + blockDim.x >= n) // Boundary check |
|
|
|
return; |
|
|
|
|
|
|
|
// Fetch to local memory the entire effective block size (2 positions for each thread) |
|
|
|
shared_data[lIdx0] = data[gIdx0]; |
|
|
|
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; |
|
|
|
__syncthreads(); |
|
|
|
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Init thread global and local indices |
|
|
|
threadId_t gIdx = gIdx0; |
|
|
|
threadId_t lIdx = lIdx0; |
|
|
|
// Find partner and keep-small configuration based on the global data positions |
|
|
|
threadId_t pIdx = partner(gIdx, step); |
|
|
|
if (gIdx > pIdx) { |
|
|
|
// Shift inside effective block |
|
|
|
gIdx += blockDim.x; // global |
|
|
|
pIdx += blockDim.x; |
|
|
|
lIdx += blockDim.x; // local |
|
|
|
} |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Write back to global memory (no sync here, there will be sync from host) |
|
|
|
data[gIdx0] = shared_data[lIdx0]; |
|
|
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
@@ -421,19 +513,23 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not allocate memory"); |
|
|
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory to device"); |
|
|
|
Timer_memory.stop(); |
|
|
|
|
|
|
|
size_t Nth = config.blockSize; |
|
|
|
size_t Nbl = NBlocks(size); |
|
|
|
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); |
|
|
|
size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t); |
|
|
|
|
|
|
|
auto Stages = static_cast<size_t>(log2(size)); |
|
|
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); |
|
|
|
size_t PrephaseStages= InnerBlockSteps + 1; |
|
|
|
|
|
|
|
Timer_sorting.start(); |
|
|
|
for (size_t stage = 1; stage <= Stages; ++stage) { |
|
|
|
prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { |
|
|
|
size_t step = stage - 1; |
|
|
|
for ( ; step > InnerBlockSteps; --step) { |
|
|
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); |
|
|
@@ -446,7 +542,7 @@ void bitonicSort(DataT& data) { |
|
|
|
|
|
|
|
Timer_memory.start(); |
|
|
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); |
|
|
|
throw std::runtime_error("[CUDA] - Can not copy memory from device"); |
|
|
|
cudaFree(dev_data); |
|
|
|
Timer_memory.stop(); |
|
|
|
} |
|
|
|