|
|
@@ -232,46 +232,40 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage |
|
|
|
|
|
|
|
|
|
|
|
template <typename ValueT> |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) { |
|
|
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) { |
|
|
|
extern __shared__ ValueT shared_data[]; |
|
|
|
ValueT* lowerHalf = (ValueT*) shared_data; |
|
|
|
ValueT* upperHalf = (ValueT*) &shared_data[nthreads]; |
|
|
|
|
|
|
|
// Global memory thread and partner ids |
|
|
|
threadId_t thread_id = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Shared memory thread and partner ids |
|
|
|
threadId_t local_tid = threadIdx.x; |
|
|
|
// Global memory thread and partner ids |
|
|
|
threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
|
threadId_t Pid = partner(Tid, step); |
|
|
|
if (Tid > Pid) { |
|
|
|
Tid += n >> 1; |
|
|
|
Pid += n >> 1; |
|
|
|
} |
|
|
|
|
|
|
|
if ( thread_id < (n >> 1) ) { |
|
|
|
// Fetch to local memory for both half |
|
|
|
lowerHalf[local_tid] = data[thread_id]; |
|
|
|
upperHalf[local_tid] = data[thread_id + (n >> 1)]; |
|
|
|
__syncthreads(); |
|
|
|
if ((Tid < n) && (Pid < n)) { // Boundary check |
|
|
|
// Global to local index resolution |
|
|
|
threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads)); |
|
|
|
threadId_t pid = tid + 1; |
|
|
|
// Fetch to local memory |
|
|
|
shared_data[tid] = data[Tid]; |
|
|
|
shared_data[pid] = data[Pid]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
bool keep = keepSmall(Tid, Pid, stage); |
|
|
|
exchange(shared_data, tid, pid, keep); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
// Find partner and localize it |
|
|
|
threadId_t partner_id = partner(thread_id, step); |
|
|
|
threadId_t local_pid = partner_id % nthreads; |
|
|
|
|
|
|
|
if (local_tid < local_pid) { |
|
|
|
// exchange on low site buffer (half of the threads) |
|
|
|
bool keep = keepSmall(thread_id, partner_id, stage); |
|
|
|
exchange(lowerHalf, local_tid, local_pid, keep); |
|
|
|
} |
|
|
|
else { |
|
|
|
// exchange on high site buffer (other half of the threads) |
|
|
|
bool keep = keepSmall(thread_id, partner_id, stage); |
|
|
|
exchange(upperHalf, local_tid, local_pid, keep); |
|
|
|
} |
|
|
|
// Write back to global memory |
|
|
|
data[Tid] = shared_data[tid]; |
|
|
|
data[Pid] = shared_data[pid]; |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
// Write back to global memory |
|
|
|
data[thread_id] = lowerHalf[local_tid]; |
|
|
|
data[thread_id + (n >> 1)] = upperHalf[local_tid]; |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
@@ -297,6 +291,10 @@ void bitonicSort(DataT& data) { |
|
|
|
cudaMalloc(&dev_data, size * sizeof(value_t)); |
|
|
|
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
int* d_mutex; |
|
|
|
cudaMalloc(&d_mutex, sizeof(int)); |
|
|
|
cudaMemset(d_mutex, 0, sizeof(int)); // init mutex |
|
|
|
|
|
|
|
int Nthreads = THREADS_PER_BLOCK; |
|
|
|
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; |
|
|
|
|
|
|
@@ -308,7 +306,7 @@ void bitonicSort(DataT& data) { |
|
|
|
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage); |
|
|
|
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
} |
|
|
|
|
|
|
|