|
|
@@ -112,8 +112,8 @@ template <typename ValueT> |
|
|
|
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) { |
|
|
|
if (( keepSmall && (data[tid] > data[partner])) || |
|
|
|
(!keepSmall && (data[tid] < data[partner])) ) { |
|
|
|
ValueT temp = data[tid]; |
|
|
|
data[tid] = data[partner]; |
|
|
|
ValueT temp = data[tid]; |
|
|
|
data[tid] = data[partner]; |
|
|
|
data[partner] = temp; |
|
|
|
} |
|
|
|
} |
|
|
@@ -378,6 +378,7 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* This is unrolled part of the bitonic double loop. |
|
|
|
* |
|
|
@@ -399,42 +400,44 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st |
|
|
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) |
|
|
|
* And we cache the neighbor block address indexes in local (shared) memory |
|
|
|
*/ |
|
|
|
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); |
|
|
|
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx = toLocal(gIdx, blockDim.x); |
|
|
|
|
|
|
|
if (gIdx0 + blockDim.x >= n) // Boundary check |
|
|
|
if (gIdx + blockDim.x >= n) // Boundary check |
|
|
|
return; |
|
|
|
|
|
|
|
// Fetch to local memory the entire effective block size (2 positions for each thread) |
|
|
|
shared_data[lIdx0] = data[gIdx0]; |
|
|
|
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; |
|
|
|
shared_data[lIdx] = data[gIdx]; |
|
|
|
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t step = innerSteps + 1; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Init thread global and local indices |
|
|
|
threadId_t gIdx = gIdx0; |
|
|
|
threadId_t lIdx = lIdx0; |
|
|
|
// Find partner and keep-small configuration based on the global data positions |
|
|
|
threadId_t pIdx = partner(gIdx, step); |
|
|
|
if (gIdx > pIdx) { |
|
|
|
// Shift inside effective block |
|
|
|
gIdx += blockDim.x; // global |
|
|
|
pIdx += blockDim.x; |
|
|
|
lIdx += blockDim.x; // local |
|
|
|
// Work on the right site |
|
|
|
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x); |
|
|
|
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep); |
|
|
|
} |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
else { |
|
|
|
// Work on the left site |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
|
|
|
|
// Write back to global memory (no sync here, there will be sync from host) |
|
|
|
data[gIdx0] = shared_data[lIdx0]; |
|
|
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; |
|
|
|
data[gIdx] = shared_data[lIdx]; |
|
|
|
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x]; |
|
|
|
} |
|
|
|
|
|
|
|
/*! |
|
|
@@ -459,6 +462,59 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages |
|
|
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) |
|
|
|
* And we cache the neighbor block address indexes in local (shared) memory |
|
|
|
*/ |
|
|
|
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx = toLocal(gIdx, blockDim.x); |
|
|
|
|
|
|
|
if (gIdx + blockDim.x >= n) // Boundary check |
|
|
|
return; |
|
|
|
|
|
|
|
// Fetch to local memory the entire effective block size (2 positions for each thread) |
|
|
|
shared_data[lIdx] = data[gIdx]; |
|
|
|
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x]; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Find partner and keep-small configuration based on the global data positions |
|
|
|
threadId_t pIdx = partner(gIdx, step); |
|
|
|
if (gIdx > pIdx) { |
|
|
|
// Work on the right site |
|
|
|
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x); |
|
|
|
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep); |
|
|
|
} |
|
|
|
else { |
|
|
|
// Work on the left site |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Write back to global memory (no sync here, there will be sync from host) |
|
|
|
data[gIdx] = shared_data[lIdx]; |
|
|
|
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x]; |
|
|
|
|
|
|
|
#if 0 |
|
|
|
/* |
|
|
|
* Idea: |
|
|
|
* - Keep a register copy of data[gIdx0], and data[gIdx0 + blockDim.x] |
|
|
|
* - Instead of exchange in shared_data, read in register the partner and exchange there. |
|
|
|
* - Write back only if there was an exchange |
|
|
|
* |
|
|
|
* ^^ |
|
|
|
* Unfortunately this breaks sequential consistency and register values (lValve) does not match with share_data |
|
|
|
* or even lValueR0 and lValueL0. Maybe there is something to do with register spilling (lValue keeps spill |
|
|
|
* on local mem). |
|
|
|
*/ |
|
|
|
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; |
|
|
|
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); |
|
|
|
|
|
|
@@ -466,29 +522,40 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages |
|
|
|
return; |
|
|
|
|
|
|
|
// Fetch to local memory the entire effective block size (2 positions for each thread) |
|
|
|
shared_data[lIdx0] = data[gIdx0]; |
|
|
|
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; |
|
|
|
// also keep thread's init values (L and R) on register locations |
|
|
|
ValueT lValueL0 = data[gIdx0]; |
|
|
|
ValueT lValueR0 = data[gIdx0 + blockDim.x]; |
|
|
|
shared_data[lIdx0] = lValueL0; |
|
|
|
shared_data[lIdx0 + blockDim.x] = lValueR0; |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { |
|
|
|
for (size_t step = stage; step > 0; ) { |
|
|
|
--step; |
|
|
|
|
|
|
|
// Init thread global and local indices |
|
|
|
// Init thread global, local indices and active local register value |
|
|
|
threadId_t gIdx = gIdx0; |
|
|
|
threadId_t lIdx = lIdx0; |
|
|
|
ValueT lValue = lValueL0; // "Me" on the left side of effective block |
|
|
|
|
|
|
|
// Find partner and keep-small configuration based on the global data positions |
|
|
|
threadId_t pIdx = partner(gIdx, step); |
|
|
|
if (gIdx > pIdx) { |
|
|
|
// Shift inside effective block |
|
|
|
gIdx += blockDim.x; // global |
|
|
|
gIdx += blockDim.x; // global |
|
|
|
pIdx += blockDim.x; |
|
|
|
lIdx += blockDim.x; // local |
|
|
|
lIdx += blockDim.x; // local |
|
|
|
lValue = lValueR0; // The other me (the right side) |
|
|
|
} |
|
|
|
bool keep = keepSmall(gIdx, pIdx, stage); |
|
|
|
|
|
|
|
// Exchange data on local(shared) copy |
|
|
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x); |
|
|
|
exchange(shared_data, lIdx, lpIdx, keep); |
|
|
|
ValueT pValue = shared_data[lpIdx]; |
|
|
|
if (exchangeVals(&lValue, &pValue, keep)) { |
|
|
|
shared_data[lIdx] = lValue; |
|
|
|
shared_data[lpIdx] = pValue; |
|
|
|
} |
|
|
|
__syncthreads(); |
|
|
|
} |
|
|
|
} |
|
|
@@ -496,8 +563,10 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages |
|
|
|
// Write back to global memory (no sync here, there will be sync from host) |
|
|
|
data[gIdx0] = shared_data[lIdx0]; |
|
|
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; |
|
|
|
#endif |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/*! |
|
|
|
* A CUDA version of the Bitonic sort algorithm. |
|
|
|
* |
|
|
|