HE3: RC3 - V2 code refactor version measurements
This commit is contained in:
parent
f687f30d2f
commit
a5ae5c6bdb
2049
homework_3/analyse/RC3/profReportv2_A-regVals.txt
Normal file
2049
homework_3/analyse/RC3/profReportv2_A-regVals.txt
Normal file
File diff suppressed because it is too large
Load Diff
2049
homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt
Normal file
2049
homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -112,8 +112,8 @@ template <typename ValueT>
|
||||
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
|
||||
if (( keepSmall && (data[tid] > data[partner])) ||
|
||||
(!keepSmall && (data[tid] < data[partner])) ) {
|
||||
ValueT temp = data[tid];
|
||||
data[tid] = data[partner];
|
||||
ValueT temp = data[tid];
|
||||
data[tid] = data[partner];
|
||||
data[partner] = temp;
|
||||
}
|
||||
}
|
||||
@ -378,6 +378,7 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* This is unrolled part of the bitonic double loop.
|
||||
*
|
||||
@ -399,42 +400,44 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
|
||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
||||
* And we cache the neighbor block address indexes in local (shared) memory
|
||||
*/
|
||||
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
|
||||
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||
threadId_t lIdx = toLocal(gIdx, blockDim.x);
|
||||
|
||||
if (gIdx0 + blockDim.x >= n) // Boundary check
|
||||
if (gIdx + blockDim.x >= n) // Boundary check
|
||||
return;
|
||||
|
||||
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
||||
shared_data[lIdx0] = data[gIdx0];
|
||||
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
|
||||
shared_data[lIdx] = data[gIdx];
|
||||
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x];
|
||||
__syncthreads();
|
||||
|
||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||
--step;
|
||||
|
||||
// Init thread global and local indices
|
||||
threadId_t gIdx = gIdx0;
|
||||
threadId_t lIdx = lIdx0;
|
||||
// Find partner and keep-small configuration based on the global data positions
|
||||
threadId_t pIdx = partner(gIdx, step);
|
||||
if (gIdx > pIdx) {
|
||||
// Shift inside effective block
|
||||
gIdx += blockDim.x; // global
|
||||
pIdx += blockDim.x;
|
||||
lIdx += blockDim.x; // local
|
||||
}
|
||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||
// Work on the right site
|
||||
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||
exchange(shared_data, lIdx, lpIdx, keep);
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x);
|
||||
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep);
|
||||
}
|
||||
else {
|
||||
// Work on the left site
|
||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||
exchange(shared_data, lIdx, lpIdx, keep);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write back to global memory (no sync here, there will be sync from host)
|
||||
data[gIdx0] = shared_data[lIdx0];
|
||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
||||
data[gIdx] = shared_data[lIdx];
|
||||
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x];
|
||||
}
|
||||
|
||||
/*!
|
||||
@ -459,6 +462,59 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
|
||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
||||
* And we cache the neighbor block address indexes in local (shared) memory
|
||||
*/
|
||||
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||
threadId_t lIdx = toLocal(gIdx, blockDim.x);
|
||||
|
||||
if (gIdx + blockDim.x >= n) // Boundary check
|
||||
return;
|
||||
|
||||
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
||||
shared_data[lIdx] = data[gIdx];
|
||||
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x];
|
||||
__syncthreads();
|
||||
|
||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
|
||||
for (size_t step = stage; step > 0; ) {
|
||||
--step;
|
||||
|
||||
// Find partner and keep-small configuration based on the global data positions
|
||||
threadId_t pIdx = partner(gIdx, step);
|
||||
if (gIdx > pIdx) {
|
||||
// Work on the right site
|
||||
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x);
|
||||
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep);
|
||||
}
|
||||
else {
|
||||
// Work on the left site
|
||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||
exchange(shared_data, lIdx, lpIdx, keep);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// Write back to global memory (no sync here, there will be sync from host)
|
||||
data[gIdx] = shared_data[lIdx];
|
||||
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x];
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Idea:
|
||||
* - Keep a register copy of data[gIdx0], and data[gIdx0 + blockDim.x]
|
||||
* - Instead of exchange in shared_data, read in register the partner and exchange there.
|
||||
* - Write back only if there was an exchange
|
||||
*
|
||||
* ^^
|
||||
* Unfortunately this breaks sequential consistency and register values (lValve) does not match with share_data
|
||||
* or even lValueR0 and lValueL0. Maybe there is something to do with register spilling (lValue keeps spill
|
||||
* on local mem).
|
||||
*/
|
||||
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
|
||||
|
||||
@ -466,29 +522,40 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
|
||||
return;
|
||||
|
||||
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
||||
shared_data[lIdx0] = data[gIdx0];
|
||||
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
|
||||
// also keep thread's init values (L and R) on register locations
|
||||
ValueT lValueL0 = data[gIdx0];
|
||||
ValueT lValueR0 = data[gIdx0 + blockDim.x];
|
||||
shared_data[lIdx0] = lValueL0;
|
||||
shared_data[lIdx0 + blockDim.x] = lValueR0;
|
||||
__syncthreads();
|
||||
|
||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
|
||||
for (size_t step = stage; step > 0; ) {
|
||||
--step;
|
||||
|
||||
// Init thread global and local indices
|
||||
// Init thread global, local indices and active local register value
|
||||
threadId_t gIdx = gIdx0;
|
||||
threadId_t lIdx = lIdx0;
|
||||
ValueT lValue = lValueL0; // "Me" on the left side of effective block
|
||||
|
||||
// Find partner and keep-small configuration based on the global data positions
|
||||
threadId_t pIdx = partner(gIdx, step);
|
||||
if (gIdx > pIdx) {
|
||||
// Shift inside effective block
|
||||
gIdx += blockDim.x; // global
|
||||
gIdx += blockDim.x; // global
|
||||
pIdx += blockDim.x;
|
||||
lIdx += blockDim.x; // local
|
||||
lIdx += blockDim.x; // local
|
||||
lValue = lValueR0; // The other me (the right side)
|
||||
}
|
||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||
exchange(shared_data, lIdx, lpIdx, keep);
|
||||
ValueT pValue = shared_data[lpIdx];
|
||||
if (exchangeVals(&lValue, &pValue, keep)) {
|
||||
shared_data[lIdx] = lValue;
|
||||
shared_data[lpIdx] = pValue;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
@ -496,8 +563,10 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
|
||||
// Write back to global memory (no sync here, there will be sync from host)
|
||||
data[gIdx0] = shared_data[lIdx0];
|
||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* A CUDA version of the Bitonic sort algorithm.
|
||||
*
|
||||
|
@ -16,8 +16,10 @@
|
||||
/*
|
||||
* Versioning:
|
||||
* - RC1: First version to test on HPC
|
||||
* - RC2: A pre-phase added for v1 and v2
|
||||
* - RC3:
|
||||
*/
|
||||
static constexpr char version[] = "0.1";
|
||||
static constexpr char version[] = "0.2";
|
||||
|
||||
/*
|
||||
* Defines for different version of the exercise
|
||||
|
Loading…
x
Reference in New Issue
Block a user