HE3: RC3 - V2 code refactor version measurements

This commit is contained in:
Christos Choutouridis 2025-02-16 23:16:38 +02:00
부모 f687f30d2f
커밋 a5ae5c6bdb
4개의 변경된 파일4197개의 추가작업 그리고 28개의 파일을 삭제

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다. Load Diff

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다. Load Diff

파일 보기

@ -112,8 +112,8 @@ template <typename ValueT>
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
if (( keepSmall && (data[tid] > data[partner])) ||
(!keepSmall && (data[tid] < data[partner])) ) {
ValueT temp = data[tid];
data[tid] = data[partner];
ValueT temp = data[tid];
data[tid] = data[partner];
data[partner] = temp;
}
}
@ -378,6 +378,7 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage
}
}
/*!
* This is unrolled part of the bitonic double loop.
*
@ -399,42 +400,44 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we cache the neighbor block address indexes in local (shared) memory
*/
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx = toLocal(gIdx, blockDim.x);
if (gIdx0 + blockDim.x >= n) // Boundary check
if (gIdx + blockDim.x >= n) // Boundary check
return;
// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx0] = data[gIdx0];
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
shared_data[lIdx] = data[gIdx];
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x];
__syncthreads();
for (size_t step = innerSteps + 1; step > 0; ) {
--step;
// Init thread global and local indices
threadId_t gIdx = gIdx0;
threadId_t lIdx = lIdx0;
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Shift inside effective block
gIdx += blockDim.x; // global
pIdx += blockDim.x;
lIdx += blockDim.x; // local
}
bool keep = keepSmall(gIdx, pIdx, stage);
// Work on the right site
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x);
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep);
}
else {
// Work on the left site
bool keep = keepSmall(gIdx, pIdx, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
}
__syncthreads();
}
// Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
data[gIdx] = shared_data[lIdx];
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x];
}
/*!
@ -459,6 +462,59 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we cache the neighbor block address indexes in local (shared) memory
*/
threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx = toLocal(gIdx, blockDim.x);
if (gIdx + blockDim.x >= n) // Boundary check
return;
// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx] = data[gIdx];
shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x];
__syncthreads();
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Work on the right site
bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x);
exchange(shared_data, lIdx + blockDim.x, lpIdx, keep);
}
else {
// Work on the left site
bool keep = keepSmall(gIdx, pIdx, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
}
__syncthreads();
}
}
// Write back to global memory (no sync here, there will be sync from host)
data[gIdx] = shared_data[lIdx];
data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x];
#if 0
/*
* Idea:
* - Keep a register copy of data[gIdx0], and data[gIdx0 + blockDim.x]
* - Instead of exchange in shared_data, read in register the partner and exchange there.
* - Write back only if there was an exchange
*
* ^^
* Unfortunately this breaks sequential consistency and register values (lValve) does not match with share_data
* or even lValueR0 and lValueL0. Maybe there is something to do with register spilling (lValue keeps spill
* on local mem).
*/
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
@ -466,29 +522,40 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
return;
// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx0] = data[gIdx0];
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
// also keep thread's init values (L and R) on register locations
ValueT lValueL0 = data[gIdx0];
ValueT lValueR0 = data[gIdx0 + blockDim.x];
shared_data[lIdx0] = lValueL0;
shared_data[lIdx0 + blockDim.x] = lValueR0;
__syncthreads();
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
// Init thread global and local indices
// Init thread global, local indices and active local register value
threadId_t gIdx = gIdx0;
threadId_t lIdx = lIdx0;
ValueT lValue = lValueL0; // "Me" on the left side of effective block
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Shift inside effective block
gIdx += blockDim.x; // global
gIdx += blockDim.x; // global
pIdx += blockDim.x;
lIdx += blockDim.x; // local
lIdx += blockDim.x; // local
lValue = lValueR0; // The other me (the right side)
}
bool keep = keepSmall(gIdx, pIdx, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
ValueT pValue = shared_data[lpIdx];
if (exchangeVals(&lValue, &pValue, keep)) {
shared_data[lIdx] = lValue;
shared_data[lpIdx] = pValue;
}
__syncthreads();
}
}
@ -496,8 +563,10 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages
// Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
#endif
}
/*!
* A CUDA version of the Bitonic sort algorithm.
*

파일 보기

@ -16,8 +16,10 @@
/*
* Versioning:
* - RC1: First version to test on HPC
* - RC2: A pre-phase added for v1 and v2
* - RC3:
*/
static constexpr char version[] = "0.1";
static constexpr char version[] = "0.2";
/*
* Defines for different version of the exercise