HW3: RC1 First version to test on HPC
This commit is contained in:
parent
6db2a814d2
commit
e165b75f92
@ -22,7 +22,7 @@
|
|||||||
PROJECT := PDS_homework_3
|
PROJECT := PDS_homework_3
|
||||||
|
|
||||||
# Excecutable's name
|
# Excecutable's name
|
||||||
TARGET := bitonic
|
TARGET := bitonicCUDA
|
||||||
|
|
||||||
# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
|
# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
|
||||||
SRC_DIR_LIST := src #test test/gtest
|
SRC_DIR_LIST := src #test test/gtest
|
||||||
@ -52,7 +52,7 @@ REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
|||||||
|
|
||||||
# Pre-defines
|
# Pre-defines
|
||||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
||||||
PRE_DEFS :=
|
PRE_DEFS := TARGET=$(TARGET)
|
||||||
|
|
||||||
# ============== Linker settings ==============
|
# ============== Linker settings ==============
|
||||||
# Linker flags (example: -pthread -lm)
|
# Linker flags (example: -pthread -lm)
|
||||||
@ -83,17 +83,14 @@ DOCKER :=
|
|||||||
CSIZE := size
|
CSIZE := size
|
||||||
CFLAGS := $(DEB_CFLAGS)
|
CFLAGS := $(DEB_CFLAGS)
|
||||||
CXXFLAGS := $(DEB_CXXFLAGS)
|
CXXFLAGS := $(DEB_CXXFLAGS)
|
||||||
CXX := g++ #mpic++
|
CXX := g++
|
||||||
CC := gcc #mpicc
|
CC := gcc
|
||||||
LINKER := g++
|
LINKER := g++
|
||||||
|
|
||||||
#
|
#
|
||||||
# =========== Main body and Patterns ===========
|
# =========== Main body and Patterns ===========
|
||||||
#
|
#
|
||||||
|
|
||||||
#ifeq ($(OS), Windows_NT)
|
|
||||||
# TARGET := $(TARGET).exe
|
|
||||||
#endif
|
|
||||||
INC := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
|
INC := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
|
||||||
DEF := $(foreach def,$(PRE_DEFS),-D$(def))
|
DEF := $(foreach def,$(PRE_DEFS),-D$(def))
|
||||||
EXC := $(foreach fil,$(EXC_FILE_LIST), \
|
EXC := $(foreach fil,$(EXC_FILE_LIST), \
|
||||||
@ -111,38 +108,16 @@ OBJ := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
|
|||||||
DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
|
DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
|
||||||
|
|
||||||
|
|
||||||
# Make Dependencies pattern.
|
|
||||||
# This little trick enables recompilation only when dependencies change
|
|
||||||
# and it does so for changes both in source AND header files ;)
|
|
||||||
#
|
|
||||||
# It is based on Tom Tromey's method.
|
|
||||||
#
|
|
||||||
# Invoke cpp to create makefile rules with dependencies for each source file
|
|
||||||
#$(DEP_DIR)/%.d: %.c
|
|
||||||
# @mkdir -p $(@D)
|
|
||||||
# @$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
|
|
||||||
|
|
||||||
# c file objects depent on .c AND dependency files, which have an empty recipe
|
# c file objects depent on .c AND dependency files, which have an empty recipe
|
||||||
$(OBJ_DIR)/%.o: %.c
|
$(OBJ_DIR)/%.o: %.c
|
||||||
@mkdir -p $(@D)
|
@mkdir -p $(@D)
|
||||||
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
|
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
|
||||||
|
|
||||||
#$(DEP_DIR)/%.d: %.cpp
|
|
||||||
# @mkdir -p $(@D)
|
|
||||||
# @$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
|
|
||||||
|
|
||||||
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
|
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
|
||||||
$(OBJ_DIR)/%.o: %.cpp
|
$(OBJ_DIR)/%.o: %.cpp
|
||||||
@mkdir -p $(@D)
|
@mkdir -p $(@D)
|
||||||
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
|
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
|
||||||
|
|
||||||
# empty recipe for dependency files. This prevents make errors
|
|
||||||
#$(DEP):
|
|
||||||
|
|
||||||
# now include all dependencies
|
|
||||||
# After all they are makefile dependency rules ;)
|
|
||||||
#include $(wildcard $(DEP))
|
|
||||||
|
|
||||||
# main target rule
|
# main target rule
|
||||||
$(BUILD_DIR)/$(TARGET): $(OBJ)
|
$(BUILD_DIR)/$(TARGET): $(OBJ)
|
||||||
@mkdir -p $(@D)
|
@mkdir -p $(@D)
|
||||||
@ -179,6 +154,39 @@ release: $(BUILD_DIR)/$(TARGET)
|
|||||||
# ================ Build rules =================
|
# ================ Build rules =================
|
||||||
#
|
#
|
||||||
|
|
||||||
|
bitonic_v0deb: CC := nvcc -G -g -x cu
|
||||||
|
bitonic_v0deb: CXX := nvcc -G -g -x cu
|
||||||
|
bitonic_v0deb: LINKER := nvcc
|
||||||
|
bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
|
||||||
|
bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
|
||||||
|
bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
||||||
|
bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
|
||||||
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
bitonic_v1deb: CC := nvcc -G -g -x cu
|
||||||
|
bitonic_v1deb: CXX := nvcc -G -g -x cu
|
||||||
|
bitonic_v1deb: LINKER := nvcc
|
||||||
|
bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
|
||||||
|
bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
|
||||||
|
bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
|
||||||
|
bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
|
||||||
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
bitonic_v2deb: CC := nvcc -G -g -x cu
|
||||||
|
bitonic_v2deb: CXX := nvcc -G -g -x cu
|
||||||
|
bitonic_v2deb: LINKER := nvcc
|
||||||
|
bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
|
||||||
|
bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
|
||||||
|
bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
|
||||||
|
bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
|
||||||
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
bitonic_v0: CC := nvcc -x cu
|
bitonic_v0: CC := nvcc -x cu
|
||||||
bitonic_v0: CXX := nvcc -x cu
|
bitonic_v0: CXX := nvcc -x cu
|
||||||
@ -201,7 +209,6 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET)
|
|||||||
@mkdir -p $(OUTPUT_DIR)
|
@mkdir -p $(OUTPUT_DIR)
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
bitonic_v2: CC := nvcc -x cu
|
bitonic_v2: CC := nvcc -x cu
|
||||||
bitonic_v2: CXX := nvcc -x cu
|
bitonic_v2: CXX := nvcc -x cu
|
||||||
bitonic_v2: LINKER := nvcc
|
bitonic_v2: LINKER := nvcc
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
/*
|
/*
|
||||||
* Exported timers
|
* Exported timers
|
||||||
*/
|
*/
|
||||||
extern Timing Timer_total;
|
extern Timing Timer_total, Timer_memory, Timer_sorting;
|
||||||
|
|
||||||
using threadId_t = size_t;
|
using threadId_t = size_t;
|
||||||
|
|
||||||
@ -74,9 +74,41 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag
|
|||||||
* ============================== Sort algorithms ==============================
|
* ============================== Sort algorithms ==============================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Each thread can handle 2 points in the array. For each of these 2 points it may
|
||||||
|
* - compare and exchange if needed
|
||||||
|
* - copy data to local and back if needed
|
||||||
|
*/
|
||||||
|
static constexpr size_t SizeToThreadsRatio = 2;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculates the blocks needed for the entire sorting process
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
|
||||||
|
* multipliers of config.blockSize.
|
||||||
|
* Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
|
||||||
|
*
|
||||||
|
* @param arraySize [ArraySize_t] The size of the entire array (in points)
|
||||||
|
* @return [size_t] The number of blocks
|
||||||
|
*/
|
||||||
|
inline size_t NBlocks(ArraySize_t arraySize) {
|
||||||
|
return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Exchange utility
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
*
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param tid [threadId_t] Current thread's index to data
|
||||||
|
* @param pid [threadId_t] Parents's index to data
|
||||||
|
* @param keepSmall [bool] Flag to indicate if current threads is keeping the small
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
|
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
|
||||||
if (( keepSmall && (data[tid] > data[partner])) ||
|
if (( keepSmall && (data[tid] > data[partner])) ||
|
||||||
(!keepSmall && (data[tid] < data[partner])) ) {
|
(!keepSmall && (data[tid] < data[partner])) ) {
|
||||||
ValueT temp = data[tid];
|
ValueT temp = data[tid];
|
||||||
@ -86,13 +118,24 @@ __device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if CODE_VERSION == V0
|
#if CODE_VERSION == V0
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is the body of each thread. This function compare and exchange data
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
|
||||||
threadId_t pid = partner(tid, step);
|
threadId_t pid = partner(tid, step);
|
||||||
if (tid > pid) {
|
if (tid > pid) {
|
||||||
tid += n >> 1;
|
// Shift to the other half of the array for global data
|
||||||
pid += n >> 1;
|
tid += n / SizeToThreadsRatio;
|
||||||
|
pid += n / SizeToThreadsRatio;
|
||||||
}
|
}
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
@ -102,18 +145,11 @@ __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
|||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* A distributed version of the Bitonic sort algorithm.
|
* A CUDA version of the Bitonic sort algorithm.
|
||||||
*
|
*
|
||||||
* @note
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
||||||
* Each MPI process should run an instance of this function.
|
* @param data [DataT&] Reference to the container to sort
|
||||||
*
|
|
||||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
|
||||||
*
|
|
||||||
* @param data [ShadowedDataT] The local to MPI process data to sort
|
|
||||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
|
||||||
* @param rank [mpi_id_t] The current process id
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <typename DataT>
|
template <typename DataT>
|
||||||
void bitonicSort(DataT& data) {
|
void bitonicSort(DataT& data) {
|
||||||
using value_t = typename DataT::value_type;
|
using value_t = typename DataT::value_type;
|
||||||
@ -121,34 +157,57 @@ void bitonicSort(DataT& data) {
|
|||||||
value_t* dev_data;
|
value_t* dev_data;
|
||||||
auto size = data.size();
|
auto size = data.size();
|
||||||
|
|
||||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
Timer_memory.start();
|
||||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||||
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||||
|
Timer_memory.stop();
|
||||||
|
|
||||||
int Nthreads = THREADS_PER_BLOCK;
|
size_t Nth = config.blockSize;
|
||||||
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
size_t Nbl = NBlocks(size);
|
||||||
|
|
||||||
size_t Stages = static_cast<size_t>(log2(size));
|
size_t Stages = static_cast<size_t>(log2(size));
|
||||||
|
Timer_sorting.start();
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
for (size_t step = stage; step > 0; ) {
|
for (size_t step = stage; step > 0; ) {
|
||||||
--step;
|
--step;
|
||||||
bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Timer_sorting.stop();
|
||||||
|
|
||||||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
Timer_memory.start();
|
||||||
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||||
cudaFree(dev_data);
|
cudaFree(dev_data);
|
||||||
|
Timer_memory.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif CODE_VERSION == V1
|
#elif CODE_VERSION == V1
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is the body of each thread. This function compare and exchange data
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
|
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
/*
|
||||||
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
||||||
|
* And we use the neighbor block address indices for the other half of the threads
|
||||||
|
*/
|
||||||
|
threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||||
threadId_t pid = partner(tid, step);
|
threadId_t pid = partner(tid, step);
|
||||||
if (tid > pid) {
|
if (tid > pid) {
|
||||||
tid += n >> 1;
|
// Shift to the other half of the array for global data
|
||||||
pid += n >> 1;
|
tid += blockDim.x;
|
||||||
|
pid += blockDim.x;
|
||||||
}
|
}
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
@ -156,12 +215,29 @@ __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stag
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is the version of the body that is called outside of the loop unrolling
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
interBlockStep_(data, n, step, stage);
|
interBlockStep_(data, n, step, stage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is unrolled part of the bitonic double loop.
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
||||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||||
@ -172,18 +248,11 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* A distributed version of the Bitonic sort algorithm.
|
* A CUDA version of the Bitonic sort algorithm.
|
||||||
*
|
*
|
||||||
* @note
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
||||||
* Each MPI process should run an instance of this function.
|
* @param data [DataT&] Reference to the container to sort
|
||||||
*
|
|
||||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
|
||||||
*
|
|
||||||
* @param data [ShadowedDataT] The local to MPI process data to sort
|
|
||||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
|
||||||
* @param rank [mpi_id_t] The current process id
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <typename DataT>
|
template <typename DataT>
|
||||||
void bitonicSort(DataT& data) {
|
void bitonicSort(DataT& data) {
|
||||||
using value_t = typename DataT::value_type;
|
using value_t = typename DataT::value_type;
|
||||||
@ -191,38 +260,85 @@ void bitonicSort(DataT& data) {
|
|||||||
value_t* dev_data;
|
value_t* dev_data;
|
||||||
auto size = data.size();
|
auto size = data.size();
|
||||||
|
|
||||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
Timer_memory.start();
|
||||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||||
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||||
|
Timer_memory.stop();
|
||||||
|
|
||||||
int Nthreads = THREADS_PER_BLOCK;
|
size_t Nth = config.blockSize;
|
||||||
int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
size_t Nbl = NBlocks(size);
|
||||||
|
|
||||||
auto Stages = static_cast<size_t>(log2(size));
|
auto Stages = static_cast<size_t>(log2(size));
|
||||||
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); //
|
||||||
|
Timer_sorting.start();
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
size_t step = stage - 1;
|
size_t step = stage - 1;
|
||||||
for ( ; step > InnerBlockSteps; --step) {
|
for ( ; step > InnerBlockSteps; --step) {
|
||||||
interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
|
inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
|
Timer_sorting.stop();
|
||||||
|
|
||||||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
Timer_memory.start();
|
||||||
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||||
cudaFree(dev_data);
|
cudaFree(dev_data);
|
||||||
|
Timer_memory.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#elif CODE_VERSION == V2
|
#elif CODE_VERSION == V2
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @return The memory that each block local threads can affect.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* Each block thread collection can exchange twice the size of data points.
|
||||||
|
*/
|
||||||
|
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Converts the global address of the data to the local shared memory array which is used
|
||||||
|
* as cached memory to the unrolled part of the bitonic sort loop.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* Each block's thread collection can exchange twice the size of data points.
|
||||||
|
* These points get copied (cached) in the shared memory location. We use contiguous blocks
|
||||||
|
* both in global data memory and the shared memory buffer.
|
||||||
|
*
|
||||||
|
* @param gIndex The global array index
|
||||||
|
* @param blockDim The block size (threads per block)
|
||||||
|
* @return The equivalent local address of the shared memory
|
||||||
|
*/
|
||||||
|
__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
|
||||||
|
return gIndex % (SizeToThreadsRatio * blockDim);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is the version of the body that is called outside of the loop unrolling
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
|
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
|
||||||
threadId_t pid = partner(tid, step);
|
threadId_t pid = partner(tid, step);
|
||||||
if (tid > pid) {
|
if (tid > pid) {
|
||||||
tid += n >> 1;
|
// Shift to the other half of the array for global data
|
||||||
pid += n >> 1;
|
tid += n / SizeToThreadsRatio;
|
||||||
|
pid += n / SizeToThreadsRatio;
|
||||||
}
|
}
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
if ((tid < n) && (pid < n)) { // Boundary check
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
bool keep = keepSmall(tid, pid, stage);
|
||||||
@ -230,57 +346,72 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* This is unrolled part of the bitonic double loop.
|
||||||
|
*
|
||||||
|
* First each thread caches its corresponding data point from the current and the following data block.
|
||||||
|
* After that we execute the loop unrolling on the local data and then we write back to global memory.
|
||||||
|
*
|
||||||
|
* @tparam ValueT The underlying data type of the array items
|
||||||
|
* @param data [ValueT*] Pointer to data array
|
||||||
|
* @param n [size_t] The total size of the array
|
||||||
|
* @param step [size_t] The current step of the current stage of bitonic sort
|
||||||
|
* @param stage [size_t] The current stage of bitonic sort
|
||||||
|
*/
|
||||||
template <typename ValueT>
|
template <typename ValueT>
|
||||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) {
|
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
||||||
extern __shared__ ValueT shared_data[];
|
extern __shared__ ValueT shared_data[];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Global and local(shared) memory indices (calculated once)
|
||||||
|
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
||||||
|
* And we cache the neighbor block address indexes in local (shared) memory
|
||||||
|
*/
|
||||||
|
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||||
|
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
|
||||||
|
|
||||||
|
if (gIdx0 + blockDim.x >= n) // Boundary check
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
||||||
|
shared_data[lIdx0] = data[gIdx0];
|
||||||
|
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
for (size_t step = innerSteps + 1; step > 0; ) {
|
||||||
--step;
|
--step;
|
||||||
|
|
||||||
// Global memory thread and partner ids
|
// Init thread global and local indices
|
||||||
threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x;
|
threadId_t gIdx = gIdx0;
|
||||||
threadId_t Pid = partner(Tid, step);
|
threadId_t lIdx = lIdx0;
|
||||||
if (Tid > Pid) {
|
// Find partner and keep-small configuration based on the global data positions
|
||||||
Tid += n >> 1;
|
threadId_t pIdx = partner(gIdx, step);
|
||||||
Pid += n >> 1;
|
if (gIdx > pIdx) {
|
||||||
|
// Shift inside effective block
|
||||||
|
gIdx += blockDim.x; // global
|
||||||
|
pIdx += blockDim.x;
|
||||||
|
lIdx += blockDim.x; // local
|
||||||
}
|
}
|
||||||
|
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||||
|
|
||||||
if ((Tid < n) && (Pid < n)) { // Boundary check
|
// Exchange data on local(shared) copy
|
||||||
// Global to local index resolution
|
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||||
threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads));
|
exchange(shared_data, lIdx, lpIdx, keep);
|
||||||
threadId_t pid = tid + 1;
|
__syncthreads();
|
||||||
// Fetch to local memory
|
|
||||||
shared_data[tid] = data[Tid];
|
|
||||||
shared_data[pid] = data[Pid];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
bool keep = keepSmall(Tid, Pid, stage);
|
|
||||||
exchange(shared_data, tid, pid, keep);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Write back to global memory
|
|
||||||
data[Tid] = shared_data[tid];
|
|
||||||
data[Pid] = shared_data[pid];
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write back to global memory
|
||||||
|
data[gIdx0] = shared_data[lIdx0];
|
||||||
|
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* A distributed version of the Bitonic sort algorithm.
|
* A CUDA version of the Bitonic sort algorithm.
|
||||||
*
|
*
|
||||||
* @note
|
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
||||||
* Each MPI process should run an instance of this function.
|
* @param data [DataT&] Reference to the container to sort
|
||||||
*
|
|
||||||
* @tparam dDataT A Shadowed buffer type with random access iterator.
|
|
||||||
*
|
|
||||||
* @param data [ShadowedDataT] The local to MPI process data to sort
|
|
||||||
* @param Processes [mpi_id_t] The total number of MPI processes
|
|
||||||
* @param rank [mpi_id_t] The current process id
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <typename DataT>
|
template <typename DataT>
|
||||||
void bitonicSort(DataT& data) {
|
void bitonicSort(DataT& data) {
|
||||||
using value_t = typename DataT::value_type;
|
using value_t = typename DataT::value_type;
|
||||||
@ -288,30 +419,36 @@ void bitonicSort(DataT& data) {
|
|||||||
value_t* dev_data;
|
value_t* dev_data;
|
||||||
auto size = data.size();
|
auto size = data.size();
|
||||||
|
|
||||||
cudaMalloc(&dev_data, size * sizeof(value_t));
|
Timer_memory.start();
|
||||||
cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
|
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||||
|
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||||
|
Timer_memory.stop();
|
||||||
|
|
||||||
int* d_mutex;
|
size_t Nth = config.blockSize;
|
||||||
cudaMalloc(&d_mutex, sizeof(int));
|
size_t Nbl = NBlocks(size);
|
||||||
cudaMemset(d_mutex, 0, sizeof(int)); // init mutex
|
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
|
||||||
|
|
||||||
int Nthreads = THREADS_PER_BLOCK;
|
auto Stages = static_cast<size_t>(log2(size));
|
||||||
int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
|
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
|
||||||
|
Timer_sorting.start();
|
||||||
auto Stages = static_cast<size_t>(log2(size));
|
|
||||||
auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
|
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||||
size_t step = stage - 1;
|
size_t step = stage - 1;
|
||||||
for ( ; step > InnerBlockSteps; --step) {
|
for ( ; step > InnerBlockSteps; --step) {
|
||||||
interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
|
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex);
|
inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
|
Timer_sorting.stop();
|
||||||
|
|
||||||
cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
|
Timer_memory.start();
|
||||||
|
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||||
cudaFree(dev_data);
|
cudaFree(dev_data);
|
||||||
|
Timer_memory.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*!
|
/*!
|
||||||
* \file
|
* \file
|
||||||
* \brief Build configuration file.
|
* \brief Build and runtime configuration file.
|
||||||
*
|
*
|
||||||
* \author
|
* \author
|
||||||
* Christos Choutouridis AEM:8997
|
* Christos Choutouridis AEM:8997
|
||||||
@ -11,19 +11,20 @@
|
|||||||
#define CONFIG_H_
|
#define CONFIG_H_
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Versioning:
|
* Versioning:
|
||||||
* - RC1:
|
* - RC1: First version to test on HPC
|
||||||
*/
|
*/
|
||||||
static constexpr char version[] = "0.0";
|
static constexpr char version[] = "0.1";
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Defines for different version of the exercise
|
* Defines for different version of the exercise
|
||||||
*/
|
*/
|
||||||
#define V0 (0)
|
#define V0 0
|
||||||
#define V1 (1)
|
#define V1 1
|
||||||
#define V2 (2)
|
#define V2 2
|
||||||
|
|
||||||
// Fail-safe version selection
|
// Fail-safe version selection
|
||||||
#if !defined CODE_VERSION
|
#if !defined CODE_VERSION
|
||||||
@ -33,8 +34,9 @@ static constexpr char version[] = "0.0";
|
|||||||
// Default Data size (in case -q <N> is not present)
|
// Default Data size (in case -q <N> is not present)
|
||||||
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||||
|
|
||||||
|
// Placeholder default (actual default comes from device properties read at initialization)
|
||||||
static constexpr size_t THREADS_PER_BLOCK = 1024;
|
static constexpr size_t THREADS_PER_BLOCK = 1024;
|
||||||
static constexpr size_t IN_BLOCK_THRESHOLD = 512;
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Value and Buffer type selection
|
* Value and Buffer type selection
|
||||||
@ -51,6 +53,11 @@ static constexpr size_t IN_BLOCK_THRESHOLD = 512;
|
|||||||
using Value_t = uint32_t;
|
using Value_t = uint32_t;
|
||||||
using Data_t = std::vector<Value_t>;
|
using Data_t = std::vector<Value_t>;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* In theory we can support large arrays ;)
|
||||||
|
*/
|
||||||
|
using ArraySize_t = uint64_t;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Session option for each invocation of the executable.
|
* Session option for each invocation of the executable.
|
||||||
*
|
*
|
||||||
@ -58,20 +65,18 @@ using Data_t = std::vector<Value_t>;
|
|||||||
* The values of the members are set from the command line.
|
* The values of the members are set from the command line.
|
||||||
*/
|
*/
|
||||||
struct config_t {
|
struct config_t {
|
||||||
size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
||||||
bool exchangeOpt{false}; //!< Flag to request the exchange optimization
|
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
|
||||||
size_t pipeline{1UL}; //!< Pipeline stages (1 to disable)
|
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
||||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
|
||||||
bool ndebug{false}; //!< Skips debug trap on DEBUG builds.
|
//!< the sorting <perf> times to do so.
|
||||||
size_t perf{1}; //!< Enable performance timing measurements and prints and repeat
|
bool verbose{false}; //!< Flag to enable verbose output to stdout.
|
||||||
//!< the sorting <perf> times.
|
|
||||||
bool verbose{false}; //!< Flag to enable verbose output to stdout.
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Exported data types
|
* Exported data types
|
||||||
*/
|
*/
|
||||||
extern config_t config;
|
extern config_t config;
|
||||||
|
extern cudaDeviceProp device;
|
||||||
|
|
||||||
#endif /* CONFIG_H_ */
|
#endif /* CONFIG_H_ */
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
@ -18,22 +19,25 @@
|
|||||||
|
|
||||||
|
|
||||||
// Global session data
|
// Global session data
|
||||||
Data_t Data = {3, 5, 1, 2, 4, 7, 8, 6};
|
Data_t Data;
|
||||||
config_t config;
|
config_t config;
|
||||||
Log logger;
|
Log logger;
|
||||||
|
cudaDeviceProp device;
|
||||||
|
|
||||||
// Mersenne seeded from hw if possible. range: [type_min, type_max]
|
// Mersenne seeded from hw if possible. range: [type_min, type_max]
|
||||||
std::random_device rd;
|
std::random_device rd;
|
||||||
std::mt19937 gen(rd());
|
std::mt19937 gen(rd());
|
||||||
|
|
||||||
//! Performance timers for each one of the "costly" functions
|
//! Performance timers for each one of the "costly" functions
|
||||||
Timing Timer_total;
|
Timing Timer_total, Timer_memory, Timer_sorting;
|
||||||
|
|
||||||
|
|
||||||
//! Init timing objects for extra rounds
|
//! Init timing objects for extra rounds
|
||||||
void measurements_init() {
|
void measurements_init() {
|
||||||
if (config.perf > 1) {
|
if (config.perf > 1) {
|
||||||
Timer_total.init(config.perf);
|
Timer_total.init(config.perf);
|
||||||
|
Timer_memory.init(config.perf);
|
||||||
|
Timer_sorting.init(config.perf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -41,6 +45,8 @@ void measurements_init() {
|
|||||||
void measurements_next() {
|
void measurements_next() {
|
||||||
if (config.perf > 1) {
|
if (config.perf > 1) {
|
||||||
Timer_total.next();
|
Timer_total.next();
|
||||||
|
Timer_memory.next();
|
||||||
|
Timer_sorting.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,7 +63,15 @@ bool get_options(int argc, char* argv[]){
|
|||||||
|
|
||||||
if (arg == "-q" || arg == "--array-size") {
|
if (arg == "-q" || arg == "--array-size") {
|
||||||
if (i+1 < argc) {
|
if (i+1 < argc) {
|
||||||
config.arraySize = 1 << atoi(argv[++i]);
|
config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
status = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (arg == "-b" || arg == "--block-size") {
|
||||||
|
if (i+1 < argc) {
|
||||||
|
config.blockSize = atoi(argv[++i]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
status = false;
|
status = false;
|
||||||
@ -74,32 +88,34 @@ bool get_options(int argc, char* argv[]){
|
|||||||
status = false;
|
status = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (arg == "--ndebug") {
|
|
||||||
config.ndebug = true;
|
|
||||||
}
|
|
||||||
else if (arg == "-v" || arg == "--verbose") {
|
else if (arg == "-v" || arg == "--verbose") {
|
||||||
config.verbose = true;
|
config.verbose = true;
|
||||||
}
|
}
|
||||||
else if (arg == "--version") {
|
else if (arg == "--version") {
|
||||||
std::cout << "bitonic - A GPU accelerated sort utility\n";
|
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
|
||||||
std::cout << "version: " << version << "\n\n";
|
std::cout << "version: " << version << "\n\n";
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
else if (arg == "-h" || arg == "--help") {
|
else if (arg == "-h" || arg == "--help") {
|
||||||
std::cout << "distbitonic - A distributed sort utility\n\n";
|
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
|
||||||
std::cout << " distbitonic -q <N> [--validation] [--perf <N>] [--ndebug] [-v]\n";
|
std::cout << " " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
|
||||||
std::cout << " distbitonic -h\n";
|
std::cout << " " << STR(TARGET) << " -h\n";
|
||||||
|
std::cout << " " << STR(TARGET) << " --version\n";
|
||||||
std::cout << '\n';
|
std::cout << '\n';
|
||||||
std::cout << "Options:\n\n";
|
std::cout << "Options:\n\n";
|
||||||
std::cout << " -q | --array-size <N>\n";
|
std::cout << " -q | --array-size <N>\n";
|
||||||
std::cout << " Selects the array size according to size = 2^N\n\n";
|
std::cout << " Selects the array size according to size = 2^N\n";
|
||||||
|
std::cout << " [Size must be larger than 2 * blockSize]\n";
|
||||||
|
std::cout << " [Default is 2^16]\n\n";
|
||||||
|
std::cout << " -b | --block-size <N>\n";
|
||||||
|
std::cout << " Selects the number of CUDA threads per block\n";
|
||||||
|
std::cout << " [Size has to be multiple of device's warp size (usually 32)\n";
|
||||||
|
std::cout << " [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
|
||||||
std::cout << " --validation\n";
|
std::cout << " --validation\n";
|
||||||
std::cout << " Request a full validation at the end, performed by process rank 0\n\n";
|
std::cout << " Request a full validation at the end\n\n";
|
||||||
std::cout << " --perf <N> \n";
|
std::cout << " --perf <N> \n";
|
||||||
std::cout << " Enable performance timing measurements and prints, and repeat\n";
|
std::cout << " Enable performance timing measurements and prints, and repeat\n";
|
||||||
std::cout << " the sorting <N> times.\n\n";
|
std::cout << " the sorting <N> times.\n\n";
|
||||||
std::cout << " --ndebug\n";
|
|
||||||
std::cout << " Skip debug breakpoint when on debug build.\n\n";
|
|
||||||
std::cout << " -v | --verbose\n";
|
std::cout << " -v | --verbose\n";
|
||||||
std::cout << " Request a more verbose output to stdout.\n\n";
|
std::cout << " Request a more verbose output to stdout.\n\n";
|
||||||
std::cout << " -h | --help\n";
|
std::cout << " -h | --help\n";
|
||||||
@ -107,8 +123,12 @@ bool get_options(int argc, char* argv[]){
|
|||||||
std::cout << " --version\n";
|
std::cout << " --version\n";
|
||||||
std::cout << " Prints version and exit.\n\n";
|
std::cout << " Prints version and exit.\n\n";
|
||||||
std::cout << "Examples:\n\n";
|
std::cout << "Examples:\n\n";
|
||||||
std::cout << " bitonic -q 24\n";
|
std::cout << " " << STR(TARGET) << " -q 24\n";
|
||||||
std::cout << " Runs bitonic with GPU acceleration with 2^24 array points\n\n";
|
std::cout << " Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
|
||||||
|
std::cout << " " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
|
||||||
|
std::cout << " Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
|
||||||
|
std::cout << " 512 threads per block, performs a validation check at the end and prints the time\n";
|
||||||
|
std::cout << " of the median.\n\n";
|
||||||
|
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
@ -118,6 +138,17 @@ bool get_options(int argc, char* argv[]){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check configuration requirements
|
||||||
|
if (config.blockSize % device.warpSize)
|
||||||
|
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
|
||||||
|
if (config.arraySize < 2*config.blockSize)
|
||||||
|
throw std::runtime_error("[Config] - Unsupported array size (smaller than "
|
||||||
|
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
|
||||||
|
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
|
||||||
|
throw std::runtime_error("[CUDA] - Unsupported array size: "
|
||||||
|
+ std::to_string(config.arraySize * sizeof(Value_t))
|
||||||
|
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,6 +172,13 @@ bool validator(DataT& data) {
|
|||||||
* @param argv [char***] POINTER to main's argv argument
|
* @param argv [char***] POINTER to main's argv argument
|
||||||
*/
|
*/
|
||||||
void init(int* argc, char*** argv) {
|
void init(int* argc, char*** argv) {
|
||||||
|
|
||||||
|
// Get device configuration
|
||||||
|
if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
|
||||||
|
throw std::runtime_error("[CUDA] - Can not read GPU");
|
||||||
|
|
||||||
|
config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
|
||||||
|
|
||||||
// try to read command line
|
// try to read command line
|
||||||
if (!get_options(*argc, *argv))
|
if (!get_options(*argc, *argv))
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -159,16 +197,23 @@ int main(int argc, char* argv[]) try {
|
|||||||
// Init everything
|
// Init everything
|
||||||
init(&argc, &argv);
|
init(&argc, &argv);
|
||||||
|
|
||||||
|
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
|
||||||
|
logger << "Repeated sorts: " << config.perf << logger.endl;
|
||||||
|
logger << "GPU: " << device.name << logger.endl;
|
||||||
|
logger << "Block size: " << config.blockSize << logger.endl;
|
||||||
|
|
||||||
for (size_t it = 0 ; it < config.perf ; ++it) {
|
for (size_t it = 0 ; it < config.perf ; ++it) {
|
||||||
// Initialize local data
|
// Initialize local data
|
||||||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
|
logger << "Initialize array ... ";
|
||||||
std::uniform_int_distribution<Value_t > dis(
|
std::uniform_int_distribution<Value_t > dis(
|
||||||
std::numeric_limits<Value_t>::min(),
|
std::numeric_limits<Value_t>::min(),
|
||||||
std::numeric_limits<Value_t>::max()
|
std::numeric_limits<Value_t>::max()
|
||||||
);
|
);
|
||||||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
|
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
|
||||||
|
logger << " Done." << logger.endl;
|
||||||
|
|
||||||
// Run distributed sort
|
// Run distributed sort
|
||||||
logger << "Starting distributed sorting ... ";
|
logger << "Start sorting ... ";
|
||||||
Timer_total.start();
|
Timer_total.start();
|
||||||
bitonicSort(Data);
|
bitonicSort(Data);
|
||||||
Timer_total.stop();
|
Timer_total.stop();
|
||||||
@ -178,20 +223,15 @@ int main(int argc, char* argv[]) try {
|
|||||||
|
|
||||||
// Print-outs and validation
|
// Print-outs and validation
|
||||||
if (config.perf > 1) {
|
if (config.perf > 1) {
|
||||||
Timing::print_duration(Timer_total.median(), "Total");
|
Timing::print_duration(Timer_total.median(), "Total ");
|
||||||
|
Timing::print_duration(Timer_memory.median(), "Mem-xch ");
|
||||||
|
Timing::print_duration(Timer_sorting.median(),"Sorting ");
|
||||||
}
|
}
|
||||||
if (config.validation) {
|
if (config.validation) {
|
||||||
// If requested, we have the chance to fail!
|
// If requested, we have the chance to fail!
|
||||||
std::cout << "[Validation] Results validation ...";
|
std::cout << "[Validation] Results validation ...";
|
||||||
bool val = validator(Data);
|
bool val = validator(Data);
|
||||||
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
|
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
|
||||||
if (Data.size() < 128) {
|
|
||||||
std::cout << "Data: ";
|
|
||||||
for (auto& v : Data) {
|
|
||||||
std::cout << (int)v << ", ";
|
|
||||||
}
|
|
||||||
std::cout << '\n';
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,11 @@
|
|||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Stringify preprocessor util
|
||||||
|
*/
|
||||||
|
#define STR(s) STR_(s)
|
||||||
|
#define STR_(s) #s
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* A Logger for entire program.
|
* A Logger for entire program.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user