Compare commits

..

8 Commits

12 changed files with 30108 additions and 0 deletions

23
homework_3/.gitignore vendored Normal file
View File

@ -0,0 +1,23 @@
# project
bin/
out/
mat/
mtx/
.unused/
various/
# hpc
# IDEs
.idea/
.clangd
# eclipse
.project
.cproject
.settings/
.vs/
.vscode/

236
homework_3/Makefile Normal file
View File

@ -0,0 +1,236 @@
#
# PDS HW3 Makefile
#
# Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ============== Project settings ==============
# Project's name
PROJECT := PDS_homework_3
# Excecutable's name
TARGET := bitonicCUDA
# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
SRC_DIR_LIST := src #test test/gtest
# Include directories list(space seperated). Makefile-relative path.
INC_DIR_LIST := src
# test \
# test/gtest/ \
# Exclude files list(space seperated). Filenames only.
# EXC_FILE_LIST := bad.cpp old.cpp
# Build directories
BUILD_DIR := bin
OBJ_DIR := $(BUILD_DIR)/obj
DEP_DIR := $(BUILD_DIR)/.dep
OUTPUT_DIR := out
# ========== Compiler settings ==========
# Compiler flags for debug and release
DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG"
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
# Pre-defines
# PRE_DEFS := MYCAB=1729 SUPER_MODE
PRE_DEFS := TARGET=$(TARGET)
# ============== Linker settings ==============
# Linker flags (example: -pthread -lm)
LDFLAGS :=
# Map output file
MAP_FILE := # output.map
MAP_FLAG := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
# ============== Docker settings ==============
# We need:
# - Bind the entire project directory(the dir that includes all the code) as volume.
# - In docker instance, change to working directory(where the makefile is).
DOCKER_VOL_DIR := $(shell pwd)
DOCKER_WRK_DIR :=
DOCKER_RUN := docker run --rm
DOCKER_FLAGS := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)
# docker invoke mechanism (edit with care)
# note:
# Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
# functionality is needed.
DOCKER_CMD = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
DOCKER :=
# ============== Tool selection ==============
# compiler and compiler flags.
CSIZE := size
CFLAGS := $(DEB_CFLAGS)
CXXFLAGS := $(DEB_CXXFLAGS)
CXX := g++
CC := gcc
LINKER := g++
#
# =========== Main body and Patterns ===========
#
INC := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
DEF := $(foreach def,$(PRE_DEFS),-D$(def))
EXC := $(foreach fil,$(EXC_FILE_LIST), \
$(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
)
# source files. object and dependencies list
# recursive search into current and source directories
SRC := $(wildcard *.cpp)
SRC += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
SRC += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
SRC := $(filter-out $(EXC),$(SRC))
#SRC := $(abspath $(SRC))
OBJ := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
# c file objects depent on .c AND dependency files, which have an empty recipe
$(OBJ_DIR)/%.o: %.c
@mkdir -p $(@D)
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
$(OBJ_DIR)/%.o: %.cpp
@mkdir -p $(@D)
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
# main target rule
$(BUILD_DIR)/$(TARGET): $(OBJ)
@mkdir -p $(@D)
@echo Linking to target: $(TARGET)
@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
@echo
@echo Print size information
@$(CSIZE) $(@D)/$(TARGET)
@echo Done
#
# ================ Default local build rules =================
# example:
# make debug
.DEFAULT_GOAL := all
.PHONY: clean
clean:
@echo Cleaning build directories
@rm -rf $(OBJ_DIR)
@rm -rf $(DEP_DIR)
@rm -rf $(BUILD_DIR)
debug: CFLAGS := $(DEB_CFLAGS)
debug: $(BUILD_DIR)/$(TARGET)
release: CFLAGS := $(REL_CFLAGS)
release: $(BUILD_DIR)/$(TARGET)
#
# ================ Build rules =================
#
bitonic_v0deb: CC := nvcc -G -g -x cu
bitonic_v0deb: CXX := nvcc -G -g -x cu
bitonic_v0deb: LINKER := nvcc
bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
bitonic_v1deb: CC := nvcc -G -g -x cu
bitonic_v1deb: CXX := nvcc -G -g -x cu
bitonic_v1deb: LINKER := nvcc
bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
bitonic_v2deb: CC := nvcc -G -g -x cu
bitonic_v2deb: CXX := nvcc -G -g -x cu
bitonic_v2deb: LINKER := nvcc
bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
bitonic_v0: CC := nvcc -x cu
bitonic_v0: CXX := nvcc -x cu
bitonic_v0: LINKER := nvcc
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
bitonic_v0: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
bitonic_v1: CC := nvcc -x cu
bitonic_v1: CXX := nvcc -x cu
bitonic_v1: LINKER := nvcc
bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
bitonic_v1: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
bitonic_v2: CC := nvcc -x cu
bitonic_v2: CXX := nvcc -x cu
bitonic_v2: LINKER := nvcc
bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
bitonic_v2: $(BUILD_DIR)/$(TARGET)
@mkdir -p $(OUTPUT_DIR)
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
hpc-build:
make clean
make bitonic_v0
make clean
make bitonic_v1
make clean
make bitonic_v2
all: debug bitonic_v0
# Note:
# Add a gcc based make rule here in order for clangd to successfully scan the project files.
# Otherwise we do not need the gcc build.

33
homework_3/exersize.md Normal file
View File

@ -0,0 +1,33 @@
Parallel & Distributed Computer Systems HW3
January, 2025
Write a program that sorts $N$ integers in ascending order, using CUDA.
The program must perform the following tasks:
- The user specifies a positive integers $q$.
- Start a process with an array of $N = 2^q$ random integers is each processes.
- Sort all $N$ elements int ascending order.
- Check the correctness of the final result.
Your implementation should be based on the following steps:
V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
V2. Modify the kernel of V1 to work with local memory instead of global.
You must deliver:
- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
- The source code of your program uploaded online.
Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
Deadline: 2 February, $2025$.

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Parameters
versions=("v0" "v1" "v2")
q_values=(20 21 22 23 24 25 26 27 28 29 30)
# Make scripts
for version in "${versions[@]}"; do
for q in "${q_values[@]}"; do
filename="Bitnc${version^^}Q${q}.sh" # Convert v0 -> V0 etc...
cat > "$filename" <<EOL
#! /usr/bin/env bash
#SBATCH --job-name=Bitnc${version^^}Q${q}
#SBATCH --nodes=1
#SBATCH --gres=gpu:1
#SBATCH --time=10:00
module load gcc/9.2.0 cuda/11.1.0
./out/${version}/bitonicCUDA -v --validation --perf 7 -b 512 -q ${q}
EOL
echo "Create: $filename"
done
done

26
homework_3/hpc/submitJobs.sh Executable file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Submission parameters
QOS="small"
PARTITION="ampere"
SCRIPT_DIR="hpc" # Directory containing the job scripts
# Range of values for the -q parameter
VERSIONS=("V0" "V1" "V2")
Q_START=20
Q_END=30
# Submitting the jobs
for version in "${VERSIONS[@]}"; do
for ((q = Q_START; q <= Q_END; q++)); do
script_name="Bitnc${version}Q${q}.sh"
script_path="${SCRIPT_DIR}/${script_name}"
if [[ -f "$script_path" ]]; then
sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
echo "Submitted: $script_path"
else
echo "Warning: File not found - $script_path"
fi
done
done

View File

@ -0,0 +1,456 @@
/*!
* \file
* \brief Bitonic sort CUDA implementation header
*
* \author
* Christos Choutouridis AEM:8997
* <cchoutou@ece.auth.gr>
*/
#ifndef BITONICSORTCUDA_H_
#define BITONICSORTCUDA_H_
#include <cuda_runtime.h>
#include <vector>
#include <cmath>
#include <cstdint>
#include <utility>
#include "utils.hpp"
/*
* Exported timers
*/
extern Timing Timer_total, Timer_memory, Timer_sorting;
using threadId_t = size_t;
/*
* ============================== Sort utilities ==============================
*/
/*!
* Returns the ascending or descending configuration (up/down phase) of the thread id
* depending on the current depth
*
* @param tid [threadId_t] The current thread
* @param stage [size_t] The current stage of the sorting network (same for each step)
* @return [bool] True if we need ascending configuration, false otherwise
*/
__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
return !(tid & (1 << stage));
}
/*!
* Returns the thread's partner for data exchange during the sorting network iterations
* of Bitonic
*
* @param tid [threadId_t] The current node
* @param step [size_t] The step of the sorting network
* @return [threadId_t] The node id of the partner for data exchange
*/
__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
return (tid ^ (1 << step));
}
/*!
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
*
* @param tid [threadId_t] The node for which we check
* @param partner [threadId_t] The partner of the data exchange
* @param stage [size_t] The current stage of the sorting network (same for each step)
* @return [bool] True if the node should keep the small values, false otherwise
*/
__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
return ascending(tid, stage) == (tid < partner);
}
/*
* ============================== Sort algorithms ==============================
*/
/*!
* Each thread can handle 2 points in the array. For each of these 2 points it may
* - compare and exchange if needed
* - copy data to local and back if needed
*/
static constexpr size_t SizeToThreadsRatio = 2;
/*!
* Calculates the blocks needed for the entire sorting process
*
* @note
* This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
* multipliers of config.blockSize.
* Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
*
* @param arraySize [ArraySize_t] The size of the entire array (in points)
* @return [size_t] The number of blocks
*/
inline size_t NBlocks(ArraySize_t arraySize) {
return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
}
/*!
* Exchange utility
*
* @tparam ValueT The underlying data type of the array items
*
* @param data [ValueT*] Pointer to data array
* @param tid [threadId_t] Current thread's index to data
* @param pid [threadId_t] Parents's index to data
* @param keepSmall [bool] Flag to indicate if current threads is keeping the small
*/
template <typename ValueT>
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
if (( keepSmall && (data[tid] > data[partner])) ||
(!keepSmall && (data[tid] < data[partner])) ) {
ValueT temp = data[tid];
data[tid] = data[partner];
data[partner] = temp;
}
}
#if CODE_VERSION == V0
/*!
* This is the body of each thread. This function compare and exchange data
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
threadId_t pid = partner(tid, step);
if (tid > pid) {
// Shift to the other half of the array for global data
tid += n / SizeToThreadsRatio;
pid += n / SizeToThreadsRatio;
}
if ((tid < n) && (pid < n)) { // Boundary check
bool keep = keepSmall(tid, pid, stage);
exchange(data, tid, pid, keep);
}
}
/*!
* A CUDA version of the Bitonic sort algorithm.
*
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
* @param data [DataT&] Reference to the container to sort
*/
template <typename DataT>
void bitonicSort(DataT& data) {
using value_t = typename DataT::value_type;
value_t* dev_data;
auto size = data.size();
Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
Timer_memory.stop();
size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size);
size_t Stages = static_cast<size_t>(log2(size));
Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
cudaDeviceSynchronize();
}
}
Timer_sorting.stop();
Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
cudaFree(dev_data);
Timer_memory.stop();
}
#elif CODE_VERSION == V1
/*!
* This is the body of each thread. This function compare and exchange data
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
/*
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we use the neighbor block address indices for the other half of the threads
*/
threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t pid = partner(tid, step);
if (tid > pid) {
// Shift to the other half of the array for global data
tid += blockDim.x;
pid += blockDim.x;
}
if ((tid < n) && (pid < n)) { // Boundary check
bool keep = keepSmall(tid, pid, stage);
exchange(data, tid, pid, keep);
}
}
/*!
* This is the version of the body that is called outside of the loop unrolling
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
interBlockStep_(data, n, step, stage);
}
/*!
* This is unrolled part of the bitonic double loop.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
for (size_t step = innerSteps + 1; step > 0; ) {
--step;
interBlockStep_(data, n, step, stage);
__syncthreads();
}
}
/*!
* A CUDA version of the Bitonic sort algorithm.
*
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
* @param data [DataT&] Reference to the container to sort
*/
template <typename DataT>
void bitonicSort(DataT& data) {
using value_t = typename DataT::value_type;
value_t* dev_data;
auto size = data.size();
Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
Timer_memory.stop();
size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size);
auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); //
Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) {
size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
cudaDeviceSynchronize();
}
inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
cudaDeviceSynchronize();
}
Timer_sorting.stop();
Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
cudaFree(dev_data);
Timer_memory.stop();
}
#elif CODE_VERSION == V2
/*!
* @return The memory that each block local threads can affect.
*
* @note
* Each block thread collection can exchange twice the size of data points.
*/
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
/*!
* Converts the global address of the data to the local shared memory array which is used
* as cached memory to the unrolled part of the bitonic sort loop.
*
* @note
* Each block's thread collection can exchange twice the size of data points.
* These points get copied (cached) in the shared memory location. We use contiguous blocks
* both in global data memory and the shared memory buffer.
*
* @param gIndex The global array index
* @param blockDim The block size (threads per block)
* @return The equivalent local address of the shared memory
*/
__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
return gIndex % (SizeToThreadsRatio * blockDim);
}
/*!
* This is the version of the body that is called outside of the loop unrolling
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
threadId_t pid = partner(tid, step);
if (tid > pid) {
// Shift to the other half of the array for global data
tid += n / SizeToThreadsRatio;
pid += n / SizeToThreadsRatio;
}
if ((tid < n) && (pid < n)) { // Boundary check
bool keep = keepSmall(tid, pid, stage);
exchange(data, tid, pid, keep);
}
}
/*!
* This is unrolled part of the bitonic double loop.
*
* First each thread caches its corresponding data point from the current and the following data block.
* After that we execute the loop unrolling on the local data and then we write back to global memory.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param step [size_t] The current step of the current stage of bitonic sort
* @param stage [size_t] The current stage of bitonic sort
*/
template <typename ValueT>
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
extern __shared__ ValueT shared_data[];
/*
* Global and local(shared) memory indices (calculated once)
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we cache the neighbor block address indexes in local (shared) memory
*/
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
if (gIdx0 + blockDim.x >= n) // Boundary check
return;
// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx0] = data[gIdx0];
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
__syncthreads();
for (size_t step = innerSteps + 1; step > 0; ) {
--step;
// Init thread global and local indices
threadId_t gIdx = gIdx0;
threadId_t lIdx = lIdx0;
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Shift inside effective block
gIdx += blockDim.x; // global
pIdx += blockDim.x;
lIdx += blockDim.x; // local
}
bool keep = keepSmall(gIdx, pIdx, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
__syncthreads();
}
// Write back to global memory
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
__syncthreads();
}
/*!
* A CUDA version of the Bitonic sort algorithm.
*
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
* @param data [DataT&] Reference to the container to sort
*/
template <typename DataT>
void bitonicSort(DataT& data) {
using value_t = typename DataT::value_type;
value_t* dev_data;
auto size = data.size();
Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
Timer_memory.stop();
size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size);
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) {
size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
cudaDeviceSynchronize();
}
inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
cudaDeviceSynchronize();
}
Timer_sorting.stop();
Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
cudaFree(dev_data);
Timer_memory.stop();
}
#endif
#endif //BITONICSORTCUDA_H_

82
homework_3/src/config.h Normal file
View File

@ -0,0 +1,82 @@
/*!
* \file
* \brief Build and runtime configuration file.
*
* \author
* Christos Choutouridis AEM:8997
* <cchoutou@ece.auth.gr>
*/
#ifndef CONFIG_H_
#define CONFIG_H_
#include <cstdint>
#include <cuda_runtime.h>
/*
* Versioning:
* - RC1: First version to test on HPC
*/
static constexpr char version[] = "0.1";
/*
* Defines for different version of the exercise
*/
#define V0 0
#define V1 1
#define V2 2
// Fail-safe version selection
#if !defined CODE_VERSION
#define CODE_VERSION V2
#endif
// Default Data size (in case -q <N> is not present)
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
// Placeholder default (actual default comes from device properties read at initialization)
static constexpr size_t THREADS_PER_BLOCK = 1024;
/*!
* Value and Buffer type selection
*
* We support the following compiler types or the <cstdint> that translate to them:
* char - unsigned char
* short - unsigned short
* int - unsigned int
* long - unsigned long
* long long - unsigned long long
* float
* double
*/
using Value_t = uint32_t;
using Data_t = std::vector<Value_t>;
/*!
* In theory we can support large arrays ;)
*/
using ArraySize_t = uint64_t;
/*!
* Session option for each invocation of the executable.
*
* @note
* The values of the members are set from the command line.
*/
struct config_t {
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
//!< the sorting <perf> times to do so.
bool verbose{false}; //!< Flag to enable verbose output to stdout.
};
/*
* Exported data types
*/
extern config_t config;
extern cudaDeviceProp device;
#endif /* CONFIG_H_ */

260
homework_3/src/main.cpp Normal file
View File

@ -0,0 +1,260 @@
/*!
* \file
* \brief Main application file for PDS HW3 (CUDA)
*
* \author
* Christos Choutouridis AEM:8997
* <cchoutou@ece.auth.gr>
*/
#include <exception>
#include <iostream>
#include <algorithm>
#include <random>
#include <cuda_runtime.h>
#include "utils.hpp"
#include "config.h"
#include "bitonicsort.hpp"
// Global session data
Data_t Data;
config_t config;
Log logger;
cudaDeviceProp device;
// Mersenne seeded from hw if possible. range: [type_min, type_max]
std::random_device rd;
std::mt19937 gen(rd());
//! Performance timers for each one of the "costly" functions
Timing Timer_total, Timer_memory, Timer_sorting;
//! Init timing objects for extra rounds
void measurements_init() {
if (config.perf > 1) {
Timer_total.init(config.perf);
Timer_memory.init(config.perf);
Timer_sorting.init(config.perf);
}
}
//! iterate ot the next round of measurements for all measurement objects
void measurements_next() {
if (config.perf > 1) {
Timer_total.next();
Timer_memory.next();
Timer_sorting.next();
}
}
/*!
* A small command line argument parser
* \return The status of the operation
*/
bool get_options(int argc, char* argv[]){
bool status =true;
// iterate over the passed arguments
for (int i=1 ; i<argc ; ++i) {
std::string arg(argv[i]); // get current argument
if (arg == "-q" || arg == "--array-size") {
if (i+1 < argc) {
config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
}
else {
status = false;
}
}
else if (arg == "-b" || arg == "--block-size") {
if (i+1 < argc) {
config.blockSize = atoi(argv[++i]);
}
else {
status = false;
}
}
else if (arg == "--validation") {
config.validation = true;
}
else if (arg == "--perf") {
if (i+1 < argc) {
config.perf = atoi(argv[++i]);
}
else {
status = false;
}
}
else if (arg == "-v" || arg == "--verbose") {
config.verbose = true;
}
else if (arg == "--version") {
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
std::cout << "version: " << version << "\n\n";
exit(0);
}
else if (arg == "-h" || arg == "--help") {
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
std::cout << " " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
std::cout << " " << STR(TARGET) << " -h\n";
std::cout << " " << STR(TARGET) << " --version\n";
std::cout << '\n';
std::cout << "Options:\n\n";
std::cout << " -q | --array-size <N>\n";
std::cout << " Selects the array size according to size = 2^N\n";
std::cout << " [Size must be larger than 2 * blockSize]\n";
std::cout << " [Default is 2^16]\n\n";
std::cout << " -b | --block-size <N>\n";
std::cout << " Selects the number of CUDA threads per block\n";
std::cout << " [Size has to be multiple of device's warp size (usually 32)\n";
std::cout << " [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
std::cout << " --validation\n";
std::cout << " Request a full validation at the end\n\n";
std::cout << " --perf <N> \n";
std::cout << " Enable performance timing measurements and prints, and repeat\n";
std::cout << " the sorting <N> times.\n\n";
std::cout << " -v | --verbose\n";
std::cout << " Request a more verbose output to stdout.\n\n";
std::cout << " -h | --help\n";
std::cout << " Prints this and exit.\n\n";
std::cout << " --version\n";
std::cout << " Prints version and exit.\n\n";
std::cout << "Examples:\n\n";
std::cout << " " << STR(TARGET) << " -q 24\n";
std::cout << " Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
std::cout << " " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
std::cout << " Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
std::cout << " 512 threads per block, performs a validation check at the end and prints the time\n";
std::cout << " of the median.\n\n";
exit(0);
}
else { // parse error
std::cout << "Invocation error. Try -h for details.\n";
status = false;
}
}
// Check configuration requirements
if (config.blockSize % device.warpSize)
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
if (config.arraySize < 2*config.blockSize)
throw std::runtime_error("[Config] - Unsupported array size (smaller than "
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
throw std::runtime_error("[CUDA] - Unsupported array size: "
+ std::to_string(config.arraySize * sizeof(Value_t))
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
return status;
}
/*!
* A simple validator for the entire distributed process
*
* @tparam DataT A buffer type with random access iterator.
*
* @param data [DataT] The data
* @return [bool] True if sorted in ascending order
*/
template<typename DataT>
bool validator(DataT& data) {
return std::is_sorted(data.begin(), data.end());
}
/*!
* Initializes the environment, must called from each process
*
* @param argc [int*] POINTER to main's argc argument
* @param argv [char***] POINTER to main's argv argument
*/
void init(int* argc, char*** argv) {
// Get device configuration
if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not read GPU");
config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
// try to read command line
if (!get_options(*argc, *argv))
exit(1);
// Prepare vector and timing data
Data.resize(config.arraySize);
measurements_init();
}
#if !defined TESTING
/*!
* @return Returns 0, but.... we may throw or exit(0) / exit(1)
*/
int main(int argc, char* argv[]) try {
// Init everything
init(&argc, &argv);
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
logger << "Repeated sorts: " << config.perf << logger.endl;
logger << "GPU: " << device.name << logger.endl;
logger << "Block size: " << config.blockSize << logger.endl;
for (size_t it = 0 ; it < config.perf ; ++it) {
// Initialize local data
logger << "Initialize array ... ";
std::uniform_int_distribution<Value_t > dis(
std::numeric_limits<Value_t>::min(),
std::numeric_limits<Value_t>::max()
);
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
logger << " Done." << logger.endl;
// Run distributed sort
logger << "Start sorting ... ";
Timer_total.start();
bitonicSort(Data);
Timer_total.stop();
measurements_next();
logger << " Done." << logger.endl;
}
// Print-outs and validation
if (config.perf > 1) {
Timing::print_duration(Timer_total.median(), "Total ");
Timing::print_duration(Timer_memory.median(), "Mem-xch ");
Timing::print_duration(Timer_sorting.median(),"Sorting ");
}
if (config.validation) {
// If requested, we have the chance to fail!
std::cout << "[Validation] Results validation ...";
bool val = validator(Data);
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
}
return 0;
}
catch (std::exception& e) {
//we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
std::cerr << "Error: " << e.what() << '\n';
exit(1);
}
#else
#include <gtest/gtest.h>
#include <exception>
/*!
* The testing version of our program
*/
GTEST_API_ int main(int argc, char **argv) try {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
catch (std::exception& e) {
std::cout << "Exception: " << e.what() << '\n';
}
#endif

157
homework_3/src/utils.hpp Normal file
View File

@ -0,0 +1,157 @@
/**
* \file
* \brief Utilities header
*
* \author
* Christos Choutouridis AEM:8997
* <cchoutou@ece.auth.gr>
*/
#ifndef UTILS_HPP_
#define UTILS_HPP_
#include <vector>
#include <iostream>
#include <chrono>
#include <unistd.h>
#include <algorithm>
#include "config.h"
/*!
* Stringify preprocessor util
*/
#define STR(s) STR_(s)
#define STR_(s) #s
/*!
* A Logger for entire program.
*/
struct Log {
struct Endl {} endl; //!< a tag object to to use it as a new line request.
//! We provide logging via << operator
template<typename T>
Log &operator<<(T &&t) {
if (config.verbose) {
if (line_) {
std::cout << "[Log]: " << t;
line_ = false;
} else
std::cout << t;
}
return *this;
}
// overload for special end line handling
Log &operator<<(Endl e) {
(void) e;
if (config.verbose) {
std::cout << '\n';
line_ = true;
}
return *this;
}
private:
bool line_{true};
};
extern Log logger;
/*!
* A small timing utility based on chrono that supports timing rounds
* and returning the median of them. Time can accumulate to the measurement
* for each round.
*/
struct Timing {
using Tpoint = std::chrono::steady_clock::time_point;
using Tduration = std::chrono::microseconds;
using microseconds = std::chrono::microseconds;
using milliseconds = std::chrono::milliseconds;
using seconds = std::chrono::seconds;
//! Setup measurement rounds
void init(size_t rounds) {
duration_.resize(rounds);
for (auto& d : duration_)
d = Tduration::zero();
}
//! tool to mark the starting point
Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }
//! tool to mark the ending point
Tpoint stop() noexcept {
Tpoint now = std::chrono::steady_clock::now();
duration_[current_] += dt(now, mark_);
return now;
}
//! Switch timing slot
void next() noexcept {
++current_;
current_ %= duration_.size();
}
Tduration& median() noexcept {
std::sort(duration_.begin(), duration_.end());
return duration_[duration_.size()/2];
}
//! A duration calculation utility
static Tduration dt(Tpoint t2, Tpoint t1) noexcept {
return std::chrono::duration_cast<Tduration>(t2 - t1);
}
//! Tool to print the time interval
static void print_duration(const Tduration& duration, const char *what) noexcept {
if (std::chrono::duration_cast<microseconds>(duration).count() < 10000)
std::cout << "[Timing] " << what << ": "
<< std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n";
else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000)
std::cout << "[Timing] " << what << ": "
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n";
else {
char stime[26]; // fit ulong
auto sec = std::chrono::duration_cast<seconds>(duration).count();
auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10; // keep 2 digit
std::sprintf(stime, "%ld.%1ld", sec, msec);
std::cout << "[Timing] " << what << ": " << stime << " [sec]\n";
}
}
private:
size_t current_{0};
Tpoint mark_{};
std::vector<Tduration> duration_{1};
};
/*!
* A "high level function"-like utility macro to forward a function call
* and accumulate the execution time to the corresponding timing object.
*
* @param Tim The Timing object [Needs to have methods start() and stop()]
* @param Func The function name
* @param ... The arguments to pass to function (the preprocessor way)
*/
#define timeCall(Tim, Func, ...) \
Tim.start(); \
Func(__VA_ARGS__); \
Tim.stop(); \
/*!
* A utility to check if a number is power of two
*
* @tparam Integral The integral type of the number to check
* @param x The number to check
* @return True if it is power of 2, false otherwise
*/
template <typename Integral>
constexpr inline bool isPowerOfTwo(Integral x) noexcept {
return (!(x & (x - 1)) && x);
}
#endif /* UTILS_HPP_ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33
homework_3/test/tests.cpp Normal file
View File

@ -0,0 +1,33 @@
/**
* \file
* \brief PDS HW3 tests
*
* To run these test execute:
* ...
*
* \author
* Christos Choutouridis AEM:8997
* <cchoutou@ece.auth.gr>
*/
#include <gtest/gtest.h>
/*
* Global fixtures
*/
class TCUDAbitonic : public ::testing::Test {
protected:
static void SetUpTestSuite() { }
static void TearDownTestSuite() { }
};
/*
*
*/
TEST_F(TCUDAbitonic, test1) {
EXPECT_EQ(true, true);
}