HW3: RC1 - hpc slurm creation and submittion scripts added

HW3: RC1 First version to test on HPC
HW3: First batch to test [v0 and v1]
2025-02-15 21:39:47 +02:00 · 2025-02-15 21:18:55 +02:00 · 2025-02-14 18:41:08 +02:00 · 2025-02-14 17:56:56 +02:00 · 2025-02-10 00:01:58 +02:00 · 2025-01-29 00:48:45 +02:00
12 changed files with 30108 additions and 0 deletions
--- a/homework_3/.gitignore
+++ b/homework_3/.gitignore
@ -0,0 +1,23 @@
 # project
 bin/
 out/
 mat/
 mtx/
 .unused/
 various/
 # hpc
 # IDEs
 .idea/
 .clangd
 # eclipse
 .project
 .cproject
 .settings/
 .vs/
 .vscode/
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@ -0,0 +1,236 @@
 #
 # PDS HW3 Makefile
 #
 # Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation, either version 3
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 # ============== Project settings ==============
 # Project's name
 PROJECT         := PDS_homework_3
 # Excecutable's name
 TARGET          := bitonicCUDA
 # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
 SRC_DIR_LIST    := src #test test/gtest
 # Include directories list(space seperated). Makefile-relative path.
 INC_DIR_LIST    := src
 #                   test \
 #                   test/gtest/ \
 # Exclude files list(space seperated). Filenames only.
 # EXC_FILE_LIST := bad.cpp old.cpp
 # Build directories
 BUILD_DIR       := bin
 OBJ_DIR         := $(BUILD_DIR)/obj
 DEP_DIR         := $(BUILD_DIR)/.dep
 OUTPUT_DIR      := out
 # ========== Compiler settings ==========
 # Compiler flags for debug and release
 DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
 REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
 DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG" 
 REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
 PRE_DEFS        := TARGET=$(TARGET)
 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
 LDFLAGS         :=
 # Map output file
 MAP_FILE        := # output.map
 MAP_FLAG        := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
 # ============== Docker settings ==============
 # We need:
 #  - Bind the entire project directory(the dir that includes all the code) as volume.
 #  - In docker instance, change to working directory(where the makefile is).
 DOCKER_VOL_DIR  := $(shell pwd)
 DOCKER_WRK_DIR  :=
 DOCKER_RUN      := docker run --rm
 DOCKER_FLAGS    := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)
 # docker invoke mechanism (edit with care)
 #   note:
 #   Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
 #   functionality is needed.
 DOCKER_CMD      = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
 DOCKER          :=
 # ============== Tool selection ==============
 # compiler and compiler flags.
 CSIZE           := size
 CFLAGS          := $(DEB_CFLAGS)
 CXXFLAGS        := $(DEB_CXXFLAGS)
 CXX             := g++
 CC              := gcc
 LINKER          := g++
 #
 # =========== Main body and Patterns ===========
 #
 INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
 DEF     := $(foreach def,$(PRE_DEFS),-D$(def))
 EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
               $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
           )
 # source files. object and dependencies list
 # recursive search into current and source directories
 SRC     := $(wildcard *.cpp)
 SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
 SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
 SRC     := $(filter-out $(EXC),$(SRC))
 #SRC     := $(abspath $(SRC))
 OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
 DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
 # c file objects depent on .c AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.c
 	@mkdir -p $(@D)
 	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
 # cpp file objects depend on .cpp AND dependency files, which have an empty recipe
 $(OBJ_DIR)/%.o: %.cpp
 	@mkdir -p $(@D)
 	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
 # main target rule
 $(BUILD_DIR)/$(TARGET): $(OBJ)
 	@mkdir -p $(@D)
 	@echo Linking to target: $(TARGET)
 	@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@echo
 	@echo Print size information
 	@$(CSIZE) $(@D)/$(TARGET)
 	@echo Done
 #
 # ================ Default local build rules =================
 # example:
 # make debug
 .DEFAULT_GOAL := all
 .PHONY: clean
 clean:
 	@echo Cleaning build directories
 	@rm -rf $(OBJ_DIR)
 	@rm -rf $(DEP_DIR)
 	@rm -rf $(BUILD_DIR)
 debug: CFLAGS := $(DEB_CFLAGS)
 debug: $(BUILD_DIR)/$(TARGET)
 release: CFLAGS := $(REL_CFLAGS)
 release: $(BUILD_DIR)/$(TARGET)
 #
 # ================ Build rules =================
 #
 bitonic_v0deb: CC := nvcc -G -g -x cu
 bitonic_v0deb: CXX := nvcc -G -g -x cu
 bitonic_v0deb: LINKER := nvcc
 bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
 bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
 bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 bitonic_v1deb: CC := nvcc -G -g -x cu
 bitonic_v1deb: CXX := nvcc -G -g -x cu
 bitonic_v1deb: LINKER := nvcc
 bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
 bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
 bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
 bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 bitonic_v2deb: CC := nvcc -G -g -x cu
 bitonic_v2deb: CXX := nvcc -G -g -x cu
 bitonic_v2deb: LINKER := nvcc
 bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
 bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
 bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
 bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 bitonic_v0: CC := nvcc -x cu
 bitonic_v0: CXX := nvcc -x cu
 bitonic_v0: LINKER := nvcc
 bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
 bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
 bitonic_v0: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 bitonic_v1: CC := nvcc -x cu
 bitonic_v1: CXX := nvcc -x cu
 bitonic_v1: LINKER := nvcc
 bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
 bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
 bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
 bitonic_v1: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 bitonic_v2: CC := nvcc -x cu
 bitonic_v2: CXX := nvcc -x cu
 bitonic_v2: LINKER := nvcc
 bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
 bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
 bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
 bitonic_v2: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 hpc-build:
 	make clean
 	make bitonic_v0
 	make clean
 	make bitonic_v1
 	make clean
 	make bitonic_v2
 all: debug bitonic_v0
 # Note:
 #	Add a gcc based make rule here in order for clangd to successfully scan the project files.
 #	Otherwise we do not need the gcc build.
--- a/homework_3/exersize.md
+++ b/homework_3/exersize.md
@ -0,0 +1,33 @@
 Parallel & Distributed Computer Systems HW3
 January, 2025
 Write a program that sorts $N$ integers in ascending order, using CUDA.
 The program must perform the following tasks:
 - The user specifies a positive integers $q$.
 - Start a process with an array of $N = 2^q$ random integers is each processes.
 - Sort all $N$ elements int ascending order.
 - Check the correctness of the final result.
 Your implementation should be based on the following steps:
 V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
 V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
 V2. Modify the kernel of V1 to work with local memory instead of global.
 You must deliver:
 - A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
 - Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
 - The source code of your program uploaded online.
 Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
 Deadline: 2 February, $2025$.
--- a/homework_3/hpc/makeSlurmScripts.sh
+++ b/homework_3/hpc/makeSlurmScripts.sh
@ -0,0 +1,26 @@
 #!/usr/bin/env bash
 # Parameters
 versions=("v0" "v1" "v2")
 q_values=(20 21 22 23 24 25 26 27 28 29 30)
 # Make scripts
 for version in "${versions[@]}"; do
  for q in "${q_values[@]}"; do
    filename="Bitnc${version^^}Q${q}.sh"  # Convert v0 -> V0 etc...
    cat > "$filename" <<EOL
 #! /usr/bin/env bash
 #SBATCH --job-name=Bitnc${version^^}Q${q}
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:1
 #SBATCH --time=10:00
 module load gcc/9.2.0 cuda/11.1.0
 ./out/${version}/bitonicCUDA -v --validation --perf 7 -b 512 -q ${q}
 EOL
    echo "Create: $filename"
  done
 done
--- a/homework_3/hpc/submitJobs.sh
+++ b/homework_3/hpc/submitJobs.sh
@ -0,0 +1,26 @@
 #!/usr/bin/env bash
 # Submission parameters
 QOS="small"
 PARTITION="ampere"
 SCRIPT_DIR="hpc"  # Directory containing the job scripts
 # Range of values for the -q parameter
 VERSIONS=("V0" "V1" "V2")
 Q_START=20
 Q_END=30
 # Submitting the jobs
 for version in "${VERSIONS[@]}"; do
  for ((q = Q_START; q <= Q_END; q++)); do
    script_name="Bitnc${version}Q${q}.sh"
    script_path="${SCRIPT_DIR}/${script_name}"
    if [[ -f "$script_path" ]]; then
      sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
      echo "Submitted: $script_path"
    else
      echo "Warning: File not found - $script_path"
    fi
  done
 done
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@ -0,0 +1,456 @@
 /*!
 * \file
 * \brief   Bitonic sort CUDA implementation header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #ifndef BITONICSORTCUDA_H_
 #define BITONICSORTCUDA_H_
 #include <cuda_runtime.h>
 #include <vector>
 #include <cmath>
 #include <cstdint>
 #include <utility>
 #include "utils.hpp"
 /*
 * Exported timers
 */
 extern Timing Timer_total, Timer_memory, Timer_sorting;
 using threadId_t = size_t;
 /*
 * ============================== Sort utilities ==============================
 */
 /*!
 * Returns the ascending or descending configuration (up/down phase) of the thread id
 * depending on the current depth
 *
 * @param tid   [threadId_t] The current thread
 * @param stage [size_t]     The current stage of the sorting network (same for each step)
 * @return      [bool]       True if we need ascending configuration, false otherwise
 */
 __device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
    return !(tid & (1 << stage));
 }
 /*!
 * Returns the thread's partner for data exchange during the sorting network iterations
 * of Bitonic
 *
 * @param tid   [threadId_t] The current node
 * @param step  [size_t]     The step of the sorting network
 * @return      [threadId_t] The node id of the partner for data exchange
 */
 __device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
    return (tid ^ (1 << step));
 }
 /*!
 * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
 *
 * @param tid       [threadId_t] The node for which we check
 * @param partner   [threadId_t] The partner of the data exchange
 * @param stage     [size_t]     The current stage of the sorting network (same for each step)
 * @return          [bool]       True if the node should keep the small values, false otherwise
 */
 __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
    return ascending(tid, stage) == (tid < partner);
 }
 /*
 * ============================== Sort algorithms ==============================
 */
 /*!
 * Each thread can handle 2 points in the array. For each of these 2 points it may
 * - compare and exchange if needed
 * - copy data to local and back if needed
 */
 static constexpr size_t SizeToThreadsRatio = 2;
 /*!
 * Calculates the blocks needed for the entire sorting process
 *
 * @note
 *  This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
 *  multipliers of config.blockSize.
 *  Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
 *
 * @param arraySize [ArraySize_t]   The size of the entire array (in points)
 * @return          [size_t]        The number of blocks
 */
 inline size_t NBlocks(ArraySize_t arraySize) {
 	return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
 }
 /*!
 * Exchange utility
 *
 * @tparam  ValueT  The underlying data type of the array items
 *
 * @param data      [ValueT*]    Pointer to data array
 * @param tid       [threadId_t] Current thread's index to data
 * @param pid       [threadId_t] Parents's index to data
 * @param keepSmall [bool]       Flag to indicate if current threads is keeping the small
 */
 template <typename ValueT>
 __device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
    if (( keepSmall && (data[tid] > data[partner])) ||
        (!keepSmall && (data[tid] < data[partner])) ) {
        ValueT temp = data[tid];
        data[tid] = data[partner];
        data[partner] = temp;
    }
 }
 #if CODE_VERSION == V0
 /*!
 * This is the body of each thread. This function compare and exchange data
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
    threadId_t pid = partner(tid, step);
    if (tid > pid) {
        // Shift to the other half of the array for global data
        tid += n / SizeToThreadsRatio;
        pid += n / SizeToThreadsRatio;
    }
    if ((tid < n) && (pid < n)) {       // Boundary check
        bool keep = keepSmall(tid, pid, stage);
        exchange(data, tid, pid, keep);
    }
 }
 /*!
 * A CUDA version of the Bitonic sort algorithm.
 *
 * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
 * @param data      [DataT&]    Reference to the container to sort
 */
 template <typename DataT>
 void bitonicSort(DataT& data) {
    using value_t = typename DataT::value_type;
    value_t* dev_data;
    auto size = data.size();
    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
    Timer_memory.stop();
    size_t Nth = config.blockSize;
    size_t Nbl = NBlocks(size);
    size_t Stages = static_cast<size_t>(log2(size));
    Timer_sorting.start();
    for (size_t stage = 1; stage <= Stages; ++stage) {
        for (size_t step = stage; step > 0; ) {
            --step;
            bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
            cudaDeviceSynchronize();
        }
    }
    Timer_sorting.stop();
    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
 #elif CODE_VERSION == V1
 /*!
 * This is the body of each thread. This function compare and exchange data
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
    /*
     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
     * And we use the neighbor block address indices for the other half of the threads
     */
    threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
    threadId_t pid = partner(tid, step);
    if (tid > pid) {
        // Shift to the other half of the array for global data
        tid += blockDim.x;
        pid += blockDim.x;
    }
    if ((tid < n) && (pid < n)) {       // Boundary check
        bool keep = keepSmall(tid, pid, stage);
        exchange(data, tid, pid, keep);
    }
 }
 /*!
 * This is the version of the body that is called outside of the loop unrolling
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
    interBlockStep_(data, n, step, stage);
 }
 /*!
 * This is unrolled part of the bitonic double loop.
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
    for (size_t step = innerSteps + 1; step > 0; ) {
        --step;
        interBlockStep_(data, n, step, stage);
        __syncthreads();
    }
 }
 /*!
 * A CUDA version of the Bitonic sort algorithm.
 *
 * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
 * @param data      [DataT&]    Reference to the container to sort
 */
 template <typename DataT>
 void bitonicSort(DataT& data) {
    using value_t = typename DataT::value_type;
    value_t* dev_data;
    auto size = data.size();
    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
    Timer_memory.stop();
    size_t Nth = config.blockSize;
    size_t Nbl = NBlocks(size);
    auto Stages          = static_cast<size_t>(log2(size));
    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));    //
    Timer_sorting.start();
    for (size_t stage = 1; stage <= Stages; ++stage) {
        size_t step = stage - 1;
        for ( ; step > InnerBlockSteps; --step) {
            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
            cudaDeviceSynchronize();
        }
        inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
        cudaDeviceSynchronize();
    }
    Timer_sorting.stop();
    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
 #elif CODE_VERSION == V2
 /*!
 * @return The memory that each block local threads can affect.
 *
 * @note
 *  Each block thread collection can exchange twice the size of data points.
 */
 inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
 /*!
 * Converts the global address of the data to the local shared memory array which is used
 * as cached memory to the unrolled part of the bitonic sort loop.
 *
 * @note
 *  Each block's thread collection can exchange twice the size of data points.
 *  These points get copied (cached) in the shared memory location. We use contiguous blocks
 *  both in global data memory and the shared memory buffer.
 *
 * @param gIndex    The global array index
 * @param blockDim  The block size (threads per block)
 * @return          The equivalent local address of the shared memory
 */
 __device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
    return gIndex % (SizeToThreadsRatio * blockDim);
 }
 /*!
 * This is the version of the body that is called outside of the loop unrolling
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x;     // Keep contiguous addressing to the first half of the array
    threadId_t pid = partner(tid, step);
    if (tid > pid) {
        // Shift to the other half of the array for global data
        tid += n / SizeToThreadsRatio;
        pid += n / SizeToThreadsRatio;
    }
    if ((tid < n) && (pid < n)) {       // Boundary check
        bool keep = keepSmall(tid, pid, stage);
        exchange(data, tid, pid, keep);
    }
 }
 /*!
 * This is unrolled part of the bitonic double loop.
 *
 * First each thread caches its corresponding data point from the current and the following data block.
 * After that we execute the loop unrolling on the local data and then we write back to global memory.
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param step      [size_t]    The current step of the current stage of bitonic sort
 * @param stage     [size_t]    The current stage of bitonic sort
 */
 template <typename ValueT>
 __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
    extern __shared__ ValueT shared_data[];
    /*
     * Global and local(shared) memory indices (calculated once)
     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
     * And we cache the neighbor block address indexes in local (shared) memory
     */
    threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
    threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
    if (gIdx0 + blockDim.x >= n)   // Boundary check
        return;
    // Fetch to local memory the entire effective block size (2 positions for each thread)
    shared_data[lIdx0]              = data[gIdx0];
    shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
    __syncthreads();
    for (size_t step = innerSteps + 1; step > 0; ) {
        --step;
        // Init thread global and local indices
        threadId_t gIdx = gIdx0;
        threadId_t lIdx = lIdx0;
        // Find partner and keep-small configuration based on the global data positions
        threadId_t pIdx  = partner(gIdx, step);
        if (gIdx > pIdx) {
            // Shift inside effective block
            gIdx += blockDim.x; // global
            pIdx += blockDim.x;
            lIdx += blockDim.x;	// local
        }
        bool keep = keepSmall(gIdx, pIdx, stage);
        // Exchange data on local(shared) copy
        threadId_t lpIdx = toLocal(pIdx, blockDim.x);
        exchange(shared_data, lIdx, lpIdx, keep);
        __syncthreads();
    }
    // Write back to global memory
    data[gIdx0]              = shared_data[lIdx0];
    data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
    __syncthreads();
 }
 /*!
 * A CUDA version of the Bitonic sort algorithm.
 *
 * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
 * @param data      [DataT&]    Reference to the container to sort
 */
 template <typename DataT>
 void bitonicSort(DataT& data) {
    using value_t = typename DataT::value_type;
    value_t* dev_data;
    auto size = data.size();
    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
    Timer_memory.stop();
    size_t Nth = config.blockSize;
    size_t Nbl = NBlocks(size);
    size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
    auto Stages          = static_cast<size_t>(log2(size));
    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
    Timer_sorting.start();
    for (size_t stage = 1; stage <= Stages; ++stage) {
        size_t step = stage - 1;
        for ( ; step > InnerBlockSteps; --step) {
            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
            cudaDeviceSynchronize();
        }
        inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
        cudaDeviceSynchronize();
    }
    Timer_sorting.stop();
    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
 #endif
 #endif //BITONICSORTCUDA_H_
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@ -0,0 +1,82 @@
 /*!
 * \file
 * \brief   Build and runtime configuration file.
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #ifndef CONFIG_H_
 #define CONFIG_H_
 #include <cstdint>
 #include <cuda_runtime.h>
 /*
 * Versioning:
 * - RC1: First version to test on HPC
 */
 static constexpr char version[] = "0.1";
 /*
 * Defines for different version of the exercise
 */
 #define  V0        0
 #define  V1        1
 #define  V2        2
 // Fail-safe version selection
 #if !defined CODE_VERSION
 #define CODE_VERSION   V2
 #endif
 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 // Placeholder default (actual default comes from device properties read at initialization)
 static constexpr size_t THREADS_PER_BLOCK   = 1024;
 /*!
 * Value and Buffer type selection
 *
 * We support the following compiler types or the <cstdint> that translate to them:
 *  char       -  unsigned char
 *  short      -  unsigned short
 *  int        -  unsigned int
 *  long       -  unsigned long
 *  long long  -  unsigned long long
 *  float
 *  double
 */
 using Value_t = uint32_t;
 using Data_t  = std::vector<Value_t>;
 /*!
 * In theory we can support large arrays ;)
 */
 using ArraySize_t = uint64_t;
 /*!
 * Session option for each invocation of the executable.
 *
 * @note
 *  The values of the members are set from the command line.
 */
 struct config_t {
 	ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
 	size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
    bool        validation{false};              //!< Request a full validation at the end, performed by process rank 0.
    size_t      perf{1};                        //!< Enable performance timing measurements and prints. Repeat
                                                //!< the sorting <perf> times to do so.
    bool        verbose{false};                 //!< Flag to enable verbose output to stdout.
 };
 /*
 * Exported data types
 */
 extern config_t 		config;
 extern cudaDeviceProp   device;
 #endif /* CONFIG_H_ */
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@ -0,0 +1,260 @@
 /*!
 * \file
 * \brief   Main application file for PDS HW3 (CUDA)
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #include <exception>
 #include <iostream>
 #include <algorithm>
 #include <random>
 #include <cuda_runtime.h>
 #include "utils.hpp"
 #include "config.h"
 #include "bitonicsort.hpp"
 // Global session data
 Data_t          Data;
 config_t        config;
 Log             logger;
 cudaDeviceProp 	device;
 // Mersenne seeded from hw if possible. range: [type_min, type_max]
 std::random_device  rd;
 std::mt19937        gen(rd());
 //! Performance timers for each one of the "costly" functions
 Timing Timer_total, Timer_memory, Timer_sorting;
 //! Init timing objects for extra rounds
 void measurements_init() {
    if (config.perf > 1) {
        Timer_total.init(config.perf);
        Timer_memory.init(config.perf);
        Timer_sorting.init(config.perf);
    }
 }
 //! iterate ot the next round of measurements for all measurement objects
 void measurements_next() {
    if (config.perf > 1) {
        Timer_total.next();
        Timer_memory.next();
        Timer_sorting.next();
    }
 }
 /*!
 * A small command line argument parser
 * \return  The status of the operation
 */
 bool get_options(int argc, char* argv[]){
    bool status =true;
    // iterate over the passed arguments
    for (int i=1 ; i<argc ; ++i) {
        std::string arg(argv[i]);     // get current argument
        if (arg == "-q" || arg == "--array-size") {
            if (i+1 < argc) {
                config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
            }
            else {
                status = false;
            }
        }
        else if (arg == "-b" || arg == "--block-size") {
            if (i+1 < argc) {
                config.blockSize = atoi(argv[++i]);
            }
            else {
                status = false;
            }
        }
        else if (arg == "--validation") {
            config.validation = true;
        }
        else if (arg == "--perf") {
            if (i+1 < argc) {
                config.perf = atoi(argv[++i]);
            }
            else {
                status = false;
            }
        }
        else if (arg == "-v" || arg == "--verbose") {
            config.verbose = true;
        }
        else if (arg == "--version") {
            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
            std::cout << "version: " << version << "\n\n";
            exit(0);
        }
        else if (arg == "-h" || arg == "--help") {
            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
            std::cout << "  " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
            std::cout << "  " << STR(TARGET) << " -h\n";
            std::cout << "  " << STR(TARGET) << " --version\n";
            std::cout << '\n';
            std::cout << "Options:\n\n";
            std::cout << "   -q | --array-size <N>\n";
            std::cout << "      Selects the array size according to size = 2^N\n";
            std::cout << "      [Size must be larger than 2 * blockSize]\n";
            std::cout << "      [Default is 2^16]\n\n";
            std::cout << "   -b | --block-size <N>\n";
            std::cout << "      Selects the number of CUDA threads per block\n";
            std::cout << "      [Size has to be multiple of device's warp size (usually 32)\n";
            std::cout << "      [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
            std::cout << "   --validation\n";
            std::cout << "      Request a full validation at the end\n\n";
            std::cout << "   --perf <N> \n";
            std::cout << "      Enable performance timing measurements and prints, and repeat\n";
            std::cout << "      the sorting <N> times.\n\n";
            std::cout << "   -v | --verbose\n";
            std::cout << "      Request a more verbose output to stdout.\n\n";
            std::cout << "   -h | --help\n";
            std::cout << "      Prints this and exit.\n\n";
            std::cout << "   --version\n";
            std::cout << "      Prints version and exit.\n\n";
            std::cout << "Examples:\n\n";
            std::cout << "  " << STR(TARGET) << " -q 24\n";
            std::cout << "      Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
            std::cout << "  " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
            std::cout << "      Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
            std::cout << "      512 threads per block, performs a validation check at the end and prints the time\n";
            std::cout << "      of the median.\n\n";
            exit(0);
        }
        else {   // parse error
            std::cout << "Invocation error. Try -h for details.\n";
            status = false;
        }
    }
    // Check configuration requirements
    if (config.blockSize % device.warpSize)
    	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
    if (config.arraySize < 2*config.blockSize)
    	throw std::runtime_error("[Config] - Unsupported array size (smaller than "
                                 + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
    if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
    	throw std::runtime_error("[CUDA] - Unsupported array size: "
                                 + std::to_string(config.arraySize * sizeof(Value_t))
                                 + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
    return status;
 }
 /*!
 * A simple validator for the entire distributed process
 *
 * @tparam DataT    A buffer type with random access iterator.
 *
 * @param data      [DataT] The data
 * @return          [bool]  True if sorted in ascending order
 */
 template<typename DataT>
 bool validator(DataT& data) {
    return std::is_sorted(data.begin(), data.end());
 }
 /*!
 * Initializes the environment, must called from each process
 *
 * @param argc  [int*]      POINTER to main's argc argument
 * @param argv  [char***]   POINTER to main's argv argument
 */
 void init(int* argc, char*** argv) {
    // Get device configuration
    if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not read GPU");
    config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
    // try to read command line
    if (!get_options(*argc, *argv))
        exit(1);
    // Prepare vector and timing data
    Data.resize(config.arraySize);
    measurements_init();
 }
 #if !defined TESTING
 /*!
 * @return Returns 0, but.... we may throw or exit(0) / exit(1)
 */
 int main(int argc, char* argv[]) try {
    // Init everything
    init(&argc, &argv);
    logger << "Array size:     " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
    logger << "Repeated sorts: " << config.perf << logger.endl;
    logger << "GPU:            " << device.name << logger.endl;
    logger << "Block size:     " << config.blockSize << logger.endl;
    for (size_t it = 0 ; it < config.perf ; ++it) {
        // Initialize local data
        logger << "Initialize array ... ";
        std::uniform_int_distribution<Value_t > dis(
                std::numeric_limits<Value_t>::min(),
                std::numeric_limits<Value_t>::max()
        );
        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
        logger << " Done." << logger.endl;
        // Run distributed sort
        logger << "Start sorting ...    ";
        Timer_total.start();
        bitonicSort(Data);
        Timer_total.stop();
        measurements_next();
        logger << " Done." << logger.endl;
    }
    // Print-outs and validation
    if (config.perf > 1) {
        Timing::print_duration(Timer_total.median(),  "Total   ");
        Timing::print_duration(Timer_memory.median(), "Mem-xch ");
        Timing::print_duration(Timer_sorting.median(),"Sorting ");
    }
    if (config.validation) {
        // If requested, we have the chance to fail!
        std::cout << "[Validation] Results validation ...";
        bool val = validator(Data);
        std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
    }
    return 0;
 }
 catch (std::exception& e) {
    //we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
    std::cerr << "Error: " << e.what() << '\n';
    exit(1);
 }
 #else
 #include <gtest/gtest.h>
 #include <exception>
 /*!
 * The testing version of our program
 */
 GTEST_API_ int main(int argc, char **argv) try {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
 catch (std::exception& e) {
    std::cout << "Exception: " << e.what() << '\n';
 }
 #endif
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@ -0,0 +1,157 @@
 /**
 * \file
 * \brief   Utilities header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #ifndef UTILS_HPP_
 #define UTILS_HPP_
 #include <vector>
 #include <iostream>
 #include <chrono>
 #include <unistd.h>
 #include <algorithm>
 #include "config.h"
 /*!
 * Stringify preprocessor util
 */
 #define STR(s)		STR_(s)
 #define STR_(s)		#s
 /*!
 * A Logger for entire program.
 */
 struct Log {
    struct Endl {} endl;    //!< a tag object to to use it as a new line request.
    //! We provide logging via << operator
    template<typename T>
    Log &operator<<(T &&t) {
        if (config.verbose) {
            if (line_) {
                std::cout << "[Log]: " << t;
                line_ = false;
            } else
                std::cout << t;
        }
        return *this;
    }
    // overload for special end line handling
    Log &operator<<(Endl e) {
        (void) e;
        if (config.verbose) {
            std::cout << '\n';
            line_ = true;
        }
        return *this;
    }
 private:
    bool line_{true};
 };
 extern Log logger;
 /*!
 * A small timing utility based on chrono that supports timing rounds
 * and returning the median of them. Time can accumulate to the measurement
 * for each round.
 */
 struct Timing {
    using Tpoint = std::chrono::steady_clock::time_point;
    using Tduration = std::chrono::microseconds;
    using microseconds = std::chrono::microseconds;
    using milliseconds = std::chrono::milliseconds;
    using seconds = std::chrono::seconds;
    //! Setup measurement rounds
    void init(size_t rounds) {
        duration_.resize(rounds);
        for (auto& d : duration_)
            d = Tduration::zero();
    }
    //! tool to mark the starting point
    Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }
    //! tool to mark the ending point
    Tpoint stop() noexcept {
        Tpoint now = std::chrono::steady_clock::now();
        duration_[current_] += dt(now, mark_);
        return now;
    }
    //! Switch timing slot
    void next() noexcept {
        ++current_;
        current_ %= duration_.size();
    }
    Tduration& median() noexcept {
        std::sort(duration_.begin(), duration_.end());
        return duration_[duration_.size()/2];
    }
    //! A duration calculation utility
    static Tduration dt(Tpoint t2, Tpoint t1) noexcept {
        return std::chrono::duration_cast<Tduration>(t2 - t1);
    }
    //! Tool to print the time interval
    static void print_duration(const Tduration& duration, const char *what) noexcept {
        if (std::chrono::duration_cast<microseconds>(duration).count() < 10000)
            std::cout << "[Timing] " << what << ": "
                      << std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n";
        else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000)
            std::cout << "[Timing] " << what << ": "
                      << std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n";
        else {
            char stime[26]; // fit ulong
            auto sec  = std::chrono::duration_cast<seconds>(duration).count();
            auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10;  // keep 2 digit
            std::sprintf(stime, "%ld.%1ld", sec, msec);
            std::cout << "[Timing] " << what << ": " << stime << " [sec]\n";
        }
    }
 private:
    size_t current_{0};
    Tpoint mark_{};
    std::vector<Tduration> duration_{1};
 };
 /*!
 * A "high level function"-like utility macro to forward a function call
 * and accumulate the execution time to the corresponding timing object.
 *
 * @param   Tim     The Timing object [Needs to have methods start() and stop()]
 * @param   Func    The function name
 * @param   ...     The arguments to pass to function (the preprocessor way)
 */
 #define timeCall(Tim, Func, ...)    \
    Tim.start();                    \
    Func(__VA_ARGS__);              \
    Tim.stop();                     \
 /*!
 * A utility to check if a number is power of two
 *
 * @tparam Integral     The integral type of the number to check
 * @param x             The number to check
 * @return              True if it is power of 2, false otherwise
 */
 template <typename Integral>
 constexpr inline bool isPowerOfTwo(Integral x) noexcept {
    return (!(x & (x - 1)) && x);
 }
 #endif /* UTILS_HPP_ */
--- a/homework_3/test/gtest/gtest/gtest-all.cpp
+++ b/homework_3/test/gtest/gtest/gtest-all.cpp
--- a/homework_3/test/gtest/gtest/gtest.h
+++ b/homework_3/test/gtest/gtest/gtest.h
--- a/homework_3/test/tests.cpp
+++ b/homework_3/test/tests.cpp
@ -0,0 +1,33 @@
 /**
 * \file
 * \brief   PDS HW3 tests
 *
 * To run these test execute:
 *  ...
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #include <gtest/gtest.h>
 /*
 * Global fixtures
 */
 class TCUDAbitonic : public ::testing::Test {
 protected:
    static void SetUpTestSuite() { }
    static void TearDownTestSuite() { }
 };
 /*
 *
 */
 TEST_F(TCUDAbitonic, test1) {
    EXPECT_EQ(true, true);
 }
Author	SHA1	Message	Date
Christos Choutouridis	7a6f7f53b5	HW3: RC1 - hpc slurm creation and submittion scripts added	2025-02-15 21:39:47 +02:00
Christos Choutouridis	e165b75f92	HW3: RC1 First version to test on HPC	2025-02-15 21:18:55 +02:00
Christos Choutouridis	6db2a814d2	HW3: First batch to test [v0 and v1]	2025-02-14 18:41:08 +02:00
Christos Choutouridis	b31ca23757	HW3: [WIP] Test HPC build	2025-02-14 17:56:56 +02:00
Christos Choutouridis	2a2c7fec38	HW3: [WIP] V2 added (needs debug)	2025-02-10 00:01:58 +02:00
Christos Choutouridis	1fe5ab4da7	HW3: [WIP] V0 version added	2025-01-29 00:48:45 +02:00
Christos Choutouridis	146e975ac1	HW3: [no compile] A first clean up	2025-01-21 22:52:28 +02:00
Christos Choutouridis	2ff6ae171a	HW3: [No Compile] Init HW3 with HW2 files	2025-01-21 22:25:49 +02:00