HW3: RC1 - hpc slurm creation and submittion scripts added

HW3: RC1 First version to test on HPC
HW3: First batch to test [v0 and v1]
2025-02-15 21:39:47 +02:00 · 2025-02-15 21:18:55 +02:00 · 2025-02-14 18:41:08 +02:00 · 2025-02-14 17:56:56 +02:00 · 2025-02-10 00:01:58 +02:00 · 2025-01-29 00:48:45 +02:00
12 changed files with 30108 additions and 0 deletions
--- a/homework_3/.gitignore
+++ b/homework_3/.gitignore
@ -0,0 +1,23 @@
+# project
+bin/
+out/
+mat/
+mtx/
+.unused/
+various/
+
+# hpc
+
+# IDEs
+.idea/
+.clangd
+
+# eclipse
+.project
+.cproject
+.settings/
+
+.vs/
+.vscode/
+
+
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@ -0,0 +1,236 @@
+#
+# PDS HW3 Makefile
+#
+# Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============== Project settings ==============
+# Project's name
+PROJECT         := PDS_homework_3
+
+# Excecutable's name
+TARGET          := bitonicCUDA
+
+# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
+SRC_DIR_LIST    := src #test test/gtest
+
+# Include directories list(space seperated). Makefile-relative path.
+INC_DIR_LIST    := src
+#                   test \
+#                   test/gtest/ \
+
+
+# Exclude files list(space seperated). Filenames only.
+# EXC_FILE_LIST := bad.cpp old.cpp
+
+# Build directories
+BUILD_DIR       := bin
+OBJ_DIR         := $(BUILD_DIR)/obj
+DEP_DIR         := $(BUILD_DIR)/.dep
+
+OUTPUT_DIR      := out
+
+# ========== Compiler settings ==========
+# Compiler flags for debug and release
+DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
+REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
+DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG" 
+REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
+
+# Pre-defines
+# PRE_DEFS := MYCAB=1729 SUPER_MODE
+PRE_DEFS        := TARGET=$(TARGET)
+
+# ============== Linker settings ==============
+# Linker flags (example: -pthread -lm)
+LDFLAGS         :=
+
+# Map output file
+MAP_FILE        := # output.map
+MAP_FLAG        := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
+
+# ============== Docker settings ==============
+# We need:
+#  - Bind the entire project directory(the dir that includes all the code) as volume.
+#  - In docker instance, change to working directory(where the makefile is).
+DOCKER_VOL_DIR  := $(shell pwd)
+DOCKER_WRK_DIR  :=
+DOCKER_RUN      := docker run --rm
+DOCKER_FLAGS    := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)
+
+# docker invoke mechanism (edit with care)
+#   note:
+#   Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
+#   functionality is needed.
+DOCKER_CMD      = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
+DOCKER          :=
+
+# ============== Tool selection ==============
+# compiler and compiler flags.
+CSIZE           := size
+CFLAGS          := $(DEB_CFLAGS)
+CXXFLAGS        := $(DEB_CXXFLAGS)
+CXX             := g++
+CC              := gcc
+LINKER          := g++
+
+#
+# =========== Main body and Patterns ===========
+#
+
+INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
+DEF     := $(foreach def,$(PRE_DEFS),-D$(def))
+EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
+               $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
+           )
+# source files. object and dependencies list
+# recursive search into current and source directories
+SRC     := $(wildcard *.cpp)
+SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
+SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
+SRC     := $(filter-out $(EXC),$(SRC))
+#SRC     := $(abspath $(SRC))
+
+OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
+DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
+
+
+# c file objects depent on .c AND dependency files, which have an empty recipe 
+$(OBJ_DIR)/%.o: %.c
+	@mkdir -p $(@D)
+	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
+
+# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
+$(OBJ_DIR)/%.o: %.cpp
+	@mkdir -p $(@D)
+	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
+
+# main target rule
+$(BUILD_DIR)/$(TARGET): $(OBJ)
+	@mkdir -p $(@D)
+	@echo Linking to target: $(TARGET)
+	@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@echo
+	@echo Print size information
+	@$(CSIZE) $(@D)/$(TARGET)
+	@echo Done
+
+
+#
+# ================ Default local build rules =================
+# example:
+# make debug
+
+.DEFAULT_GOAL := all
+
+.PHONY: clean
+clean:
+	@echo Cleaning build directories
+	@rm -rf $(OBJ_DIR)
+	@rm -rf $(DEP_DIR)
+	@rm -rf $(BUILD_DIR)
+
+debug: CFLAGS := $(DEB_CFLAGS)
+debug: $(BUILD_DIR)/$(TARGET)
+
+release: CFLAGS := $(REL_CFLAGS)
+release: $(BUILD_DIR)/$(TARGET)
+
+#
+# ================ Build rules =================
+#
+
+bitonic_v0deb: CC := nvcc -G -g -x cu
+bitonic_v0deb: CXX := nvcc -G -g -x cu
+bitonic_v0deb: LINKER := nvcc
+bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
+bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
+bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
+bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v1deb: CC := nvcc -G -g -x cu
+bitonic_v1deb: CXX := nvcc -G -g -x cu
+bitonic_v1deb: LINKER := nvcc
+bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
+bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
+bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
+bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v2deb: CC := nvcc -G -g -x cu
+bitonic_v2deb: CXX := nvcc -G -g -x cu
+bitonic_v2deb: LINKER := nvcc
+bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
+bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
+bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
+bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+
+bitonic_v0: CC := nvcc -x cu
+bitonic_v0: CXX := nvcc -x cu
+bitonic_v0: LINKER := nvcc
+bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
+bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
+bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
+bitonic_v0: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v1: CC := nvcc -x cu
+bitonic_v1: CXX := nvcc -x cu
+bitonic_v1: LINKER := nvcc
+bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
+bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
+bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
+bitonic_v1: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+bitonic_v2: CC := nvcc -x cu
+bitonic_v2: CXX := nvcc -x cu
+bitonic_v2: LINKER := nvcc
+bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
+bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
+bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
+bitonic_v2: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+hpc-build:
+	make clean
+	make bitonic_v0
+	make clean
+	make bitonic_v1
+	make clean
+	make bitonic_v2
+
+
+all: debug bitonic_v0
+# Note:
+#	Add a gcc based make rule here in order for clangd to successfully scan the project files.
+#	Otherwise we do not need the gcc build.
+
--- a/homework_3/exersize.md
+++ b/homework_3/exersize.md
@ -0,0 +1,33 @@
+Parallel & Distributed Computer Systems HW3
+
+January, 2025
+
+Write a program that sorts $N$ integers in ascending order, using CUDA.
+
+The program must perform the following tasks:
+
+- The user specifies a positive integers $q$.
+
+- Start a process with an array of $N = 2^q$ random integers is each processes.
+
+- Sort all $N$ elements int ascending order.
+
+- Check the correctness of the final result.
+
+Your implementation should be based on the following steps:
+
+V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
+
+V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
+
+V2. Modify the kernel of V1 to work with local memory instead of global.
+
+You must deliver:
+- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
+
+- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
+
+- The source code of your program uploaded online.
+
+Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
+Deadline: 2 February, $2025$.
--- a/homework_3/hpc/makeSlurmScripts.sh
+++ b/homework_3/hpc/makeSlurmScripts.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Parameters
+versions=("v0" "v1" "v2")
+q_values=(20 21 22 23 24 25 26 27 28 29 30)
+
+# Make scripts
+for version in "${versions[@]}"; do
+  for q in "${q_values[@]}"; do
+    filename="Bitnc${version^^}Q${q}.sh"  # Convert v0 -> V0 etc...
+    cat > "$filename" <<EOL
+#! /usr/bin/env bash
+
+#SBATCH --job-name=Bitnc${version^^}Q${q}
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=10:00
+
+module load gcc/9.2.0 cuda/11.1.0
+
+./out/${version}/bitonicCUDA -v --validation --perf 7 -b 512 -q ${q}
+
+EOL
+    echo "Create: $filename"
+  done
+done
--- a/homework_3/hpc/submitJobs.sh
+++ b/homework_3/hpc/submitJobs.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Submission parameters
+QOS="small"
+PARTITION="ampere"
+SCRIPT_DIR="hpc"  # Directory containing the job scripts
+
+# Range of values for the -q parameter
+VERSIONS=("V0" "V1" "V2")
+Q_START=20
+Q_END=30
+
+# Submitting the jobs
+for version in "${VERSIONS[@]}"; do
+  for ((q = Q_START; q <= Q_END; q++)); do
+    script_name="Bitnc${version}Q${q}.sh"
+    script_path="${SCRIPT_DIR}/${script_name}"
+
+    if [[ -f "$script_path" ]]; then
+      sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
+      echo "Submitted: $script_path"
+    else
+      echo "Warning: File not found - $script_path"
+    fi
+  done
+done
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@ -0,0 +1,456 @@
+/*!
+ * \file
+ * \brief   Bitonic sort CUDA implementation header
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#ifndef BITONICSORTCUDA_H_
+#define BITONICSORTCUDA_H_
+
+#include <cuda_runtime.h>
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+
+#include "utils.hpp"
+
+/*
+ * Exported timers
+ */
+extern Timing Timer_total, Timer_memory, Timer_sorting;
+
+using threadId_t = size_t;
+
+
+/*
+ * ============================== Sort utilities ==============================
+ */
+
+/*!
+ * Returns the ascending or descending configuration (up/down phase) of the thread id
+ * depending on the current depth
+ *
+ * @param tid   [threadId_t] The current thread
+ * @param stage [size_t]     The current stage of the sorting network (same for each step)
+ * @return      [bool]       True if we need ascending configuration, false otherwise
+ */
+__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
+    return !(tid & (1 << stage));
+}
+
+/*!
+ * Returns the thread's partner for data exchange during the sorting network iterations
+ * of Bitonic
+ *
+ * @param tid   [threadId_t] The current node
+ * @param step  [size_t]     The step of the sorting network
+ * @return      [threadId_t] The node id of the partner for data exchange
+ */
+__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
+    return (tid ^ (1 << step));
+}
+
+
+/*!
+ * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
+ *
+ * @param tid       [threadId_t] The node for which we check
+ * @param partner   [threadId_t] The partner of the data exchange
+ * @param stage     [size_t]     The current stage of the sorting network (same for each step)
+ * @return          [bool]       True if the node should keep the small values, false otherwise
+ */
+
+__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
+    return ascending(tid, stage) == (tid < partner);
+}
+
+
+
+/*
+ * ============================== Sort algorithms ==============================
+ */
+
+/*!
+ * Each thread can handle 2 points in the array. For each of these 2 points it may
+ * - compare and exchange if needed
+ * - copy data to local and back if needed
+ */
+static constexpr size_t SizeToThreadsRatio = 2;
+
+/*!
+ * Calculates the blocks needed for the entire sorting process
+ *
+ * @note
+ *  This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
+ *  multipliers of config.blockSize.
+ *  Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
+ *
+ * @param arraySize [ArraySize_t]   The size of the entire array (in points)
+ * @return          [size_t]        The number of blocks
+ */
+inline size_t NBlocks(ArraySize_t arraySize) {
+	return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
+}
+
+
+/*!
+ * Exchange utility
+ *
+ * @tparam  ValueT  The underlying data type of the array items
+ *
+ * @param data      [ValueT*]    Pointer to data array
+ * @param tid       [threadId_t] Current thread's index to data
+ * @param pid       [threadId_t] Parents's index to data
+ * @param keepSmall [bool]       Flag to indicate if current threads is keeping the small
+ */
+template <typename ValueT>
+__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
+    if (( keepSmall && (data[tid] > data[partner])) ||
+        (!keepSmall && (data[tid] < data[partner])) ) {
+        ValueT temp = data[tid];
+        data[tid] = data[partner];
+        data[partner] = temp;
+    }
+}
+
+#if CODE_VERSION == V0
+
+/*!
+ * This is the body of each thread. This function compare and exchange data
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        // Shift to the other half of the array for global data
+        tid += n / SizeToThreadsRatio;
+        pid += n / SizeToThreadsRatio;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
+    }
+}
+
+
+/*!
+ * A CUDA version of the Bitonic sort algorithm.
+ *
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
+ */
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
+
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
+
+    size_t Stages = static_cast<size_t>(log2(size));
+    Timer_sorting.start();
+    for (size_t stage = 1; stage <= Stages; ++stage) {
+        for (size_t step = stage; step > 0; ) {
+            --step;
+            bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+    }
+    Timer_sorting.stop();
+
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
+    cudaFree(dev_data);
+    Timer_memory.stop();
+}
+
+#elif CODE_VERSION == V1
+
+/*!
+ * This is the body of each thread. This function compare and exchange data
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
+    /*
+     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
+     * And we use the neighbor block address indices for the other half of the threads
+     */
+    threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        // Shift to the other half of the array for global data
+        tid += blockDim.x;
+        pid += blockDim.x;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
+    }
+}
+
+/*!
+ * This is the version of the body that is called outside of the loop unrolling
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    interBlockStep_(data, n, step, stage);
+}
+
+/*!
+ * This is unrolled part of the bitonic double loop.
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
+    for (size_t step = innerSteps + 1; step > 0; ) {
+        --step;
+        interBlockStep_(data, n, step, stage);
+        __syncthreads();
+    }
+}
+
+/*!
+ * A CUDA version of the Bitonic sort algorithm.
+ *
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
+ */
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
+
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
+
+    auto Stages          = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));    //
+    Timer_sorting.start();
+    for (size_t stage = 1; stage <= Stages; ++stage) {
+        size_t step = stage - 1;
+        for ( ; step > InnerBlockSteps; --step) {
+            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+        inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
+        cudaDeviceSynchronize();
+    }
+    Timer_sorting.stop();
+
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
+    cudaFree(dev_data);
+    Timer_memory.stop();
+}
+
+
+
+#elif CODE_VERSION == V2
+
+/*!
+ * @return The memory that each block local threads can affect.
+ *
+ * @note
+ *  Each block thread collection can exchange twice the size of data points.
+ */
+inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
+
+
+
+/*!
+ * Converts the global address of the data to the local shared memory array which is used
+ * as cached memory to the unrolled part of the bitonic sort loop.
+ *
+ * @note
+ *  Each block's thread collection can exchange twice the size of data points.
+ *  These points get copied (cached) in the shared memory location. We use contiguous blocks
+ *  both in global data memory and the shared memory buffer.
+ *
+ * @param gIndex    The global array index
+ * @param blockDim  The block size (threads per block)
+ * @return          The equivalent local address of the shared memory
+ */
+__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
+    return gIndex % (SizeToThreadsRatio * blockDim);
+}
+
+/*!
+ * This is the version of the body that is called outside of the loop unrolling
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x;     // Keep contiguous addressing to the first half of the array
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        // Shift to the other half of the array for global data
+        tid += n / SizeToThreadsRatio;
+        pid += n / SizeToThreadsRatio;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
+    }
+}
+
+/*!
+ * This is unrolled part of the bitonic double loop.
+ *
+ * First each thread caches its corresponding data point from the current and the following data block.
+ * After that we execute the loop unrolling on the local data and then we write back to global memory.
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
+template <typename ValueT>
+__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
+    extern __shared__ ValueT shared_data[];
+
+    /*
+     * Global and local(shared) memory indices (calculated once)
+     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
+     * And we cache the neighbor block address indexes in local (shared) memory
+     */
+    threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
+    threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
+
+    if (gIdx0 + blockDim.x >= n)   // Boundary check
+        return;
+
+    // Fetch to local memory the entire effective block size (2 positions for each thread)
+    shared_data[lIdx0]              = data[gIdx0];
+    shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
+    __syncthreads();
+
+    for (size_t step = innerSteps + 1; step > 0; ) {
+        --step;
+
+        // Init thread global and local indices
+        threadId_t gIdx = gIdx0;
+        threadId_t lIdx = lIdx0;
+        // Find partner and keep-small configuration based on the global data positions
+        threadId_t pIdx  = partner(gIdx, step);
+        if (gIdx > pIdx) {
+            // Shift inside effective block
+            gIdx += blockDim.x; // global
+            pIdx += blockDim.x;
+            lIdx += blockDim.x;	// local
+        }
+        bool keep = keepSmall(gIdx, pIdx, stage);
+
+        // Exchange data on local(shared) copy
+        threadId_t lpIdx = toLocal(pIdx, blockDim.x);
+        exchange(shared_data, lIdx, lpIdx, keep);
+        __syncthreads();
+    }
+
+    // Write back to global memory
+    data[gIdx0]              = shared_data[lIdx0];
+    data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
+    __syncthreads();
+}
+
+/*!
+ * A CUDA version of the Bitonic sort algorithm.
+ *
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
+ */
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
+
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
+    size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
+
+    auto Stages          = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
+    Timer_sorting.start();
+    for (size_t stage = 1; stage <= Stages; ++stage) {
+        size_t step = stage - 1;
+        for ( ; step > InnerBlockSteps; --step) {
+            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+        inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
+        cudaDeviceSynchronize();
+    }
+    Timer_sorting.stop();
+
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
+    cudaFree(dev_data);
+    Timer_memory.stop();
+}
+
+#endif
+
+#endif //BITONICSORTCUDA_H_
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@ -0,0 +1,82 @@
+/*!
+ * \file
+ * \brief   Build and runtime configuration file.
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+/*
+ * Versioning:
+ * - RC1: First version to test on HPC
+ */
+static constexpr char version[] = "0.1";
+
+/*
+ * Defines for different version of the exercise
+ */
+#define  V0        0
+#define  V1        1
+#define  V2        2
+
+// Fail-safe version selection
+#if !defined CODE_VERSION
+#define CODE_VERSION   V2
+#endif
+
+// Default Data size (in case -q <N> is not present)
+static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
+
+// Placeholder default (actual default comes from device properties read at initialization)
+static constexpr size_t THREADS_PER_BLOCK   = 1024;
+
+
+/*!
+ * Value and Buffer type selection
+ *
+ * We support the following compiler types or the <cstdint> that translate to them:
+ *  char       -  unsigned char
+ *  short      -  unsigned short
+ *  int        -  unsigned int
+ *  long       -  unsigned long
+ *  long long  -  unsigned long long
+ *  float
+ *  double
+ */
+using Value_t = uint32_t;
+using Data_t  = std::vector<Value_t>;
+
+/*!
+ * In theory we can support large arrays ;)
+ */
+using ArraySize_t = uint64_t;
+
+/*!
+ * Session option for each invocation of the executable.
+ *
+ * @note
+ *  The values of the members are set from the command line.
+ */
+struct config_t {
+	ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
+	size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
+    bool        validation{false};              //!< Request a full validation at the end, performed by process rank 0.
+    size_t      perf{1};                        //!< Enable performance timing measurements and prints. Repeat
+                                                //!< the sorting <perf> times to do so.
+    bool        verbose{false};                 //!< Flag to enable verbose output to stdout.
+};
+
+/*
+ * Exported data types
+ */
+extern config_t 		config;
+extern cudaDeviceProp   device;
+
+#endif /* CONFIG_H_ */
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@ -0,0 +1,260 @@
+/*!
+ * \file
+ * \brief   Main application file for PDS HW3 (CUDA)
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#include <exception>
+#include <iostream>
+#include <algorithm>
+#include <random>
+#include <cuda_runtime.h>
+
+#include "utils.hpp"
+#include "config.h"
+#include "bitonicsort.hpp"
+
+
+// Global session data
+Data_t          Data;
+config_t        config;
+Log             logger;
+cudaDeviceProp 	device;
+
+// Mersenne seeded from hw if possible. range: [type_min, type_max]
+std::random_device  rd;
+std::mt19937        gen(rd());
+
+//! Performance timers for each one of the "costly" functions
+Timing Timer_total, Timer_memory, Timer_sorting;
+
+
+//! Init timing objects for extra rounds
+void measurements_init() {
+    if (config.perf > 1) {
+        Timer_total.init(config.perf);
+        Timer_memory.init(config.perf);
+        Timer_sorting.init(config.perf);
+    }
+}
+
+//! iterate ot the next round of measurements for all measurement objects
+void measurements_next() {
+    if (config.perf > 1) {
+        Timer_total.next();
+        Timer_memory.next();
+        Timer_sorting.next();
+    }
+}
+
+/*!
+ * A small command line argument parser
+ * \return  The status of the operation
+ */
+bool get_options(int argc, char* argv[]){
+    bool status =true;
+
+    // iterate over the passed arguments
+    for (int i=1 ; i<argc ; ++i) {
+        std::string arg(argv[i]);     // get current argument
+
+        if (arg == "-q" || arg == "--array-size") {
+            if (i+1 < argc) {
+                config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
+            }
+            else {
+                status = false;
+            }
+        }
+        else if (arg == "-b" || arg == "--block-size") {
+            if (i+1 < argc) {
+                config.blockSize = atoi(argv[++i]);
+            }
+            else {
+                status = false;
+            }
+        }
+        else if (arg == "--validation") {
+            config.validation = true;
+        }
+        else if (arg == "--perf") {
+            if (i+1 < argc) {
+                config.perf = atoi(argv[++i]);
+            }
+            else {
+                status = false;
+            }
+        }
+        else if (arg == "-v" || arg == "--verbose") {
+            config.verbose = true;
+        }
+        else if (arg == "--version") {
+            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
+            std::cout << "version: " << version << "\n\n";
+            exit(0);
+        }
+        else if (arg == "-h" || arg == "--help") {
+            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
+            std::cout << "  " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
+            std::cout << "  " << STR(TARGET) << " -h\n";
+            std::cout << "  " << STR(TARGET) << " --version\n";
+            std::cout << '\n';
+            std::cout << "Options:\n\n";
+            std::cout << "   -q | --array-size <N>\n";
+            std::cout << "      Selects the array size according to size = 2^N\n";
+            std::cout << "      [Size must be larger than 2 * blockSize]\n";
+            std::cout << "      [Default is 2^16]\n\n";
+            std::cout << "   -b | --block-size <N>\n";
+            std::cout << "      Selects the number of CUDA threads per block\n";
+            std::cout << "      [Size has to be multiple of device's warp size (usually 32)\n";
+            std::cout << "      [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
+            std::cout << "   --validation\n";
+            std::cout << "      Request a full validation at the end\n\n";
+            std::cout << "   --perf <N> \n";
+            std::cout << "      Enable performance timing measurements and prints, and repeat\n";
+            std::cout << "      the sorting <N> times.\n\n";
+            std::cout << "   -v | --verbose\n";
+            std::cout << "      Request a more verbose output to stdout.\n\n";
+            std::cout << "   -h | --help\n";
+            std::cout << "      Prints this and exit.\n\n";
+            std::cout << "   --version\n";
+            std::cout << "      Prints version and exit.\n\n";
+            std::cout << "Examples:\n\n";
+            std::cout << "  " << STR(TARGET) << " -q 24\n";
+            std::cout << "      Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
+            std::cout << "  " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
+            std::cout << "      Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
+            std::cout << "      512 threads per block, performs a validation check at the end and prints the time\n";
+            std::cout << "      of the median.\n\n";
+
+            exit(0);
+        }
+        else {   // parse error
+            std::cout << "Invocation error. Try -h for details.\n";
+            status = false;
+        }
+    }
+
+    // Check configuration requirements
+    if (config.blockSize % device.warpSize)
+    	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
+    if (config.arraySize < 2*config.blockSize)
+    	throw std::runtime_error("[Config] - Unsupported array size (smaller than "
+                                 + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
+    if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
+    	throw std::runtime_error("[CUDA] - Unsupported array size: "
+                                 + std::to_string(config.arraySize * sizeof(Value_t))
+                                 + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
+
+    return status;
+}
+
+/*!
+ * A simple validator for the entire distributed process
+ *
+ * @tparam DataT    A buffer type with random access iterator.
+ *
+ * @param data      [DataT] The data
+ * @return          [bool]  True if sorted in ascending order
+ */
+template<typename DataT>
+bool validator(DataT& data) {
+    return std::is_sorted(data.begin(), data.end());
+}
+
+/*!
+ * Initializes the environment, must called from each process
+ *
+ * @param argc  [int*]      POINTER to main's argc argument
+ * @param argv  [char***]   POINTER to main's argv argument
+ */
+void init(int* argc, char*** argv) {
+
+    // Get device configuration
+    if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not read GPU");
+
+    config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
+
+    // try to read command line
+    if (!get_options(*argc, *argv))
+        exit(1);
+
+    // Prepare vector and timing data
+    Data.resize(config.arraySize);
+    measurements_init();
+}
+
+#if !defined TESTING
+/*!
+ * @return Returns 0, but.... we may throw or exit(0) / exit(1)
+ */
+int main(int argc, char* argv[]) try {
+
+    // Init everything
+    init(&argc, &argv);
+
+    logger << "Array size:     " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
+    logger << "Repeated sorts: " << config.perf << logger.endl;
+    logger << "GPU:            " << device.name << logger.endl;
+    logger << "Block size:     " << config.blockSize << logger.endl;
+
+    for (size_t it = 0 ; it < config.perf ; ++it) {
+        // Initialize local data
+        logger << "Initialize array ... ";
+        std::uniform_int_distribution<Value_t > dis(
+                std::numeric_limits<Value_t>::min(),
+                std::numeric_limits<Value_t>::max()
+        );
+        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
+        logger << " Done." << logger.endl;
+
+        // Run distributed sort
+        logger << "Start sorting ...    ";
+        Timer_total.start();
+        bitonicSort(Data);
+        Timer_total.stop();
+        measurements_next();
+        logger << " Done." << logger.endl;
+    }
+
+    // Print-outs and validation
+    if (config.perf > 1) {
+        Timing::print_duration(Timer_total.median(),  "Total   ");
+        Timing::print_duration(Timer_memory.median(), "Mem-xch ");
+        Timing::print_duration(Timer_sorting.median(),"Sorting ");
+    }
+    if (config.validation) {
+        // If requested, we have the chance to fail!
+        std::cout << "[Validation] Results validation ...";
+        bool val = validator(Data);
+        std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
+    }
+    return 0;
+}
+catch (std::exception& e) {
+    //we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
+    std::cerr << "Error: " << e.what() << '\n';
+    exit(1);
+}
+
+#else
+
+#include <gtest/gtest.h>
+#include <exception>
+
+/*!
+ * The testing version of our program
+ */
+GTEST_API_ int main(int argc, char **argv) try {
+   testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
+catch (std::exception& e) {
+    std::cout << "Exception: " << e.what() << '\n';
+}
+
+#endif
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@ -0,0 +1,157 @@
+/**
+ * \file
+ * \brief   Utilities header
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#ifndef UTILS_HPP_
+#define UTILS_HPP_
+
+#include <vector>
+#include <iostream>
+#include <chrono>
+#include <unistd.h>
+#include <algorithm>
+
+#include "config.h"
+
+/*!
+ * Stringify preprocessor util
+ */
+#define STR(s)		STR_(s)
+#define STR_(s)		#s
+
+/*!
+ * A Logger for entire program.
+ */
+struct Log {
+    struct Endl {} endl;    //!< a tag object to to use it as a new line request.
+
+    //! We provide logging via << operator
+    template<typename T>
+    Log &operator<<(T &&t) {
+        if (config.verbose) {
+            if (line_) {
+                std::cout << "[Log]: " << t;
+                line_ = false;
+            } else
+                std::cout << t;
+        }
+        return *this;
+    }
+
+    // overload for special end line handling
+    Log &operator<<(Endl e) {
+        (void) e;
+        if (config.verbose) {
+            std::cout << '\n';
+            line_ = true;
+        }
+        return *this;
+    }
+
+private:
+    bool line_{true};
+};
+
+extern Log logger;
+
+/*!
+ * A small timing utility based on chrono that supports timing rounds
+ * and returning the median of them. Time can accumulate to the measurement
+ * for each round.
+ */
+struct Timing {
+    using Tpoint = std::chrono::steady_clock::time_point;
+    using Tduration = std::chrono::microseconds;
+    using microseconds = std::chrono::microseconds;
+    using milliseconds = std::chrono::milliseconds;
+    using seconds = std::chrono::seconds;
+
+    //! Setup measurement rounds
+    void init(size_t rounds) {
+        duration_.resize(rounds);
+        for (auto& d : duration_)
+            d = Tduration::zero();
+    }
+
+    //! tool to mark the starting point
+    Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }
+
+    //! tool to mark the ending point
+    Tpoint stop() noexcept {
+        Tpoint now = std::chrono::steady_clock::now();
+        duration_[current_] += dt(now, mark_);
+        return now;
+    }
+
+    //! Switch timing slot
+    void next() noexcept {
+        ++current_;
+        current_ %= duration_.size();
+    }
+
+    Tduration& median() noexcept {
+        std::sort(duration_.begin(), duration_.end());
+        return duration_[duration_.size()/2];
+    }
+
+    //! A duration calculation utility
+    static Tduration dt(Tpoint t2, Tpoint t1) noexcept {
+        return std::chrono::duration_cast<Tduration>(t2 - t1);
+    }
+
+    //! Tool to print the time interval
+    static void print_duration(const Tduration& duration, const char *what) noexcept {
+        if (std::chrono::duration_cast<microseconds>(duration).count() < 10000)
+            std::cout << "[Timing] " << what << ": "
+                      << std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n";
+        else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000)
+            std::cout << "[Timing] " << what << ": "
+                      << std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n";
+        else {
+            char stime[26]; // fit ulong
+            auto sec  = std::chrono::duration_cast<seconds>(duration).count();
+            auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10;  // keep 2 digit
+            std::sprintf(stime, "%ld.%1ld", sec, msec);
+            std::cout << "[Timing] " << what << ": " << stime << " [sec]\n";
+        }
+
+    }
+
+private:
+    size_t current_{0};
+    Tpoint mark_{};
+    std::vector<Tduration> duration_{1};
+};
+
+/*!
+ * A "high level function"-like utility macro to forward a function call
+ * and accumulate the execution time to the corresponding timing object.
+ *
+ * @param   Tim     The Timing object [Needs to have methods start() and stop()]
+ * @param   Func    The function name
+ * @param   ...     The arguments to pass to function (the preprocessor way)
+ */
+#define timeCall(Tim, Func, ...)    \
+    Tim.start();                    \
+    Func(__VA_ARGS__);              \
+    Tim.stop();                     \
+
+
+/*!
+ * A utility to check if a number is power of two
+ *
+ * @tparam Integral     The integral type of the number to check
+ * @param x             The number to check
+ * @return              True if it is power of 2, false otherwise
+ */
+template <typename Integral>
+constexpr inline bool isPowerOfTwo(Integral x) noexcept {
+    return (!(x & (x - 1)) && x);
+}
+
+
+#endif /* UTILS_HPP_ */
--- a/homework_3/test/gtest/gtest/gtest-all.cpp
+++ b/homework_3/test/gtest/gtest/gtest-all.cpp
--- a/homework_3/test/gtest/gtest/gtest.h
+++ b/homework_3/test/gtest/gtest/gtest.h
--- a/homework_3/test/tests.cpp
+++ b/homework_3/test/tests.cpp
@ -0,0 +1,33 @@
+/**
+ * \file
+ * \brief   PDS HW3 tests
+ *
+ * To run these test execute:
+ *  ...
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#include <gtest/gtest.h>
+
+/*
+ * Global fixtures
+ */
+
+class TCUDAbitonic : public ::testing::Test {
+protected:
+    static void SetUpTestSuite() { }
+
+    static void TearDownTestSuite() { }
+};
+
+
+/*
+ *
+ */
+TEST_F(TCUDAbitonic, test1) {
+
+    EXPECT_EQ(true, true);
+}
Author	SHA1	Message	Date
Christos Choutouridis	7a6f7f53b5	HW3: RC1 - hpc slurm creation and submittion scripts added	2025-02-15 21:39:47 +02:00
Christos Choutouridis	e165b75f92	HW3: RC1 First version to test on HPC	2025-02-15 21:18:55 +02:00
Christos Choutouridis	6db2a814d2	HW3: First batch to test [v0 and v1]	2025-02-14 18:41:08 +02:00
Christos Choutouridis	b31ca23757	HW3: [WIP] Test HPC build	2025-02-14 17:56:56 +02:00
Christos Choutouridis	2a2c7fec38	HW3: [WIP] V2 added (needs debug)	2025-02-10 00:01:58 +02:00
Christos Choutouridis	1fe5ab4da7	HW3: [WIP] V0 version added	2025-01-29 00:48:45 +02:00
Christos Choutouridis	146e975ac1	HW3: [no compile] A first clean up	2025-01-21 22:52:28 +02:00
Christos Choutouridis	2ff6ae171a	HW3: [No Compile] Init HW3 with HW2 files	2025-01-21 22:25:49 +02:00