HW3: [WIP] V0 version added

2 달 전 · 1fe5ab4da7
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@@ -25,12 +25,12 @@ PROJECT         := PDS_homework_3
 TARGET          := bitonic

 # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
 SRC_DIR_LIST    := src test test/gtest
 SRC_DIR_LIST    := src #test test/gtest

 # Include directories list(space seperated). Makefile-relative path.
 INC_DIR_LIST    := src \
                   test \
                   test/gtest/ \
 INC_DIR_LIST    := src
 #                   test \
 #                   test/gtest/ \


 # Exclude files list(space seperated). Filenames only.
@@ -45,10 +45,10 @@ OUTPUT_DIR      := out

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
 DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp
 REL_CFLAGS      := -Wall -Wextra -O3 -std=c11 -fopenmp
 DEB_CXXFLAGS    := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp
 REL_CXXFLAGS    := -Wall -Wextra -O3 -std=c++17 -fopenmp
 DEB_CFLAGS      := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra"
 REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
 DEB_CXXFLAGS    := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra"
 REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"

 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
@@ -56,15 +56,15 @@ PRE_DEFS        :=

 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
 LDFLAGS         := -pthread
 LDFLAGS         :=

 # Map output file
 MAP_FILE        := output.map
 MAP_FLAG        := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
 MAP_FILE        := # output.map
 MAP_FLAG        := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)

 # ============== Docker settings ==============
 # We need:
 #  - Bind the entire project directory(the dir that icludes all the code) as volume.
 #  - Bind the entire project directory(the dir that includes all the code) as volume.
 #  - In docker instance, change to working directory(where the makefile is).
 DOCKER_VOL_DIR  := $(shell pwd)
 DOCKER_WRK_DIR  :=
@@ -85,6 +85,7 @@ CFLAGS          := $(DEB_CFLAGS)
 CXXFLAGS        := $(DEB_CXXFLAGS)
 CXX             := g++ #mpic++
 CC              := gcc #mpicc
 LINKER          := g++

 #
 # =========== Main body and Patterns ===========
@@ -117,37 +118,37 @@ DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
 # It is based on Tom Tromey's method.
 # 
 # Invoke cpp to create makefile rules with dependencies for each source file
 $(DEP_DIR)/%.d: %.c
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
 #$(DEP_DIR)/%.d: %.c
 #	@mkdir -p $(@D)
 #	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<

 # c file objects depent on .c AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d
 $(OBJ_DIR)/%.o: %.c
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
 	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<

 $(DEP_DIR)/%.d: %.cpp
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
 #$(DEP_DIR)/%.d: %.cpp
 #	@mkdir -p $(@D)
 #	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<

 # cpp file objects depent on .cpp AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
 # cpp file objects depend on .cpp AND dependency files, which have an empty recipe
 $(OBJ_DIR)/%.o: %.cpp
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
 	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<

 # empty recipe for dependency files. This prevents make errors
 $(DEP):
 #$(DEP):

 # now include all dependencies
 # After all they are makefile dependency rules ;)
 include $(wildcard $(DEP))
 #include $(wildcard $(DEP))

 # main target rule
 $(BUILD_DIR)/$(TARGET): $(OBJ)
 	@mkdir -p $(@D)
 	@echo Linking to target: $(TARGET)
 	@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@echo
 	@echo Print size information
 	@$(CSIZE) $(@D)/$(TARGET)
@@ -179,10 +180,12 @@ release: $(BUILD_DIR)/$(TARGET)
 #


 bitonic_v0: CC := nvcc
 bitonic_v0: CXX := nvcc
 bitonic_v0: CC := nvcc -x cu
 bitonic_v0: CXX := nvcc -x cu
 bitonic_v0: LINKER := nvcc
 bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
 bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
 bitonic_v0: TARGET := bitonic_v0
 bitonic_v0: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
@@ -191,11 +194,7 @@ bitonic_v0: $(BUILD_DIR)/$(TARGET)

 hpc-build:
 	make clean
 	make distbubbletonic
 	make clean
 	make distbitonic
 	make clean
 	make tests
 	make bitonic_v0


 all: debug bitonic_v0
--- a/homework_3/exersize.md
+++ b/homework_3/exersize.md
@@ -0,0 +1,33 @@
 Parallel & Distributed Computer Systems HW3

 January, 2025

 Write a program that sorts $N$ integers in ascending order, using CUDA.

 The program must perform the following tasks:

 - The user specifies a positive integers $q$.

 - Start a process with an array of $N = 2^q$ random integers is each processes.

 - Sort all $N$ elements int ascending order.

 - Check the correctness of the final result.

 Your implementation should be based on the following steps:

 V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.

 V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!

 V2. Modify the kernel of V1 to work with local memory instead of global.

 You must deliver:
 - A report (about $3-4$ pages) that describes your parallel algorithm and implementation.

 - Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.

 - The source code of your program uploaded online.

 Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
 Deadline: 2 February, $2025$.
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@@ -0,0 +1,145 @@
 /*!
 * \file
 * \brief   Bitonic sort CUDA implementation header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #ifndef BITONICSORTCUDA_H_
 #define BITONICSORTCUDA_H_

 #include <cuda_runtime.h>
 #include <vector>
 #include <cmath>
 #include <cstdint>
 #include <utility>

 #include "utils.hpp"

 /*
 * Exported timers
 */
 extern Timing Timer_total;

 using threadId_t = size_t;


 /*
 * ============================== Sort utilities ==============================
 */

 /*!
 * Returns the ascending or descending configuration (up/down phase) of the thread id
 * depending on the current depth
 *
 * @param tid   [threadId_t] The current thread
 * @param stage [size_t]     The current stage of the sorting network (same for each step)
 * @return      [bool]       True if we need ascending configuration, false otherwise
 */
 __device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
    return !(tid & (1 << stage));
 }

 /*!
 * Returns the thread's partner for data exchange during the sorting network iterations
 * of Bitonic
 *
 * @param tid   [threadId_t] The current node
 * @param step  [size_t]     The step of the sorting network
 * @return      [threadId_t] The node id of the partner for data exchange
 */
 __device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
    return (tid ^ (1 << step));
 }


 /*!
 * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
 *
 * @param tid       [threadId_t] The node for which we check
 * @param partner   [threadId_t] The partner of the data exchange
 * @param stage     [size_t]     The current stage of the sorting network (same for each step)
 * @return          [bool]       True if the node should keep the small values, false otherwise
 */

 __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
    return ascending(tid, stage) == (tid < partner);
 }



 /*
 * ============================== Sort algorithms ==============================
 */


 template <typename ValueT>
 __device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) {
    if (( keepSmall && (data[tid] > data[partner])) ||
        (!keepSmall && (data[tid] < data[partner])) ) {
        ValueT temp = data[tid];
        data[tid] = data[partner];
        data[partner] = temp;
    }
 }


 template <typename ValueT>
 __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
    if (tid < n) {
        threadId_t pid = partner(tid, step);
        if (pid < n) {
            bool keep = keepSmall(tid, pid, stage);
            cudaExchange(data, tid, pid, keep);
        }
    }
 }


 /*!
 * A distributed version of the Bitonic sort algorithm.
 *
 * @note
 *  Each MPI process should run an instance of this function.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process data to sort
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 */

 template <typename DataT>
 void bitonicSort(DataT& data) {
    using value_t = typename DataT::value_type;

    value_t* dev_data;
    auto size = data.size();

    cudaMalloc(&dev_data, size * sizeof(value_t));
    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);

    int Nthreads = 1024;
    int Nblocks = (size + Nthreads - 1) / Nthreads;

    size_t max_depth = static_cast<size_t>(log2(size));
    for (size_t stage = 1; stage <= max_depth; ++stage) {
        for (size_t step = stage; step > 0; ) {
            --step;
            bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
            cudaDeviceSynchronize();
        }
    }

    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
    cudaFree(dev_data);
 }





 #endif //BITONICSORTCUDA_H_
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@@ -35,7 +35,7 @@ static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;


 /*!
 * Value type selection
 * Value and Buffer type selection
 *
 * We support the following compiler types or the <cstdint> that translate to them:
 *  char       -  unsigned char
@@ -46,7 +46,8 @@ static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 *  float
 *  double
 */
 using   distValue_t = uint32_t;
 using Value_t = uint32_t;
 using Data_t  = std::vector<Value_t>;

 /*!
 * Session option for each invocation of the executable.
--- a/homework_3/src/distsort.cpp
+++ b/homework_3/src/distsort.cpp
@@ -1,51 +0,0 @@
 /*!
 * \file
 * \brief   Distributed sort implementation
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #include "utils.hpp"
 #include "distsort.hpp"


 /*!
 * Returns the ascending or descending configuration of the node's sequence based on
 * the current node (MPI process) and the depth of the sorting network
 *
 * @param node      [mpi_id_t] The current node (MPI process)
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if we need ascending configuration, false otherwise
 */
 bool ascending(mpi_id_t node, size_t depth) noexcept {
    return !(node & (1 << depth));
 }

 /*!
 * Returns the node's partner for data exchange during the sorting network iterations
 * of Bitonic
 *
 * @param node      [mpi_id_t] The current node
 * @param step      [size_t]   The step of the sorting network
 * @return          [mpi_id_t] The node id of the partner for data exchange
 */
 mpi_id_t partner(mpi_id_t node, size_t step) noexcept {
    return (node ^ (1 << step));
 }


 /*!
 * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
 *
 * @param node      [mpi_id_t] The node for which we check
 * @param partner   [mpi_id_t] The partner of the data exchange
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if the node should keep the small values, false otherwise
 */

 bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth) {
    if (node == partner)
        throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n");
    return ascending(node, depth) == (node < partner);
 }
--- a/homework_3/src/distsort.hpp
+++ b/homework_3/src/distsort.hpp
@@ -1,223 +0,0 @@
 /*!
 * \file
 * \brief   Distributed sort implementation header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #ifndef DISTBITONIC_H_
 #define DISTBITONIC_H_

 #include <vector>
 #include <algorithm>
 #include <parallel/algorithm>
 #include <cmath>
 #include <cstdint>
 #if !defined DEBUG
 #define NDEBUG
 #endif
 #include <cassert>

 #include "utils.hpp"

 /*
 * Exported timers
 */
 extern Timing Timer_total;
 extern Timing Timer_fullSort;
 extern Timing Timer_exchange;
 extern Timing Timer_minmax;
 extern Timing Timer_elbowSort;



 /*
 * ============================== Sort utilities ==============================
 */



 /*!
 * Returns the ascending or descending configuration of the node's sequence based on
 * the current node (MPI process) and the depth of the sorting network
 *
 * @param node      [mpi_id_t] The current node (MPI process)
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if we need ascending configuration, false otherwise
 */
 bool ascending(mpi_id_t node, size_t depth);

 /*!
 * Returns the node's partner for data exchange during the sorting network iterations
 * of Bitonic
 *
 * @param node      [mpi_id_t] The current node
 * @param step      [size_t]   The step of the sorting network
 * @return          [mpi_id_t] The node id of the partner for data exchange
 */
 mpi_id_t partner(mpi_id_t node, size_t step);


 /*!
 * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
 *
 * @param node      [mpi_id_t] The node for which we check
 * @param partner   [mpi_id_t] The partner of the data exchange
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if the node should keep the small values, false otherwise
 */
 bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth);



 /*
 * ============================== Data utilities ==============================
 */

 /*!
 * Sort a range using the build-in O(Nlog(N)) algorithm
 *
 * @tparam RangeT   A range type with random access iterator
 *
 * @param data      [RangeT] The data to be sorted
 * @param ascending [bool]   Flag to indicate the sorting order
 */
 template<typename RangeT>
 void fullSort(RangeT& data, bool ascending) noexcept {
    // Use introsort from stdlib++ here, unless ... __gnu_parallel
    if (ascending) {
        __gnu_parallel::sort(data.begin(), data.end(), std::less<>());
    }
    else {
        __gnu_parallel::sort(data.begin(), data.end(), std::greater<>());
    }
 }

 /*!
 * Core functionality of sort for shadowed buffer types using
 * the "elbow sort" algorithm.
 *
 * @note:
 *  This algorithm can not work "in place".
 *  We use the active buffer as source and the shadow as target.
 *  At the end we switch which buffer is active and which is the shadow.
 * @note
 *  This is the core functionality. Use the elbowSort() function instead
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 * @tparam CompT            A Comparison type for binary operation comparisons
 *
 * @param data          [ShadowedDataT] The data to sort
 * @param ascending     [bool]          Flag to indicate the sorting order
 * @param comp          [CompT]         The binary operator object
 */
 template<typename ShadowedDataT, typename CompT>
 void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept {
    auto& active = data.getActive(); // Get the source vector (the data to sort)
    auto& shadow = data.getShadow(); // Get the target vector (the sorted data)

    size_t N = data.size();         // The total size is the same or both vectors
    size_t left = std::distance(
            active.begin(),
            (ascending) ?
                std::min_element(active.begin(), active.end()) :
                std::max_element(active.begin(), active.end())
    );                              // start 'left' from elbow of the bitonic
    size_t right = (left == N-1) ? 0 : left + 1;

    // Walk in opposite directions from elbow and insert-sort to target vector
    for (size_t i = 0 ; i<N ; ++i) {
        if (comp(active[left], active[right])) {
            shadow[i] = active[left];
            left = (left == 0) ? N-1 : left -1; // cycle decrease
        }
        else {
            shadow[i] = active[right];
            right = (right + 1) % N;            // cycle increase
        }
    }
    data.switch_active();           // Switch active-shadow buffers
 }

 /*!
 * Sort a shadowed buffer using the "elbow sort" algorithm.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The data to sort
 * @param ascending     [bool]          Flag to indicate the sorting order
 */
 template<typename ShadowedDataT>
 void elbowSort(ShadowedDataT& data, bool ascending) noexcept {
    if (ascending)
        elbowSortCore(data, ascending, std::less<>());
    else
        elbowSortCore(data, ascending, std::greater<>());
 }


 /*!
 * Takes two sequences and selects either the larger or the smaller items
 * in one-to-one comparison between them. If the initial sequences are bitonic, then
 * the result is a bitonic sequence too!
 *
 * @tparam ValueT   The underlying type of the sequences
 *
 * @param local     [ValueT*]       Pointer to the local sequence
 * @param remote    [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
 * @param count     [size_t]        The number of items to process
 * @param keepSmall [bool]          Flag to indicate if we keep the small items in local sequence
 */
 template<typename ValueT>
 void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
    std::transform(
            local, local + count,
            remote,
            local,
            [&keepSmall](const ValueT& a, const ValueT& b){
                return (keepSmall) ? std::min(a, b) : std::max(a, b);
            });
 }

 /*
 * ============================== Sort algorithms ==============================
 */


 /*!
 * A distributed version of the Bitonic sort algorithm.
 *
 * @note
 *  Each MPI process should run an instance of this function.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process data to sort
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 */
 template<typename ShadowedDataT>
 void distBitonic(ShadowedDataT& data) {
    // Initially sort to create a half part of a bitonic sequence
    timeCall(Timer_fullSort, fullSort, data, ascending(rank, 0));

    // Run through sort network using elbow-sort ( O(LogN * LogN) iterations )
    auto p = static_cast<uint32_t>(std::log2(Processes));
    for (size_t depth = 1; depth <= p; ++depth) {
        for (size_t step = depth; step > 0;) {
            --step;
            // Find out exchange configuration
            auto part = partner(rank, step);
            auto ks = keepSmall(rank, part, depth);
            // Exchange with partner, keep nim-or-max
            exchange(data, part, ks, tag);

        }
        // sort - O(N)
        timeCall(Timer_elbowSort, elbowSort, data, ascending(rank, depth));
    }
 }

 #endif //DISTBITONIC_H_
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@@ -14,34 +14,26 @@

 #include "utils.hpp"
 #include "config.h"
 #include "distsort.hpp"
 #include "bitonicsort.hpp"


 // Global session data
 Data_t          Data;
 config_t        config;
 distBuffer_t    Data;
 Log             logger;


 // Mersenne seeded from hw if possible. range: [type_min, type_max]
 std::random_device  rd;
 std::mt19937        gen(rd());

 //! Performance timers for each one of the "costly" functions
 Timing Timer_total;
 Timing Timer_fullSort;
 Timing Timer_exchange;
 Timing Timer_minmax;
 Timing Timer_elbowSort;


 //! Init timing objects for extra rounds
 void measurements_init() {
    if (config.perf > 1) {
        Timer_total.init(config.perf);
        Timer_fullSort.init(config.perf);
        Timer_exchange.init(config.perf);
        Timer_minmax.init(config.perf);
        Timer_elbowSort.init(config.perf);
    }
 }

@@ -49,10 +41,6 @@ void measurements_init() {
 void measurements_next() {
    if (config.perf > 1) {
        Timer_total.next();
        Timer_fullSort.next();
        Timer_exchange.next();
        Timer_minmax.next();
        Timer_elbowSort.next();
    }
 }

@@ -136,20 +124,14 @@ bool get_options(int argc, char* argv[]){
 /*!
 * A simple validator for the entire distributed process
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 * @tparam DataT    A buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 *
 * @return              [bool]          True if all are sorted and in total ascending order
 * @param data      [DataT] The data
 * @return          [bool]  True if sorted in ascending order
 */
 template<typename ShadowedDataT>
 bool validator(ShadowedDataT& data) {
    using value_t = typename ShadowedDataT::value_type;
    bool ret = true;    // Have faith!

    return ret;
 template<typename DataT>
 bool validator(DataT& data) {
    return std::is_sorted(data.begin(), data.end());
 }

 /*!
@@ -180,15 +162,15 @@ int main(int argc, char* argv[]) try {
    for (size_t it = 0 ; it < config.perf ; ++it) {
        // Initialize local data
        logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
        std::uniform_int_distribution<distValue_t > dis(
                std::numeric_limits<distValue_t>::min(),
                std::numeric_limits<distValue_t>::max()
        std::uniform_int_distribution<Value_t > dis(
                std::numeric_limits<Value_t>::min(),
                std::numeric_limits<Value_t>::max()
        );
        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
        // Run distributed sort
        logger << "Starting distributed sorting ... ";
        Timer_total.start();
        distBitonic(Data);
        bitonicSort(Data);
        Timer_total.stop();
        measurements_next();
        logger << " Done." << logger.endl;
@@ -196,11 +178,7 @@ int main(int argc, char* argv[]) try {

    // Print-outs and validation
    if (config.perf > 1) {
        Timing::print_duration(Timer_total.median(),    "Total     ", 0);
        Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", 0);
        Timing::print_duration(Timer_exchange.median(), "Exchange  ", 0);
        Timing::print_duration(Timer_minmax.median(),   "Min-Max   ", 0);
        Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", 0);
        Timing::print_duration(Timer_total.median(), "Total");
    }
    if (config.validation) {
        // If requested, we have the chance to fail!
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@@ -18,124 +18,6 @@
 #include "config.h"


 /*!
 * @brief A std::vector wrapper with 2 vectors, an active and a shadow.
 *
 * This type exposes the standard vector functionality of the active vector.
 * The shadow can be used when we need to use the vector as mutable
 * data in algorithms that can not support "in-place" editing (like elbow-sort for example)
 *
 * @tparam Value_t  the underlying data type of the vectors
 */
 template <typename Value_t>
 struct ShadowedVec_t {
    // STL requirements
    using value_type     = Value_t;
    using iterator       = typename std::vector<Value_t>::iterator;
    using const_iterator = typename std::vector<Value_t>::const_iterator;
    using size_type      = typename std::vector<Value_t>::size_type;

    // Default constructor
    ShadowedVec_t() = default;

    // Constructor from an std::vector
    explicit ShadowedVec_t(const std::vector<Value_t>& vec)
            : North(vec), South(), active(north) {
        South.resize(North.size());
    }

    explicit ShadowedVec_t(std::vector<Value_t>&& vec)
            : North(std::move(vec)), South(), active(north) {
        South.resize(North.size());
    }

    // Copy assignment operator
    ShadowedVec_t& operator=(const ShadowedVec_t& other) {
        if (this != &other) { // Avoid self-assignment
            North = other.North;
            South = other.South;
            active = other.active;
        }
        return *this;
    }

    // Move assignment operator
    ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept {
        if (this != &other) { // Avoid self-assignment
            North = std::move(other.North);
            South = std::move(other.South);
            active = other.active;

            // There is no need to zero out other since it is valid but in a non-defined state
        }
        return *this;
    }

    // Type accessors
    std::vector<Value_t>& getActive() { return (active == north) ? North : South; }
    std::vector<Value_t>& getShadow() { return (active == north) ? South : North; }
    const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; }
    const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; }

    // Swap vectors
    void switch_active() { active = (active == north) ? south : north; }

    // Dispatch vector functionality to active vector
    Value_t& operator[](size_type index) { return getActive()[index]; }
    const Value_t& operator[](size_type index) const { return getActive()[index]; }

    Value_t& at(size_type index) { return getActive().at(index); }
    const Value_t& at(size_type index) const { return getActive().at(index); }

    void push_back(const Value_t& value) { getActive().push_back(value); }
    void push_back(Value_t&& value)      { getActive().push_back(std::move(value)); }
    void pop_back()                      { getActive().pop_back(); }
    Value_t& front() { return getActive().front(); }
    Value_t& back()  { return getActive().back(); }
    const Value_t& front() const { return getActive().front(); }
    const Value_t& back()  const { return getActive().back(); }

    iterator begin() { return getActive().begin(); }
    const_iterator begin() const { return getActive().begin(); }
    iterator end() { return getActive().end(); }
    const_iterator end() const { return getActive().end(); }

    size_type size() const { return getActive().size(); }
    void resize(size_t new_size) {
        North.resize(new_size);
        South.resize(new_size);
    }

    void reserve(size_t new_capacity) {
        North.reserve(new_capacity);
        South.reserve(new_capacity);
    }
    [[nodiscard]] size_t capacity() const { return getActive().capacity(); }
    [[nodiscard]] bool empty() const { return getActive().empty(); }

    void clear() { getActive().clear(); }
    void swap(std::vector<Value_t>& other) { getActive().swap(other); }

    // Comparisons
    bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); }
    bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); }
    bool operator== (const std::vector<value_type>& other) { return getActive() == other; }
    bool operator!= (const std::vector<value_type>& other) { return getActive() != other; }

 private:
    std::vector<Value_t> North{};       //!< Actual buffer to be used either as active or shadow
    std::vector<Value_t> South{};       //!< Actual buffer to be used either as active or shadow
    enum {
        north, south
    } active{north};                    //!< Flag to select between North and South buffer
 };

 /*
 * Exported data types
 */
 using distBuffer_t = ShadowedVec_t<distValue_t>;
 extern distBuffer_t Data;

 /*!
 * A Logger for entire program.
 */
--- a/homework_3/test/tests.cpp
+++ b/homework_3/test/tests.cpp
@@ -25,8 +25,7 @@ protected:


 /*
 * MPI: SysTest (acceptance)
 * Each process executes distBubbletonic for uin8_t [16]
 *
 */
 TEST_F(TCUDAbitonic, test1) {