HW3: [WIP] V0 version added

2025-01-29 00:48:45 +02:00 · 2025-01-29 00:48:45 +02:00 · 1fe5ab4da7
commit 1fe5ab4da7
parent 146e975ac1
9 changed files with 229 additions and 466 deletions
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@ -25,12 +25,12 @@ PROJECT         := PDS_homework_3
 TARGET          := bitonic

 # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
-SRC_DIR_LIST    := src test test/gtest
+SRC_DIR_LIST    := src #test test/gtest

 # Include directories list(space seperated). Makefile-relative path.
-INC_DIR_LIST    := src \
-                   test \
-                   test/gtest/ \
+INC_DIR_LIST    := src
+#                   test \
+#                   test/gtest/ \


 # Exclude files list(space seperated). Filenames only.
@ -45,10 +45,10 @@ OUTPUT_DIR      := out

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
-DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp
-REL_CFLAGS      := -Wall -Wextra -O3 -std=c11 -fopenmp
-DEB_CXXFLAGS    := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp
-REL_CXXFLAGS    := -Wall -Wextra -O3 -std=c++17 -fopenmp
+DEB_CFLAGS      := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra"
+REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
+DEB_CXXFLAGS    := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra"
+REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"

 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
@ -56,15 +56,15 @@ PRE_DEFS        :=

 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
-LDFLAGS         := -pthread
+LDFLAGS         :=

 # Map output file
-MAP_FILE        := output.map
-MAP_FLAG        := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
+MAP_FILE        := # output.map
+MAP_FLAG        := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)

 # ============== Docker settings ==============
 # We need:
-#  - Bind the entire project directory(the dir that icludes all the code) as volume.
+#  - Bind the entire project directory(the dir that includes all the code) as volume.
 #  - In docker instance, change to working directory(where the makefile is).
 DOCKER_VOL_DIR  := $(shell pwd)
 DOCKER_WRK_DIR  :=
@ -85,6 +85,7 @@ CFLAGS          := $(DEB_CFLAGS)
 CXXFLAGS        := $(DEB_CXXFLAGS)
 CXX             := g++ #mpic++
 CC              := gcc #mpicc
+LINKER          := g++

 #
 # =========== Main body and Patterns ===========
@ -117,37 +118,37 @@ DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
 # It is based on Tom Tromey's method.
 # 
 # Invoke cpp to create makefile rules with dependencies for each source file
-$(DEP_DIR)/%.d: %.c
-	@mkdir -p $(@D)
-	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
+#$(DEP_DIR)/%.d: %.c
+#	@mkdir -p $(@D)
+#	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<

 # c file objects depent on .c AND dependency files, which have an empty recipe 
-$(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d
+$(OBJ_DIR)/%.o: %.c
 	@mkdir -p $(@D)
-	@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
+	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<

-$(DEP_DIR)/%.d: %.cpp
-	@mkdir -p $(@D)
-	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
+#$(DEP_DIR)/%.d: %.cpp
+#	@mkdir -p $(@D)
+#	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<

-# cpp file objects depent on .cpp AND dependency files, which have an empty recipe 
-$(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
+# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
+$(OBJ_DIR)/%.o: %.cpp
 	@mkdir -p $(@D)
-	@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
+	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<

 # empty recipe for dependency files. This prevents make errors
-$(DEP):
+#$(DEP):

 # now include all dependencies
 # After all they are makefile dependency rules ;)
-include $(wildcard $(DEP))
+#include $(wildcard $(DEP))

 # main target rule
 $(BUILD_DIR)/$(TARGET): $(OBJ)
 	@mkdir -p $(@D)
 	@echo Linking to target: $(TARGET)
-	@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
-	@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@echo
 	@echo Print size information
 	@$(CSIZE) $(@D)/$(TARGET)
@ -179,10 +180,12 @@ release: $(BUILD_DIR)/$(TARGET)
 #


-bitonic_v0: CC := nvcc
-bitonic_v0: CXX := nvcc
+bitonic_v0: CC := nvcc -x cu
+bitonic_v0: CXX := nvcc -x cu
+bitonic_v0: LINKER := nvcc
 bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
+bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
 bitonic_v0: TARGET := bitonic_v0
 bitonic_v0: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
@ -191,11 +194,7 @@ bitonic_v0: $(BUILD_DIR)/$(TARGET)

 hpc-build:
 	make clean
-	make distbubbletonic
-	make clean
-	make distbitonic
-	make clean
-	make tests
+	make bitonic_v0


 all: debug bitonic_v0
--- a/homework_3/exersize.md
+++ b/homework_3/exersize.md
@ -0,0 +1,33 @@
+Parallel & Distributed Computer Systems HW3
+
+January, 2025
+
+Write a program that sorts $N$ integers in ascending order, using CUDA.
+
+The program must perform the following tasks:
+
+- The user specifies a positive integers $q$.
+
+- Start a process with an array of $N = 2^q$ random integers is each processes.
+
+- Sort all $N$ elements int ascending order.
+
+- Check the correctness of the final result.
+
+Your implementation should be based on the following steps:
+
+V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
+
+V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
+
+V2. Modify the kernel of V1 to work with local memory instead of global.
+
+You must deliver:
+- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
+
+- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
+
+- The source code of your program uploaded online.
+
+Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
+Deadline: 2 February, $2025$.
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@ -0,0 +1,145 @@
+/*!
+ * \file
+ * \brief   Bitonic sort CUDA implementation header
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#ifndef BITONICSORTCUDA_H_
+#define BITONICSORTCUDA_H_
+
+#include <cuda_runtime.h>
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+
+#include "utils.hpp"
+
+/*
+ * Exported timers
+ */
+extern Timing Timer_total;
+
+using threadId_t = size_t;
+
+
+/*
+ * ============================== Sort utilities ==============================
+ */
+
+/*!
+ * Returns the ascending or descending configuration (up/down phase) of the thread id
+ * depending on the current depth
+ *
+ * @param tid   [threadId_t] The current thread
+ * @param stage [size_t]     The current stage of the sorting network (same for each step)
+ * @return      [bool]       True if we need ascending configuration, false otherwise
+ */
+__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
+    return !(tid & (1 << stage));
+}
+
+/*!
+ * Returns the thread's partner for data exchange during the sorting network iterations
+ * of Bitonic
+ *
+ * @param tid   [threadId_t] The current node
+ * @param step  [size_t]     The step of the sorting network
+ * @return      [threadId_t] The node id of the partner for data exchange
+ */
+__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
+    return (tid ^ (1 << step));
+}
+
+
+/*!
+ * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
+ *
+ * @param tid       [threadId_t] The node for which we check
+ * @param partner   [threadId_t] The partner of the data exchange
+ * @param stage     [size_t]     The current stage of the sorting network (same for each step)
+ * @return          [bool]       True if the node should keep the small values, false otherwise
+ */
+
+__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
+    return ascending(tid, stage) == (tid < partner);
+}
+
+
+
+/*
+ * ============================== Sort algorithms ==============================
+ */
+
+
+template <typename ValueT>
+__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) {
+    if (( keepSmall && (data[tid] > data[partner])) ||
+        (!keepSmall && (data[tid] < data[partner])) ) {
+        ValueT temp = data[tid];
+        data[tid] = data[partner];
+        data[partner] = temp;
+    }
+}
+
+
+template <typename ValueT>
+__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    if (tid < n) {
+        threadId_t pid = partner(tid, step);
+        if (pid < n) {
+            bool keep = keepSmall(tid, pid, stage);
+            cudaExchange(data, tid, pid, keep);
+        }
+    }
+}
+
+
+/*!
+ * A distributed version of the Bitonic sort algorithm.
+ *
+ * @note
+ *  Each MPI process should run an instance of this function.
+ *
+ * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
+ *
+ * @param data          [ShadowedDataT] The local to MPI process data to sort
+ * @param Processes     [mpi_id_t]      The total number of MPI processes
+ * @param rank          [mpi_id_t]      The current process id
+ */
+
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    cudaMalloc(&dev_data, size * sizeof(value_t));
+    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+
+    int Nthreads = 1024;
+    int Nblocks = (size + Nthreads - 1) / Nthreads;
+
+    size_t max_depth = static_cast<size_t>(log2(size));
+    for (size_t stage = 1; stage <= max_depth; ++stage) {
+        for (size_t step = stage; step > 0; ) {
+            --step;
+            bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+    }
+
+    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    cudaFree(dev_data);
+}
+
+
+
+
+
+#endif //BITONICSORTCUDA_H_
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@ -35,7 +35,7 @@ static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;


 /*!
- * Value type selection
+ * Value and Buffer type selection
 *
 * We support the following compiler types or the <cstdint> that translate to them:
 *  char       -  unsigned char
@ -46,7 +46,8 @@ static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 *  float
 *  double
 */
-using   distValue_t = uint32_t;
+using Value_t = uint32_t;
+using Data_t  = std::vector<Value_t>;

 /*!
 * Session option for each invocation of the executable.
--- a/homework_3/src/distsort.cpp
+++ b/homework_3/src/distsort.cpp
@ -1,51 +0,0 @@
-/*!
- * \file
- * \brief   Distributed sort implementation
- *
- * \author
- *    Christos Choutouridis AEM:8997
- *    <cchoutou@ece.auth.gr>
- */
-#include "utils.hpp"
-#include "distsort.hpp"
-
-
-/*!
- * Returns the ascending or descending configuration of the node's sequence based on
- * the current node (MPI process) and the depth of the sorting network
- *
- * @param node      [mpi_id_t] The current node (MPI process)
- * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
- * @return          [bool]     True if we need ascending configuration, false otherwise
- */
-bool ascending(mpi_id_t node, size_t depth) noexcept {
-    return !(node & (1 << depth));
-}
-
-/*!
- * Returns the node's partner for data exchange during the sorting network iterations
- * of Bitonic
- *
- * @param node      [mpi_id_t] The current node
- * @param step      [size_t]   The step of the sorting network
- * @return          [mpi_id_t] The node id of the partner for data exchange
- */
-mpi_id_t partner(mpi_id_t node, size_t step) noexcept {
-    return (node ^ (1 << step));
-}
-
-
-/*!
- * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
- *
- * @param node      [mpi_id_t] The node for which we check
- * @param partner   [mpi_id_t] The partner of the data exchange
- * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
- * @return          [bool]     True if the node should keep the small values, false otherwise
- */
-
-bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth) {
-    if (node == partner)
-        throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n");
-    return ascending(node, depth) == (node < partner);
-}
--- a/homework_3/src/distsort.hpp
+++ b/homework_3/src/distsort.hpp
@ -1,223 +0,0 @@
-/*!
- * \file
- * \brief   Distributed sort implementation header
- *
- * \author
- *    Christos Choutouridis AEM:8997
- *    <cchoutou@ece.auth.gr>
- */
-
-#ifndef DISTBITONIC_H_
-#define DISTBITONIC_H_
-
-#include <vector>
-#include <algorithm>
-#include <parallel/algorithm>
-#include <cmath>
-#include <cstdint>
-#if !defined DEBUG
-#define NDEBUG
-#endif
-#include <cassert>
-
-#include "utils.hpp"
-
-/*
- * Exported timers
- */
-extern Timing Timer_total;
-extern Timing Timer_fullSort;
-extern Timing Timer_exchange;
-extern Timing Timer_minmax;
-extern Timing Timer_elbowSort;
-
-
-
-/*
- * ============================== Sort utilities ==============================
- */
-
-
-
-/*!
- * Returns the ascending or descending configuration of the node's sequence based on
- * the current node (MPI process) and the depth of the sorting network
- *
- * @param node      [mpi_id_t] The current node (MPI process)
- * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
- * @return          [bool]     True if we need ascending configuration, false otherwise
- */
-bool ascending(mpi_id_t node, size_t depth);
-
-/*!
- * Returns the node's partner for data exchange during the sorting network iterations
- * of Bitonic
- *
- * @param node      [mpi_id_t] The current node
- * @param step      [size_t]   The step of the sorting network
- * @return          [mpi_id_t] The node id of the partner for data exchange
- */
-mpi_id_t partner(mpi_id_t node, size_t step);
-
-
-/*!
- * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
- *
- * @param node      [mpi_id_t] The node for which we check
- * @param partner   [mpi_id_t] The partner of the data exchange
- * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
- * @return          [bool]     True if the node should keep the small values, false otherwise
- */
-bool keepSmall(mpi_id_t node, mpi_id_t partner, size_t depth);
-
-
-
-/*
- * ============================== Data utilities ==============================
- */
-
-/*!
- * Sort a range using the build-in O(Nlog(N)) algorithm
- *
- * @tparam RangeT   A range type with random access iterator
- *
- * @param data      [RangeT] The data to be sorted
- * @param ascending [bool]   Flag to indicate the sorting order
- */
-template<typename RangeT>
-void fullSort(RangeT& data, bool ascending) noexcept {
-    // Use introsort from stdlib++ here, unless ... __gnu_parallel
-    if (ascending) {
-        __gnu_parallel::sort(data.begin(), data.end(), std::less<>());
-    }
-    else {
-        __gnu_parallel::sort(data.begin(), data.end(), std::greater<>());
-    }
-}
-
-/*!
- * Core functionality of sort for shadowed buffer types using
- * the "elbow sort" algorithm.
- *
- * @note:
- *  This algorithm can not work "in place".
- *  We use the active buffer as source and the shadow as target.
- *  At the end we switch which buffer is active and which is the shadow.
- * @note
- *  This is the core functionality. Use the elbowSort() function instead
- *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
- * @tparam CompT            A Comparison type for binary operation comparisons
- *
- * @param data          [ShadowedDataT] The data to sort
- * @param ascending     [bool]          Flag to indicate the sorting order
- * @param comp          [CompT]         The binary operator object
- */
-template<typename ShadowedDataT, typename CompT>
-void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept {
-    auto& active = data.getActive(); // Get the source vector (the data to sort)
-    auto& shadow = data.getShadow(); // Get the target vector (the sorted data)
-
-    size_t N = data.size();         // The total size is the same or both vectors
-    size_t left = std::distance(
-            active.begin(),
-            (ascending) ?
-                std::min_element(active.begin(), active.end()) :
-                std::max_element(active.begin(), active.end())
-    );                              // start 'left' from elbow of the bitonic
-    size_t right = (left == N-1) ? 0 : left + 1;
-
-    // Walk in opposite directions from elbow and insert-sort to target vector
-    for (size_t i = 0 ; i<N ; ++i) {
-        if (comp(active[left], active[right])) {
-            shadow[i] = active[left];
-            left = (left == 0) ? N-1 : left -1; // cycle decrease
-        }
-        else {
-            shadow[i] = active[right];
-            right = (right + 1) % N;            // cycle increase
-        }
-    }
-    data.switch_active();           // Switch active-shadow buffers
-}
-
-/*!
- * Sort a shadowed buffer using the "elbow sort" algorithm.
- *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
- *
- * @param data          [ShadowedDataT] The data to sort
- * @param ascending     [bool]          Flag to indicate the sorting order
- */
-template<typename ShadowedDataT>
-void elbowSort(ShadowedDataT& data, bool ascending) noexcept {
-    if (ascending)
-        elbowSortCore(data, ascending, std::less<>());
-    else
-        elbowSortCore(data, ascending, std::greater<>());
-}
-
-
-/*!
- * Takes two sequences and selects either the larger or the smaller items
- * in one-to-one comparison between them. If the initial sequences are bitonic, then
- * the result is a bitonic sequence too!
- *
- * @tparam ValueT   The underlying type of the sequences
- *
- * @param local     [ValueT*]       Pointer to the local sequence
- * @param remote    [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
- * @param count     [size_t]        The number of items to process
- * @param keepSmall [bool]          Flag to indicate if we keep the small items in local sequence
- */
-template<typename ValueT>
-void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
-    std::transform(
-            local, local + count,
-            remote,
-            local,
-            [&keepSmall](const ValueT& a, const ValueT& b){
-                return (keepSmall) ? std::min(a, b) : std::max(a, b);
-            });
-}
-
-/*
- * ============================== Sort algorithms ==============================
- */
-
-
-/*!
- * A distributed version of the Bitonic sort algorithm.
- *
- * @note
- *  Each MPI process should run an instance of this function.
- *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
- *
- * @param data          [ShadowedDataT] The local to MPI process data to sort
- * @param Processes     [mpi_id_t]      The total number of MPI processes
- * @param rank          [mpi_id_t]      The current process id
- */
-template<typename ShadowedDataT>
-void distBitonic(ShadowedDataT& data) {
-    // Initially sort to create a half part of a bitonic sequence
-    timeCall(Timer_fullSort, fullSort, data, ascending(rank, 0));
-
-    // Run through sort network using elbow-sort ( O(LogN * LogN) iterations )
-    auto p = static_cast<uint32_t>(std::log2(Processes));
-    for (size_t depth = 1; depth <= p; ++depth) {
-        for (size_t step = depth; step > 0;) {
-            --step;
-            // Find out exchange configuration
-            auto part = partner(rank, step);
-            auto ks = keepSmall(rank, part, depth);
-            // Exchange with partner, keep nim-or-max
-            exchange(data, part, ks, tag);
-
-        }
-        // sort - O(N)
-        timeCall(Timer_elbowSort, elbowSort, data, ascending(rank, depth));
-    }
-}
-
-#endif //DISTBITONIC_H_
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@ -14,34 +14,26 @@

 #include "utils.hpp"
 #include "config.h"
-#include "distsort.hpp"
+#include "bitonicsort.hpp"


 // Global session data
+Data_t          Data;
 config_t        config;
-distBuffer_t    Data;
 Log             logger;

-
 // Mersenne seeded from hw if possible. range: [type_min, type_max]
 std::random_device  rd;
 std::mt19937        gen(rd());

 //! Performance timers for each one of the "costly" functions
 Timing Timer_total;
-Timing Timer_fullSort;
-Timing Timer_exchange;
-Timing Timer_minmax;
-Timing Timer_elbowSort;
+

 //! Init timing objects for extra rounds
 void measurements_init() {
    if (config.perf > 1) {
        Timer_total.init(config.perf);
-        Timer_fullSort.init(config.perf);
-        Timer_exchange.init(config.perf);
-        Timer_minmax.init(config.perf);
-        Timer_elbowSort.init(config.perf);
    }
 }

@ -49,10 +41,6 @@ void measurements_init() {
 void measurements_next() {
    if (config.perf > 1) {
        Timer_total.next();
-        Timer_fullSort.next();
-        Timer_exchange.next();
-        Timer_minmax.next();
-        Timer_elbowSort.next();
    }
 }

@ -136,20 +124,14 @@ bool get_options(int argc, char* argv[]){
 /*!
 * A simple validator for the entire distributed process
 *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
+ * @tparam DataT    A buffer type with random access iterator.
 *
- * @param data          [ShadowedDataT] The local to MPI process
- * @param Processes     [mpi_id_t]      The total number of MPI processes
- * @param rank          [mpi_id_t]      The current process id
- *
- * @return              [bool]          True if all are sorted and in total ascending order
+ * @param data      [DataT] The data
+ * @return          [bool]  True if sorted in ascending order
 */
-template<typename ShadowedDataT>
-bool validator(ShadowedDataT& data) {
-    using value_t = typename ShadowedDataT::value_type;
-    bool ret = true;    // Have faith!
-
-    return ret;
+template<typename DataT>
+bool validator(DataT& data) {
+    return std::is_sorted(data.begin(), data.end());
 }

 /*!
@ -180,15 +162,15 @@ int main(int argc, char* argv[]) try {
    for (size_t it = 0 ; it < config.perf ; ++it) {
        // Initialize local data
        logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
-        std::uniform_int_distribution<distValue_t > dis(
-                std::numeric_limits<distValue_t>::min(),
-                std::numeric_limits<distValue_t>::max()
+        std::uniform_int_distribution<Value_t > dis(
+                std::numeric_limits<Value_t>::min(),
+                std::numeric_limits<Value_t>::max()
        );
        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
        // Run distributed sort
        logger << "Starting distributed sorting ... ";
        Timer_total.start();
-        distBitonic(Data);
+        bitonicSort(Data);
        Timer_total.stop();
        measurements_next();
        logger << " Done." << logger.endl;
@ -196,11 +178,7 @@ int main(int argc, char* argv[]) try {

    // Print-outs and validation
    if (config.perf > 1) {
-        Timing::print_duration(Timer_total.median(),    "Total     ", 0);
-        Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", 0);
-        Timing::print_duration(Timer_exchange.median(), "Exchange  ", 0);
-        Timing::print_duration(Timer_minmax.median(),   "Min-Max   ", 0);
-        Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", 0);
+        Timing::print_duration(Timer_total.median(), "Total");
    }
    if (config.validation) {
        // If requested, we have the chance to fail!
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@ -18,124 +18,6 @@
 #include "config.h"


-/*!
- * @brief A std::vector wrapper with 2 vectors, an active and a shadow.
- *
- * This type exposes the standard vector functionality of the active vector.
- * The shadow can be used when we need to use the vector as mutable
- * data in algorithms that can not support "in-place" editing (like elbow-sort for example)
- *
- * @tparam Value_t  the underlying data type of the vectors
- */
-template <typename Value_t>
-struct ShadowedVec_t {
-    // STL requirements
-    using value_type     = Value_t;
-    using iterator       = typename std::vector<Value_t>::iterator;
-    using const_iterator = typename std::vector<Value_t>::const_iterator;
-    using size_type      = typename std::vector<Value_t>::size_type;
-
-    // Default constructor
-    ShadowedVec_t() = default;
-
-    // Constructor from an std::vector
-    explicit ShadowedVec_t(const std::vector<Value_t>& vec)
-            : North(vec), South(), active(north) {
-        South.resize(North.size());
-    }
-
-    explicit ShadowedVec_t(std::vector<Value_t>&& vec)
-            : North(std::move(vec)), South(), active(north) {
-        South.resize(North.size());
-    }
-
-    // Copy assignment operator
-    ShadowedVec_t& operator=(const ShadowedVec_t& other) {
-        if (this != &other) { // Avoid self-assignment
-            North = other.North;
-            South = other.South;
-            active = other.active;
-        }
-        return *this;
-    }
-
-    // Move assignment operator
-    ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept {
-        if (this != &other) { // Avoid self-assignment
-            North = std::move(other.North);
-            South = std::move(other.South);
-            active = other.active;
-
-            // There is no need to zero out other since it is valid but in a non-defined state
-        }
-        return *this;
-    }
-
-    // Type accessors
-    std::vector<Value_t>& getActive() { return (active == north) ? North : South; }
-    std::vector<Value_t>& getShadow() { return (active == north) ? South : North; }
-    const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; }
-    const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; }
-
-    // Swap vectors
-    void switch_active() { active = (active == north) ? south : north; }
-
-    // Dispatch vector functionality to active vector
-    Value_t& operator[](size_type index) { return getActive()[index]; }
-    const Value_t& operator[](size_type index) const { return getActive()[index]; }
-
-    Value_t& at(size_type index) { return getActive().at(index); }
-    const Value_t& at(size_type index) const { return getActive().at(index); }
-
-    void push_back(const Value_t& value) { getActive().push_back(value); }
-    void push_back(Value_t&& value)      { getActive().push_back(std::move(value)); }
-    void pop_back()                      { getActive().pop_back(); }
-    Value_t& front() { return getActive().front(); }
-    Value_t& back()  { return getActive().back(); }
-    const Value_t& front() const { return getActive().front(); }
-    const Value_t& back()  const { return getActive().back(); }
-
-    iterator begin() { return getActive().begin(); }
-    const_iterator begin() const { return getActive().begin(); }
-    iterator end() { return getActive().end(); }
-    const_iterator end() const { return getActive().end(); }
-
-    size_type size() const { return getActive().size(); }
-    void resize(size_t new_size) {
-        North.resize(new_size);
-        South.resize(new_size);
-    }
-
-    void reserve(size_t new_capacity) {
-        North.reserve(new_capacity);
-        South.reserve(new_capacity);
-    }
-    [[nodiscard]] size_t capacity() const { return getActive().capacity(); }
-    [[nodiscard]] bool empty() const { return getActive().empty(); }
-
-    void clear() { getActive().clear(); }
-    void swap(std::vector<Value_t>& other) { getActive().swap(other); }
-
-    // Comparisons
-    bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); }
-    bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); }
-    bool operator== (const std::vector<value_type>& other) { return getActive() == other; }
-    bool operator!= (const std::vector<value_type>& other) { return getActive() != other; }
-
-private:
-    std::vector<Value_t> North{};       //!< Actual buffer to be used either as active or shadow
-    std::vector<Value_t> South{};       //!< Actual buffer to be used either as active or shadow
-    enum {
-        north, south
-    } active{north};                    //!< Flag to select between North and South buffer
-};
-
-/*
- * Exported data types
- */
-using distBuffer_t = ShadowedVec_t<distValue_t>;
-extern distBuffer_t Data;
-
 /*!
 * A Logger for entire program.
 */
--- a/homework_3/test/tests.cpp
+++ b/homework_3/test/tests.cpp
@ -25,8 +25,7 @@ protected:


 /*
- * MPI: SysTest (acceptance)
- * Each process executes distBubbletonic for uin8_t [16]
+ *
 */
 TEST_F(TCUDAbitonic, test1) {