HW3: [No Compile] Init HW3 with HW2 files

1 päivä sitten · 2ff6ae171a
--- a/homework_3/.gitignore
+++ b/homework_3/.gitignore
@@ -0,0 +1,23 @@
 # project
 bin/
 out/
 mat/
 mtx/
 .unused/
 various/

 # hpc

 # IDEs
 .idea/
 .clangd

 # eclipse
 .project
 .cproject
 .settings/

 .vs/
 .vscode/


--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@@ -0,0 +1,205 @@
 #
 # PDS HW3 Makefile
 #
 # Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation, either version 3
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #

 # ============== Project settings ==============
 # Project's name
 PROJECT         := PDS_homework_3

 # Excecutable's name
 TARGET          := bitonic

 # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
 SRC_DIR_LIST    := src test test/gtest

 # Include directories list(space seperated). Makefile-relative path.
 INC_DIR_LIST    := src \
                   test \
                   test/gtest/ \


 # Exclude files list(space seperated). Filenames only.
 # EXC_FILE_LIST := bad.cpp old.cpp

 # Build directories
 BUILD_DIR       := bin
 OBJ_DIR         := $(BUILD_DIR)/obj
 DEP_DIR         := $(BUILD_DIR)/.dep

 OUTPUT_DIR      := out

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
 DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c11 -fopenmp
 REL_CFLAGS      := -Wall -Wextra -O3 -std=c11 -fopenmp
 DEB_CXXFLAGS    := -DDEBUG -g3 -Wall -Wextra -std=c++17 -fopenmp
 REL_CXXFLAGS    := -Wall -Wextra -O3 -std=c++17 -fopenmp

 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
 PRE_DEFS        :=

 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
 LDFLAGS         := -pthread

 # Map output file
 MAP_FILE        := output.map
 MAP_FLAG        := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)

 # ============== Docker settings ==============
 # We need:
 #  - Bind the entire project directory(the dir that icludes all the code) as volume.
 #  - In docker instance, change to working directory(where the makefile is).
 DOCKER_VOL_DIR  := $(shell pwd)
 DOCKER_WRK_DIR  :=
 DOCKER_RUN      := docker run --rm
 DOCKER_FLAGS    := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)

 # docker invoke mechanism (edit with care)
 #   note:
 #   Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
 #   functionality is needed.
 DOCKER_CMD      = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
 DOCKER          :=

 # ============== Tool selection ==============
 # compiler and compiler flags.
 CSIZE           := size
 CFLAGS          := $(DEB_CFLAGS)
 CXXFLAGS        := $(DEB_CXXFLAGS)
 CXX             := g++ #mpic++
 CC              := gcc #mpicc

 #
 # =========== Main body and Patterns ===========
 #

 #ifeq ($(OS), Windows_NT)
 #	TARGET := $(TARGET).exe
 #endif
 INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
 DEF     := $(foreach def,$(PRE_DEFS),-D$(def))
 EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
               $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
           )
 # source files. object and dependencies list
 # recursive search into current and source directories
 SRC     := $(wildcard *.cpp)
 SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
 SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
 SRC     := $(filter-out $(EXC),$(SRC))
 #SRC     := $(abspath $(SRC))

 OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
 DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))


 # Make Dependencies pattern.
 # This little trick enables recompilation only when dependencies change
 # and it does so for changes both in source AND header files ;)
 # 
 # It is based on Tom Tromey's method.
 # 
 # Invoke cpp to create makefile rules with dependencies for each source file
 $(DEP_DIR)/%.d: %.c
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<

 # c file objects depent on .c AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<

 $(DEP_DIR)/%.d: %.cpp
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<

 # cpp file objects depent on .cpp AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<

 # empty recipe for dependency files. This prevents make errors
 $(DEP):

 # now include all dependencies
 # After all they are makefile dependency rules ;)
 include $(wildcard $(DEP))

 # main target rule
 $(BUILD_DIR)/$(TARGET): $(OBJ)
 	@mkdir -p $(@D)
 	@echo Linking to target: $(TARGET)
 	@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
 	@echo
 	@echo Print size information
 	@$(CSIZE) $(@D)/$(TARGET)
 	@echo Done


 #
 # ================ Default local build rules =================
 # example:
 # make debug

 .DEFAULT_GOAL := all

 .PHONY: clean
 clean:
 	@echo Cleaning build directories
 	@rm -rf $(OBJ_DIR)
 	@rm -rf $(DEP_DIR)
 	@rm -rf $(BUILD_DIR)

 debug: CFLAGS := $(DEB_CFLAGS)
 debug: $(BUILD_DIR)/$(TARGET)

 release: CFLAGS := $(REL_CFLAGS)
 release: $(BUILD_DIR)/$(TARGET)

 #
 # ================ Build rules =================
 #


 bitonic_v0: CC := nvcc
 bitonic_v0: CXX := nvcc
 bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
 bitonic_v0: TARGET := bitonic_v0
 bitonic_v0: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)


 hpc-build:
 	make clean
 	make distbubbletonic
 	make clean
 	make distbitonic
 	make clean
 	make tests


 all: debug bitonic_v0
 # Note:
 #	Add a gcc based make rule here in order for clangd to successfully scan the project files.
 #	Otherwise we do not need the gcc build.

--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@@ -0,0 +1,74 @@
 /*!
 * \file
 * \brief   Build configuration file.
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #ifndef CONFIG_H_
 #define CONFIG_H_

 #include <cstdint>

 /*
 * Versioning:
 * - RC1:
 */
 static constexpr char version[] = "0.0";

 /*
 * Defines for different version of the exercise
 */
 #define  V0        (0)
 #define  V1        (1)
 #define  V2        (2)

 // Fail-safe version selection
 #if !defined CODE_VERSION
 #define CODE_VERSION   V0
 #endif

 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;


 /*!
 * Value type selection
 *
 * We support the following compiler types or the <cstdint> that translate to them:
 *  char       -  unsigned char
 *  short      -  unsigned short
 *  int        -  unsigned int
 *  long       -  unsigned long
 *  long long  -  unsigned long long
 *  float
 *  double
 */
 using   distValue_t = uint32_t;

 /*!
 * Session option for each invocation of the executable.
 *
 * @note
 *  The values of the members are set from the command line.
 */
 struct config_t {
    size_t  arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
    bool    exchangeOpt{false};             //!< Flag to request the exchange optimization
    size_t  pipeline{1UL};                  //!< Pipeline stages (1 to disable)
    bool    validation{false};              //!< Request a full validation at the end, performed by process rank 0.
    bool    ndebug{false};                  //!< Skips debug trap on DEBUG builds.
    size_t  perf{1};                        //!< Enable performance timing measurements and prints and repeat
                                            //!< the sorting <perf> times.
    bool    verbose{false};                 //!< Flag to enable verbose output to stdout.
 };

 /*
 * Exported data types
 */
 extern config_t config;


 #endif /* CONFIG_H_ */
--- a/homework_3/src/distsort.cpp
+++ b/homework_3/src/distsort.cpp
@@ -0,0 +1,33 @@
 /*!
 * \file
 * \brief   Distributed sort implementation
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #include "utils.hpp"
 #include "distsort.hpp"


 bool isActive(mpi_id_t node, size_t nodes) {
    if (!((nodes > 0) &&
          (nodes <= static_cast<size_t>(std::numeric_limits<mpi_id_t>::max())) ))
        throw std::runtime_error("(isActive) Non-acceptable value of MPI Nodes\n");
    // ^ Assert that mpi_id_t can hold nodes, and thus we can cast without data loss!

    return (node >= 0) && (node < static_cast<mpi_id_t>(nodes));
 }

 size_t tagGenerator(size_t depth, size_t step, size_t stage) {
    auto stage_bits   = static_cast<uint32_t>(std::log2(MAX_PIPELINE_SIZE));
    auto step_bits    = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE));
    uint32_t stat_bit = 1UL;
    // ^ We use MPI_SIZE room for steps to fit the bubbletonic version

    // [  depth  | step | stage+stats ]
    size_t tag = stage
                 | (step << (stage_bits + stat_bit))
                 | (depth << (stage_bits + step_bits + stat_bit));
    return tag;
 }
--- a/homework_3/src/distsort.hpp
+++ b/homework_3/src/distsort.hpp
@@ -0,0 +1,454 @@
 /*!
 * \file
 * \brief   Distributed sort implementation header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #ifndef DISTBITONIC_H_
 #define DISTBITONIC_H_

 #include <vector>
 #include <algorithm>
 #include <parallel/algorithm>
 #include <cmath>
 #include <cstdint>
 #if !defined DEBUG
 #define NDEBUG
 #endif
 #include <cassert>

 #include "utils.hpp"

 /*
 * Exported timers
 */
 extern Timing Timer_total;
 extern Timing Timer_fullSort;
 extern Timing Timer_exchange;
 extern Timing Timer_minmax;
 extern Timing Timer_elbowSort;


 /*!
 * Enumerator for the different versions of the sorting method
 */
 enum class SortMode {
    Bubbletonic,    //!< The v0.5 of the algorithm where we use a bubble-sort like approach
    Bitonic         //!< The v1.0 of the algorithm where we use the bitonic data-exchange approach
 };

 /*
 * ============================== Sort utilities ==============================
 */

 /*!
 * The primary function template of ascending(). It is DISABLED since , it is explicitly specialized
 * for each of the \c SortMode
 */
 template <SortMode Mode> inline bool ascending(mpi_id_t, [[maybe_unused]] size_t) noexcept = delete;

 /*!
 * Returns the ascending or descending configuration of the node's sequence based on
 * the current node (MPI process) and the depth of the sorting network
 *
 * @param node      [mpi_id_t] The current node (MPI process)
 * @return          [bool]     True if we need ascending configuration, false otherwise
 */
 template <> inline
 bool ascending<SortMode::Bubbletonic>(mpi_id_t node, [[maybe_unused]] size_t depth) noexcept {
    return (node % 2) == 0;
 }

 /*!
 * Returns the ascending or descending configuration of the node's sequence based on
 * the current node (MPI process) and the depth of the sorting network
 *
 * @param node      [mpi_id_t] The current node (MPI process)
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if we need ascending configuration, false otherwise
 */
 template <> inline
 bool ascending<SortMode::Bitonic>(mpi_id_t node, size_t depth) noexcept {
    return !(node & (1 << depth));
 }

 /*!
 * The primary function template of partner(). It is DISABLED since , it is explicitly specialized
 * for each of the \c SortMode
 */
 template <SortMode Mode> inline mpi_id_t partner(mpi_id_t, size_t) noexcept = delete;

 /*!
 * Returns the node's partner for data exchange during the sorting network iterations
 * of Bubbletonic
 *
 * @param node      [mpi_id_t] The current node
 * @param step      [size_t]   The step of the sorting network
 * @return          [mpi_id_t] The node id of the partner for data exchange
 */
 template <> inline
 mpi_id_t partner<SortMode::Bubbletonic>(mpi_id_t node, size_t step) noexcept {
    //return (node % 2 == step % 2) ? node + 1 : node - 1;
    return (((node+step) % 2) == 0) ? node + 1 : node - 1;
 }

 /*!
 * Returns the node's partner for data exchange during the sorting network iterations
 * of Bitonic
 *
 * @param node      [mpi_id_t] The current node
 * @param step      [size_t]   The step of the sorting network
 * @return          [mpi_id_t] The node id of the partner for data exchange
 */
 template <> inline
 mpi_id_t partner<SortMode::Bitonic>(mpi_id_t node, size_t step) noexcept {
    return (node ^ (1 << step));
 }


 /*!
 * The primary function template of keepSmall(). It is DISABLED since , it is explicitly specialized
 * for each of the \c SortMode
 */
 template<SortMode Mode> inline bool keepSmall(mpi_id_t, mpi_id_t, [[maybe_unused]] size_t) = delete;

 /*!
 * Predicate to check if a node keeps the small numbers during the bubbletonic sort network exchange.
 *
 * @param node      [mpi_id_t] The node for which we check
 * @param partner   [mpi_id_t] The partner of the data exchange
 * @return          [bool]     True if the node should keep the small values, false otherwise
 */
 template <> inline
 bool keepSmall<SortMode::Bubbletonic>(mpi_id_t node, mpi_id_t partner, [[maybe_unused]] size_t depth)  {
    if (node == partner)
        throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n");
    return (node < partner);
 }

 /*!
 * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
 *
 * @param node      [mpi_id_t] The node for which we check
 * @param partner   [mpi_id_t] The partner of the data exchange
 * @param depth     [size_t]   The total depth of the sorting network (same for each step for a given network)
 * @return          [bool]     True if the node should keep the small values, false otherwise
 */
 template <> inline
 bool keepSmall<SortMode::Bitonic>(mpi_id_t node, mpi_id_t partner, size_t depth) {
    if (node == partner)
        throw std::runtime_error("(keepSmall) Node and Partner can not be the same\n");
    return ascending<SortMode::Bitonic>(node, depth) == (node < partner);
 }

 /*!
 * Predicate to check if the node is active in the current iteration of the bubbletonic
 * sort exchange.
 *
 * @param node      [mpi_id_t] The node to check
 * @param nodes     [size_t]   The total number of nodes
 * @return          [bool]     True if the node is active, false otherwise
 */
 bool isActive(mpi_id_t node, size_t nodes);

 /*
 * ============================== Data utilities ==============================
 */

 /*!
 * Sort a range using the build-in O(Nlog(N)) algorithm
 *
 * @tparam RangeT   A range type with random access iterator
 *
 * @param data      [RangeT] The data to be sorted
 * @param ascending [bool]   Flag to indicate the sorting order
 */
 template<typename RangeT>
 void fullSort(RangeT& data, bool ascending) noexcept {
    // Use introsort from stdlib++ here, unless ... __gnu_parallel
    if (ascending) {
        __gnu_parallel::sort(data.begin(), data.end(), std::less<>());
    }
    else {
        __gnu_parallel::sort(data.begin(), data.end(), std::greater<>());
    }

    if (config.exchangeOpt)
        updateMinMax(localStat, data);
 }

 /*!
 * Core functionality of sort for shadowed buffer types using
 * the "elbow sort" algorithm.
 *
 * @note:
 *  This algorithm can not work "in place".
 *  We use the active buffer as source and the shadow as target.
 *  At the end we switch which buffer is active and which is the shadow.
 * @note
 *  This is the core functionality. Use the elbowSort() function instead
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 * @tparam CompT            A Comparison type for binary operation comparisons
 *
 * @param data          [ShadowedDataT] The data to sort
 * @param ascending     [bool]          Flag to indicate the sorting order
 * @param comp          [CompT]         The binary operator object
 */
 template<typename ShadowedDataT, typename CompT>
 void elbowSortCore(ShadowedDataT& data, bool ascending, CompT comp) noexcept {
    auto& active = data.getActive(); // Get the source vector (the data to sort)
    auto& shadow = data.getShadow(); // Get the target vector (the sorted data)

    size_t N = data.size();         // The total size is the same or both vectors
    size_t left = std::distance(
            active.begin(),
            (ascending) ?
                std::min_element(active.begin(), active.end()) :
                std::max_element(active.begin(), active.end())
    );                              // start 'left' from elbow of the bitonic
    size_t right = (left == N-1) ? 0 : left + 1;

    // Walk in opposite directions from elbow and insert-sort to target vector
    for (size_t i = 0 ; i<N ; ++i) {
        if (comp(active[left], active[right])) {
            shadow[i] = active[left];
            left = (left == 0) ? N-1 : left -1; // cycle decrease
        }
        else {
            shadow[i] = active[right];
            right = (right + 1) % N;            // cycle increase
        }
    }
    data.switch_active();           // Switch active-shadow buffers
 }

 /*!
 * Sort a shadowed buffer using the "elbow sort" algorithm.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The data to sort
 * @param ascending     [bool]          Flag to indicate the sorting order
 */
 template<typename ShadowedDataT>
 void elbowSort(ShadowedDataT& data, bool ascending) noexcept {
    if (ascending)
        elbowSortCore(data, ascending, std::less<>());
    else
        elbowSortCore(data, ascending, std::greater<>());
 }

 /*!
 * Predicate for exchange optimization. Returns true only if an exchange between partners is needed.
 * In order to do that we exchange min and max statistics of the partner's data.
 *
 * @tparam StatT    Statistics data type (for min-max)
 *
 * @param lstat     [const StatT]   Reference to the local statistic data
 * @param rstat     [StatT]         Reference to the remote statistic data to fill
 * @param part      [mpi_id_t]      The partner for the exchange
 * @param tag       [int]           The tag to use for the exchange of stats
 * @param keepSmall [bool]          Flag to indicate if the local thread keeps the small ro the large values
 * @return          True if we need data exchange, false otherwise
 */
 template<typename StatT>
 bool needsExchange(const StatT& lstat, StatT& rstat, mpi_id_t part, int tag, bool keepSmall) {
    timeCall(Timer_exchange, mpi.exchange_it, lstat, rstat, part, tag);
    return (keepSmall) ?
           rstat.min < lstat.max   // Lmin: rstat.min - Smax: lstat.max
         : lstat.min < rstat.max;  // Lmin: lstat.min - Smax: rstat.max
 }


 /*!
 * Update stats utility
 *
 * @tparam RangeT   A range type with random access iterator
 * @tparam StatT    Statistics data type (for min-max)
 *
 * @param stat      [StatT]         Reference to the statistic data to update
 * @param data      [const RangeT]  Reference to the sequence to extract stats from
 */
 template<typename RangeT, typename StatT>
 void updateMinMax(StatT& stat, const RangeT& data) noexcept {
    auto [min, max] = std::minmax_element(data.begin(), data.end());
    stat.min = *min;
    stat.max = *max;
 }

 /*!
 * Takes two sequences and selects either the larger or the smaller items
 * in one-to-one comparison between them. If the initial sequences are bitonic, then
 * the result is a bitonic sequence too!
 *
 * @tparam ValueT   The underlying type of the sequences
 *
 * @param local     [ValueT*]       Pointer to the local sequence
 * @param remote    [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
 * @param count     [size_t]        The number of items to process
 * @param keepSmall [bool]          Flag to indicate if we keep the small items in local sequence
 */
 template<typename ValueT>
 void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
    std::transform(
            local, local + count,
            remote,
            local,
            [&keepSmall](const ValueT& a, const ValueT& b){
                return (keepSmall) ? std::min(a, b) : std::max(a, b);
            });
 }

 /*
 * ============================== Sort algorithms ==============================
 */

 /*!
 * A small tag generator tool to provide consistent encoding to tag communication
 *
 * @param depth     The current algorithmic depth[bitonic] of the communication, if any
 * @param step      The current step on the current depth
 * @param stage     The stage of the pipeline.
 * @return          The tag to use.
 *
 * @note
 *      In case we call this function outside of the pipeline loop, we can ommit
 *      @c stage argument and use the return value as starting tag for every communication
 *      of the pipeline loop. We need to increase the tags for each communication of
 *      the pipeline loop though!
 */
 size_t tagGenerator(size_t depth, size_t step, size_t stage = 0);

 /*!
 * An exchange functionality to support both Bubbletonic and Bitonic sort algorithms.
 *
 * @note
 *  In case of pipeline request it switches to non-blocking MPI communication for
 *  pipelining min-max process with mpi data exchange
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data      [ShadowedDataT&]    Reference to the data to exchange
 * @param partner   [mpi_id_t]          The partner for the exchange
 * @param keepSmall [bool]              Flag to indicate if we keep the small values
 * @param tag       [int]               The init tag to use for the loop.
 *
 * @note
 *      The @c tag is increased inside the pipeline loop for each different data exchange
 */
 template<typename ShadowedDataT>
 void exchange(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) {
    using Value_t = typename ShadowedDataT::value_type;

    // Init counters and pointers
    Value_t* active = data.getActive().data();
    Value_t* shadow = data.getShadow().data();
    size_t    count = data.size() / config.pipeline;

    if (config.pipeline > 1) {
        // Pipeline case - use async MPI
        Timer_exchange.start();
        mpi.exchange_start(active, shadow, count, partner, tag);
        for (size_t stage = 0; stage < config.pipeline; active += count, shadow += count) {
            // Wait previous chunk
            mpi.exchange_wait();
            Timer_exchange.stop();
            if (++stage < config.pipeline) {
                // Start next chunk if there is a next one
                Timer_exchange.start();
                mpi.exchange_start(active + count, shadow + count, count, partner, ++tag);
            }
            // process the arrived data
            timeCall(Timer_minmax, keepMinOrMax, active, shadow, count, keepSmall);
        }
    }
    else {
        // No pipeline - use blocking MPI
        timeCall(Timer_exchange, mpi.exchange, active, shadow, count, partner, tag);
        timeCall(Timer_minmax, keepMinOrMax, active, shadow, count, keepSmall);
    }
    if (config.exchangeOpt)
        updateMinMax(localStat, data);
 }

 /*!
 * A distributed version of the Bubbletonic sort algorithm.
 *
 * @note
 *  Each MPI process should run an instance of this function.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process data to sort
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 */
 template<typename ShadowedDataT>
 void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
    // Initially sort to create a half part of a bitonic sequence
    timeCall(Timer_fullSort, fullSort, data, ascending<SortMode::Bubbletonic>(rank, 0));

    // Sort network (O(N) iterations)
    for (size_t step = 0; step < static_cast<size_t>(Processes); ++step) {
        // Find out exchange configuration
        auto part = partner<SortMode::Bubbletonic>(rank, step);
        auto ks = keepSmall<SortMode::Bubbletonic>(rank, part, Processes);
        if ( isActive(rank, Processes) &&
             isActive(part, Processes) ) {
            // Exchange with partner, keep nim-or-max and sort - O(N)
            int tag = static_cast<int>(tagGenerator(0, step));
            if (!config.exchangeOpt || needsExchange(localStat, remoteStat, part, tag++, ks)) {
                exchange(data, part, ks, tag);
                timeCall(Timer_elbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
            }
        }
    }

    // Invert if the node was descending.
    if (!ascending<SortMode::Bubbletonic>(rank, 0)) {
        elbowSort(data, true);
    }
 }


 /*!
 * A distributed version of the Bitonic sort algorithm.
 *
 * @note
 *  Each MPI process should run an instance of this function.
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process data to sort
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 */
 template<typename ShadowedDataT>
 void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
    // Initially sort to create a half part of a bitonic sequence
    timeCall(Timer_fullSort, fullSort, data, ascending<SortMode::Bitonic>(rank, 0));

    // Run through sort network using elbow-sort ( O(LogN * LogN) iterations )
    auto p = static_cast<uint32_t>(std::log2(Processes));
    for (size_t depth = 1; depth <= p; ++depth) {
        for (size_t step = depth; step > 0;) {
            --step;
            // Find out exchange configuration
            auto part = partner<SortMode::Bitonic>(rank, step);
            auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
            // Exchange with partner, keep nim-or-max
            int tag = static_cast<int>(tagGenerator(depth, step));
            if (!config.exchangeOpt || needsExchange(localStat, remoteStat, part, tag++, ks)) {
                exchange(data, part, ks, tag);
            }
        }
        // sort - O(N)
        timeCall(Timer_elbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
    }
 }

 #endif //DISTBITONIC_H_
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@@ -0,0 +1,318 @@
 /*!
 * \file
 * \brief   Main application file for PDS HW3 (CUDA)
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #include <exception>
 #include <iostream>
 #include <algorithm>
 #include <random>

 #include "utils.hpp"
 #include "config.h"
 #include "distsort.hpp"


 // Global session data
 config_t        config;
 MPI_t<>         mpi;
 distBuffer_t    Data;
 Log             logger;
 distStat_t      localStat, remoteStat;

 // Mersenne seeded from hw if possible. range: [type_min, type_max]
 std::random_device  rd;
 std::mt19937        gen(rd());

 //! Performance timers for each one of the "costly" functions
 Timing Timer_total;
 Timing Timer_fullSort;
 Timing Timer_exchange;
 Timing Timer_minmax;
 Timing Timer_elbowSort;

 //! Init timing objects for extra rounds
 void measurements_init() {
    if (config.perf > 1) {
        Timer_total.init(config.perf);
        Timer_fullSort.init(config.perf);
        Timer_exchange.init(config.perf);
        Timer_minmax.init(config.perf);
        Timer_elbowSort.init(config.perf);
    }
 }

 //! iterate ot the next round of measurements for all measurement objects
 void measurements_next() {
    if (config.perf > 1) {
        Timer_total.next();
        Timer_fullSort.next();
        Timer_exchange.next();
        Timer_minmax.next();
        Timer_elbowSort.next();
    }
 }

 /*!
 * A small command line argument parser
 * \return  The status of the operation
 */
 bool get_options(int argc, char* argv[]){
    bool status =true;

    // iterate over the passed arguments
    for (int i=1 ; i<argc ; ++i) {
        std::string arg(argv[i]);     // get current argument

        if (arg == "-q" || arg == "--array-size") {
            if (i+1 < argc) {
                config.arraySize = 1 << atoi(argv[++i]);
            }
            else {
                status = false;
            }
        }
        else if (arg == "-e" || arg == "--exchange-opt") {
            config.exchangeOpt = true;
        }
        else if (arg == "--pipeline") {
            if (i+1 < argc) {
                auto stages = atoi(argv[++i]);
                if (isPowerOfTwo(stages) && stages <= static_cast<int>(MAX_PIPELINE_SIZE))
                    config.pipeline = stages;
                else
                    status = false;
            }
            else {
                status = false;
            }
        }
        else if (arg == "--validation") {
            config.validation = true;
        }
        else if (arg == "--perf") {
            if (i+1 < argc) {
                config.perf = atoi(argv[++i]);
            }
            else {
                status = false;
            }
        }
        else if (arg == "--ndebug") {
            config.ndebug = true;
        }
        else if (arg == "-v" || arg == "--verbose") {
            config.verbose = true;
        }
        else if (arg == "--version") {
            std::cout << "distbitonic/distbubbletonic - A distributed sort utility\n";
            std::cout << "version: " << version << "\n\n";
            exit(0);
        }
        else if (arg == "-h" || arg == "--help") {
            std::cout << "distbitonic/distbubbletonic - A distributed sort utility\n\n";
            std::cout << "  distbitonic -q <N> [-e] [-p | --pipeline <N>] [--validation] [--perf <N>] [--ndebug] [-v]\n";
            std::cout << "  distbitonic -h\n";
            std::cout << "  distbubbletonic -q <N> [-e] [-p | --pipeline <N>] [--validation] [--perf <N> ] [--ndebug] [-v]\n";
            std::cout << "  distbubbletonic -h\n";
            std::cout << '\n';
            std::cout << "Options:\n\n";
            std::cout << "   -q | --array-size <N>\n";
            std::cout << "      Selects the array size according to size = 2^N\n\n";
            std::cout << "   -e | --exchange-opt\n";
            std::cout << "      Request an MPI data exchange optimization \n\n";
            std::cout << "   -p <N> | --pipeline <N>\n";
            std::cout << "      Request a pipeline of <N> stages for exchange-minmax\n";
            std::cout << "      N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n";
            std::cout << "   --validation\n";
            std::cout << "      Request a full validation at the end, performed by process rank 0\n\n";
            std::cout << "   --perf <N> \n";
            std::cout << "      Enable performance timing measurements and prints, and repeat\n";
            std::cout << "      the sorting <N> times.\n\n";
            std::cout << "   --ndebug\n";
            std::cout << "      Skip debug breakpoint when on debug build.\n\n";
            std::cout << "   -v | --verbose\n";
            std::cout << "      Request a more verbose output to stdout.\n\n";
            std::cout << "   -h | --help\n";
            std::cout << "      Prints this and exit.\n\n";
            std::cout << "   --version\n";
            std::cout << "      Prints version and exit.\n\n";
            std::cout << "Examples:\n\n";
            std::cout << "   mpirun -np 4 distbitonic -q 24\n";
            std::cout << "      Runs distbitonic in 4 MPI processes with 2^24 array points each\n\n";
            std::cout << "   mpirun -np 16 distbubbletonic -q 20\n";
            std::cout << "      Runs distbubbletonic in 16 MPI processes with 2^20 array points each\n\n";

            exit(0);
        }
        else {   // parse error
            std::cout << "Invocation error. Try -h for details.\n";
            status = false;
        }
    }

    return status;
 }

 /*!
 * A simple validator for the entire distributed process
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data          [ShadowedDataT] The local to MPI process
 * @param Processes     [mpi_id_t]      The total number of MPI processes
 * @param rank          [mpi_id_t]      The current process id
 *
 * @return              [bool]          True if all are sorted and in total ascending order
 */
 template<typename ShadowedDataT>
 bool validator(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
    using value_t = typename ShadowedDataT::value_type;
    bool ret = true;    // Have faith!

    // Local results
    value_t lmin   = data.front();
    value_t lmax   = data.back();
    value_t lsort  = static_cast<value_t>(std::is_sorted(data.begin(), data.end()));

    // Gather min/max/sort to rank 0
    std::vector<value_t> mins(Processes);
    std::vector<value_t> maxes(Processes);
    std::vector<value_t> sorts(Processes);

    MPI_Datatype datatype = MPI_TypeMapper<value_t>::getType();
    MPI_Gather(&lmin,  1, datatype, mins.data(),  1, datatype, 0, MPI_COMM_WORLD);
    MPI_Gather(&lmax,  1, datatype, maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
    MPI_Gather(&lsort, 1, datatype, sorts.data(), 1, datatype, 0, MPI_COMM_WORLD);

    // Check all results
    if (rank == 0) {
        for (mpi_id_t r = 1; r < Processes; ++r) {
            if (sorts[r] == 0)
                ret = false;
            if (maxes[r - 1] > mins[r])
                ret = false;
        }
    }
    return ret;
 }

 /*!
 * Initializes the environment, must called from each process
 *
 * @param argc  [int*]      POINTER to main's argc argument
 * @param argv  [char***]   POINTER to main's argv argument
 */
 void init(int* argc, char*** argv) {
    // try to read command line
    if (!get_options(*argc, *argv))
        exit(1);

    // Initialize MPI environment
    mpi.init(argc, argv);

    logger << "MPI environment initialized." << " Rank: " << mpi.rank() << " Size: " << mpi.size()
           << logger.endl;

    #if defined DEBUG
    #if defined TESTING
        /*
         * In case of a debug build we will wait here until sleep_wait
         * will reset via debugger. In order to do that the user must attach
         * debugger to all processes. For example:
         *  $> mpirun -np 2 ./<program path>
         *  $> ps aux | grep <program>
         *  $> gdb <program> <PID1>
         *  $> gdb <program> <PID2>
         */
         volatile bool sleep_wait = false;
    #else
        volatile bool sleep_wait = true;
    #endif
        while (sleep_wait && !config.ndebug)
            sleep(1);
    #endif

    // Prepare vector and timing data
    Data.resize(config.arraySize);
    measurements_init();
 }

 #if !defined TESTING
 /*!
 * @return Returns 0, but.... we may throw or exit(0) / exit(1)
 */
 int main(int argc, char* argv[]) try {

    // Init everything
    init(&argc, &argv);

    for (size_t it = 0 ; it < config.perf ; ++it) {
        // Initialize local data
        logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
        std::uniform_int_distribution<distValue_t > dis(
                std::numeric_limits<distValue_t>::min(),
                std::numeric_limits<distValue_t>::max()
        );
        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
        // Run distributed sort
        if (mpi.rank() == 0)
            logger << "Starting distributed sorting ... ";
        Timer_total.start();
    #if CODE_VERSION == BUBBLETONIC
        distBubbletonic(Data, mpi.size(), mpi.rank());
    #else
        distBitonic(Data, mpi.size(), mpi.rank());
    #endif
        Timer_total.stop();
        measurements_next();
        if (mpi.rank() == 0)
            logger << " Done." << logger.endl;
    }

    // Print-outs and validation
    if (config.perf > 1) {
        Timing::print_duration(Timer_total.median(),    "Total     ", mpi.rank());
        Timing::print_duration(Timer_fullSort.median(), "Full-Sort ", mpi.rank());
        Timing::print_duration(Timer_exchange.median(), "Exchange  ", mpi.rank());
        Timing::print_duration(Timer_minmax.median(),   "Min-Max   ", mpi.rank());
        Timing::print_duration(Timer_elbowSort.median(),"Elbow-Sort", mpi.rank());
    }
    if (config.validation) {
        // If requested, we have the chance to fail!
        if (mpi.rank() == 0)
            std::cout << "[Validation] Results validation ...";
        bool val = validator(Data, mpi.size(), mpi.rank());
        if (mpi.rank() == 0)
            std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
    }
    mpi.finalize();
    return 0;
 }
 catch (std::exception& e) {
    //we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
    std::cerr << "Error: " << e.what() << '\n';
    exit(1);
 }

 #else

 #include <gtest/gtest.h>
 #include <exception>

 /*!
 * The testing version of our program
 */
 GTEST_API_ int main(int argc, char **argv) try {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
 catch (std::exception& e) {
    std::cout << "Exception: " << e.what() << '\n';
 }

 #endif
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@@ -0,0 +1,516 @@
 /**
 * \file
 * \brief   Utilities header
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
 #ifndef UTILS_HPP_
 #define UTILS_HPP_

 #include <vector>
 #include <iostream>
 #include <chrono>
 #include <unistd.h>
 #include <mpi.h>

 #include "config.h"

 /*!
 * Min-Max statistics data for exchange optimization
 * @tparam Value_t  The underlying data type of the sequence data
 */
 template <typename Value_t>
 struct Stat_t {
    using value_type = Value_t;     //!< meta-export the type

    Value_t min{};                  //!< The minimum value of the sequence
    Value_t max{};                  //!< The maximum value of the sequence
 };

 //! Application data selection alias
 using distStat_t = Stat_t<distValue_t>;
 extern distStat_t localStat, remoteStat;    // Make stats public

 /*
 * MPI_<type> dispatcher mechanism
 */
 template <typename T> struct MPI_TypeMapper { };

 template <> struct MPI_TypeMapper<char>          { static MPI_Datatype getType() { return MPI_CHAR; } };
 template <> struct MPI_TypeMapper<short>         { static MPI_Datatype getType() { return MPI_SHORT; } };
 template <> struct MPI_TypeMapper<int>           { static MPI_Datatype getType() { return MPI_INT; } };
 template <> struct MPI_TypeMapper<long>          { static MPI_Datatype getType() { return MPI_LONG; } };
 template <> struct MPI_TypeMapper<long long>     { static MPI_Datatype getType() { return MPI_LONG_LONG; } };
 template <> struct MPI_TypeMapper<unsigned char> { static MPI_Datatype getType() { return MPI_UNSIGNED_CHAR; } };
 template <> struct MPI_TypeMapper<unsigned short>{ static MPI_Datatype getType() { return MPI_UNSIGNED_SHORT; } };
 template <> struct MPI_TypeMapper<unsigned int>  { static MPI_Datatype getType() { return MPI_UNSIGNED; } };
 template <> struct MPI_TypeMapper<unsigned long> { static MPI_Datatype getType() { return MPI_UNSIGNED_LONG; } };
 template <> struct MPI_TypeMapper<unsigned long long> { static MPI_Datatype getType() { return MPI_UNSIGNED_LONG_LONG; } };
 template <> struct MPI_TypeMapper<float>         { static MPI_Datatype getType() { return MPI_FLOAT; } };
 template <> struct MPI_TypeMapper<double>        { static MPI_Datatype getType() { return MPI_DOUBLE; } };

 /*!
 * MPI wrapper type to provide MPI functionality and RAII to MPI as a resource
 *
 * @tparam TID  The MPI type for process id [default: int]
 */
 template<typename TID = int>
 struct MPI_t {
    using ID_t = TID; // Export TID type (currently int defined by the standard)

    /*!
     * Initializes the MPI environment, must called from each process
     *
     * @param argc  [int*]      POINTER to main's argc argument
     * @param argv  [char***]   POINTER to main's argv argument
     */
    void init(int* argc, char*** argv) {
        // Initialize the MPI environment
        int err;
        if ((err = MPI_Init(argc, argv)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Init() - ");
        initialized_ = true;

        // Get the number of processes
        int size_value, rank_value;
        if ((err = MPI_Comm_size(MPI_COMM_WORLD, &size_value)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Comm_size() - ");
        if ((err = MPI_Comm_rank(MPI_COMM_WORLD, &rank_value)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Comm_rank() - ");
        size_ = static_cast<ID_t>(size_value);
        rank_ = static_cast<ID_t>(rank_value);
        if (size_ > static_cast<ID_t>(MAX_MPI_SIZE))
            throw std::runtime_error(
                    "(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n"
            );

        // Get the name of the processor
        char processor_name[MPI_MAX_PROCESSOR_NAME];
        int name_len;
        if ((err = MPI_Get_processor_name(processor_name, &name_len)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Get_processor_name() - ");
        name_ = std::string (processor_name, name_len);
    }

    /*!
     * Exchange one data object of type @c T with partner as part of the sorting network of both
     * bubbletonic or bitonic sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged the data object
     * between current node and partner.
     *
     * @tparam T        The object type
     *
     * @param local     [const T&]  Reference to the local object to send
     * @param remote    [T&]        Reference to the object to receive data from partner
     * @param partner   [mpi_id_t]  The partner for the exchange
     * @param tag       [int]       The tag to use for the MPI communication
     */
    template<typename T>
    void exchange_it(const T& local, T& remote, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound");
        MPI_Status status;
        int err;
        if ((err = MPI_Sendrecv(
                &local, sizeof(T), MPI_BYTE, partner, tag,
                &remote, sizeof(T), MPI_BYTE, partner, tag,
                MPI_COMM_WORLD, &status
        )) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - ");
    }

    /*!
     * Exchange data with partner as part of the sorting network of both bubbletonic or bitonic
     * sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged data between
     * current node and partner.
     *
     * @tparam ValueT   The value type used in buffer
     *
     * @param ldata     [const ValueT*] Pointer to local data to send
     * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
     * @param count     [size_t]        The number of data to exchange
     * @param partner   [mpi_id_t]      The partner for the exchange
     * @param tag       [int]           The tag to use for the MPI communication
     */
    template<typename ValueT>
    void exchange(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");

        MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType();
        MPI_Status status;
        int err;
        if ((err = MPI_Sendrecv(
                ldata, count, datatype, partner, tag,
                rdata, count, datatype, partner, tag,
                MPI_COMM_WORLD, &status
        )) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - ");
    }

    /*!
     * Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the
     * sorting network of both bubbletonic or bitonic sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged data between
     * current node and partner.
     * @note
     *      This call MUST paired with exchange_wait() for each MPI_t object.
     *      Calling 2 consecutive exchange_start() for the same MPI_t object is undefined.
     *
     * @tparam ValueT   The value type used in buffers
     *
     * @param ldata     [const ValueT*] Pointer to local data to send
     * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
     * @param count     [size_t]        The number of data to exchange
     * @param partner   [mpi_id_t]      The partner for the exchange
     * @param tag       [int]           The tag to use for the MPI communication
     */
    template<typename ValueT>
    void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");

        MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType();
        int err;
        err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx);
        if (err != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Isend() - ");
        err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx);
        if (err != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Irecv() - ");
    }

    /*!
     * Block wait for the completion of the previously called exchange_start()
     *
     * @note
     *      This call MUST paired with exchange_start() for each MPI_t object.
     *      Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined.
     */
    void exchange_wait() {
        MPI_Status status;

        int err;
        if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Wait() [send] - ");

        if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Wait() [recv] - ");
    }

    // Accessors
    [[nodiscard]] ID_t rank() const noexcept { return rank_; }
    [[nodiscard]] ID_t size() const noexcept { return size_; }
    [[nodiscard]] const std::string& name() const noexcept { return name_; }

    // Mutators
    ID_t rank(ID_t rank) noexcept { return rank_ = rank; }
    ID_t size(ID_t size) noexcept { return size_ = size; }
    std::string& name(const std::string& name) noexcept { return name_ = name; }

    /*!
     * Finalized the MPI
     */
    void finalize() {
        // Finalize the MPI environment
        initialized_ = false;
        MPI_Finalize();
    }

    //! RAII MPI finalization
    ~MPI_t() {
        // Finalize the MPI environment even on unexpected errors
        if (initialized_)
            MPI_Finalize();
    }


    // Local functionality
 private:
    /*!
     * Throw exception helper. It bundles the prefix msg with the MPI error string retrieved by
     * MPI API.
     *
     * @param err           The MPI error code
     * @param prefixMsg     The prefix text for the exception error message
     */
    void mpi_throw(int err, const char* prefixMsg) {
        char err_msg[MPI_MAX_ERROR_STRING];
        int msg_len;
        MPI_Error_string(err, err_msg, &msg_len);
        throw std::runtime_error(prefixMsg + std::string (err_msg) + '\n');
    }

 private:
    ID_t rank_{};           //!< MPI rank of the process
    ID_t size_{};           //!< MPI total size of the execution
    std::string name_{};    //!< The name of the local machine
    bool initialized_{};    //!< RAII helper flag
    MPI_Request handle_tx{};    //!< MPI async exchange handler for Transmission
    MPI_Request handle_rx{};    //!< MPI async exchange handler for Receptions
 };

 /*
 * Exported data types
 */
 extern MPI_t<>  mpi;
 using mpi_id_t = MPI_t<>::ID_t;



 /*!
 * @brief A std::vector wrapper with 2 vectors, an active and a shadow.
 *
 * This type exposes the standard vector functionality of the active vector.
 * The shadow can be used when we need to use the vector as mutable
 * data in algorithms that can not support "in-place" editing (like elbow-sort for example)
 *
 * @tparam Value_t  the underlying data type of the vectors
 */
 template <typename Value_t>
 struct ShadowedVec_t {
    // STL requirements
    using value_type     = Value_t;
    using iterator       = typename std::vector<Value_t>::iterator;
    using const_iterator = typename std::vector<Value_t>::const_iterator;
    using size_type      = typename std::vector<Value_t>::size_type;

    // Default constructor
    ShadowedVec_t() = default;

    // Constructor from an std::vector
    explicit ShadowedVec_t(const std::vector<Value_t>& vec)
            : North(vec), South(), active(north) {
        South.resize(North.size());
    }

    explicit ShadowedVec_t(std::vector<Value_t>&& vec)
            : North(std::move(vec)), South(), active(north) {
        South.resize(North.size());
    }

    // Copy assignment operator
    ShadowedVec_t& operator=(const ShadowedVec_t& other) {
        if (this != &other) { // Avoid self-assignment
            North = other.North;
            South = other.South;
            active = other.active;
        }
        return *this;
    }

    // Move assignment operator
    ShadowedVec_t& operator=(ShadowedVec_t&& other) noexcept {
        if (this != &other) { // Avoid self-assignment
            North = std::move(other.North);
            South = std::move(other.South);
            active = other.active;

            // There is no need to zero out other since it is valid but in a non-defined state
        }
        return *this;
    }

    // Type accessors
    std::vector<Value_t>& getActive() { return (active == north) ? North : South; }
    std::vector<Value_t>& getShadow() { return (active == north) ? South : North; }
    const std::vector<Value_t>& getActive() const { return (active == north) ? North : South; }
    const std::vector<Value_t>& getShadow() const { return (active == north) ? South : North; }

    // Swap vectors
    void switch_active() { active = (active == north) ? south : north; }

    // Dispatch vector functionality to active vector
    Value_t& operator[](size_type index) { return getActive()[index]; }
    const Value_t& operator[](size_type index) const { return getActive()[index]; }

    Value_t& at(size_type index) { return getActive().at(index); }
    const Value_t& at(size_type index) const { return getActive().at(index); }

    void push_back(const Value_t& value) { getActive().push_back(value); }
    void push_back(Value_t&& value)      { getActive().push_back(std::move(value)); }
    void pop_back()                      { getActive().pop_back(); }
    Value_t& front() { return getActive().front(); }
    Value_t& back()  { return getActive().back(); }
    const Value_t& front() const { return getActive().front(); }
    const Value_t& back()  const { return getActive().back(); }

    iterator begin() { return getActive().begin(); }
    const_iterator begin() const { return getActive().begin(); }
    iterator end() { return getActive().end(); }
    const_iterator end() const { return getActive().end(); }

    size_type size() const { return getActive().size(); }
    void resize(size_t new_size) {
        North.resize(new_size);
        South.resize(new_size);
    }

    void reserve(size_t new_capacity) {
        North.reserve(new_capacity);
        South.reserve(new_capacity);
    }
    [[nodiscard]] size_t capacity() const { return getActive().capacity(); }
    [[nodiscard]] bool empty() const { return getActive().empty(); }

    void clear() { getActive().clear(); }
    void swap(std::vector<Value_t>& other) { getActive().swap(other); }

    // Comparisons
    bool operator== (const ShadowedVec_t& other) { return getActive() == other.getActive(); }
    bool operator!= (const ShadowedVec_t& other) { return getActive() != other.getActive(); }
    bool operator== (const std::vector<value_type>& other) { return getActive() == other; }
    bool operator!= (const std::vector<value_type>& other) { return getActive() != other; }

 private:
    std::vector<Value_t> North{};       //!< Actual buffer to be used either as active or shadow
    std::vector<Value_t> South{};       //!< Actual buffer to be used either as active or shadow
    enum {
        north, south
    } active{north};                    //!< Flag to select between North and South buffer
 };

 /*
 * Exported data types
 */
 using distBuffer_t = ShadowedVec_t<distValue_t>;
 extern distBuffer_t Data;

 /*!
 * A Logger for entire program.
 */
 struct Log {
    struct Endl {} endl;    //!< a tag object to to use it as a new line request.

    //! We provide logging via << operator
    template<typename T>
    Log &operator<<(T &&t) {
        if (config.verbose) {
            if (line_) {
                std::cout << "[Log]: " << t;
                line_ = false;
            } else
                std::cout << t;
        }
        return *this;
    }

    // overload for special end line handling
    Log &operator<<(Endl e) {
        (void) e;
        if (config.verbose) {
            std::cout << '\n';
            line_ = true;
        }
        return *this;
    }

 private:
    bool line_{true};
 };

 extern Log logger;

 /*!
 * A small timing utility based on chrono that supports timing rounds
 * and returning the median of them. Time can accumulate to the measurement
 * for each round.
 */
 struct Timing {
    using Tpoint = std::chrono::steady_clock::time_point;
    using Tduration = std::chrono::microseconds;
    using microseconds = std::chrono::microseconds;
    using milliseconds = std::chrono::milliseconds;
    using seconds = std::chrono::seconds;

    //! Setup measurement rounds
    void init(size_t rounds) {
        duration_.resize(rounds);
        for (auto& d : duration_)
            d = Tduration::zero();
    }

    //! tool to mark the starting point
    Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }

    //! tool to mark the ending point
    Tpoint stop() noexcept {
        Tpoint now = std::chrono::steady_clock::now();
        duration_[current_] += dt(now, mark_);
        return now;
    }

    //! Switch timing slot
    void next() noexcept {
        ++current_;
        current_ %= duration_.size();
    }

    Tduration& median() noexcept {
        std::sort(duration_.begin(), duration_.end());
        return duration_[duration_.size()/2];
    }

    //! A duration calculation utility
    static Tduration dt(Tpoint t2, Tpoint t1) noexcept {
        return std::chrono::duration_cast<Tduration>(t2 - t1);
    }

    //! Tool to print the time interval
    static void print_duration(const Tduration& duration, const char *what, mpi_id_t rank) noexcept {
        if (std::chrono::duration_cast<microseconds>(duration).count() < 10000)
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
                      << std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n";
        else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000)
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
                      << std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n";
        else {
            char stime[26]; // fit ulong
            auto sec  = std::chrono::duration_cast<seconds>(duration).count();
            auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10;  // keep 2 digit
            std::sprintf(stime, "%ld.%1ld", sec, msec);
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n";
        }

    }

 private:
    size_t current_{0};
    Tpoint mark_{};
    std::vector<Tduration> duration_{1};
 };

 /*!
 * A "high level function"-like utility macro to forward a function call
 * and accumulate the execution time to the corresponding timing object.
 *
 * @param   Tim     The Timing object [Needs to have methods start() and stop()]
 * @param   Func    The function name
 * @param   ...     The arguments to pass to function (the preprocessor way)
 */
 #define timeCall(Tim, Func, ...)    \
    Tim.start();                    \
    Func(__VA_ARGS__);              \
    Tim.stop();                     \


 /*!
 * A utility to check if a number is power of two
 *
 * @tparam Integral     The integral type of the number to check
 * @param x             The number to check
 * @return              True if it is power of 2, false otherwise
 */
 template <typename Integral>
 constexpr inline bool isPowerOfTwo(Integral x) noexcept {
    return (!(x & (x - 1)) && x);
 }


 #endif /* UTILS_HPP_ */
--- a/homework_3/test/gtest/gtest/gtest-all.cpp
+++ b/homework_3/test/gtest/gtest/gtest-all.cpp
--- a/homework_3/test/gtest/gtest/gtest.h
+++ b/homework_3/test/gtest/gtest/gtest.h
--- a/homework_3/test/tests.cpp
+++ b/homework_3/test/tests.cpp
@@ -0,0 +1,34 @@
 /**
 * \file
 * \brief   PDS HW3 tests
 *
 * To run these test execute:
 *  ...
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #include <gtest/gtest.h>

 /*
 * Global fixtures
 */

 class TCUDAbitonic : public ::testing::Test {
 protected:
    static void SetUpTestSuite() { }

    static void TearDownTestSuite() { }
 };


 /*
 * MPI: SysTest (acceptance)
 * Each process executes distBubbletonic for uin8_t [16]
 */
 TEST_F(TCUDAbitonic, test1) {

    EXPECT_EQ(true, true);
 }