HW2: RC3b - A MinMax-MPIexchange pipeline and small changes

2 週之前 · 3bf4522448
--- a/homework_2/Makefile
+++ b/homework_2/Makefile
@@ -42,7 +42,7 @@ BUILD_DIR       := bin
 OBJ_DIR         := $(BUILD_DIR)/obj
 DEP_DIR         := $(BUILD_DIR)/.dep

 OUTPUT_DIR      := out-rc3a
 OUTPUT_DIR      := out-rc3b

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
--- a/homework_2/hpc/N1P2T4Q20.sh
+++ b/homework_2/hpc/N1P2T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N1P2T4Q23.sh
+++ b/homework_2/hpc/N1P2T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N1P2T4Q25.sh
+++ b/homework_2/hpc/N1P2T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N1P2T4Q27.sh
+++ b/homework_2/hpc/N1P2T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N1P4T4Q20.sh
+++ b/homework_2/hpc/N1P4T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N1P4T4Q23.sh
+++ b/homework_2/hpc/N1P4T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N1P4T4Q25.sh
+++ b/homework_2/hpc/N1P4T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N1P4T4Q27.sh
+++ b/homework_2/hpc/N1P4T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N2P4T4Q20.sh
+++ b/homework_2/hpc/N2P4T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N2P4T4Q23.sh
+++ b/homework_2/hpc/N2P4T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N2P4T4Q25.sh
+++ b/homework_2/hpc/N2P4T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N2P4T4Q27.sh
+++ b/homework_2/hpc/N2P4T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N4P16T4Q20.sh
+++ b/homework_2/hpc/N4P16T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P16T4Q23.sh
+++ b/homework_2/hpc/N4P16T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P16T4Q25.sh
+++ b/homework_2/hpc/N4P16T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P16T4Q27.sh
+++ b/homework_2/hpc/N4P16T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P32T4Q20.sh
+++ b/homework_2/hpc/N4P32T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P32T4Q23.sh
+++ b/homework_2/hpc/N4P32T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P32T4Q25.sh
+++ b/homework_2/hpc/N4P32T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P32T4Q27.sh
+++ b/homework_2/hpc/N4P32T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P4T4Q20.sh
+++ b/homework_2/hpc/N4P4T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P4T4Q23.sh
+++ b/homework_2/hpc/N4P4T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P4T4Q25.sh
+++ b/homework_2/hpc/N4P4T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N4P4T4Q27.sh
+++ b/homework_2/hpc/N4P4T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N4P8T4Q20.sh
+++ b/homework_2/hpc/N4P8T4Q20.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 20 --perf --validation
 srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P8T4Q23.sh
+++ b/homework_2/hpc/N4P8T4Q23.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 23 --perf --validation
 srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P8T4Q25.sh
+++ b/homework_2/hpc/N4P8T4Q25.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 25 --perf --validation
 srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P8T4Q27.sh
+++ b/homework_2/hpc/N4P8T4Q27.sh
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

 srun ./out-rc3a/distbitonic -q 27 --perf --validation
 srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/include/config.h
+++ b/homework_2/include/config.h
@@ -25,7 +25,14 @@
 #endif

 // Default Data size (in case -q <N> is not present)
 #define DEFAULT_DATA_SIZE (1 << 16)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;

 // The maximum MPI size we support
 static constexpr size_t MAX_MPI_SIZE        = 1024UL;

 // The maximum pipeline size we support
 static constexpr size_t MAX_PIPELINE_SIZE   = 64UL;


 /*!
 * Value type selection
@@ -46,6 +53,7 @@ using   distValue_t = uint32_t;
 */
 struct config_t {
    size_t  arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
    size_t  pipeline{1UL};                  //!< Pipeline stages
    bool    validation{false};              //!< Request a full validation at the end, performed by process rank 0.
    bool    ndebug{false};                  //!< Skips debug trap on DEBUG builds.
    bool    perf{false};                    //!< Enable performance timing measurements and prints.
--- a/homework_2/include/distsort.hpp
+++ b/homework_2/include/distsort.hpp
@@ -233,24 +233,24 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept {


 /*!
 * Takes two sorted sequences where one is in increasing and the other is in decreasing order
 * and selects either the larger or the smaller items in one-to-one comparison between them.
 * The result is a bitonic sequence.
 * Takes two sequences and selects either the larger or the smaller items
 * in one-to-one comparison between them. If the initial sequences are bitonic, then
 * the result is a bitonic sequence too!
 *
 * @tparam RangeT   A range type with random access iterator
 * @tparam ValueT   The underlying type of the sequences
 *
 * @param local     [RangeT]        Reference to the local sequence
 * @param remote    [const RangeT]  Reference to the remote sequence (copied locally by MPI)
 * @param local     [ValueT*]       Pointer to the local sequence
 * @param remote    [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
 * @param count     [size_t]        The number of items to process
 * @param keepSmall [bool]          Flag to indicate if we keep the small items in local sequence
 */
 template<typename RangeT>
 void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept {
    using value_t = typename RangeT::value_type;
 template<typename ValueT>
 void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
    std::transform(
            local.begin(), local.end(),
            remote.begin(),
            local.begin(),
            [&keepSmall](const value_t& a, const value_t& b){
            local, local + count,
            remote,
            local,
            [&keepSmall](const ValueT& a, const ValueT& b){
                return (keepSmall) ? std::min(a, b) : std::max(a, b);
            });
 }
@@ -259,6 +259,60 @@ void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept
 * ============================== Sort algorithms ==============================
 */

 /*!
 * A small tag generator tool to provide consistent encoding to tag communication
 *
 * @param depth     The current algorithmic depth[bitonic] of the communication, if any
 * @param step      The current step on the current depth
 * @param stage     The stage of the pipeline.
 * @return          The tag to use.
 *
 * @note
 *      In case we call this function outside of the pipeline loop, we can ommit
 *      @c stage argument and use the return value as starting tag for every communication
 *      of the pipeline loop. We need to increase the tags for each communication of
 *      the pipeline loop though!
 */
 size_t tagGenerator(size_t depth, size_t step, size_t stage = 0);

 /*!
 * A pipeline loop for mixing min-max process with mpi data exchange
 *
 * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
 *
 * @param data      [ShadowedDataT&]    Reference to the data to exchange
 * @param partner   [mpi_id_t]          The partner for the exchange
 * @param keepSmall [bool]              Flag to indicate if we keep the small values
 * @param tag       [int]               The init tag to use for the loop.
 *
 * @note
 *      The @c tag is increased inside the pipeline loop for each different data exchange
 */
 template<typename ShadowedDataT>
 void exchangePipeline(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) {
    using Value_t = typename ShadowedDataT::value_type;

    // Init counters and pointers
    size_t    count = data.size() / config.pipeline;
    Value_t* active = data.getActive().data();
    Value_t* shadow = data.getShadow().data();

    // Pipeline
    Texchange.start();
    mpi.exchange_start(active, shadow, count, partner, tag);
    for (size_t stage = 0 ; stage < config.pipeline ; active += count, shadow += count) {
        // Wait previous chunk
        mpi.exchange_wait();  Texchange.stop();
        if (++stage < config.pipeline) {
            // Start next chunk if there is a next one
            Texchange.start();
            mpi.exchange_start(active + count, shadow + count, count, partner, ++tag);
        }
        // process the arrived data
        timeCall(Tminmax, keepMinOrMax, active, shadow, count, keepSmall);
    }
 }

 /*!
 * A distributed version of the Bubbletonic sort algorithm.
 *
@@ -284,9 +338,8 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
        if ( isActive(rank, Processes) &&
             isActive(part, Processes) ) {
            // Exchange with partner, keep nim-or-max and sort - O(N)
            int tag = static_cast<int>(step);
            timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
            timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
            int tag = static_cast<int>(tagGenerator(0, step));
            exchangePipeline(data, part, ks, tag);
            timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
        }
    }
@@ -324,9 +377,8 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
            auto part = partner<SortMode::Bitonic>(rank, step);
            auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
            // Exchange with partner, keep nim-or-max
            int tag = static_cast<int>( (2*p*depth) + step );
            timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
            timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
            int tag = static_cast<int>(tagGenerator(depth, step));
            exchangePipeline(data, part, ks, tag);
        }
        // sort - O(N)
        timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
--- a/homework_2/include/utils.hpp
+++ b/homework_2/include/utils.hpp
@@ -65,6 +65,10 @@ struct MPI_t {
            mpi_throw(err, "(MPI) MPI_Comm_rank() - ");
        size_ = static_cast<ID_t>(size_value);
        rank_ = static_cast<ID_t>(rank_value);
        if (size_ > static_cast<ID_t>(MAX_MPI_SIZE))
            throw std::runtime_error(
                    "(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n"
            );

        // Get the name of the processor
        char processor_name[MPI_MAX_PROCESSOR_NAME];
@@ -74,63 +78,56 @@ struct MPI_t {
        name_ = std::string (processor_name, name_len);
    }


    /*!
     * Exchange data with partner as part of the sorting network of both bubbletonic or bitonic
     * sorting algorithms.
     * Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the
     * sorting network of both bubbletonic or bitonic sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged data between
     * current node and partner.
     * @note
     *      This call MUST paired with exchange_wait() for each MPI_t object.
     *      Calling 2 consecutive exchange_start() for the same MPI_t object is undefined.
     *
     * @tparam T        The inner valur type used in buffer
     * @tparam ValueT   The underlying value type used in buffers
     *
     * @param ldata     [std::vector<T>]    Reference to local data to send
     * @param rdata     [std::vector<T>]    Reference to buffer to receive data from partner
     * @param partner   [mpi_id_t]          The partner for the exchange
     * @param tag       [int]               The tag to use for the MPI communication
     * @param ldata     [const ValueT*] Pointer to local data to send
     * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
     * @param count     [size_t]        The number of data to exchange
     * @param partner   [mpi_id_t]      The partner for the exchange
     * @param tag       [int]           The tag to use for the MPI communication
     */
    template<typename T>
    void exchange_data(const std::vector<T>& ldata, std::vector<T>& rdata, ID_t partner, int tag) {
    template<typename ValueT>
    void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");

        MPI_Datatype datatype = MPI_TypeMapper<T>::getType();
        int count = static_cast<int>(ldata.size());
        MPI_Status status;
        MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType();
        int err;
        if ((err = MPI_Sendrecv(
                ldata.data(), count, datatype, partner, tag,
                rdata.data(), count, datatype, partner, tag,
                 MPI_COMM_WORLD, &status
        )) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - ");
        err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx);
        if (err != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Isend() - ");
        err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx);
        if (err != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Irecv() - ");
    }

    /*!
     * Exchange a data object with partner as part of the sorting network of both bubbletonic
     * or bitonic sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged the data object
     * between current node and partner.
     * Block wait for the completion of the previously called exchange_start()
     *
     * @tparam T        The object type
     *
     * @param local     [const T&]  Reference to the local object to send
     * @param remote    [T&]        Reference to the object to receive data from partner
     * @param partner   [mpi_id_t]  The partner for the exchange
     * @param tag       [int]       The tag to use for the MPI communication
     * @note
     *      This call MUST paired with exchange_start() for each MPI_t object.
     *      Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined.
     */
    template<typename T>
    void exchange_it(const T& local, T& remote, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound");
    void exchange_wait() {
        MPI_Status status;

        int err;
        if ((err = MPI_Sendrecv(
                &local, sizeof(T), MPI_BYTE, partner, tag,
                &remote, sizeof(T), MPI_BYTE, partner, tag,
                MPI_COMM_WORLD, &status
        )) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - ");
        if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Wait() [send] - ");

        if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS)
            mpi_throw(err, "(MPI) MPI_Wait() [recv] - ");
    }

    // Accessors
@@ -181,6 +178,8 @@ private:
    ID_t size_{};           //!< MPI total size of the execution
    std::string name_{};    //!< The name of the local machine
    bool initialized_{};    //!< RAII helper flag
    MPI_Request handle_tx{};    //!< MPI async exchange handler for Transmission
    MPI_Request handle_rx{};    //!< MPI async exchange handler for Receptions
 };

 /*
@@ -377,9 +376,13 @@ struct Timing {
        else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000)
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
                      << std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n";
        else
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
                      << std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n";
        else {
            char stime[26]; // fit ulong
            auto sec  = std::chrono::duration_cast<seconds>(duration_).count();
            auto msec = (std::chrono::duration_cast<milliseconds>(duration_).count() % 1000) / 10;  // keep 2 digit
            std::sprintf(stime, "%ld.%1ld", sec, msec);
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n";
        }

    }

@@ -402,4 +405,17 @@ private:
    Tim.stop();                     \


 /*!
 * A utility to check if a number is power of two
 *
 * @tparam Integral     The integral type of the number to check
 * @param x             The number to check
 * @return              True if it is power of 2, false otherwise
 */
 template <typename Integral>
 constexpr inline bool isPowerOfTwo(Integral x) noexcept {
    return (!(x & (x - 1)) && x);
 }


 #endif /* UTILS_HPP_ */
--- a/homework_2/src/distsort.cpp
+++ b/homework_2/src/distsort.cpp
@@ -23,3 +23,13 @@ bool isActive(mpi_id_t node, size_t nodes) {
    return (node >= 0) && (node < static_cast<mpi_id_t>(nodes));
 }

 size_t tagGenerator(size_t depth, size_t step, size_t stage) {
    auto stage_bits = static_cast<uint32_t>(std::log2(MAX_PIPELINE_SIZE));
    auto step_bits  = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE));
    // ^ We use MPI_SIZE room for steps to fit the bubbletonic version

    size_t tag = stage
                 | (step << stage_bits)
                 | (depth << (stage_bits + step_bits));
    return tag;
 }
--- a/homework_2/src/main.cpp
+++ b/homework_2/src/main.cpp
@@ -17,7 +17,7 @@
 #include "distsort.hpp"


 // Global config data
 // Global session data
 config_t        config;
 MPI_t<>         mpi;
 distBuffer_t    Data;
@@ -43,36 +43,49 @@ bool get_options(int argc, char* argv[]){
                status = false;
            }
        }
        else if (arg == "--pipeline") {
            if (i+1 < argc) {
                auto stages = atoi(argv[++i]);
                if (isPowerOfTwo(stages) && stages <= static_cast<int>(MAX_PIPELINE_SIZE))
                    config.pipeline = stages;
                else
                    status = false;
            }
            else {
                status = false;
            }
        }
        else if (arg == "--validation") {
            config.validation = true;
        }
        else if (arg == "--ndebug") {
            config.ndebug = true;
        }
        else if (arg == "--perf") {
            config.perf = true;
        }
        else if (arg == "--ndebug") {
            config.ndebug = true;
        }
        else if (arg == "-v" || arg == "--verbose") {
            config.verbose = true;
        }
        else if (arg == "-h" || arg == "--help") {
            std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n";
            std::cout << "distbitonic -q <N> [--validation] [--ndebug] [-v]\n";
            std::cout << "distbitonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
            std::cout << "distbitonic -h\n";
            std::cout << "distbubbletonic -q <N> [--validation] [--ndebug] [-v]\n";
            std::cout << "distbubbletonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
            std::cout << "distbubbletonic -h\n";
            std::cout << '\n';
            std::cout << "Options:\n\n";
            std::cout << "   -q | --array-size <N>\n";
            std::cout << "      Selects the array size according to size = 2^N\n\n";
            std::cout << "   --par-sort\n";
            std::cout << "      Request a parallel full sorting algorithm\n\n";
            std::cout << "   --pipeline <N>\n";
            std::cout << "      Request a pipeline of <N> stages for exchange-minmax\n";
            std::cout << "      N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n";
            std::cout << "   --validation\n";
            std::cout << "      Request a full validation at the end, performed by process rank 0\n\n";
            std::cout << "   --perf\n";
            std::cout << "      Request performance timing measurements to stdout.\n\n";
            std::cout << "   --ndebug\n";
            std::cout << "      Skip debug breakpoint when on debug build.\n\n";
            std::cout << "   -t | --timing\n";
            std::cout << "      Request timing measurements output to stdout.\n\n";
            std::cout << "   -v | --verbose\n";
            std::cout << "      Request a more verbose output to stdout.\n\n";
            std::cout << "   -h | --help\n";
--- a/homework_2/test/tests_MPI.cpp
+++ b/homework_2/test/tests_MPI.cpp
@@ -126,6 +126,49 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) {
    }
 }

 /*
 * MPI: SysTest (acceptance)
 * Each process executes distBubbletonic for uin32_t [1 << 16] with pipeline
 */
 TEST_F(TMPIdistSort, distBubbletonic_test3) {
    // Create and fill vector
    using tsValue_t = uint32_t;      // Test parameters
    size_t ts_buffer_size = 1 << 16;

    ShadowedVec_t<tsValue_t> ts_Data;
    std::uniform_int_distribution<tsValue_t > dis(
            std::numeric_limits<tsValue_t>::min(),
            std::numeric_limits<tsValue_t>::max()
    );
    ts_Data.resize(ts_buffer_size);
    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });

    // Set pipeline
    config.pipeline = 8;

    // Execute function under test in all processes
    distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());

    // Local min and max
    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());

    // Gather min/max to rank 0
    std::vector<tsValue_t> global_mins(ts_mpi.size());
    std::vector<tsValue_t> global_maxes(ts_mpi.size());
    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();

    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);

    // Check results
    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
    if (ts_mpi.rank() == 0) {
        for (size_t i = 1; i < global_mins.size(); ++i) {
            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
        }
    }
 }

 /*
 * MPI: SysTest (acceptance)
@@ -209,3 +252,46 @@ TEST_F(TMPIdistSort, distBitonic_test2) {
    }
 }

 /*
 * MPI: SysTest (acceptance)
 * Each process executes distBitonic for uin32_t [1 << 16] with pipeline
 */
 TEST_F(TMPIdistSort, distBitonic_test3) {
    // Create and fill vector
    using tsValue_t = uint32_t;      // Test parameters
    size_t ts_buffer_size = 1 << 16;

    ShadowedVec_t<tsValue_t> ts_Data;
    std::uniform_int_distribution<tsValue_t > dis(
            std::numeric_limits<tsValue_t>::min(),
            std::numeric_limits<tsValue_t>::max()
    );
    ts_Data.resize(ts_buffer_size);
    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });

    // Set pipeline
    config.pipeline = 8;

    // Execute function under test in all processes
    distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());

    // Local min and max
    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());

    // Gather min/max to rank 0
    std::vector<tsValue_t> global_mins(ts_mpi.size());
    std::vector<tsValue_t> global_maxes(ts_mpi.size());
    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();

    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);

    // Check results
    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
    if (ts_mpi.rank() == 0) {
        for (size_t i = 1; i < global_mins.size(); ++i) {
            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
        }
    }
 }