HW2: RC3b - A MinMax-MPIexchange pipeline and small changes

2025-01-05 21:26:56 +02:00 · 2025-01-05 21:26:56 +02:00 · 3bf4522448
commit 3bf4522448
parent c485d0db2d
35 changed files with 286 additions and 101 deletions
--- a/homework_2/Makefile
+++ b/homework_2/Makefile
@ -42,7 +42,7 @@ BUILD_DIR       := bin
 OBJ_DIR         := $(BUILD_DIR)/obj
 DEP_DIR         := $(BUILD_DIR)/.dep

-OUTPUT_DIR      := out-rc3a
+OUTPUT_DIR      := out-rc3b

 # ========== Compiler settings ==========
 # Compiler flags for debug and release
--- a/homework_2/hpc/N1P2T4Q20.sh
+++ b/homework_2/hpc/N1P2T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N1P2T4Q23.sh
+++ b/homework_2/hpc/N1P2T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N1P2T4Q25.sh
+++ b/homework_2/hpc/N1P2T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N1P2T4Q27.sh
+++ b/homework_2/hpc/N1P2T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N1P4T4Q20.sh
+++ b/homework_2/hpc/N1P4T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N1P4T4Q23.sh
+++ b/homework_2/hpc/N1P4T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N1P4T4Q25.sh
+++ b/homework_2/hpc/N1P4T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N1P4T4Q27.sh
+++ b/homework_2/hpc/N1P4T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N2P4T4Q20.sh
+++ b/homework_2/hpc/N2P4T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N2P4T4Q23.sh
+++ b/homework_2/hpc/N2P4T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N2P4T4Q25.sh
+++ b/homework_2/hpc/N2P4T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N2P4T4Q27.sh
+++ b/homework_2/hpc/N2P4T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N4P16T4Q20.sh
+++ b/homework_2/hpc/N4P16T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P16T4Q23.sh
+++ b/homework_2/hpc/N4P16T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P16T4Q25.sh
+++ b/homework_2/hpc/N4P16T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P16T4Q27.sh
+++ b/homework_2/hpc/N4P16T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P32T4Q20.sh
+++ b/homework_2/hpc/N4P32T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P32T4Q23.sh
+++ b/homework_2/hpc/N4P32T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P32T4Q25.sh
+++ b/homework_2/hpc/N4P32T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P32T4Q27.sh
+++ b/homework_2/hpc/N4P32T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P4T4Q20.sh
+++ b/homework_2/hpc/N4P4T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P4T4Q23.sh
+++ b/homework_2/hpc/N4P4T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P4T4Q25.sh
+++ b/homework_2/hpc/N4P4T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation
--- a/homework_2/hpc/N4P4T4Q27.sh
+++ b/homework_2/hpc/N4P4T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation
--- a/homework_2/hpc/N4P8T4Q20.sh
+++ b/homework_2/hpc/N4P8T4Q20.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 20 --perf --validation
+srun ./out-rc3b/distbitonic -q 20 --perf --validation
--- a/homework_2/hpc/N4P8T4Q23.sh
+++ b/homework_2/hpc/N4P8T4Q23.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 23 --perf --validation
+srun ./out-rc3b/distbitonic -q 23 --perf --validation
--- a/homework_2/hpc/N4P8T4Q25.sh
+++ b/homework_2/hpc/N4P8T4Q25.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 25 --perf --validation
+srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
--- a/homework_2/hpc/N4P8T4Q27.sh
+++ b/homework_2/hpc/N4P8T4Q27.sh
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0

-srun ./out-rc3a/distbitonic -q 27 --perf --validation
+srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
--- a/homework_2/include/config.h
+++ b/homework_2/include/config.h
@ -25,7 +25,14 @@
 #endif

 // Default Data size (in case -q <N> is not present)
-#define DEFAULT_DATA_SIZE (1 << 16)
+static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
+
+// The maximum MPI size we support
+static constexpr size_t MAX_MPI_SIZE        = 1024UL;
+
+// The maximum pipeline size we support
+static constexpr size_t MAX_PIPELINE_SIZE   = 64UL;
+

 /*!
 * Value type selection
@ -46,6 +53,7 @@ using   distValue_t = uint32_t;
 */
 struct config_t {
    size_t  arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
+    size_t  pipeline{1UL};                  //!< Pipeline stages
    bool    validation{false};              //!< Request a full validation at the end, performed by process rank 0.
    bool    ndebug{false};                  //!< Skips debug trap on DEBUG builds.
    bool    perf{false};                    //!< Enable performance timing measurements and prints.
--- a/homework_2/include/distsort.hpp
+++ b/homework_2/include/distsort.hpp
@ -233,24 +233,24 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept {


 /*!
- * Takes two sorted sequences where one is in increasing and the other is in decreasing order
- * and selects either the larger or the smaller items in one-to-one comparison between them.
- * The result is a bitonic sequence.
+ * Takes two sequences and selects either the larger or the smaller items
+ * in one-to-one comparison between them. If the initial sequences are bitonic, then
+ * the result is a bitonic sequence too!
 *
- * @tparam RangeT   A range type with random access iterator
+ * @tparam ValueT   The underlying type of the sequences
 *
- * @param local     [RangeT]        Reference to the local sequence
- * @param remote    [const RangeT]  Reference to the remote sequence (copied locally by MPI)
+ * @param local     [ValueT*]       Pointer to the local sequence
+ * @param remote    [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
+ * @param count     [size_t]        The number of items to process
 * @param keepSmall [bool]          Flag to indicate if we keep the small items in local sequence
 */
-template<typename RangeT>
-void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept {
-    using value_t = typename RangeT::value_type;
+template<typename ValueT>
+void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
    std::transform(
-            local.begin(), local.end(),
-            remote.begin(),
-            local.begin(),
-            [&keepSmall](const value_t& a, const value_t& b){
+            local, local + count,
+            remote,
+            local,
+            [&keepSmall](const ValueT& a, const ValueT& b){
                return (keepSmall) ? std::min(a, b) : std::max(a, b);
            });
 }
@ -259,6 +259,60 @@ void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept
 * ============================== Sort algorithms ==============================
 */

+/*!
+ * A small tag generator tool to provide consistent encoding to tag communication
+ *
+ * @param depth     The current algorithmic depth[bitonic] of the communication, if any
+ * @param step      The current step on the current depth
+ * @param stage     The stage of the pipeline.
+ * @return          The tag to use.
+ *
+ * @note
+ *      In case we call this function outside of the pipeline loop, we can ommit
+ *      @c stage argument and use the return value as starting tag for every communication
+ *      of the pipeline loop. We need to increase the tags for each communication of
+ *      the pipeline loop though!
+ */
+size_t tagGenerator(size_t depth, size_t step, size_t stage = 0);
+
+/*!
+ * A pipeline loop for mixing min-max process with mpi data exchange
+ *
+ * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
+ *
+ * @param data      [ShadowedDataT&]    Reference to the data to exchange
+ * @param partner   [mpi_id_t]          The partner for the exchange
+ * @param keepSmall [bool]              Flag to indicate if we keep the small values
+ * @param tag       [int]               The init tag to use for the loop.
+ *
+ * @note
+ *      The @c tag is increased inside the pipeline loop for each different data exchange
+ */
+template<typename ShadowedDataT>
+void exchangePipeline(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) {
+    using Value_t = typename ShadowedDataT::value_type;
+
+    // Init counters and pointers
+    size_t    count = data.size() / config.pipeline;
+    Value_t* active = data.getActive().data();
+    Value_t* shadow = data.getShadow().data();
+
+    // Pipeline
+    Texchange.start();
+    mpi.exchange_start(active, shadow, count, partner, tag);
+    for (size_t stage = 0 ; stage < config.pipeline ; active += count, shadow += count) {
+        // Wait previous chunk
+        mpi.exchange_wait();  Texchange.stop();
+        if (++stage < config.pipeline) {
+            // Start next chunk if there is a next one
+            Texchange.start();
+            mpi.exchange_start(active + count, shadow + count, count, partner, ++tag);
+        }
+        // process the arrived data
+        timeCall(Tminmax, keepMinOrMax, active, shadow, count, keepSmall);
+    }
+}
+
 /*!
 * A distributed version of the Bubbletonic sort algorithm.
 *
@ -284,9 +338,8 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
        if ( isActive(rank, Processes) &&
             isActive(part, Processes) ) {
            // Exchange with partner, keep nim-or-max and sort - O(N)
-            int tag = static_cast<int>(step);
-            timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
-            timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
+            int tag = static_cast<int>(tagGenerator(0, step));
+            exchangePipeline(data, part, ks, tag);
            timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
        }
    }
@ -324,9 +377,8 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
            auto part = partner<SortMode::Bitonic>(rank, step);
            auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
            // Exchange with partner, keep nim-or-max
-            int tag = static_cast<int>( (2*p*depth) + step );
-            timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
-            timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
+            int tag = static_cast<int>(tagGenerator(depth, step));
+            exchangePipeline(data, part, ks, tag);
        }
        // sort - O(N)
        timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
--- a/homework_2/include/utils.hpp
+++ b/homework_2/include/utils.hpp
@ -65,6 +65,10 @@ struct MPI_t {
            mpi_throw(err, "(MPI) MPI_Comm_rank() - ");
        size_ = static_cast<ID_t>(size_value);
        rank_ = static_cast<ID_t>(rank_value);
+        if (size_ > static_cast<ID_t>(MAX_MPI_SIZE))
+            throw std::runtime_error(
+                    "(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n"
+            );

        // Get the name of the processor
        char processor_name[MPI_MAX_PROCESSOR_NAME];
@ -74,63 +78,56 @@ struct MPI_t {
        name_ = std::string (processor_name, name_len);
    }

+
    /*!
-     * Exchange data with partner as part of the sorting network of both bubbletonic or bitonic
-     * sorting algorithms.
+     * Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the
+     * sorting network of both bubbletonic or bitonic sorting algorithms.
     *
     * This function matches a transmit and a receive in order for fully exchanged data between
     * current node and partner.
+     * @note
+     *      This call MUST paired with exchange_wait() for each MPI_t object.
+     *      Calling 2 consecutive exchange_start() for the same MPI_t object is undefined.
     *
-     * @tparam T        The inner valur type used in buffer
+     * @tparam ValueT   The underlying value type used in buffers
     *
-     * @param ldata     [std::vector<T>]    Reference to local data to send
-     * @param rdata     [std::vector<T>]    Reference to buffer to receive data from partner
-     * @param partner   [mpi_id_t]          The partner for the exchange
-     * @param tag       [int]               The tag to use for the MPI communication
+     * @param ldata     [const ValueT*] Pointer to local data to send
+     * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
+     * @param count     [size_t]        The number of data to exchange
+     * @param partner   [mpi_id_t]      The partner for the exchange
+     * @param tag       [int]           The tag to use for the MPI communication
     */
-    template<typename T>
-    void exchange_data(const std::vector<T>& ldata, std::vector<T>& rdata, ID_t partner, int tag) {
+    template<typename ValueT>
+    void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) {
        if (tag < 0)
            throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");

-        MPI_Datatype datatype = MPI_TypeMapper<T>::getType();
-        int count = static_cast<int>(ldata.size());
-        MPI_Status status;
+        MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType();
        int err;
-        if ((err = MPI_Sendrecv(
-                ldata.data(), count, datatype, partner, tag,
-                rdata.data(), count, datatype, partner, tag,
-                 MPI_COMM_WORLD, &status
-        )) != MPI_SUCCESS)
-            mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - ");
+        err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx);
+        if (err != MPI_SUCCESS)
+            mpi_throw(err, "(MPI) MPI_Isend() - ");
+        err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx);
+        if (err != MPI_SUCCESS)
+            mpi_throw(err, "(MPI) MPI_Irecv() - ");
    }

    /*!
-     * Exchange a data object with partner as part of the sorting network of both bubbletonic
-     * or bitonic sorting algorithms.
+     * Block wait for the completion of the previously called exchange_start()
     *
-     * This function matches a transmit and a receive in order for fully exchanged the data object
-     * between current node and partner.
-     *
-     * @tparam T        The object type
-     *
-     * @param local     [const T&]  Reference to the local object to send
-     * @param remote    [T&]        Reference to the object to receive data from partner
-     * @param partner   [mpi_id_t]  The partner for the exchange
-     * @param tag       [int]       The tag to use for the MPI communication
+     * @note
+     *      This call MUST paired with exchange_start() for each MPI_t object.
+     *      Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined.
     */
-    template<typename T>
-    void exchange_it(const T& local, T& remote, ID_t partner, int tag) {
-        if (tag < 0)
-            throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound");
+    void exchange_wait() {
        MPI_Status status;
+
        int err;
-        if ((err = MPI_Sendrecv(
-                &local, sizeof(T), MPI_BYTE, partner, tag,
-                &remote, sizeof(T), MPI_BYTE, partner, tag,
-                MPI_COMM_WORLD, &status
-        )) != MPI_SUCCESS)
-            mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - ");
+        if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS)
+            mpi_throw(err, "(MPI) MPI_Wait() [send] - ");
+
+        if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS)
+            mpi_throw(err, "(MPI) MPI_Wait() [recv] - ");
    }

    // Accessors
@ -181,6 +178,8 @@ private:
    ID_t size_{};           //!< MPI total size of the execution
    std::string name_{};    //!< The name of the local machine
    bool initialized_{};    //!< RAII helper flag
+    MPI_Request handle_tx{};    //!< MPI async exchange handler for Transmission
+    MPI_Request handle_rx{};    //!< MPI async exchange handler for Receptions
 };

 /*
@ -377,9 +376,13 @@ struct Timing {
        else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000)
            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
                      << std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n";
-        else
-            std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
-                      << std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n";
+        else {
+            char stime[26]; // fit ulong
+            auto sec  = std::chrono::duration_cast<seconds>(duration_).count();
+            auto msec = (std::chrono::duration_cast<milliseconds>(duration_).count() % 1000) / 10;  // keep 2 digit
+            std::sprintf(stime, "%ld.%1ld", sec, msec);
+            std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n";
+        }

    }

@ -402,4 +405,17 @@ private:
    Tim.stop();                     \


+/*!
+ * A utility to check if a number is power of two
+ *
+ * @tparam Integral     The integral type of the number to check
+ * @param x             The number to check
+ * @return              True if it is power of 2, false otherwise
+ */
+template <typename Integral>
+constexpr inline bool isPowerOfTwo(Integral x) noexcept {
+    return (!(x & (x - 1)) && x);
+}
+
+
 #endif /* UTILS_HPP_ */
--- a/homework_2/src/distsort.cpp
+++ b/homework_2/src/distsort.cpp
@ -23,3 +23,13 @@ bool isActive(mpi_id_t node, size_t nodes) {
    return (node >= 0) && (node < static_cast<mpi_id_t>(nodes));
 }

+size_t tagGenerator(size_t depth, size_t step, size_t stage) {
+    auto stage_bits = static_cast<uint32_t>(std::log2(MAX_PIPELINE_SIZE));
+    auto step_bits  = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE));
+    // ^ We use MPI_SIZE room for steps to fit the bubbletonic version
+
+    size_t tag = stage
+                 | (step << stage_bits)
+                 | (depth << (stage_bits + step_bits));
+    return tag;
+}
--- a/homework_2/src/main.cpp
+++ b/homework_2/src/main.cpp
@ -17,7 +17,7 @@
 #include "distsort.hpp"


-// Global config data
+// Global session data
 config_t        config;
 MPI_t<>         mpi;
 distBuffer_t    Data;
@ -43,36 +43,49 @@ bool get_options(int argc, char* argv[]){
                status = false;
            }
        }
+        else if (arg == "--pipeline") {
+            if (i+1 < argc) {
+                auto stages = atoi(argv[++i]);
+                if (isPowerOfTwo(stages) && stages <= static_cast<int>(MAX_PIPELINE_SIZE))
+                    config.pipeline = stages;
+                else
+                    status = false;
+            }
+            else {
+                status = false;
+            }
+        }
        else if (arg == "--validation") {
            config.validation = true;
        }
-        else if (arg == "--ndebug") {
-            config.ndebug = true;
-        }
        else if (arg == "--perf") {
            config.perf = true;
        }
+        else if (arg == "--ndebug") {
+            config.ndebug = true;
+        }
        else if (arg == "-v" || arg == "--verbose") {
            config.verbose = true;
        }
        else if (arg == "-h" || arg == "--help") {
            std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n";
-            std::cout << "distbitonic -q <N> [--validation] [--ndebug] [-v]\n";
+            std::cout << "distbitonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
            std::cout << "distbitonic -h\n";
-            std::cout << "distbubbletonic -q <N> [--validation] [--ndebug] [-v]\n";
+            std::cout << "distbubbletonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
            std::cout << "distbubbletonic -h\n";
            std::cout << '\n';
            std::cout << "Options:\n\n";
            std::cout << "   -q | --array-size <N>\n";
            std::cout << "      Selects the array size according to size = 2^N\n\n";
-            std::cout << "   --par-sort\n";
-            std::cout << "      Request a parallel full sorting algorithm\n\n";
+            std::cout << "   --pipeline <N>\n";
+            std::cout << "      Request a pipeline of <N> stages for exchange-minmax\n";
+            std::cout << "      N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n";
            std::cout << "   --validation\n";
            std::cout << "      Request a full validation at the end, performed by process rank 0\n\n";
+            std::cout << "   --perf\n";
+            std::cout << "      Request performance timing measurements to stdout.\n\n";
            std::cout << "   --ndebug\n";
            std::cout << "      Skip debug breakpoint when on debug build.\n\n";
-            std::cout << "   -t | --timing\n";
-            std::cout << "      Request timing measurements output to stdout.\n\n";
            std::cout << "   -v | --verbose\n";
            std::cout << "      Request a more verbose output to stdout.\n\n";
            std::cout << "   -h | --help\n";
--- a/homework_2/test/tests_MPI.cpp
+++ b/homework_2/test/tests_MPI.cpp
@ -126,6 +126,49 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) {
    }
 }

+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBubbletonic for uin32_t [1 << 16] with pipeline
+ */
+TEST_F(TMPIdistSort, distBubbletonic_test3) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+
+    // Set pipeline
+    config.pipeline = 8;
+
+    // Execute function under test in all processes
+    distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}

 /*
 * MPI: SysTest (acceptance)
@ -209,3 +252,46 @@ TEST_F(TMPIdistSort, distBitonic_test2) {
    }
 }

+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBitonic for uin32_t [1 << 16] with pipeline
+ */
+TEST_F(TMPIdistSort, distBitonic_test3) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+
+    // Set pipeline
+    config.pipeline = 8;
+
+    // Execute function under test in all processes
+    distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}