diff --git a/homework_2/Makefile b/homework_2/Makefile index 1e5a38a..1905b6e 100644 --- a/homework_2/Makefile +++ b/homework_2/Makefile @@ -42,7 +42,7 @@ BUILD_DIR := bin OBJ_DIR := $(BUILD_DIR)/obj DEP_DIR := $(BUILD_DIR)/.dep -OUTPUT_DIR := out-rc3a +OUTPUT_DIR := out-rc3b # ========== Compiler settings ========== # Compiler flags for debug and release diff --git a/homework_2/hpc/N1P2T4Q20.sh b/homework_2/hpc/N1P2T4Q20.sh index a486988..c6f1dbf 100644 --- a/homework_2/hpc/N1P2T4Q20.sh +++ b/homework_2/hpc/N1P2T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q23.sh b/homework_2/hpc/N1P2T4Q23.sh index b2e21a9..8a82c70 100644 --- a/homework_2/hpc/N1P2T4Q23.sh +++ b/homework_2/hpc/N1P2T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q25.sh b/homework_2/hpc/N1P2T4Q25.sh index 806ea38..95858c3 100644 --- a/homework_2/hpc/N1P2T4Q25.sh +++ b/homework_2/hpc/N1P2T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q27.sh b/homework_2/hpc/N1P2T4Q27.sh index 234d10e..beb0faf 100644 --- a/homework_2/hpc/N1P2T4Q27.sh +++ b/homework_2/hpc/N1P2T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q20.sh b/homework_2/hpc/N1P4T4Q20.sh index 9c74d42..d5dd27a 100644 --- a/homework_2/hpc/N1P4T4Q20.sh +++ b/homework_2/hpc/N1P4T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q23.sh b/homework_2/hpc/N1P4T4Q23.sh index 6454df5..dcd0e7a 100644 --- a/homework_2/hpc/N1P4T4Q23.sh +++ b/homework_2/hpc/N1P4T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q25.sh b/homework_2/hpc/N1P4T4Q25.sh index f1390db..a9619ff 100644 --- a/homework_2/hpc/N1P4T4Q25.sh +++ b/homework_2/hpc/N1P4T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q27.sh b/homework_2/hpc/N1P4T4Q27.sh index 0ff5352..e8742a4 100644 --- a/homework_2/hpc/N1P4T4Q27.sh +++ b/homework_2/hpc/N1P4T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q20.sh b/homework_2/hpc/N2P4T4Q20.sh index 7b44c26..93a5944 100644 --- a/homework_2/hpc/N2P4T4Q20.sh +++ b/homework_2/hpc/N2P4T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q23.sh b/homework_2/hpc/N2P4T4Q23.sh index 18a9b37..e9a0851 100644 --- a/homework_2/hpc/N2P4T4Q23.sh +++ b/homework_2/hpc/N2P4T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q25.sh b/homework_2/hpc/N2P4T4Q25.sh index f58d11c..f03f569 100644 --- a/homework_2/hpc/N2P4T4Q25.sh +++ b/homework_2/hpc/N2P4T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q27.sh b/homework_2/hpc/N2P4T4Q27.sh index 20da8ea..bf9f8dd 100644 --- a/homework_2/hpc/N2P4T4Q27.sh +++ b/homework_2/hpc/N2P4T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q20.sh b/homework_2/hpc/N4P16T4Q20.sh index cd13c58..4ee3cd1 100644 --- a/homework_2/hpc/N4P16T4Q20.sh +++ b/homework_2/hpc/N4P16T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q23.sh b/homework_2/hpc/N4P16T4Q23.sh index 2bfead7..a44b4fc 100644 --- a/homework_2/hpc/N4P16T4Q23.sh +++ b/homework_2/hpc/N4P16T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q25.sh b/homework_2/hpc/N4P16T4Q25.sh index b60a9c4..2be0a7d 100644 --- a/homework_2/hpc/N4P16T4Q25.sh +++ b/homework_2/hpc/N4P16T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q27.sh b/homework_2/hpc/N4P16T4Q27.sh index 0e09c21..e7b2ba4 100644 --- a/homework_2/hpc/N4P16T4Q27.sh +++ b/homework_2/hpc/N4P16T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q20.sh b/homework_2/hpc/N4P32T4Q20.sh index 5ccd5db..2b7b4bb 100644 --- a/homework_2/hpc/N4P32T4Q20.sh +++ b/homework_2/hpc/N4P32T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q23.sh b/homework_2/hpc/N4P32T4Q23.sh index 26cef39..7db03b4 100644 --- a/homework_2/hpc/N4P32T4Q23.sh +++ b/homework_2/hpc/N4P32T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q25.sh b/homework_2/hpc/N4P32T4Q25.sh index 45558d1..06da205 100644 --- a/homework_2/hpc/N4P32T4Q25.sh +++ b/homework_2/hpc/N4P32T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q27.sh b/homework_2/hpc/N4P32T4Q27.sh index c82e1db..f42d08c 100644 --- a/homework_2/hpc/N4P32T4Q27.sh +++ b/homework_2/hpc/N4P32T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q20.sh b/homework_2/hpc/N4P4T4Q20.sh index 3d1fc5c..176ff23 100644 --- a/homework_2/hpc/N4P4T4Q20.sh +++ b/homework_2/hpc/N4P4T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q23.sh b/homework_2/hpc/N4P4T4Q23.sh index 51fbc1e..58d3c99 100644 --- a/homework_2/hpc/N4P4T4Q23.sh +++ b/homework_2/hpc/N4P4T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q25.sh b/homework_2/hpc/N4P4T4Q25.sh index a2cf887..ded9350 100644 --- a/homework_2/hpc/N4P4T4Q25.sh +++ b/homework_2/hpc/N4P4T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q27.sh b/homework_2/hpc/N4P4T4Q27.sh index 32073e4..d1d03cb 100644 --- a/homework_2/hpc/N4P4T4Q27.sh +++ b/homework_2/hpc/N4P4T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q20.sh b/homework_2/hpc/N4P8T4Q20.sh index 20602b5..ac2af75 100644 --- a/homework_2/hpc/N4P8T4Q20.sh +++ b/homework_2/hpc/N4P8T4Q20.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q23.sh b/homework_2/hpc/N4P8T4Q23.sh index faf8a4d..23a1cc5 100644 --- a/homework_2/hpc/N4P8T4Q23.sh +++ b/homework_2/hpc/N4P8T4Q23.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q25.sh b/homework_2/hpc/N4P8T4Q25.sh index 3d8aec6..ec4a8be 100644 --- a/homework_2/hpc/N4P8T4Q25.sh +++ b/homework_2/hpc/N4P8T4Q25.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q27.sh b/homework_2/hpc/N4P8T4Q27.sh index 975c0bc..18553df 100644 --- a/homework_2/hpc/N4P8T4Q27.sh +++ b/homework_2/hpc/N4P8T4Q27.sh @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3a/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file diff --git a/homework_2/include/config.h b/homework_2/include/config.h index 99835b9..e3bfd42 100644 --- a/homework_2/include/config.h +++ b/homework_2/include/config.h @@ -25,7 +25,14 @@ #endif // Default Data size (in case -q is not present) -#define DEFAULT_DATA_SIZE (1 << 16) +static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; + +// The maximum MPI size we support +static constexpr size_t MAX_MPI_SIZE = 1024UL; + +// The maximum pipeline size we support +static constexpr size_t MAX_PIPELINE_SIZE = 64UL; + /*! * Value type selection @@ -46,6 +53,7 @@ using distValue_t = uint32_t; */ struct config_t { size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. + size_t pipeline{1UL}; //!< Pipeline stages bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. bool ndebug{false}; //!< Skips debug trap on DEBUG builds. bool perf{false}; //!< Enable performance timing measurements and prints. diff --git a/homework_2/include/distsort.hpp b/homework_2/include/distsort.hpp index a7d6ef8..3eb3575 100644 --- a/homework_2/include/distsort.hpp +++ b/homework_2/include/distsort.hpp @@ -233,24 +233,24 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept { /*! - * Takes two sorted sequences where one is in increasing and the other is in decreasing order - * and selects either the larger or the smaller items in one-to-one comparison between them. - * The result is a bitonic sequence. + * Takes two sequences and selects either the larger or the smaller items + * in one-to-one comparison between them. If the initial sequences are bitonic, then + * the result is a bitonic sequence too! * - * @tparam RangeT A range type with random access iterator + * @tparam ValueT The underlying type of the sequences * - * @param local [RangeT] Reference to the local sequence - * @param remote [const RangeT] Reference to the remote sequence (copied locally by MPI) + * @param local [ValueT*] Pointer to the local sequence + * @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI) + * @param count [size_t] The number of items to process * @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence */ -template -void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { - using value_t = typename RangeT::value_type; +template +void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept { std::transform( - local.begin(), local.end(), - remote.begin(), - local.begin(), - [&keepSmall](const value_t& a, const value_t& b){ + local, local + count, + remote, + local, + [&keepSmall](const ValueT& a, const ValueT& b){ return (keepSmall) ? std::min(a, b) : std::max(a, b); }); } @@ -259,6 +259,60 @@ void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept * ============================== Sort algorithms ============================== */ +/*! + * A small tag generator tool to provide consistent encoding to tag communication + * + * @param depth The current algorithmic depth[bitonic] of the communication, if any + * @param step The current step on the current depth + * @param stage The stage of the pipeline. + * @return The tag to use. + * + * @note + * In case we call this function outside of the pipeline loop, we can ommit + * @c stage argument and use the return value as starting tag for every communication + * of the pipeline loop. We need to increase the tags for each communication of + * the pipeline loop though! + */ +size_t tagGenerator(size_t depth, size_t step, size_t stage = 0); + +/*! + * A pipeline loop for mixing min-max process with mpi data exchange + * + * @tparam ShadowedDataT A Shadowed buffer type with random access iterator. + * + * @param data [ShadowedDataT&] Reference to the data to exchange + * @param partner [mpi_id_t] The partner for the exchange + * @param keepSmall [bool] Flag to indicate if we keep the small values + * @param tag [int] The init tag to use for the loop. + * + * @note + * The @c tag is increased inside the pipeline loop for each different data exchange + */ +template +void exchangePipeline(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) { + using Value_t = typename ShadowedDataT::value_type; + + // Init counters and pointers + size_t count = data.size() / config.pipeline; + Value_t* active = data.getActive().data(); + Value_t* shadow = data.getShadow().data(); + + // Pipeline + Texchange.start(); + mpi.exchange_start(active, shadow, count, partner, tag); + for (size_t stage = 0 ; stage < config.pipeline ; active += count, shadow += count) { + // Wait previous chunk + mpi.exchange_wait(); Texchange.stop(); + if (++stage < config.pipeline) { + // Start next chunk if there is a next one + Texchange.start(); + mpi.exchange_start(active + count, shadow + count, count, partner, ++tag); + } + // process the arrived data + timeCall(Tminmax, keepMinOrMax, active, shadow, count, keepSmall); + } +} + /*! * A distributed version of the Bubbletonic sort algorithm. * @@ -284,9 +338,8 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { if ( isActive(rank, Processes) && isActive(part, Processes) ) { // Exchange with partner, keep nim-or-max and sort - O(N) - int tag = static_cast(step); - timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag); - timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); + int tag = static_cast(tagGenerator(0, step)); + exchangePipeline(data, part, ks, tag); timeCall(TelbowSort, elbowSort, data, ascending(rank, Processes)); } } @@ -324,9 +377,8 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { auto part = partner(rank, step); auto ks = keepSmall(rank, part, depth); // Exchange with partner, keep nim-or-max - int tag = static_cast( (2*p*depth) + step ); - timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag); - timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); + int tag = static_cast(tagGenerator(depth, step)); + exchangePipeline(data, part, ks, tag); } // sort - O(N) timeCall(TelbowSort, elbowSort, data, ascending(rank, depth)); diff --git a/homework_2/include/utils.hpp b/homework_2/include/utils.hpp index 986dc44..f1c355e 100644 --- a/homework_2/include/utils.hpp +++ b/homework_2/include/utils.hpp @@ -65,6 +65,10 @@ struct MPI_t { mpi_throw(err, "(MPI) MPI_Comm_rank() - "); size_ = static_cast(size_value); rank_ = static_cast(rank_value); + if (size_ > static_cast(MAX_MPI_SIZE)) + throw std::runtime_error( + "(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n" + ); // Get the name of the processor char processor_name[MPI_MAX_PROCESSOR_NAME]; @@ -74,63 +78,56 @@ struct MPI_t { name_ = std::string (processor_name, name_len); } + /*! - * Exchange data with partner as part of the sorting network of both bubbletonic or bitonic - * sorting algorithms. + * Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the + * sorting network of both bubbletonic or bitonic sorting algorithms. * * This function matches a transmit and a receive in order for fully exchanged data between * current node and partner. + * @note + * This call MUST paired with exchange_wait() for each MPI_t object. + * Calling 2 consecutive exchange_start() for the same MPI_t object is undefined. * - * @tparam T The inner valur type used in buffer + * @tparam ValueT The underlying value type used in buffers * - * @param ldata [std::vector] Reference to local data to send - * @param rdata [std::vector] Reference to buffer to receive data from partner - * @param partner [mpi_id_t] The partner for the exchange - * @param tag [int] The tag to use for the MPI communication + * @param ldata [const ValueT*] Pointer to local data to send + * @param rdata [ValueT*] Pointer to buffer to receive data from partner + * @param count [size_t] The number of data to exchange + * @param partner [mpi_id_t] The partner for the exchange + * @param tag [int] The tag to use for the MPI communication */ - template - void exchange_data(const std::vector& ldata, std::vector& rdata, ID_t partner, int tag) { + template + void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) { if (tag < 0) throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound"); - MPI_Datatype datatype = MPI_TypeMapper::getType(); - int count = static_cast(ldata.size()); - MPI_Status status; + MPI_Datatype datatype = MPI_TypeMapper::getType(); int err; - if ((err = MPI_Sendrecv( - ldata.data(), count, datatype, partner, tag, - rdata.data(), count, datatype, partner, tag, - MPI_COMM_WORLD, &status - )) != MPI_SUCCESS) - mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - "); + err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx); + if (err != MPI_SUCCESS) + mpi_throw(err, "(MPI) MPI_Isend() - "); + err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx); + if (err != MPI_SUCCESS) + mpi_throw(err, "(MPI) MPI_Irecv() - "); } /*! - * Exchange a data object with partner as part of the sorting network of both bubbletonic - * or bitonic sorting algorithms. - * - * This function matches a transmit and a receive in order for fully exchanged the data object - * between current node and partner. + * Block wait for the completion of the previously called exchange_start() * - * @tparam T The object type - * - * @param local [const T&] Reference to the local object to send - * @param remote [T&] Reference to the object to receive data from partner - * @param partner [mpi_id_t] The partner for the exchange - * @param tag [int] The tag to use for the MPI communication + * @note + * This call MUST paired with exchange_start() for each MPI_t object. + * Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined. */ - template - void exchange_it(const T& local, T& remote, ID_t partner, int tag) { - if (tag < 0) - throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound"); + void exchange_wait() { MPI_Status status; + int err; - if ((err = MPI_Sendrecv( - &local, sizeof(T), MPI_BYTE, partner, tag, - &remote, sizeof(T), MPI_BYTE, partner, tag, - MPI_COMM_WORLD, &status - )) != MPI_SUCCESS) - mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - "); + if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS) + mpi_throw(err, "(MPI) MPI_Wait() [send] - "); + + if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS) + mpi_throw(err, "(MPI) MPI_Wait() [recv] - "); } // Accessors @@ -181,6 +178,8 @@ private: ID_t size_{}; //!< MPI total size of the execution std::string name_{}; //!< The name of the local machine bool initialized_{}; //!< RAII helper flag + MPI_Request handle_tx{}; //!< MPI async exchange handler for Transmission + MPI_Request handle_rx{}; //!< MPI async exchange handler for Receptions }; /* @@ -377,9 +376,13 @@ struct Timing { else if (std::chrono::duration_cast(duration_).count() < 10000) std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << std::to_string(std::chrono::duration_cast(duration_).count()) << " [msec]\n"; - else - std::cout << "[Timing] (Rank " << rank << ") " << what << ": " - << std::to_string(std::chrono::duration_cast(duration_).count()) << " [sec]\n"; + else { + char stime[26]; // fit ulong + auto sec = std::chrono::duration_cast(duration_).count(); + auto msec = (std::chrono::duration_cast(duration_).count() % 1000) / 10; // keep 2 digit + std::sprintf(stime, "%ld.%1ld", sec, msec); + std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n"; + } } @@ -402,4 +405,17 @@ private: Tim.stop(); \ +/*! + * A utility to check if a number is power of two + * + * @tparam Integral The integral type of the number to check + * @param x The number to check + * @return True if it is power of 2, false otherwise + */ +template +constexpr inline bool isPowerOfTwo(Integral x) noexcept { + return (!(x & (x - 1)) && x); +} + + #endif /* UTILS_HPP_ */ diff --git a/homework_2/src/distsort.cpp b/homework_2/src/distsort.cpp index 19e77e9..6a614ea 100644 --- a/homework_2/src/distsort.cpp +++ b/homework_2/src/distsort.cpp @@ -23,3 +23,13 @@ bool isActive(mpi_id_t node, size_t nodes) { return (node >= 0) && (node < static_cast(nodes)); } +size_t tagGenerator(size_t depth, size_t step, size_t stage) { + auto stage_bits = static_cast(std::log2(MAX_PIPELINE_SIZE)); + auto step_bits = static_cast(std::log2(MAX_MPI_SIZE)); + // ^ We use MPI_SIZE room for steps to fit the bubbletonic version + + size_t tag = stage + | (step << stage_bits) + | (depth << (stage_bits + step_bits)); + return tag; +} diff --git a/homework_2/src/main.cpp b/homework_2/src/main.cpp index fc34a4b..dafa1e0 100644 --- a/homework_2/src/main.cpp +++ b/homework_2/src/main.cpp @@ -17,7 +17,7 @@ #include "distsort.hpp" -// Global config data +// Global session data config_t config; MPI_t<> mpi; distBuffer_t Data; @@ -43,36 +43,49 @@ bool get_options(int argc, char* argv[]){ status = false; } } + else if (arg == "--pipeline") { + if (i+1 < argc) { + auto stages = atoi(argv[++i]); + if (isPowerOfTwo(stages) && stages <= static_cast(MAX_PIPELINE_SIZE)) + config.pipeline = stages; + else + status = false; + } + else { + status = false; + } + } else if (arg == "--validation") { config.validation = true; } - else if (arg == "--ndebug") { - config.ndebug = true; - } else if (arg == "--perf") { config.perf = true; } + else if (arg == "--ndebug") { + config.ndebug = true; + } else if (arg == "-v" || arg == "--verbose") { config.verbose = true; } else if (arg == "-h" || arg == "--help") { std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; - std::cout << "distbitonic -q [--validation] [--ndebug] [-v]\n"; + std::cout << "distbitonic -q [--pipeline N] [--validation] [--ndebug] [-v]\n"; std::cout << "distbitonic -h\n"; - std::cout << "distbubbletonic -q [--validation] [--ndebug] [-v]\n"; + std::cout << "distbubbletonic -q [--pipeline N] [--validation] [--ndebug] [-v]\n"; std::cout << "distbubbletonic -h\n"; std::cout << '\n'; std::cout << "Options:\n\n"; std::cout << " -q | --array-size \n"; std::cout << " Selects the array size according to size = 2^N\n\n"; - std::cout << " --par-sort\n"; - std::cout << " Request a parallel full sorting algorithm\n\n"; + std::cout << " --pipeline \n"; + std::cout << " Request a pipeline of stages for exchange-minmax\n"; + std::cout << " N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n"; std::cout << " --validation\n"; std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; + std::cout << " --perf\n"; + std::cout << " Request performance timing measurements to stdout.\n\n"; std::cout << " --ndebug\n"; std::cout << " Skip debug breakpoint when on debug build.\n\n"; - std::cout << " -t | --timing\n"; - std::cout << " Request timing measurements output to stdout.\n\n"; std::cout << " -v | --verbose\n"; std::cout << " Request a more verbose output to stdout.\n\n"; std::cout << " -h | --help\n"; diff --git a/homework_2/test/tests_MPI.cpp b/homework_2/test/tests_MPI.cpp index 416ebe8..2c79bee 100644 --- a/homework_2/test/tests_MPI.cpp +++ b/homework_2/test/tests_MPI.cpp @@ -126,6 +126,49 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) { } } +/* + * MPI: SysTest (acceptance) + * Each process executes distBubbletonic for uin32_t [1 << 16] with pipeline + */ +TEST_F(TMPIdistSort, distBubbletonic_test3) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + + // Set pipeline + config.pipeline = 8; + + // Execute function under test in all processes + distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +} /* * MPI: SysTest (acceptance) @@ -209,3 +252,46 @@ TEST_F(TMPIdistSort, distBitonic_test2) { } } +/* + * MPI: SysTest (acceptance) + * Each process executes distBitonic for uin32_t [1 << 16] with pipeline + */ +TEST_F(TMPIdistSort, distBitonic_test3) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + + // Set pipeline + config.pipeline = 8; + + // Execute function under test in all processes + distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +}