From 4e7237bd4d29c7e8f9fd19389f9d403481477f58 Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Sat, 4 Jan 2025 16:56:34 +0200 Subject: [PATCH] HW2: RC3 - A small exchange optimization --- homework_2/Makefile | 2 +- homework_2/hpc/btN1P4T4Q24.sh | 2 +- homework_2/include/config.h | 15 +-- homework_2/include/distsort.hpp | 59 ++++++++++-- homework_2/include/utils.hpp | 90 ++++++++++++++---- homework_2/julia/bitonic_v1.jl | 93 ++++++++++++++++++- homework_2/src/distsort.cpp | 7 +- homework_2/src/main.cpp | 10 +- ...sts_Bitonic.cpp => tests_BitonicUtils.cpp} | 0 ...letonic.cpp => tests_BubbletonicUtils.cpp} | 0 ...tests_Common.cpp => tests_CommonUtils.cpp} | 0 homework_2/test/tests_MPI.cpp | 17 ++-- 12 files changed, 249 insertions(+), 46 deletions(-) rename homework_2/test/{tests_Bitonic.cpp => tests_BitonicUtils.cpp} (100%) rename homework_2/test/{tests_Bubbletonic.cpp => tests_BubbletonicUtils.cpp} (100%) rename homework_2/test/{tests_Common.cpp => tests_CommonUtils.cpp} (100%) diff --git a/homework_2/Makefile b/homework_2/Makefile index 6d51bfb..f64a3c6 100644 --- a/homework_2/Makefile +++ b/homework_2/Makefile @@ -1,5 +1,5 @@ # -# PDS homework_1 Makefile +# PDS HW2 Makefile # # Copyright (C) 2024 Christos Choutouridis # diff --git a/homework_2/hpc/btN1P4T4Q24.sh b/homework_2/hpc/btN1P4T4Q24.sh index cc26e1e..1cdbc86 100644 --- a/homework_2/hpc/btN1P4T4Q24.sh +++ b/homework_2/hpc/btN1P4T4Q24.sh @@ -10,7 +10,7 @@ # $> sbatch # # NOTE: -# First compile in aristotel with +# First compile in aristotle with # $> module load gcc/9.2.0 openmpi/4.0.3 # $> make -j hpc-build # diff --git a/homework_2/include/config.h b/homework_2/include/config.h index db9313f..99835b9 100644 --- a/homework_2/include/config.h +++ b/homework_2/include/config.h @@ -1,5 +1,5 @@ /*! - * \file config.h + * \file * \brief Build configuration file. * * \author @@ -45,13 +45,16 @@ using distValue_t = uint32_t; * Session option for each invocation of the executable */ struct config_t { - size_t arraySize{DEFAULT_DATA_SIZE}; //!< - bool validation{false}; //!< Request a full validation at the end, performed by process rank 0 - bool ndebug{false}; //!< Skips debug trap on DEBUG builds - bool perf{false}; //!< Enable performance timing measurements and prints - bool verbose{false}; //!< Flag to enable verbose output to stdout + size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. + bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. + bool ndebug{false}; //!< Skips debug trap on DEBUG builds. + bool perf{false}; //!< Enable performance timing measurements and prints. + bool verbose{false}; //!< Flag to enable verbose output to stdout. }; +/* + * Exported data types + */ extern config_t config; diff --git a/homework_2/include/distsort.hpp b/homework_2/include/distsort.hpp index d2ce369..ea0cfa9 100644 --- a/homework_2/include/distsort.hpp +++ b/homework_2/include/distsort.hpp @@ -22,7 +22,7 @@ #include "utils.hpp" -extern Timing TfullSort, Texchange, Tminmax, TelbowSort; +extern Timing TfullSort, Texchange, Tminmax, TelbowSort; // make timers public /*! * Enumerator for the different versions of the sorting method @@ -231,6 +231,43 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept { elbowSortCore(data, ascending, std::greater<>()); } +/*! + * Predicate for exchange optimization. Returns true only if an exchange between partners is needed. + * In order to do that we exchange min and max statistics of the partner's data. + * + * @tparam StatT Statistics data type (for min-max) + * + * @param lstat [const StatT] Reference to the local statistic data + * @param rstat [StatT] Reference to the remote statistic data to fill + * @param part [mpi_id_t] The partner for the exchange + * @param tag [int] The tag to use for the exchange of stats + * @param keepSmall [bool] Flag to indicate if the local thread keeps the small ro the large values + * @return True if we need data exchange, false otherwise + */ +template +bool needsExchange(const StatT& lstat, StatT& rstat, mpi_id_t part, int tag, bool keepSmall) { + timeCall(Texchange, mpi.exchange_it, lstat, rstat, part, tag); + return (keepSmall) ? + rstat.min < lstat.max // Lmin: rstat.min - Smax: lstat.max + : lstat.min < rstat.max; // Lmin: lstat.min - Smax: rstat.max +} + +/*! + * Update stats utility + * + * @tparam RangeT A range type with random access iterator + * @tparam StatT Statistics data type (for min-max) + * + * @param stat [StatT] Reference to the statistic data to update + * @param data [const RangeT] Reference to the sequence to extract stats from + */ +template +void updateMinMax(StatT& stat, const RangeT& data) noexcept { + auto [min, max] = std::minmax_element(data.begin(), data.end()); + stat.min = *min; + stat.max = *max; +} + /*! * Takes two sorted sequences where one is in increasing and the other is in decreasing order * and selects either the larger or the smaller items in one-to-one comparison between them. @@ -243,7 +280,7 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept { * @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence */ template -void minmax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { +void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { using value_t = typename RangeT::value_type; std::transform( local.begin(), local.end(), @@ -274,6 +311,7 @@ template void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { // Initially sort to create a half part of a bitonic sequence timeCall(TfullSort, fullSort, data, ascending(rank, 0)); + updateMinMax(localStat, data); // Sort network (O(N) iterations) for (size_t step = 0; step < static_cast(Processes); ++step) { @@ -283,8 +321,12 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { if ( isActive(rank, Processes) && isActive(part, Processes) ) { // Exchange with partner, keep nim-or-max and sort - O(N) - timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, step); - timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); + int tag = static_cast(2 * step); + if (needsExchange(localStat, remoteStat, part, tag, ks)) { + timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, ++tag); + timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); + updateMinMax(localStat, data); + } timeCall(TelbowSort, elbowSort, data, ascending(rank, Processes)); } } @@ -312,6 +354,7 @@ template void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { // Initially sort to create a half part of a bitonic sequence timeCall(TfullSort, fullSort, data, ascending(rank, 0)); + updateMinMax(localStat, data); // Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) auto p = static_cast(std::log2(Processes)); @@ -322,8 +365,12 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { auto part = partner(rank, step); auto ks = keepSmall(rank, part, depth); // Exchange with partner, keep nim-or-max - timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, (depth << 8) | step); - timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); + int tag = static_cast( (2*p*depth) + (2*step) ); + if (needsExchange(localStat, remoteStat, part, tag, ks)) { + timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag); + timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); + updateMinMax(localStat, data); + } } // sort - O(N) timeCall(TelbowSort, elbowSort, data, ascending(rank, depth)); diff --git a/homework_2/include/utils.hpp b/homework_2/include/utils.hpp index 657f276..94bc0c1 100644 --- a/homework_2/include/utils.hpp +++ b/homework_2/include/utils.hpp @@ -14,10 +14,24 @@ #include #include #include -//#include #include "config.h" +/*! + * Min-Max statistics data for exchange optimization + * @tparam Value_t The underlying data type of the sequence data + */ +template +struct Stat_t { + using value_type = Value_t; //!< meta-export the type + + Value_t min{}; //!< The minimum value of the sequence + Value_t max{}; //!< The maximum value of the sequence +}; + +//! Application data selection alias +using distStat_t = Stat_t; +extern distStat_t localStat, remoteStat; // Make stats public /* * MPI_ dispatcher mechanism @@ -85,25 +99,54 @@ struct MPI_t { * * @tparam T The inner valur type used in buffer * - * @param send_data [std::vector] Reference to local data to send - * @param recv_data [std::vector] Reference to buffer to receive data from partner + * @param ldata [std::vector] Reference to local data to send + * @param rdata [std::vector] Reference to buffer to receive data from partner * @param partner [mpi_id_t] The partner for the exchange * @param tag [int] The tag to use for the MPI communication */ template - void exchange(const std::vector& send_data, std::vector& recv_data, ID_t partner, int tag) { - using namespace std::string_literals; + void exchange_data(const std::vector& ldata, std::vector& rdata, ID_t partner, int tag) { + if (tag < 0) + throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound"); MPI_Datatype datatype = MPI_TypeMapper::getType(); - int send_count = static_cast(send_data.size()); + int count = static_cast(ldata.size()); + MPI_Status status; + int err; + if ((err = MPI_Sendrecv( + ldata.data(), count, datatype, partner, tag, + rdata.data(), count, datatype, partner, tag, + MPI_COMM_WORLD, &status + )) != MPI_SUCCESS) + mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - "); + } + + /*! + * Exchange a data object with partner as part of the sorting network of both bubbletonic + * or bitonic sorting algorithms. + * + * This function matches a transmit and a receive in order for fully exchanged the data object + * between current node and partner. + * + * @tparam T The object type + * + * @param local [const T&] Reference to the local object to send + * @param remote [T&] Reference to the object to receive data from partner + * @param partner [mpi_id_t] The partner for the exchange + * @param tag [int] The tag to use for the MPI communication + */ + template + void exchange_it(const T& local, T& remote, ID_t partner, int tag) { + if (tag < 0) + throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound"); MPI_Status status; int err; if ((err = MPI_Sendrecv( - send_data.data(), send_count, datatype, partner, tag, - recv_data.data(), send_count, datatype, partner, tag, + &local, sizeof(T), MPI_BYTE, partner, tag, + &remote, sizeof(T), MPI_BYTE, partner, tag, MPI_COMM_WORLD, &status )) != MPI_SUCCESS) - mpi_throw(err, "(MPI) MPI_Sendrecv() - "); + mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - "); } // Accessors @@ -165,11 +208,13 @@ using mpi_id_t = MPI_t<>::ID_t; /*! - * A std::vector wrapper with 2 vectors, an active and a shadow. This type exposes the standard vector + * @brief A std::vector wrapper with 2 vectors, an active and a shadow. + * + * This type exposes the standard vector * functionality of the active vector. The shadow can be used when we need to use the vector as mutable * data in algorithms that can not support "in-place" editing (like elbow-sort for example) * - * @tparam Value_t the inner data type of the vectors + * @tparam Value_t the underlying data type of the vectors */ template struct ShadowedVec_t { @@ -274,8 +319,10 @@ private: } active{north}; //!< Flag to select between North and South buffer }; +/* + * Exported data types + */ using distBuffer_t = ShadowedVec_t; - extern distBuffer_t Data; /*! @@ -333,11 +380,12 @@ struct Timing { return now; } - Tduration dt(Tpoint t2, Tpoint t1) noexcept { + //! A duration calculation utility + static Tduration dt(Tpoint t2, Tpoint t1) noexcept { return std::chrono::duration_cast(t2 - t1); } - //! tool to print the time interval + //! Tool to print the time interval void print_duration(const char *what, mpi_id_t rank) noexcept { if (std::chrono::duration_cast(duration_).count() < 10000) std::cout << "[Timing] (Rank " << rank << ") " << what << ": " @@ -357,13 +405,17 @@ private: }; /*! - * Utility high level function like to forward a function call and measure - * the excecution time + * Utility "high level function"-like macro to forward a function call + * and accumulate the execution time to the corresponding timing object. + * + * @param Tim The Timing object [Needs to have methods start() and stop()] + * @param Func The function name + * @param ... The arguments to pass to function (the preprocessor way) */ #define timeCall(Tim, Func, ...) \ - Tim.start(); \ - Func(__VA_ARGS__); \ - Tim.stop(); \ + Tim.start(); \ + Func(__VA_ARGS__); \ + Tim.stop(); \ #endif /* UTILS_HPP_ */ diff --git a/homework_2/julia/bitonic_v1.jl b/homework_2/julia/bitonic_v1.jl index f17c393..7c46782 100644 --- a/homework_2/julia/bitonic_v1.jl +++ b/homework_2/julia/bitonic_v1.jl @@ -4,9 +4,9 @@ # function exchange(localid, remoteid) - if verbose - println("Exchange local data from $localid with partner $remoteid") - end + # if verbose + # println("Exchange local data from $localid with partner $remoteid") + # end nothing # We have all data here ;) end @@ -23,9 +23,89 @@ function minmax(data, localid, remoteid, keepsmall) end +function is_bitonic(arr) + n = length(arr) + if n <= 2 + return true # Any sequence of length <= 2 is bitonic + end + + # State for state machine. 1: inc, -1: dec, 0: z-state + state = 0 + inc_count = 0 + dec_count = 0 + ret = false + + for i in 1:n-1 + # Find the first order + if state == 0 + if arr[i] > arr[i+1] + state = -1 + dec_count += 1 + elseif arr[i] < arr[i+1] + state = 1 + inc_count += 1 + end + elseif state == -1 # decreasing + if arr[i] < arr[i + 1] + state = 1 + inc_count += 1 + end + elseif state == 1 # increasing + if arr[i] > arr[i+1] + state = -1 + dec_count += 1 + end + end + end + + if inc_count <= 1 && dec_count <= 1 + ret = true # Sequence is bitonic + elseif inc_count == 2 && dec_count == 1 + ret = (arr[1] >= arr[n]) + elseif inc_count == 1 && dec_count == 2 + ret = (arr[1] <= arr[n]) + end + + ret +end + +function is_sort(arr) + # State for state machine. 1: inc, -1: dec, 0: z-state + state = 0 + inc_count = 0 + dec_count = 0 + + for i in 1:length(arr)-1 + # Find the first order + if state == 0 + if arr[i] > arr[i+1] + state = -1 + dec_count += 1 + elseif arr[i] < arr[i+1] + state = 1 + inc_count += 1 + end + elseif state == -1 # decreasing + if arr[i] < arr[i + 1] + state = 1 + inc_count += 1 + end + elseif state == 1 # increasing + if arr[i] > arr[i+1] + state = -1 + dec_count += 1 + end + end + end + + ret = ((inc_count + dec_count) == 1) ? state : 0 + ret +end function sort_network!(data, n, depth) nodes = 0:n-1 + bitonicFlag = zeros(Int8, size(data, 1)) + sortFlag = zeros(Int8, size(data, 1)) for step = depth-1:-1:0 partnerid = nodes .⊻ (1 << step) direction = (nodes .& (1 << depth)) .== 0 .& (nodes .< partnerid) @@ -40,6 +120,13 @@ function sort_network!(data, n, depth) minmax(data, i, partnerid[i+1], keepsmall[i+1]) end end + if verbose + for i in 1:size(data, 1) + bitonicFlag[i] = is_bitonic(data[i, :]) + sortFlag[i] = is_sort(data[i, :]) + end + println("depth: $depth | step: $step | bitonicFlag: $bitonicFlag | sorfFlag: $sortFlag") + end end end diff --git a/homework_2/src/distsort.cpp b/homework_2/src/distsort.cpp index fe3df1a..f353f31 100644 --- a/homework_2/src/distsort.cpp +++ b/homework_2/src/distsort.cpp @@ -1,6 +1,6 @@ /*! * \file - * \brief Distributed sort implementation. + * \brief Distributed sort implementation * * \author * Christos Choutouridis AEM:8997 @@ -9,8 +9,13 @@ #include "utils.hpp" #include "distsort.hpp" +//! Statistic variables for exchange optimization +distStat_t localStat, remoteStat; + +//! Performance timers for each one of the "costly" functions Timing TfullSort, Texchange, Tminmax, TelbowSort; + bool isActive(mpi_id_t node, size_t nodes) { if (!((nodes > 0) && (nodes <= std::numeric_limits::max()) )) diff --git a/homework_2/src/main.cpp b/homework_2/src/main.cpp index 8f763ac..fc34a4b 100644 --- a/homework_2/src/main.cpp +++ b/homework_2/src/main.cpp @@ -138,6 +138,9 @@ bool validator(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { } #if !defined TESTING +/*! + * @return Returns 0, but.... we may throw or exit(1) + */ int main(int argc, char* argv[]) try { // Initialize MPI environment mpi.init(&argc, &argv); @@ -170,6 +173,7 @@ int main(int argc, char* argv[]) try { sleep(1); #endif + // Initialize local data logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] std::mt19937 gen(rd()); @@ -181,6 +185,7 @@ int main(int argc, char* argv[]) try { Data.resize(config.arraySize); std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); + // Run distributed sort if (mpi.rank() == 0) logger << "Starting distributed sorting ... "; Ttotal.start(); @@ -192,9 +197,9 @@ int main(int argc, char* argv[]) try { Ttotal.stop(); if (mpi.rank() == 0) logger << " Done." << logger.endl; - std::string timeMsg = "rank " + std::to_string(mpi.rank()); + // Print-outs and validation if (config.perf) { Ttotal.print_duration("Total ", mpi.rank()); TfullSort.print_duration("Full-Sort ", mpi.rank()); @@ -224,6 +229,9 @@ catch (std::exception& e) { #include #include +/*! + * The testing version of our program + */ GTEST_API_ int main(int argc, char **argv) try { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/homework_2/test/tests_Bitonic.cpp b/homework_2/test/tests_BitonicUtils.cpp similarity index 100% rename from homework_2/test/tests_Bitonic.cpp rename to homework_2/test/tests_BitonicUtils.cpp diff --git a/homework_2/test/tests_Bubbletonic.cpp b/homework_2/test/tests_BubbletonicUtils.cpp similarity index 100% rename from homework_2/test/tests_Bubbletonic.cpp rename to homework_2/test/tests_BubbletonicUtils.cpp diff --git a/homework_2/test/tests_Common.cpp b/homework_2/test/tests_CommonUtils.cpp similarity index 100% rename from homework_2/test/tests_Common.cpp rename to homework_2/test/tests_CommonUtils.cpp diff --git a/homework_2/test/tests_MPI.cpp b/homework_2/test/tests_MPI.cpp index 73e3593..f3d045a 100644 --- a/homework_2/test/tests_MPI.cpp +++ b/homework_2/test/tests_MPI.cpp @@ -6,6 +6,9 @@ * make tests * mpirun -np ./out/tests * + * Note: + * Yes each process runs the entire test suite!! + * * \author * Christos Choutouridis AEM:8997 * @@ -15,7 +18,11 @@ #include #include #include "distsort.hpp" +/* + * Global fixtures + */ +// MPI handler for the test session MPI_t<> ts_mpi; // Mersenne seeded from hw if possible. range: [type_min, type_max] @@ -27,17 +34,11 @@ protected: static void SetUpTestSuite() { int argc = 0; char** argv = nullptr; - MPI_Init(&argc, &argv); - - int rank, size; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &size); - ts_mpi.rank(rank); - ts_mpi.size(size); + ts_mpi.init(&argc, &argv); } static void TearDownTestSuite() { - MPI_Finalize(); + ts_mpi.finalize(); } };