diff --git a/homework_2/Makefile b/homework_2/Makefile index 6bb7fa0..7bae436 100644 --- a/homework_2/Makefile +++ b/homework_2/Makefile @@ -45,18 +45,18 @@ DEP_DIR := $(BUILD_DIR)/.dep # ========== Compiler settings ========== # Compiler flags for debug and release -DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 -REL_CFLAGS := -Wall -Wextra -O3 -std=c11 -DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 -REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 +DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 #-fopenmp +REL_CFLAGS := -Wall -Wextra -O3 -std=c11 #-fopenmp +DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 #-fopenmp +REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 #-fopenmp # Pre-defines # PRE_DEFS := MYCAB=1729 SUPER_MODE -PRE_DEFS := +PRE_DEFS := #_GLIBCXX_PARALLEL # ============== Linker settings ============== # Linker flags (example: -pthread -lm) -LDFLAGS := -pthread +LDFLAGS := -pthread # -fopenmp # Map output file MAP_FILE := output.map @@ -228,7 +228,7 @@ perfbitonic: CC := mpicc perfbitonic: CXX := mpic++ perfbitonic: CFLAGS := $(REL_CFLAGS) -g -DCODE_VERSION=BITONIC perfbitonic: CXXFLAGS := $(REL_CXXFLAGS) -g -DCODE_VERSION=BITONIC -perfbitonic: TARGET := distbitonic +perfbitonic: TARGET := perfbitonic perfbitonic: $(BUILD_DIR)/$(TARGET) @mkdir -p out cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) diff --git a/homework_2/hpc/ntasks16.sh b/homework_2/hpc/btN1P4T4Q24.sh similarity index 54% rename from homework_2/hpc/ntasks16.sh rename to homework_2/hpc/btN1P4T4Q24.sh index defffee..823193e 100644 --- a/homework_2/hpc/ntasks16.sh +++ b/homework_2/hpc/btN1P4T4Q24.sh @@ -1,28 +1,28 @@ #! /usr/bin/env bash #SBATCH --partition=batch +#SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 -#SBATCH --nodes=4 +#SBATCH --cpus-per-task=4 #SBATCH --time=1:00 +# Use this as following +# $> sbatch +# +# NOTE: +# First compile with +# $> make -j hpc-build +# + module load gcc/9.2.0 openmpi/4.0.3 # Note: # The above versions are matching w/ my system's # versions, thus making compiling/debugging easier. -# Uncomment the following to compile the project -# Note: -# In order for the MPI to run properly (or to run entirely), we need -# to compile it in hpc using the loaded modules above. -# Note: -# Consider moving this to a separate stage before sbatch the tasks. -# make distbitonic -# make distbubbletonic - # Suppress unused UCX_ROOT warning export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./bin/dist_v05 \ No newline at end of file +srun ./out/distbitonic -q 24 --perf --validation \ No newline at end of file diff --git a/homework_2/include/config.h b/homework_2/include/config.h index c57366a..db9313f 100644 --- a/homework_2/include/config.h +++ b/homework_2/include/config.h @@ -44,15 +44,15 @@ using distValue_t = uint32_t; /*! * Session option for each invocation of the executable */ -struct session_t { +struct config_t { size_t arraySize{DEFAULT_DATA_SIZE}; //!< bool validation{false}; //!< Request a full validation at the end, performed by process rank 0 bool ndebug{false}; //!< Skips debug trap on DEBUG builds - bool timing{false}; //!< Enable timing measurements and prints + bool perf{false}; //!< Enable performance timing measurements and prints bool verbose{false}; //!< Flag to enable verbose output to stdout }; -extern session_t session; +extern config_t config; #endif /* CONFIG_H_ */ diff --git a/homework_2/include/distsort.hpp b/homework_2/include/distsort.hpp index a75527a..680c515 100644 --- a/homework_2/include/distsort.hpp +++ b/homework_2/include/distsort.hpp @@ -12,6 +12,7 @@ #include #include +//#include #include #include #if !defined DEBUG @@ -20,8 +21,8 @@ #include #include "utils.hpp" -#include "config.h" +extern Timing TfullSort, Texchange, Tminmax, TelbowSort; /*! * Enumerator for the different versions of the sorting method @@ -159,11 +160,13 @@ bool isActive(mpi_id_t node, size_t nodes); */ template void fullSort(RangeT& data, bool ascending) noexcept { - // Use introsort from stdlib++ here, unless ... - if (ascending) + // Use introsort from stdlib++ here, unless ... __gnu_parallel + if (ascending) { std::sort(data.begin(), data.end(), std::less<>()); - else + } + else { std::sort(data.begin(), data.end(), std::greater<>()); + } } /*! @@ -270,7 +273,7 @@ void minmax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { template void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { // Initially sort to create a half part of a bitonic sequence - fullSort(data, ascending(rank, 0)); + timeCall(TfullSort, fullSort, data, ascending(rank, 0)); // Sort network (O(N) iterations) for (size_t step = 0; step < static_cast(Processes); ++step) { @@ -280,9 +283,9 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { if ( isActive(rank, Processes) && isActive(part, Processes) ) { // Exchange with partner, keep nim-or-max and sort - O(N) - mpi.exchange(data.getActive(), data.getShadow(), part, step); - minmax(data.getActive(), data.getShadow(), ks); - elbowSort(data, ascending(rank, Processes)); + timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, step); + timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); + timeCall(TelbowSort, elbowSort, data, ascending(rank, Processes)); } } @@ -308,7 +311,7 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { template void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { // Initially sort to create a half part of a bitonic sequence - fullSort(data, ascending(rank, 0)); + timeCall(TfullSort, fullSort, data, ascending(rank, 0)); // Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) auto p = static_cast(std::log2(Processes)); @@ -319,11 +322,11 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { auto part = partner(rank, step); auto ks = keepSmall(rank, part, depth); // Exchange with partner, keep nim-or-max - mpi.exchange(data.getActive(), data.getShadow(), part, (depth << 8) | step); - minmax(data.getActive(), data.getShadow(), ks); + timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, (depth << 8) | step); + timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); } // sort - O(N) - elbowSort (data, ascending(rank, depth)); + timeCall(TelbowSort, elbowSort, data, ascending(rank, depth)); } } diff --git a/homework_2/include/utils.hpp b/homework_2/include/utils.hpp index 5096731..19e2566 100644 --- a/homework_2/include/utils.hpp +++ b/homework_2/include/utils.hpp @@ -14,6 +14,7 @@ #include #include #include +//#include #include "config.h" @@ -286,7 +287,7 @@ struct Log { //! We provide logging via << operator template Log &operator<<(T &&t) { - if (session.verbose) { + if (config.verbose) { if (line_) { std::cout << "[Log]: " << t; line_ = false; @@ -299,7 +300,7 @@ struct Log { // overload for special end line handling Log &operator<<(Endl e) { (void) e; - if (session.verbose) { + if (config.verbose) { std::cout << '\n'; line_ = true; } @@ -317,39 +318,71 @@ extern Log logger; */ struct Timing { using Tpoint = std::chrono::steady_clock::time_point; + using Tduration = std::chrono::microseconds; using microseconds = std::chrono::microseconds; using milliseconds = std::chrono::milliseconds; using seconds = std::chrono::seconds; //! tool to mark the starting point - Tpoint start() noexcept { return start_ = std::chrono::steady_clock::now(); } + Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); } //! tool to mark the ending point - Tpoint stop() noexcept { return stop_ = std::chrono::steady_clock::now(); } + Tpoint stop() noexcept { + Tpoint now = std::chrono::steady_clock::now(); + duration_ += dt(now, mark_); + return now; + } - auto dt() noexcept { - return std::chrono::duration_cast(stop_ - start_).count(); + Tduration dt(Tpoint t2, Tpoint t1) noexcept { + return std::chrono::duration_cast(t2 - t1); } //! tool to print the time interval - void print_dt(const char *what) noexcept { - if (session.timing) { - auto t = stop_ - start_; - if (std::chrono::duration_cast(t).count() < 10000) - std::cout << "[Timing]: " << what << ": " - << std::to_string(std::chrono::duration_cast(t).count()) << " [usec]\n"; - else if (std::chrono::duration_cast(t).count() < 10000) - std::cout << "[Timing]: " << what << ": " - << std::to_string(std::chrono::duration_cast(t).count()) << " [msec]\n"; - else - std::cout << "[Timing]: " << what << ": " - << std::to_string(std::chrono::duration_cast(t).count()) << " [sec]\n"; - } + void print_duration(const char *what, mpi_id_t rank) noexcept { + if (std::chrono::duration_cast(duration_).count() < 10000) + std::cout << "[Timing] (Rank " << rank << ") " << what << ": " + << std::to_string(std::chrono::duration_cast(duration_).count()) << " [usec]\n"; + else if (std::chrono::duration_cast(duration_).count() < 10000) + std::cout << "[Timing] (Rank " << rank << ") " << what << ": " + << std::to_string(std::chrono::duration_cast(duration_).count()) << " [msec]\n"; + else + std::cout << "[Timing] (Rank " << rank << ") " << what << ": " + << std::to_string(std::chrono::duration_cast(duration_).count()) << " [sec]\n"; + } private: - Tpoint start_; - Tpoint stop_; + Tpoint mark_{}; + Tduration duration_{}; }; +/*! + * Utility high level function to forward a function call to std::invoke and measure + * the excecution time + * + * @tparam Func The function type + * @tparam Args The argument + * @param func + * @param args + * @return + */ + + +#define timeCall(Tim, Func, ...) \ +Tim.start(); \ +Func(__VA_ARGS__); \ +Tim.stop(); \ + + +//template +//auto timeCall_r(Ret& ret, Func&& func, Args&&... args) { +// Timing timer; +// +// timer.start(); +// ret = std::invoke(std::forward(func), std::forward(args)...); +// timer.stop(); +// +// return timer.dt(); +//} + #endif /* UTILS_HPP_ */ diff --git a/homework_2/src/distsort.cpp b/homework_2/src/distsort.cpp index de735b5..fe3df1a 100644 --- a/homework_2/src/distsort.cpp +++ b/homework_2/src/distsort.cpp @@ -9,6 +9,7 @@ #include "utils.hpp" #include "distsort.hpp" +Timing TfullSort, Texchange, Tminmax, TelbowSort; bool isActive(mpi_id_t node, size_t nodes) { if (!((nodes > 0) && diff --git a/homework_2/src/main.cpp b/homework_2/src/main.cpp index 738364c..8f763ac 100644 --- a/homework_2/src/main.cpp +++ b/homework_2/src/main.cpp @@ -17,12 +17,12 @@ #include "distsort.hpp" -// Global session data -session_t session; +// Global config data +config_t config; MPI_t<> mpi; distBuffer_t Data; Log logger; -Timing timer; +Timing Ttotal; /*! * A small command line argument parser @@ -37,23 +37,23 @@ bool get_options(int argc, char* argv[]){ if (arg == "-q" || arg == "--array-size") { if (i+1 < argc) { - session.arraySize = 1 << atoi(argv[++i]); + config.arraySize = 1 << atoi(argv[++i]); } else { status = false; } } else if (arg == "--validation") { - session.validation = true; + config.validation = true; } else if (arg == "--ndebug") { - session.ndebug = true; + config.ndebug = true; } - else if (arg == "-t" || arg == "--timing") { - session.timing = true; + else if (arg == "--perf") { + config.perf = true; } else if (arg == "-v" || arg == "--verbose") { - session.verbose = true; + config.verbose = true; } else if (arg == "-h" || arg == "--help") { std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; @@ -65,6 +65,8 @@ bool get_options(int argc, char* argv[]){ std::cout << "Options:\n\n"; std::cout << " -q | --array-size \n"; std::cout << " Selects the array size according to size = 2^N\n\n"; + std::cout << " --par-sort\n"; + std::cout << " Request a parallel full sorting algorithm\n\n"; std::cout << " --validation\n"; std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; std::cout << " --ndebug\n"; @@ -164,11 +166,11 @@ int main(int argc, char* argv[]) try { #else volatile bool sleep_wait = true; #endif - while (sleep_wait && !session.ndebug) + while (sleep_wait && !config.ndebug) sleep(1); #endif - logger << "Initialize local array of " << session.arraySize << " elements" << logger.endl; + logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] std::mt19937 gen(rd()); std::uniform_int_distribution dis( @@ -176,24 +178,31 @@ int main(int argc, char* argv[]) try { std::numeric_limits::max() ); // Fill vector - Data.resize(session.arraySize); + Data.resize(config.arraySize); std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); if (mpi.rank() == 0) logger << "Starting distributed sorting ... "; - timer.start(); + Ttotal.start(); #if CODE_VERSION == BUBBLETONIC distBubbletonic(Data, mpi.size(), mpi.rank()); #else distBitonic (Data, mpi.size(), mpi.rank()); #endif - timer.stop(); + Ttotal.stop(); if (mpi.rank() == 0) logger << " Done." << logger.endl; std::string timeMsg = "rank " + std::to_string(mpi.rank()); - timer.print_dt(timeMsg.c_str()); - if (session.validation) { + + if (config.perf) { + Ttotal.print_duration("Total ", mpi.rank()); + TfullSort.print_duration("Full-Sort ", mpi.rank()); + Texchange.print_duration("Exchange ", mpi.rank()); + Tminmax.print_duration("Min-Max ", mpi.rank()); + TelbowSort.print_duration("Elbow-Sort", mpi.rank()); + } + if (config.validation) { // If requested, we have the chance to fail! if (mpi.rank() == 0) std::cout << "Results validation ...";