@@ -45,18 +45,18 @@ DEP_DIR := $(BUILD_DIR)/.dep | |||||
# ========== Compiler settings ========== | # ========== Compiler settings ========== | ||||
# Compiler flags for debug and release | # Compiler flags for debug and release | ||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 | |||||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 | |||||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 | |||||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 | |||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 #-fopenmp | |||||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 #-fopenmp | |||||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 #-fopenmp | |||||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 #-fopenmp | |||||
# Pre-defines | # Pre-defines | ||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE | # PRE_DEFS := MYCAB=1729 SUPER_MODE | ||||
PRE_DEFS := | |||||
PRE_DEFS := #_GLIBCXX_PARALLEL | |||||
# ============== Linker settings ============== | # ============== Linker settings ============== | ||||
# Linker flags (example: -pthread -lm) | # Linker flags (example: -pthread -lm) | ||||
LDFLAGS := -pthread | |||||
LDFLAGS := -pthread # -fopenmp | |||||
# Map output file | # Map output file | ||||
MAP_FILE := output.map | MAP_FILE := output.map | ||||
@@ -228,7 +228,7 @@ perfbitonic: CC := mpicc | |||||
perfbitonic: CXX := mpic++ | perfbitonic: CXX := mpic++ | ||||
perfbitonic: CFLAGS := $(REL_CFLAGS) -g -DCODE_VERSION=BITONIC | perfbitonic: CFLAGS := $(REL_CFLAGS) -g -DCODE_VERSION=BITONIC | ||||
perfbitonic: CXXFLAGS := $(REL_CXXFLAGS) -g -DCODE_VERSION=BITONIC | perfbitonic: CXXFLAGS := $(REL_CXXFLAGS) -g -DCODE_VERSION=BITONIC | ||||
perfbitonic: TARGET := distbitonic | |||||
perfbitonic: TARGET := perfbitonic | |||||
perfbitonic: $(BUILD_DIR)/$(TARGET) | perfbitonic: $(BUILD_DIR)/$(TARGET) | ||||
@mkdir -p out | @mkdir -p out | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
@@ -1,28 +1,28 @@ | |||||
#! /usr/bin/env bash | #! /usr/bin/env bash | ||||
#SBATCH --partition=batch | #SBATCH --partition=batch | ||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=4 | #SBATCH --ntasks-per-node=4 | ||||
#SBATCH --nodes=4 | |||||
#SBATCH --cpus-per-task=4 | |||||
#SBATCH --time=1:00 | #SBATCH --time=1:00 | ||||
# Use this as following | |||||
# $> sbatch <this file> | |||||
# | |||||
# NOTE: | |||||
# First compile with | |||||
# $> make -j hpc-build | |||||
# | |||||
module load gcc/9.2.0 openmpi/4.0.3 | module load gcc/9.2.0 openmpi/4.0.3 | ||||
# Note: | # Note: | ||||
# The above versions are matching w/ my system's | # The above versions are matching w/ my system's | ||||
# versions, thus making compiling/debugging easier. | # versions, thus making compiling/debugging easier. | ||||
# Uncomment the following to compile the project | |||||
# Note: | |||||
# In order for the MPI to run properly (or to run entirely), we need | |||||
# to compile it in hpc using the loaded modules above. | |||||
# Note: | |||||
# Consider moving this to a separate stage before sbatch the tasks. | |||||
# make distbitonic | |||||
# make distbubbletonic | |||||
# Suppress unused UCX_ROOT warning | # Suppress unused UCX_ROOT warning | ||||
export UCX_WARN_UNUSED_ENV_VARS=n | export UCX_WARN_UNUSED_ENV_VARS=n | ||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./bin/dist_v05 | |||||
srun ./out/distbitonic -q 24 --perf --validation |
@@ -44,15 +44,15 @@ using distValue_t = uint32_t; | |||||
/*! | /*! | ||||
* Session option for each invocation of the executable | * Session option for each invocation of the executable | ||||
*/ | */ | ||||
struct session_t { | |||||
struct config_t { | |||||
size_t arraySize{DEFAULT_DATA_SIZE}; //!< | size_t arraySize{DEFAULT_DATA_SIZE}; //!< | ||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0 | bool validation{false}; //!< Request a full validation at the end, performed by process rank 0 | ||||
bool ndebug{false}; //!< Skips debug trap on DEBUG builds | bool ndebug{false}; //!< Skips debug trap on DEBUG builds | ||||
bool timing{false}; //!< Enable timing measurements and prints | |||||
bool perf{false}; //!< Enable performance timing measurements and prints | |||||
bool verbose{false}; //!< Flag to enable verbose output to stdout | bool verbose{false}; //!< Flag to enable verbose output to stdout | ||||
}; | }; | ||||
extern session_t session; | |||||
extern config_t config; | |||||
#endif /* CONFIG_H_ */ | #endif /* CONFIG_H_ */ |
@@ -12,6 +12,7 @@ | |||||
#include <vector> | #include <vector> | ||||
#include <algorithm> | #include <algorithm> | ||||
//#include <parallel/algorithm> | |||||
#include <cmath> | #include <cmath> | ||||
#include <cstdint> | #include <cstdint> | ||||
#if !defined DEBUG | #if !defined DEBUG | ||||
@@ -20,8 +21,8 @@ | |||||
#include <cassert> | #include <cassert> | ||||
#include "utils.hpp" | #include "utils.hpp" | ||||
#include "config.h" | |||||
extern Timing TfullSort, Texchange, Tminmax, TelbowSort; | |||||
/*! | /*! | ||||
* Enumerator for the different versions of the sorting method | * Enumerator for the different versions of the sorting method | ||||
@@ -159,11 +160,13 @@ bool isActive(mpi_id_t node, size_t nodes); | |||||
*/ | */ | ||||
template<typename RangeT> | template<typename RangeT> | ||||
void fullSort(RangeT& data, bool ascending) noexcept { | void fullSort(RangeT& data, bool ascending) noexcept { | ||||
// Use introsort from stdlib++ here, unless ... | |||||
if (ascending) | |||||
// Use introsort from stdlib++ here, unless ... __gnu_parallel | |||||
if (ascending) { | |||||
std::sort(data.begin(), data.end(), std::less<>()); | std::sort(data.begin(), data.end(), std::less<>()); | ||||
else | |||||
} | |||||
else { | |||||
std::sort(data.begin(), data.end(), std::greater<>()); | std::sort(data.begin(), data.end(), std::greater<>()); | ||||
} | |||||
} | } | ||||
/*! | /*! | ||||
@@ -270,7 +273,7 @@ void minmax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { | |||||
template<typename ShadowedDataT> | template<typename ShadowedDataT> | ||||
void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | ||||
// Initially sort to create a half part of a bitonic sequence | // Initially sort to create a half part of a bitonic sequence | ||||
fullSort(data, ascending<SortMode::Bubbletonic>(rank, 0)); | |||||
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bubbletonic>(rank, 0)); | |||||
// Sort network (O(N) iterations) | // Sort network (O(N) iterations) | ||||
for (size_t step = 0; step < static_cast<size_t>(Processes); ++step) { | for (size_t step = 0; step < static_cast<size_t>(Processes); ++step) { | ||||
@@ -280,9 +283,9 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||||
if ( isActive(rank, Processes) && | if ( isActive(rank, Processes) && | ||||
isActive(part, Processes) ) { | isActive(part, Processes) ) { | ||||
// Exchange with partner, keep nim-or-max and sort - O(N) | // Exchange with partner, keep nim-or-max and sort - O(N) | ||||
mpi.exchange(data.getActive(), data.getShadow(), part, step); | |||||
minmax(data.getActive(), data.getShadow(), ks); | |||||
elbowSort(data, ascending<SortMode::Bubbletonic>(rank, Processes)); | |||||
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, step); | |||||
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); | |||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes)); | |||||
} | } | ||||
} | } | ||||
@@ -308,7 +311,7 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||||
template<typename ShadowedDataT> | template<typename ShadowedDataT> | ||||
void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | ||||
// Initially sort to create a half part of a bitonic sequence | // Initially sort to create a half part of a bitonic sequence | ||||
fullSort(data, ascending<SortMode::Bitonic>(rank, 0)); | |||||
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bitonic>(rank, 0)); | |||||
// Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) | // Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) | ||||
auto p = static_cast<uint32_t>(std::log2(Processes)); | auto p = static_cast<uint32_t>(std::log2(Processes)); | ||||
@@ -319,11 +322,11 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||||
auto part = partner<SortMode::Bitonic>(rank, step); | auto part = partner<SortMode::Bitonic>(rank, step); | ||||
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth); | auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth); | ||||
// Exchange with partner, keep nim-or-max | // Exchange with partner, keep nim-or-max | ||||
mpi.exchange(data.getActive(), data.getShadow(), part, (depth << 8) | step); | |||||
minmax(data.getActive(), data.getShadow(), ks); | |||||
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, (depth << 8) | step); | |||||
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); | |||||
} | } | ||||
// sort - O(N) | // sort - O(N) | ||||
elbowSort (data, ascending<SortMode::Bitonic>(rank, depth)); | |||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth)); | |||||
} | } | ||||
} | } | ||||
@@ -14,6 +14,7 @@ | |||||
#include <chrono> | #include <chrono> | ||||
#include <unistd.h> | #include <unistd.h> | ||||
#include <mpi.h> | #include <mpi.h> | ||||
//#include <functional> | |||||
#include "config.h" | #include "config.h" | ||||
@@ -286,7 +287,7 @@ struct Log { | |||||
//! We provide logging via << operator | //! We provide logging via << operator | ||||
template<typename T> | template<typename T> | ||||
Log &operator<<(T &&t) { | Log &operator<<(T &&t) { | ||||
if (session.verbose) { | |||||
if (config.verbose) { | |||||
if (line_) { | if (line_) { | ||||
std::cout << "[Log]: " << t; | std::cout << "[Log]: " << t; | ||||
line_ = false; | line_ = false; | ||||
@@ -299,7 +300,7 @@ struct Log { | |||||
// overload for special end line handling | // overload for special end line handling | ||||
Log &operator<<(Endl e) { | Log &operator<<(Endl e) { | ||||
(void) e; | (void) e; | ||||
if (session.verbose) { | |||||
if (config.verbose) { | |||||
std::cout << '\n'; | std::cout << '\n'; | ||||
line_ = true; | line_ = true; | ||||
} | } | ||||
@@ -317,39 +318,71 @@ extern Log logger; | |||||
*/ | */ | ||||
struct Timing { | struct Timing { | ||||
using Tpoint = std::chrono::steady_clock::time_point; | using Tpoint = std::chrono::steady_clock::time_point; | ||||
using Tduration = std::chrono::microseconds; | |||||
using microseconds = std::chrono::microseconds; | using microseconds = std::chrono::microseconds; | ||||
using milliseconds = std::chrono::milliseconds; | using milliseconds = std::chrono::milliseconds; | ||||
using seconds = std::chrono::seconds; | using seconds = std::chrono::seconds; | ||||
//! tool to mark the starting point | //! tool to mark the starting point | ||||
Tpoint start() noexcept { return start_ = std::chrono::steady_clock::now(); } | |||||
Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); } | |||||
//! tool to mark the ending point | //! tool to mark the ending point | ||||
Tpoint stop() noexcept { return stop_ = std::chrono::steady_clock::now(); } | |||||
Tpoint stop() noexcept { | |||||
Tpoint now = std::chrono::steady_clock::now(); | |||||
duration_ += dt(now, mark_); | |||||
return now; | |||||
} | |||||
auto dt() noexcept { | |||||
return std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_).count(); | |||||
Tduration dt(Tpoint t2, Tpoint t1) noexcept { | |||||
return std::chrono::duration_cast<Tduration>(t2 - t1); | |||||
} | } | ||||
//! tool to print the time interval | //! tool to print the time interval | ||||
void print_dt(const char *what) noexcept { | |||||
if (session.timing) { | |||||
auto t = stop_ - start_; | |||||
if (std::chrono::duration_cast<microseconds>(t).count() < 10000) | |||||
std::cout << "[Timing]: " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n"; | |||||
else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000) | |||||
std::cout << "[Timing]: " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n"; | |||||
else | |||||
std::cout << "[Timing]: " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n"; | |||||
} | |||||
void print_duration(const char *what, mpi_id_t rank) noexcept { | |||||
if (std::chrono::duration_cast<microseconds>(duration_).count() < 10000) | |||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<microseconds>(duration_).count()) << " [usec]\n"; | |||||
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000) | |||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n"; | |||||
else | |||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n"; | |||||
} | } | ||||
private: | private: | ||||
Tpoint start_; | |||||
Tpoint stop_; | |||||
Tpoint mark_{}; | |||||
Tduration duration_{}; | |||||
}; | }; | ||||
/*! | |||||
* Utility high level function to forward a function call to std::invoke and measure | |||||
* the excecution time | |||||
* | |||||
* @tparam Func The function type | |||||
* @tparam Args The argument | |||||
* @param func | |||||
* @param args | |||||
* @return | |||||
*/ | |||||
#define timeCall(Tim, Func, ...) \ | |||||
Tim.start(); \ | |||||
Func(__VA_ARGS__); \ | |||||
Tim.stop(); \ | |||||
//template <typename Ret, typename Func, typename... Args> | |||||
//auto timeCall_r(Ret& ret, Func&& func, Args&&... args) { | |||||
// Timing timer; | |||||
// | |||||
// timer.start(); | |||||
// ret = std::invoke(std::forward<Func>(func), std::forward<Args>(args)...); | |||||
// timer.stop(); | |||||
// | |||||
// return timer.dt(); | |||||
//} | |||||
#endif /* UTILS_HPP_ */ | #endif /* UTILS_HPP_ */ |
@@ -9,6 +9,7 @@ | |||||
#include "utils.hpp" | #include "utils.hpp" | ||||
#include "distsort.hpp" | #include "distsort.hpp" | ||||
Timing TfullSort, Texchange, Tminmax, TelbowSort; | |||||
bool isActive(mpi_id_t node, size_t nodes) { | bool isActive(mpi_id_t node, size_t nodes) { | ||||
if (!((nodes > 0) && | if (!((nodes > 0) && | ||||
@@ -17,12 +17,12 @@ | |||||
#include "distsort.hpp" | #include "distsort.hpp" | ||||
// Global session data | |||||
session_t session; | |||||
// Global config data | |||||
config_t config; | |||||
MPI_t<> mpi; | MPI_t<> mpi; | ||||
distBuffer_t Data; | distBuffer_t Data; | ||||
Log logger; | Log logger; | ||||
Timing timer; | |||||
Timing Ttotal; | |||||
/*! | /*! | ||||
* A small command line argument parser | * A small command line argument parser | ||||
@@ -37,23 +37,23 @@ bool get_options(int argc, char* argv[]){ | |||||
if (arg == "-q" || arg == "--array-size") { | if (arg == "-q" || arg == "--array-size") { | ||||
if (i+1 < argc) { | if (i+1 < argc) { | ||||
session.arraySize = 1 << atoi(argv[++i]); | |||||
config.arraySize = 1 << atoi(argv[++i]); | |||||
} | } | ||||
else { | else { | ||||
status = false; | status = false; | ||||
} | } | ||||
} | } | ||||
else if (arg == "--validation") { | else if (arg == "--validation") { | ||||
session.validation = true; | |||||
config.validation = true; | |||||
} | } | ||||
else if (arg == "--ndebug") { | else if (arg == "--ndebug") { | ||||
session.ndebug = true; | |||||
config.ndebug = true; | |||||
} | } | ||||
else if (arg == "-t" || arg == "--timing") { | |||||
session.timing = true; | |||||
else if (arg == "--perf") { | |||||
config.perf = true; | |||||
} | } | ||||
else if (arg == "-v" || arg == "--verbose") { | else if (arg == "-v" || arg == "--verbose") { | ||||
session.verbose = true; | |||||
config.verbose = true; | |||||
} | } | ||||
else if (arg == "-h" || arg == "--help") { | else if (arg == "-h" || arg == "--help") { | ||||
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; | std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; | ||||
@@ -65,6 +65,8 @@ bool get_options(int argc, char* argv[]){ | |||||
std::cout << "Options:\n\n"; | std::cout << "Options:\n\n"; | ||||
std::cout << " -q | --array-size <N>\n"; | std::cout << " -q | --array-size <N>\n"; | ||||
std::cout << " Selects the array size according to size = 2^N\n\n"; | std::cout << " Selects the array size according to size = 2^N\n\n"; | ||||
std::cout << " --par-sort\n"; | |||||
std::cout << " Request a parallel full sorting algorithm\n\n"; | |||||
std::cout << " --validation\n"; | std::cout << " --validation\n"; | ||||
std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; | std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; | ||||
std::cout << " --ndebug\n"; | std::cout << " --ndebug\n"; | ||||
@@ -164,11 +166,11 @@ int main(int argc, char* argv[]) try { | |||||
#else | #else | ||||
volatile bool sleep_wait = true; | volatile bool sleep_wait = true; | ||||
#endif | #endif | ||||
while (sleep_wait && !session.ndebug) | |||||
while (sleep_wait && !config.ndebug) | |||||
sleep(1); | sleep(1); | ||||
#endif | #endif | ||||
logger << "Initialize local array of " << session.arraySize << " elements" << logger.endl; | |||||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; | |||||
std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] | std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] | ||||
std::mt19937 gen(rd()); | std::mt19937 gen(rd()); | ||||
std::uniform_int_distribution<distValue_t > dis( | std::uniform_int_distribution<distValue_t > dis( | ||||
@@ -176,24 +178,31 @@ int main(int argc, char* argv[]) try { | |||||
std::numeric_limits<distValue_t>::max() | std::numeric_limits<distValue_t>::max() | ||||
); | ); | ||||
// Fill vector | // Fill vector | ||||
Data.resize(session.arraySize); | |||||
Data.resize(config.arraySize); | |||||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | ||||
if (mpi.rank() == 0) | if (mpi.rank() == 0) | ||||
logger << "Starting distributed sorting ... "; | logger << "Starting distributed sorting ... "; | ||||
timer.start(); | |||||
Ttotal.start(); | |||||
#if CODE_VERSION == BUBBLETONIC | #if CODE_VERSION == BUBBLETONIC | ||||
distBubbletonic(Data, mpi.size(), mpi.rank()); | distBubbletonic(Data, mpi.size(), mpi.rank()); | ||||
#else | #else | ||||
distBitonic (Data, mpi.size(), mpi.rank()); | distBitonic (Data, mpi.size(), mpi.rank()); | ||||
#endif | #endif | ||||
timer.stop(); | |||||
Ttotal.stop(); | |||||
if (mpi.rank() == 0) | if (mpi.rank() == 0) | ||||
logger << " Done." << logger.endl; | logger << " Done." << logger.endl; | ||||
std::string timeMsg = "rank " + std::to_string(mpi.rank()); | std::string timeMsg = "rank " + std::to_string(mpi.rank()); | ||||
timer.print_dt(timeMsg.c_str()); | |||||
if (session.validation) { | |||||
if (config.perf) { | |||||
Ttotal.print_duration("Total ", mpi.rank()); | |||||
TfullSort.print_duration("Full-Sort ", mpi.rank()); | |||||
Texchange.print_duration("Exchange ", mpi.rank()); | |||||
Tminmax.print_duration("Min-Max ", mpi.rank()); | |||||
TelbowSort.print_duration("Elbow-Sort", mpi.rank()); | |||||
} | |||||
if (config.validation) { | |||||
// If requested, we have the chance to fail! | // If requested, we have the chance to fail! | ||||
if (mpi.rank() == 0) | if (mpi.rank() == 0) | ||||
std::cout << "Results validation ..."; | std::cout << "Results validation ..."; | ||||