@@ -45,18 +45,18 @@ DEP_DIR := $(BUILD_DIR)/.dep | |||
# ========== Compiler settings ========== | |||
# Compiler flags for debug and release | |||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 | |||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 | |||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 | |||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 | |||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 #-fopenmp | |||
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 #-fopenmp | |||
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 #-fopenmp | |||
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 #-fopenmp | |||
# Pre-defines | |||
# PRE_DEFS := MYCAB=1729 SUPER_MODE | |||
PRE_DEFS := | |||
PRE_DEFS := #_GLIBCXX_PARALLEL | |||
# ============== Linker settings ============== | |||
# Linker flags (example: -pthread -lm) | |||
LDFLAGS := -pthread | |||
LDFLAGS := -pthread # -fopenmp | |||
# Map output file | |||
MAP_FILE := output.map | |||
@@ -228,7 +228,7 @@ perfbitonic: CC := mpicc | |||
perfbitonic: CXX := mpic++ | |||
perfbitonic: CFLAGS := $(REL_CFLAGS) -g -DCODE_VERSION=BITONIC | |||
perfbitonic: CXXFLAGS := $(REL_CXXFLAGS) -g -DCODE_VERSION=BITONIC | |||
perfbitonic: TARGET := distbitonic | |||
perfbitonic: TARGET := perfbitonic | |||
perfbitonic: $(BUILD_DIR)/$(TARGET) | |||
@mkdir -p out | |||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||
@@ -1,28 +1,28 @@ | |||
#! /usr/bin/env bash | |||
#SBATCH --partition=batch | |||
#SBATCH --nodes=1 | |||
#SBATCH --ntasks-per-node=4 | |||
#SBATCH --nodes=4 | |||
#SBATCH --cpus-per-task=4 | |||
#SBATCH --time=1:00 | |||
# Use this as following | |||
# $> sbatch <this file> | |||
# | |||
# NOTE: | |||
# First compile with | |||
# $> make -j hpc-build | |||
# | |||
module load gcc/9.2.0 openmpi/4.0.3 | |||
# Note: | |||
# The above versions are matching w/ my system's | |||
# versions, thus making compiling/debugging easier. | |||
# Uncomment the following to compile the project | |||
# Note: | |||
# In order for the MPI to run properly (or to run entirely), we need | |||
# to compile it in hpc using the loaded modules above. | |||
# Note: | |||
# Consider moving this to a separate stage before sbatch the tasks. | |||
# make distbitonic | |||
# make distbubbletonic | |||
# Suppress unused UCX_ROOT warning | |||
export UCX_WARN_UNUSED_ENV_VARS=n | |||
# Suppress CUDA-aware support is disabled warning | |||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | |||
srun ./bin/dist_v05 | |||
srun ./out/distbitonic -q 24 --perf --validation |
@@ -44,15 +44,15 @@ using distValue_t = uint32_t; | |||
/*! | |||
* Session option for each invocation of the executable | |||
*/ | |||
struct session_t { | |||
struct config_t { | |||
size_t arraySize{DEFAULT_DATA_SIZE}; //!< | |||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0 | |||
bool ndebug{false}; //!< Skips debug trap on DEBUG builds | |||
bool timing{false}; //!< Enable timing measurements and prints | |||
bool perf{false}; //!< Enable performance timing measurements and prints | |||
bool verbose{false}; //!< Flag to enable verbose output to stdout | |||
}; | |||
extern session_t session; | |||
extern config_t config; | |||
#endif /* CONFIG_H_ */ |
@@ -12,6 +12,7 @@ | |||
#include <vector> | |||
#include <algorithm> | |||
//#include <parallel/algorithm> | |||
#include <cmath> | |||
#include <cstdint> | |||
#if !defined DEBUG | |||
@@ -20,8 +21,8 @@ | |||
#include <cassert> | |||
#include "utils.hpp" | |||
#include "config.h" | |||
extern Timing TfullSort, Texchange, Tminmax, TelbowSort; | |||
/*! | |||
* Enumerator for the different versions of the sorting method | |||
@@ -159,11 +160,13 @@ bool isActive(mpi_id_t node, size_t nodes); | |||
*/ | |||
template<typename RangeT> | |||
void fullSort(RangeT& data, bool ascending) noexcept { | |||
// Use introsort from stdlib++ here, unless ... | |||
if (ascending) | |||
// Use introsort from stdlib++ here, unless ... __gnu_parallel | |||
if (ascending) { | |||
std::sort(data.begin(), data.end(), std::less<>()); | |||
else | |||
} | |||
else { | |||
std::sort(data.begin(), data.end(), std::greater<>()); | |||
} | |||
} | |||
/*! | |||
@@ -270,7 +273,7 @@ void minmax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { | |||
template<typename ShadowedDataT> | |||
void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||
// Initially sort to create a half part of a bitonic sequence | |||
fullSort(data, ascending<SortMode::Bubbletonic>(rank, 0)); | |||
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bubbletonic>(rank, 0)); | |||
// Sort network (O(N) iterations) | |||
for (size_t step = 0; step < static_cast<size_t>(Processes); ++step) { | |||
@@ -280,9 +283,9 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||
if ( isActive(rank, Processes) && | |||
isActive(part, Processes) ) { | |||
// Exchange with partner, keep nim-or-max and sort - O(N) | |||
mpi.exchange(data.getActive(), data.getShadow(), part, step); | |||
minmax(data.getActive(), data.getShadow(), ks); | |||
elbowSort(data, ascending<SortMode::Bubbletonic>(rank, Processes)); | |||
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, step); | |||
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); | |||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes)); | |||
} | |||
} | |||
@@ -308,7 +311,7 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||
template<typename ShadowedDataT> | |||
void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||
// Initially sort to create a half part of a bitonic sequence | |||
fullSort(data, ascending<SortMode::Bitonic>(rank, 0)); | |||
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bitonic>(rank, 0)); | |||
// Run through sort network using elbow-sort ( O(LogN * LogN) iterations ) | |||
auto p = static_cast<uint32_t>(std::log2(Processes)); | |||
@@ -319,11 +322,11 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||
auto part = partner<SortMode::Bitonic>(rank, step); | |||
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth); | |||
// Exchange with partner, keep nim-or-max | |||
mpi.exchange(data.getActive(), data.getShadow(), part, (depth << 8) | step); | |||
minmax(data.getActive(), data.getShadow(), ks); | |||
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, (depth << 8) | step); | |||
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks); | |||
} | |||
// sort - O(N) | |||
elbowSort (data, ascending<SortMode::Bitonic>(rank, depth)); | |||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth)); | |||
} | |||
} | |||
@@ -14,6 +14,7 @@ | |||
#include <chrono> | |||
#include <unistd.h> | |||
#include <mpi.h> | |||
//#include <functional> | |||
#include "config.h" | |||
@@ -286,7 +287,7 @@ struct Log { | |||
//! We provide logging via << operator | |||
template<typename T> | |||
Log &operator<<(T &&t) { | |||
if (session.verbose) { | |||
if (config.verbose) { | |||
if (line_) { | |||
std::cout << "[Log]: " << t; | |||
line_ = false; | |||
@@ -299,7 +300,7 @@ struct Log { | |||
// overload for special end line handling | |||
Log &operator<<(Endl e) { | |||
(void) e; | |||
if (session.verbose) { | |||
if (config.verbose) { | |||
std::cout << '\n'; | |||
line_ = true; | |||
} | |||
@@ -317,39 +318,71 @@ extern Log logger; | |||
*/ | |||
struct Timing { | |||
using Tpoint = std::chrono::steady_clock::time_point; | |||
using Tduration = std::chrono::microseconds; | |||
using microseconds = std::chrono::microseconds; | |||
using milliseconds = std::chrono::milliseconds; | |||
using seconds = std::chrono::seconds; | |||
//! tool to mark the starting point | |||
Tpoint start() noexcept { return start_ = std::chrono::steady_clock::now(); } | |||
Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); } | |||
//! tool to mark the ending point | |||
Tpoint stop() noexcept { return stop_ = std::chrono::steady_clock::now(); } | |||
Tpoint stop() noexcept { | |||
Tpoint now = std::chrono::steady_clock::now(); | |||
duration_ += dt(now, mark_); | |||
return now; | |||
} | |||
auto dt() noexcept { | |||
return std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_).count(); | |||
Tduration dt(Tpoint t2, Tpoint t1) noexcept { | |||
return std::chrono::duration_cast<Tduration>(t2 - t1); | |||
} | |||
//! tool to print the time interval | |||
void print_dt(const char *what) noexcept { | |||
if (session.timing) { | |||
auto t = stop_ - start_; | |||
if (std::chrono::duration_cast<microseconds>(t).count() < 10000) | |||
std::cout << "[Timing]: " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n"; | |||
else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000) | |||
std::cout << "[Timing]: " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n"; | |||
else | |||
std::cout << "[Timing]: " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n"; | |||
} | |||
void print_duration(const char *what, mpi_id_t rank) noexcept { | |||
if (std::chrono::duration_cast<microseconds>(duration_).count() < 10000) | |||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<microseconds>(duration_).count()) << " [usec]\n"; | |||
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000) | |||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n"; | |||
else | |||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||
<< std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n"; | |||
} | |||
private: | |||
Tpoint start_; | |||
Tpoint stop_; | |||
Tpoint mark_{}; | |||
Tduration duration_{}; | |||
}; | |||
/*! | |||
* Utility high level function to forward a function call to std::invoke and measure | |||
* the excecution time | |||
* | |||
* @tparam Func The function type | |||
* @tparam Args The argument | |||
* @param func | |||
* @param args | |||
* @return | |||
*/ | |||
#define timeCall(Tim, Func, ...) \ | |||
Tim.start(); \ | |||
Func(__VA_ARGS__); \ | |||
Tim.stop(); \ | |||
//template <typename Ret, typename Func, typename... Args> | |||
//auto timeCall_r(Ret& ret, Func&& func, Args&&... args) { | |||
// Timing timer; | |||
// | |||
// timer.start(); | |||
// ret = std::invoke(std::forward<Func>(func), std::forward<Args>(args)...); | |||
// timer.stop(); | |||
// | |||
// return timer.dt(); | |||
//} | |||
#endif /* UTILS_HPP_ */ |
@@ -9,6 +9,7 @@ | |||
#include "utils.hpp" | |||
#include "distsort.hpp" | |||
Timing TfullSort, Texchange, Tminmax, TelbowSort; | |||
bool isActive(mpi_id_t node, size_t nodes) { | |||
if (!((nodes > 0) && | |||
@@ -17,12 +17,12 @@ | |||
#include "distsort.hpp" | |||
// Global session data | |||
session_t session; | |||
// Global config data | |||
config_t config; | |||
MPI_t<> mpi; | |||
distBuffer_t Data; | |||
Log logger; | |||
Timing timer; | |||
Timing Ttotal; | |||
/*! | |||
* A small command line argument parser | |||
@@ -37,23 +37,23 @@ bool get_options(int argc, char* argv[]){ | |||
if (arg == "-q" || arg == "--array-size") { | |||
if (i+1 < argc) { | |||
session.arraySize = 1 << atoi(argv[++i]); | |||
config.arraySize = 1 << atoi(argv[++i]); | |||
} | |||
else { | |||
status = false; | |||
} | |||
} | |||
else if (arg == "--validation") { | |||
session.validation = true; | |||
config.validation = true; | |||
} | |||
else if (arg == "--ndebug") { | |||
session.ndebug = true; | |||
config.ndebug = true; | |||
} | |||
else if (arg == "-t" || arg == "--timing") { | |||
session.timing = true; | |||
else if (arg == "--perf") { | |||
config.perf = true; | |||
} | |||
else if (arg == "-v" || arg == "--verbose") { | |||
session.verbose = true; | |||
config.verbose = true; | |||
} | |||
else if (arg == "-h" || arg == "--help") { | |||
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; | |||
@@ -65,6 +65,8 @@ bool get_options(int argc, char* argv[]){ | |||
std::cout << "Options:\n\n"; | |||
std::cout << " -q | --array-size <N>\n"; | |||
std::cout << " Selects the array size according to size = 2^N\n\n"; | |||
std::cout << " --par-sort\n"; | |||
std::cout << " Request a parallel full sorting algorithm\n\n"; | |||
std::cout << " --validation\n"; | |||
std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; | |||
std::cout << " --ndebug\n"; | |||
@@ -164,11 +166,11 @@ int main(int argc, char* argv[]) try { | |||
#else | |||
volatile bool sleep_wait = true; | |||
#endif | |||
while (sleep_wait && !session.ndebug) | |||
while (sleep_wait && !config.ndebug) | |||
sleep(1); | |||
#endif | |||
logger << "Initialize local array of " << session.arraySize << " elements" << logger.endl; | |||
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; | |||
std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] | |||
std::mt19937 gen(rd()); | |||
std::uniform_int_distribution<distValue_t > dis( | |||
@@ -176,24 +178,31 @@ int main(int argc, char* argv[]) try { | |||
std::numeric_limits<distValue_t>::max() | |||
); | |||
// Fill vector | |||
Data.resize(session.arraySize); | |||
Data.resize(config.arraySize); | |||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | |||
if (mpi.rank() == 0) | |||
logger << "Starting distributed sorting ... "; | |||
timer.start(); | |||
Ttotal.start(); | |||
#if CODE_VERSION == BUBBLETONIC | |||
distBubbletonic(Data, mpi.size(), mpi.rank()); | |||
#else | |||
distBitonic (Data, mpi.size(), mpi.rank()); | |||
#endif | |||
timer.stop(); | |||
Ttotal.stop(); | |||
if (mpi.rank() == 0) | |||
logger << " Done." << logger.endl; | |||
std::string timeMsg = "rank " + std::to_string(mpi.rank()); | |||
timer.print_dt(timeMsg.c_str()); | |||
if (session.validation) { | |||
if (config.perf) { | |||
Ttotal.print_duration("Total ", mpi.rank()); | |||
TfullSort.print_duration("Full-Sort ", mpi.rank()); | |||
Texchange.print_duration("Exchange ", mpi.rank()); | |||
Tminmax.print_duration("Min-Max ", mpi.rank()); | |||
TelbowSort.print_duration("Elbow-Sort", mpi.rank()); | |||
} | |||
if (config.validation) { | |||
// If requested, we have the chance to fail! | |||
if (mpi.rank() == 0) | |||
std::cout << "Results validation ..."; | |||