@@ -42,7 +42,7 @@ BUILD_DIR := bin | |||||
OBJ_DIR := $(BUILD_DIR)/obj | OBJ_DIR := $(BUILD_DIR)/obj | ||||
DEP_DIR := $(BUILD_DIR)/.dep | DEP_DIR := $(BUILD_DIR)/.dep | ||||
OUTPUT_DIR := out-rc3a | |||||
OUTPUT_DIR := out-rc3b | |||||
# ========== Compiler settings ========== | # ========== Compiler settings ========== | ||||
# Compiler flags for debug and release | # Compiler flags for debug and release | ||||
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 20 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 23 --perf --validation |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 |
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n | |||||
# Suppress CUDA-aware support is disabled warning | # Suppress CUDA-aware support is disabled warning | ||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0 | export OMPI_MCA_opal_warn_on_missing_libcuda=0 | ||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation | |||||
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 |
@@ -25,7 +25,14 @@ | |||||
#endif | #endif | ||||
// Default Data size (in case -q <N> is not present) | // Default Data size (in case -q <N> is not present) | ||||
#define DEFAULT_DATA_SIZE (1 << 16) | |||||
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; | |||||
// The maximum MPI size we support | |||||
static constexpr size_t MAX_MPI_SIZE = 1024UL; | |||||
// The maximum pipeline size we support | |||||
static constexpr size_t MAX_PIPELINE_SIZE = 64UL; | |||||
/*! | /*! | ||||
* Value type selection | * Value type selection | ||||
@@ -46,6 +53,7 @@ using distValue_t = uint32_t; | |||||
*/ | */ | ||||
struct config_t { | struct config_t { | ||||
size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. | size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. | ||||
size_t pipeline{1UL}; //!< Pipeline stages | |||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. | bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. | ||||
bool ndebug{false}; //!< Skips debug trap on DEBUG builds. | bool ndebug{false}; //!< Skips debug trap on DEBUG builds. | ||||
bool perf{false}; //!< Enable performance timing measurements and prints. | bool perf{false}; //!< Enable performance timing measurements and prints. | ||||
@@ -233,24 +233,24 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept { | |||||
/*! | /*! | ||||
* Takes two sorted sequences where one is in increasing and the other is in decreasing order | |||||
* and selects either the larger or the smaller items in one-to-one comparison between them. | |||||
* The result is a bitonic sequence. | |||||
* Takes two sequences and selects either the larger or the smaller items | |||||
* in one-to-one comparison between them. If the initial sequences are bitonic, then | |||||
* the result is a bitonic sequence too! | |||||
* | * | ||||
* @tparam RangeT A range type with random access iterator | |||||
* @tparam ValueT The underlying type of the sequences | |||||
* | * | ||||
* @param local [RangeT] Reference to the local sequence | |||||
* @param remote [const RangeT] Reference to the remote sequence (copied locally by MPI) | |||||
* @param local [ValueT*] Pointer to the local sequence | |||||
* @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI) | |||||
* @param count [size_t] The number of items to process | |||||
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence | * @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence | ||||
*/ | */ | ||||
template<typename RangeT> | |||||
void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept { | |||||
using value_t = typename RangeT::value_type; | |||||
template<typename ValueT> | |||||
void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept { | |||||
std::transform( | std::transform( | ||||
local.begin(), local.end(), | |||||
remote.begin(), | |||||
local.begin(), | |||||
[&keepSmall](const value_t& a, const value_t& b){ | |||||
local, local + count, | |||||
remote, | |||||
local, | |||||
[&keepSmall](const ValueT& a, const ValueT& b){ | |||||
return (keepSmall) ? std::min(a, b) : std::max(a, b); | return (keepSmall) ? std::min(a, b) : std::max(a, b); | ||||
}); | }); | ||||
} | } | ||||
@@ -259,6 +259,60 @@ void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept | |||||
* ============================== Sort algorithms ============================== | * ============================== Sort algorithms ============================== | ||||
*/ | */ | ||||
/*! | |||||
* A small tag generator tool to provide consistent encoding to tag communication | |||||
* | |||||
* @param depth The current algorithmic depth[bitonic] of the communication, if any | |||||
* @param step The current step on the current depth | |||||
* @param stage The stage of the pipeline. | |||||
* @return The tag to use. | |||||
* | |||||
* @note | |||||
* In case we call this function outside of the pipeline loop, we can ommit | |||||
* @c stage argument and use the return value as starting tag for every communication | |||||
* of the pipeline loop. We need to increase the tags for each communication of | |||||
* the pipeline loop though! | |||||
*/ | |||||
size_t tagGenerator(size_t depth, size_t step, size_t stage = 0); | |||||
/*! | |||||
* A pipeline loop for mixing min-max process with mpi data exchange | |||||
* | |||||
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator. | |||||
* | |||||
* @param data [ShadowedDataT&] Reference to the data to exchange | |||||
* @param partner [mpi_id_t] The partner for the exchange | |||||
* @param keepSmall [bool] Flag to indicate if we keep the small values | |||||
* @param tag [int] The init tag to use for the loop. | |||||
* | |||||
* @note | |||||
* The @c tag is increased inside the pipeline loop for each different data exchange | |||||
*/ | |||||
template<typename ShadowedDataT> | |||||
void exchangePipeline(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) { | |||||
using Value_t = typename ShadowedDataT::value_type; | |||||
// Init counters and pointers | |||||
size_t count = data.size() / config.pipeline; | |||||
Value_t* active = data.getActive().data(); | |||||
Value_t* shadow = data.getShadow().data(); | |||||
// Pipeline | |||||
Texchange.start(); | |||||
mpi.exchange_start(active, shadow, count, partner, tag); | |||||
for (size_t stage = 0 ; stage < config.pipeline ; active += count, shadow += count) { | |||||
// Wait previous chunk | |||||
mpi.exchange_wait(); Texchange.stop(); | |||||
if (++stage < config.pipeline) { | |||||
// Start next chunk if there is a next one | |||||
Texchange.start(); | |||||
mpi.exchange_start(active + count, shadow + count, count, partner, ++tag); | |||||
} | |||||
// process the arrived data | |||||
timeCall(Tminmax, keepMinOrMax, active, shadow, count, keepSmall); | |||||
} | |||||
} | |||||
/*! | /*! | ||||
* A distributed version of the Bubbletonic sort algorithm. | * A distributed version of the Bubbletonic sort algorithm. | ||||
* | * | ||||
@@ -284,9 +338,8 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||||
if ( isActive(rank, Processes) && | if ( isActive(rank, Processes) && | ||||
isActive(part, Processes) ) { | isActive(part, Processes) ) { | ||||
// Exchange with partner, keep nim-or-max and sort - O(N) | // Exchange with partner, keep nim-or-max and sort - O(N) | ||||
int tag = static_cast<int>(step); | |||||
timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag); | |||||
timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); | |||||
int tag = static_cast<int>(tagGenerator(0, step)); | |||||
exchangePipeline(data, part, ks, tag); | |||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes)); | timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes)); | ||||
} | } | ||||
} | } | ||||
@@ -324,9 +377,8 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { | |||||
auto part = partner<SortMode::Bitonic>(rank, step); | auto part = partner<SortMode::Bitonic>(rank, step); | ||||
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth); | auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth); | ||||
// Exchange with partner, keep nim-or-max | // Exchange with partner, keep nim-or-max | ||||
int tag = static_cast<int>( (2*p*depth) + step ); | |||||
timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag); | |||||
timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks); | |||||
int tag = static_cast<int>(tagGenerator(depth, step)); | |||||
exchangePipeline(data, part, ks, tag); | |||||
} | } | ||||
// sort - O(N) | // sort - O(N) | ||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth)); | timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth)); | ||||
@@ -65,6 +65,10 @@ struct MPI_t { | |||||
mpi_throw(err, "(MPI) MPI_Comm_rank() - "); | mpi_throw(err, "(MPI) MPI_Comm_rank() - "); | ||||
size_ = static_cast<ID_t>(size_value); | size_ = static_cast<ID_t>(size_value); | ||||
rank_ = static_cast<ID_t>(rank_value); | rank_ = static_cast<ID_t>(rank_value); | ||||
if (size_ > static_cast<ID_t>(MAX_MPI_SIZE)) | |||||
throw std::runtime_error( | |||||
"(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n" | |||||
); | |||||
// Get the name of the processor | // Get the name of the processor | ||||
char processor_name[MPI_MAX_PROCESSOR_NAME]; | char processor_name[MPI_MAX_PROCESSOR_NAME]; | ||||
@@ -74,63 +78,56 @@ struct MPI_t { | |||||
name_ = std::string (processor_name, name_len); | name_ = std::string (processor_name, name_len); | ||||
} | } | ||||
/*! | /*! | ||||
* Exchange data with partner as part of the sorting network of both bubbletonic or bitonic | |||||
* sorting algorithms. | |||||
* Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the | |||||
* sorting network of both bubbletonic or bitonic sorting algorithms. | |||||
* | * | ||||
* This function matches a transmit and a receive in order for fully exchanged data between | * This function matches a transmit and a receive in order for fully exchanged data between | ||||
* current node and partner. | * current node and partner. | ||||
* @note | |||||
* This call MUST paired with exchange_wait() for each MPI_t object. | |||||
* Calling 2 consecutive exchange_start() for the same MPI_t object is undefined. | |||||
* | * | ||||
* @tparam T The inner valur type used in buffer | |||||
* @tparam ValueT The underlying value type used in buffers | |||||
* | * | ||||
* @param ldata [std::vector<T>] Reference to local data to send | |||||
* @param rdata [std::vector<T>] Reference to buffer to receive data from partner | |||||
* @param partner [mpi_id_t] The partner for the exchange | |||||
* @param tag [int] The tag to use for the MPI communication | |||||
* @param ldata [const ValueT*] Pointer to local data to send | |||||
* @param rdata [ValueT*] Pointer to buffer to receive data from partner | |||||
* @param count [size_t] The number of data to exchange | |||||
* @param partner [mpi_id_t] The partner for the exchange | |||||
* @param tag [int] The tag to use for the MPI communication | |||||
*/ | */ | ||||
template<typename T> | |||||
void exchange_data(const std::vector<T>& ldata, std::vector<T>& rdata, ID_t partner, int tag) { | |||||
template<typename ValueT> | |||||
void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) { | |||||
if (tag < 0) | if (tag < 0) | ||||
throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound"); | throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound"); | ||||
MPI_Datatype datatype = MPI_TypeMapper<T>::getType(); | |||||
int count = static_cast<int>(ldata.size()); | |||||
MPI_Status status; | |||||
MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType(); | |||||
int err; | int err; | ||||
if ((err = MPI_Sendrecv( | |||||
ldata.data(), count, datatype, partner, tag, | |||||
rdata.data(), count, datatype, partner, tag, | |||||
MPI_COMM_WORLD, &status | |||||
)) != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - "); | |||||
err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx); | |||||
if (err != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Isend() - "); | |||||
err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx); | |||||
if (err != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Irecv() - "); | |||||
} | } | ||||
/*! | /*! | ||||
* Exchange a data object with partner as part of the sorting network of both bubbletonic | |||||
* or bitonic sorting algorithms. | |||||
* | |||||
* This function matches a transmit and a receive in order for fully exchanged the data object | |||||
* between current node and partner. | |||||
* Block wait for the completion of the previously called exchange_start() | |||||
* | * | ||||
* @tparam T The object type | |||||
* | |||||
* @param local [const T&] Reference to the local object to send | |||||
* @param remote [T&] Reference to the object to receive data from partner | |||||
* @param partner [mpi_id_t] The partner for the exchange | |||||
* @param tag [int] The tag to use for the MPI communication | |||||
* @note | |||||
* This call MUST paired with exchange_start() for each MPI_t object. | |||||
* Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined. | |||||
*/ | */ | ||||
template<typename T> | |||||
void exchange_it(const T& local, T& remote, ID_t partner, int tag) { | |||||
if (tag < 0) | |||||
throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound"); | |||||
void exchange_wait() { | |||||
MPI_Status status; | MPI_Status status; | ||||
int err; | int err; | ||||
if ((err = MPI_Sendrecv( | |||||
&local, sizeof(T), MPI_BYTE, partner, tag, | |||||
&remote, sizeof(T), MPI_BYTE, partner, tag, | |||||
MPI_COMM_WORLD, &status | |||||
)) != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - "); | |||||
if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Wait() [send] - "); | |||||
if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS) | |||||
mpi_throw(err, "(MPI) MPI_Wait() [recv] - "); | |||||
} | } | ||||
// Accessors | // Accessors | ||||
@@ -181,6 +178,8 @@ private: | |||||
ID_t size_{}; //!< MPI total size of the execution | ID_t size_{}; //!< MPI total size of the execution | ||||
std::string name_{}; //!< The name of the local machine | std::string name_{}; //!< The name of the local machine | ||||
bool initialized_{}; //!< RAII helper flag | bool initialized_{}; //!< RAII helper flag | ||||
MPI_Request handle_tx{}; //!< MPI async exchange handler for Transmission | |||||
MPI_Request handle_rx{}; //!< MPI async exchange handler for Receptions | |||||
}; | }; | ||||
/* | /* | ||||
@@ -377,9 +376,13 @@ struct Timing { | |||||
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000) | else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000) | ||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | ||||
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n"; | << std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n"; | ||||
else | |||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " | |||||
<< std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n"; | |||||
else { | |||||
char stime[26]; // fit ulong | |||||
auto sec = std::chrono::duration_cast<seconds>(duration_).count(); | |||||
auto msec = (std::chrono::duration_cast<milliseconds>(duration_).count() % 1000) / 10; // keep 2 digit | |||||
std::sprintf(stime, "%ld.%1ld", sec, msec); | |||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n"; | |||||
} | |||||
} | } | ||||
@@ -402,4 +405,17 @@ private: | |||||
Tim.stop(); \ | Tim.stop(); \ | ||||
/*! | |||||
* A utility to check if a number is power of two | |||||
* | |||||
* @tparam Integral The integral type of the number to check | |||||
* @param x The number to check | |||||
* @return True if it is power of 2, false otherwise | |||||
*/ | |||||
template <typename Integral> | |||||
constexpr inline bool isPowerOfTwo(Integral x) noexcept { | |||||
return (!(x & (x - 1)) && x); | |||||
} | |||||
#endif /* UTILS_HPP_ */ | #endif /* UTILS_HPP_ */ |
@@ -23,3 +23,13 @@ bool isActive(mpi_id_t node, size_t nodes) { | |||||
return (node >= 0) && (node < static_cast<mpi_id_t>(nodes)); | return (node >= 0) && (node < static_cast<mpi_id_t>(nodes)); | ||||
} | } | ||||
size_t tagGenerator(size_t depth, size_t step, size_t stage) { | |||||
auto stage_bits = static_cast<uint32_t>(std::log2(MAX_PIPELINE_SIZE)); | |||||
auto step_bits = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE)); | |||||
// ^ We use MPI_SIZE room for steps to fit the bubbletonic version | |||||
size_t tag = stage | |||||
| (step << stage_bits) | |||||
| (depth << (stage_bits + step_bits)); | |||||
return tag; | |||||
} |
@@ -17,7 +17,7 @@ | |||||
#include "distsort.hpp" | #include "distsort.hpp" | ||||
// Global config data | |||||
// Global session data | |||||
config_t config; | config_t config; | ||||
MPI_t<> mpi; | MPI_t<> mpi; | ||||
distBuffer_t Data; | distBuffer_t Data; | ||||
@@ -43,36 +43,49 @@ bool get_options(int argc, char* argv[]){ | |||||
status = false; | status = false; | ||||
} | } | ||||
} | } | ||||
else if (arg == "--pipeline") { | |||||
if (i+1 < argc) { | |||||
auto stages = atoi(argv[++i]); | |||||
if (isPowerOfTwo(stages) && stages <= static_cast<int>(MAX_PIPELINE_SIZE)) | |||||
config.pipeline = stages; | |||||
else | |||||
status = false; | |||||
} | |||||
else { | |||||
status = false; | |||||
} | |||||
} | |||||
else if (arg == "--validation") { | else if (arg == "--validation") { | ||||
config.validation = true; | config.validation = true; | ||||
} | } | ||||
else if (arg == "--ndebug") { | |||||
config.ndebug = true; | |||||
} | |||||
else if (arg == "--perf") { | else if (arg == "--perf") { | ||||
config.perf = true; | config.perf = true; | ||||
} | } | ||||
else if (arg == "--ndebug") { | |||||
config.ndebug = true; | |||||
} | |||||
else if (arg == "-v" || arg == "--verbose") { | else if (arg == "-v" || arg == "--verbose") { | ||||
config.verbose = true; | config.verbose = true; | ||||
} | } | ||||
else if (arg == "-h" || arg == "--help") { | else if (arg == "-h" || arg == "--help") { | ||||
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; | std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n"; | ||||
std::cout << "distbitonic -q <N> [--validation] [--ndebug] [-v]\n"; | |||||
std::cout << "distbitonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n"; | |||||
std::cout << "distbitonic -h\n"; | std::cout << "distbitonic -h\n"; | ||||
std::cout << "distbubbletonic -q <N> [--validation] [--ndebug] [-v]\n"; | |||||
std::cout << "distbubbletonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n"; | |||||
std::cout << "distbubbletonic -h\n"; | std::cout << "distbubbletonic -h\n"; | ||||
std::cout << '\n'; | std::cout << '\n'; | ||||
std::cout << "Options:\n\n"; | std::cout << "Options:\n\n"; | ||||
std::cout << " -q | --array-size <N>\n"; | std::cout << " -q | --array-size <N>\n"; | ||||
std::cout << " Selects the array size according to size = 2^N\n\n"; | std::cout << " Selects the array size according to size = 2^N\n\n"; | ||||
std::cout << " --par-sort\n"; | |||||
std::cout << " Request a parallel full sorting algorithm\n\n"; | |||||
std::cout << " --pipeline <N>\n"; | |||||
std::cout << " Request a pipeline of <N> stages for exchange-minmax\n"; | |||||
std::cout << " N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n"; | |||||
std::cout << " --validation\n"; | std::cout << " --validation\n"; | ||||
std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; | std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; | ||||
std::cout << " --perf\n"; | |||||
std::cout << " Request performance timing measurements to stdout.\n\n"; | |||||
std::cout << " --ndebug\n"; | std::cout << " --ndebug\n"; | ||||
std::cout << " Skip debug breakpoint when on debug build.\n\n"; | std::cout << " Skip debug breakpoint when on debug build.\n\n"; | ||||
std::cout << " -t | --timing\n"; | |||||
std::cout << " Request timing measurements output to stdout.\n\n"; | |||||
std::cout << " -v | --verbose\n"; | std::cout << " -v | --verbose\n"; | ||||
std::cout << " Request a more verbose output to stdout.\n\n"; | std::cout << " Request a more verbose output to stdout.\n\n"; | ||||
std::cout << " -h | --help\n"; | std::cout << " -h | --help\n"; | ||||
@@ -126,6 +126,49 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) { | |||||
} | } | ||||
} | } | ||||
/* | |||||
* MPI: SysTest (acceptance) | |||||
* Each process executes distBubbletonic for uin32_t [1 << 16] with pipeline | |||||
*/ | |||||
TEST_F(TMPIdistSort, distBubbletonic_test3) { | |||||
// Create and fill vector | |||||
using tsValue_t = uint32_t; // Test parameters | |||||
size_t ts_buffer_size = 1 << 16; | |||||
ShadowedVec_t<tsValue_t> ts_Data; | |||||
std::uniform_int_distribution<tsValue_t > dis( | |||||
std::numeric_limits<tsValue_t>::min(), | |||||
std::numeric_limits<tsValue_t>::max() | |||||
); | |||||
ts_Data.resize(ts_buffer_size); | |||||
std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); | |||||
// Set pipeline | |||||
config.pipeline = 8; | |||||
// Execute function under test in all processes | |||||
distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); | |||||
// Local min and max | |||||
auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); | |||||
auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); | |||||
// Gather min/max to rank 0 | |||||
std::vector<tsValue_t> global_mins(ts_mpi.size()); | |||||
std::vector<tsValue_t> global_maxes(ts_mpi.size()); | |||||
MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType(); | |||||
MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); | |||||
MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); | |||||
// Check results | |||||
EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); | |||||
if (ts_mpi.rank() == 0) { | |||||
for (size_t i = 1; i < global_mins.size(); ++i) { | |||||
EXPECT_LE(global_maxes[i - 1], global_mins[i]); | |||||
} | |||||
} | |||||
} | |||||
/* | /* | ||||
* MPI: SysTest (acceptance) | * MPI: SysTest (acceptance) | ||||
@@ -209,3 +252,46 @@ TEST_F(TMPIdistSort, distBitonic_test2) { | |||||
} | } | ||||
} | } | ||||
/* | |||||
* MPI: SysTest (acceptance) | |||||
* Each process executes distBitonic for uin32_t [1 << 16] with pipeline | |||||
*/ | |||||
TEST_F(TMPIdistSort, distBitonic_test3) { | |||||
// Create and fill vector | |||||
using tsValue_t = uint32_t; // Test parameters | |||||
size_t ts_buffer_size = 1 << 16; | |||||
ShadowedVec_t<tsValue_t> ts_Data; | |||||
std::uniform_int_distribution<tsValue_t > dis( | |||||
std::numeric_limits<tsValue_t>::min(), | |||||
std::numeric_limits<tsValue_t>::max() | |||||
); | |||||
ts_Data.resize(ts_buffer_size); | |||||
std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); | |||||
// Set pipeline | |||||
config.pipeline = 8; | |||||
// Execute function under test in all processes | |||||
distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); | |||||
// Local min and max | |||||
auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); | |||||
auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); | |||||
// Gather min/max to rank 0 | |||||
std::vector<tsValue_t> global_mins(ts_mpi.size()); | |||||
std::vector<tsValue_t> global_maxes(ts_mpi.size()); | |||||
MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType(); | |||||
MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); | |||||
MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); | |||||
// Check results | |||||
EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); | |||||
if (ts_mpi.rank() == 0) { | |||||
for (size_t i = 1; i < global_mins.size(); ++i) { | |||||
EXPECT_LE(global_maxes[i - 1], global_mins[i]); | |||||
} | |||||
} | |||||
} |