HW2: RC3b - A MinMax-MPIexchange pipeline and small changes
This commit is contained in:
parent
c485d0db2d
commit
3bf4522448
@ -42,7 +42,7 @@ BUILD_DIR := bin
|
|||||||
OBJ_DIR := $(BUILD_DIR)/obj
|
OBJ_DIR := $(BUILD_DIR)/obj
|
||||||
DEP_DIR := $(BUILD_DIR)/.dep
|
DEP_DIR := $(BUILD_DIR)/.dep
|
||||||
|
|
||||||
OUTPUT_DIR := out-rc3a
|
OUTPUT_DIR := out-rc3b
|
||||||
|
|
||||||
# ========== Compiler settings ==========
|
# ========== Compiler settings ==========
|
||||||
# Compiler flags for debug and release
|
# Compiler flags for debug and release
|
||||||
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 20 --perf --validation
|
srun ./out-rc3b/distbitonic -q 20 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 23 --perf --validation
|
srun ./out-rc3b/distbitonic -q 23 --perf --validation
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 25 --perf --validation
|
srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
|
@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
|
|||||||
# Suppress CUDA-aware support is disabled warning
|
# Suppress CUDA-aware support is disabled warning
|
||||||
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
export OMPI_MCA_opal_warn_on_missing_libcuda=0
|
||||||
|
|
||||||
srun ./out-rc3a/distbitonic -q 27 --perf --validation
|
srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
|
@ -25,7 +25,14 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Default Data size (in case -q <N> is not present)
|
// Default Data size (in case -q <N> is not present)
|
||||||
#define DEFAULT_DATA_SIZE (1 << 16)
|
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
||||||
|
|
||||||
|
// The maximum MPI size we support
|
||||||
|
static constexpr size_t MAX_MPI_SIZE = 1024UL;
|
||||||
|
|
||||||
|
// The maximum pipeline size we support
|
||||||
|
static constexpr size_t MAX_PIPELINE_SIZE = 64UL;
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Value type selection
|
* Value type selection
|
||||||
@ -46,6 +53,7 @@ using distValue_t = uint32_t;
|
|||||||
*/
|
*/
|
||||||
struct config_t {
|
struct config_t {
|
||||||
size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
||||||
|
size_t pipeline{1UL}; //!< Pipeline stages
|
||||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
||||||
bool ndebug{false}; //!< Skips debug trap on DEBUG builds.
|
bool ndebug{false}; //!< Skips debug trap on DEBUG builds.
|
||||||
bool perf{false}; //!< Enable performance timing measurements and prints.
|
bool perf{false}; //!< Enable performance timing measurements and prints.
|
||||||
|
@ -233,24 +233,24 @@ void elbowSort(ShadowedDataT& data, bool ascending) noexcept {
|
|||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Takes two sorted sequences where one is in increasing and the other is in decreasing order
|
* Takes two sequences and selects either the larger or the smaller items
|
||||||
* and selects either the larger or the smaller items in one-to-one comparison between them.
|
* in one-to-one comparison between them. If the initial sequences are bitonic, then
|
||||||
* The result is a bitonic sequence.
|
* the result is a bitonic sequence too!
|
||||||
*
|
*
|
||||||
* @tparam RangeT A range type with random access iterator
|
* @tparam ValueT The underlying type of the sequences
|
||||||
*
|
*
|
||||||
* @param local [RangeT] Reference to the local sequence
|
* @param local [ValueT*] Pointer to the local sequence
|
||||||
* @param remote [const RangeT] Reference to the remote sequence (copied locally by MPI)
|
* @param remote [const ValueT*] Pointer to the remote sequence (copied locally by MPI)
|
||||||
|
* @param count [size_t] The number of items to process
|
||||||
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence
|
* @param keepSmall [bool] Flag to indicate if we keep the small items in local sequence
|
||||||
*/
|
*/
|
||||||
template<typename RangeT>
|
template<typename ValueT>
|
||||||
void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept {
|
void keepMinOrMax(ValueT* local, const ValueT* remote, size_t count, bool keepSmall) noexcept {
|
||||||
using value_t = typename RangeT::value_type;
|
|
||||||
std::transform(
|
std::transform(
|
||||||
local.begin(), local.end(),
|
local, local + count,
|
||||||
remote.begin(),
|
remote,
|
||||||
local.begin(),
|
local,
|
||||||
[&keepSmall](const value_t& a, const value_t& b){
|
[&keepSmall](const ValueT& a, const ValueT& b){
|
||||||
return (keepSmall) ? std::min(a, b) : std::max(a, b);
|
return (keepSmall) ? std::min(a, b) : std::max(a, b);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -259,6 +259,60 @@ void keepMinOrMax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept
|
|||||||
* ============================== Sort algorithms ==============================
|
* ============================== Sort algorithms ==============================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A small tag generator tool to provide consistent encoding to tag communication
|
||||||
|
*
|
||||||
|
* @param depth The current algorithmic depth[bitonic] of the communication, if any
|
||||||
|
* @param step The current step on the current depth
|
||||||
|
* @param stage The stage of the pipeline.
|
||||||
|
* @return The tag to use.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* In case we call this function outside of the pipeline loop, we can ommit
|
||||||
|
* @c stage argument and use the return value as starting tag for every communication
|
||||||
|
* of the pipeline loop. We need to increase the tags for each communication of
|
||||||
|
* the pipeline loop though!
|
||||||
|
*/
|
||||||
|
size_t tagGenerator(size_t depth, size_t step, size_t stage = 0);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A pipeline loop for mixing min-max process with mpi data exchange
|
||||||
|
*
|
||||||
|
* @tparam ShadowedDataT A Shadowed buffer type with random access iterator.
|
||||||
|
*
|
||||||
|
* @param data [ShadowedDataT&] Reference to the data to exchange
|
||||||
|
* @param partner [mpi_id_t] The partner for the exchange
|
||||||
|
* @param keepSmall [bool] Flag to indicate if we keep the small values
|
||||||
|
* @param tag [int] The init tag to use for the loop.
|
||||||
|
*
|
||||||
|
* @note
|
||||||
|
* The @c tag is increased inside the pipeline loop for each different data exchange
|
||||||
|
*/
|
||||||
|
template<typename ShadowedDataT>
|
||||||
|
void exchangePipeline(ShadowedDataT& data, mpi_id_t partner, bool keepSmall, int tag) {
|
||||||
|
using Value_t = typename ShadowedDataT::value_type;
|
||||||
|
|
||||||
|
// Init counters and pointers
|
||||||
|
size_t count = data.size() / config.pipeline;
|
||||||
|
Value_t* active = data.getActive().data();
|
||||||
|
Value_t* shadow = data.getShadow().data();
|
||||||
|
|
||||||
|
// Pipeline
|
||||||
|
Texchange.start();
|
||||||
|
mpi.exchange_start(active, shadow, count, partner, tag);
|
||||||
|
for (size_t stage = 0 ; stage < config.pipeline ; active += count, shadow += count) {
|
||||||
|
// Wait previous chunk
|
||||||
|
mpi.exchange_wait(); Texchange.stop();
|
||||||
|
if (++stage < config.pipeline) {
|
||||||
|
// Start next chunk if there is a next one
|
||||||
|
Texchange.start();
|
||||||
|
mpi.exchange_start(active + count, shadow + count, count, partner, ++tag);
|
||||||
|
}
|
||||||
|
// process the arrived data
|
||||||
|
timeCall(Tminmax, keepMinOrMax, active, shadow, count, keepSmall);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* A distributed version of the Bubbletonic sort algorithm.
|
* A distributed version of the Bubbletonic sort algorithm.
|
||||||
*
|
*
|
||||||
@ -284,9 +338,8 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
|
|||||||
if ( isActive(rank, Processes) &&
|
if ( isActive(rank, Processes) &&
|
||||||
isActive(part, Processes) ) {
|
isActive(part, Processes) ) {
|
||||||
// Exchange with partner, keep nim-or-max and sort - O(N)
|
// Exchange with partner, keep nim-or-max and sort - O(N)
|
||||||
int tag = static_cast<int>(step);
|
int tag = static_cast<int>(tagGenerator(0, step));
|
||||||
timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
|
exchangePipeline(data, part, ks, tag);
|
||||||
timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
|
|
||||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
|
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -324,9 +377,8 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
|
|||||||
auto part = partner<SortMode::Bitonic>(rank, step);
|
auto part = partner<SortMode::Bitonic>(rank, step);
|
||||||
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
|
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
|
||||||
// Exchange with partner, keep nim-or-max
|
// Exchange with partner, keep nim-or-max
|
||||||
int tag = static_cast<int>( (2*p*depth) + step );
|
int tag = static_cast<int>(tagGenerator(depth, step));
|
||||||
timeCall(Texchange, mpi.exchange_data, data.getActive(), data.getShadow(), part, tag);
|
exchangePipeline(data, part, ks, tag);
|
||||||
timeCall(Tminmax, keepMinOrMax, data.getActive(), data.getShadow(), ks);
|
|
||||||
}
|
}
|
||||||
// sort - O(N)
|
// sort - O(N)
|
||||||
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
|
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
|
||||||
|
@ -65,6 +65,10 @@ struct MPI_t {
|
|||||||
mpi_throw(err, "(MPI) MPI_Comm_rank() - ");
|
mpi_throw(err, "(MPI) MPI_Comm_rank() - ");
|
||||||
size_ = static_cast<ID_t>(size_value);
|
size_ = static_cast<ID_t>(size_value);
|
||||||
rank_ = static_cast<ID_t>(rank_value);
|
rank_ = static_cast<ID_t>(rank_value);
|
||||||
|
if (size_ > static_cast<ID_t>(MAX_MPI_SIZE))
|
||||||
|
throw std::runtime_error(
|
||||||
|
"(MPI) size - Not supported number of nodes [over " + std::to_string(MAX_MPI_SIZE) + "]\n"
|
||||||
|
);
|
||||||
|
|
||||||
// Get the name of the processor
|
// Get the name of the processor
|
||||||
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
||||||
@ -74,63 +78,56 @@ struct MPI_t {
|
|||||||
name_ = std::string (processor_name, name_len);
|
name_ = std::string (processor_name, name_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Exchange data with partner as part of the sorting network of both bubbletonic or bitonic
|
* Initiate a data exchange data with partner using non-blocking Isend-Irecv, as part of the
|
||||||
* sorting algorithms.
|
* sorting network of both bubbletonic or bitonic sorting algorithms.
|
||||||
*
|
*
|
||||||
* This function matches a transmit and a receive in order for fully exchanged data between
|
* This function matches a transmit and a receive in order for fully exchanged data between
|
||||||
* current node and partner.
|
* current node and partner.
|
||||||
|
* @note
|
||||||
|
* This call MUST paired with exchange_wait() for each MPI_t object.
|
||||||
|
* Calling 2 consecutive exchange_start() for the same MPI_t object is undefined.
|
||||||
*
|
*
|
||||||
* @tparam T The inner valur type used in buffer
|
* @tparam ValueT The underlying value type used in buffers
|
||||||
*
|
*
|
||||||
* @param ldata [std::vector<T>] Reference to local data to send
|
* @param ldata [const ValueT*] Pointer to local data to send
|
||||||
* @param rdata [std::vector<T>] Reference to buffer to receive data from partner
|
* @param rdata [ValueT*] Pointer to buffer to receive data from partner
|
||||||
|
* @param count [size_t] The number of data to exchange
|
||||||
* @param partner [mpi_id_t] The partner for the exchange
|
* @param partner [mpi_id_t] The partner for the exchange
|
||||||
* @param tag [int] The tag to use for the MPI communication
|
* @param tag [int] The tag to use for the MPI communication
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename ValueT>
|
||||||
void exchange_data(const std::vector<T>& ldata, std::vector<T>& rdata, ID_t partner, int tag) {
|
void exchange_start(const ValueT* ldata, ValueT* rdata, size_t count, ID_t partner, int tag) {
|
||||||
if (tag < 0)
|
if (tag < 0)
|
||||||
throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");
|
throw std::runtime_error("(MPI) exchange_data() [tag] - Out of bound");
|
||||||
|
|
||||||
MPI_Datatype datatype = MPI_TypeMapper<T>::getType();
|
MPI_Datatype datatype = MPI_TypeMapper<ValueT>::getType();
|
||||||
int count = static_cast<int>(ldata.size());
|
|
||||||
MPI_Status status;
|
|
||||||
int err;
|
int err;
|
||||||
if ((err = MPI_Sendrecv(
|
err = MPI_Isend(ldata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_tx);
|
||||||
ldata.data(), count, datatype, partner, tag,
|
if (err != MPI_SUCCESS)
|
||||||
rdata.data(), count, datatype, partner, tag,
|
mpi_throw(err, "(MPI) MPI_Isend() - ");
|
||||||
MPI_COMM_WORLD, &status
|
err = MPI_Irecv(rdata, count, datatype, partner, tag, MPI_COMM_WORLD, &handle_rx);
|
||||||
)) != MPI_SUCCESS)
|
if (err != MPI_SUCCESS)
|
||||||
mpi_throw(err, "(MPI) MPI_Sendrecv() [data] - ");
|
mpi_throw(err, "(MPI) MPI_Irecv() - ");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* Exchange a data object with partner as part of the sorting network of both bubbletonic
|
* Block wait for the completion of the previously called exchange_start()
|
||||||
* or bitonic sorting algorithms.
|
|
||||||
*
|
*
|
||||||
* This function matches a transmit and a receive in order for fully exchanged the data object
|
* @note
|
||||||
* between current node and partner.
|
* This call MUST paired with exchange_start() for each MPI_t object.
|
||||||
*
|
* Calling 2 consecutive exchange_wait() for the same MPI_t object is undefined.
|
||||||
* @tparam T The object type
|
|
||||||
*
|
|
||||||
* @param local [const T&] Reference to the local object to send
|
|
||||||
* @param remote [T&] Reference to the object to receive data from partner
|
|
||||||
* @param partner [mpi_id_t] The partner for the exchange
|
|
||||||
* @param tag [int] The tag to use for the MPI communication
|
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
void exchange_wait() {
|
||||||
void exchange_it(const T& local, T& remote, ID_t partner, int tag) {
|
|
||||||
if (tag < 0)
|
|
||||||
throw std::runtime_error("(MPI) exchange_it() [tag] - Out of bound");
|
|
||||||
MPI_Status status;
|
MPI_Status status;
|
||||||
|
|
||||||
int err;
|
int err;
|
||||||
if ((err = MPI_Sendrecv(
|
if ((err = MPI_Wait(&handle_tx, &status)) != MPI_SUCCESS)
|
||||||
&local, sizeof(T), MPI_BYTE, partner, tag,
|
mpi_throw(err, "(MPI) MPI_Wait() [send] - ");
|
||||||
&remote, sizeof(T), MPI_BYTE, partner, tag,
|
|
||||||
MPI_COMM_WORLD, &status
|
if ((err = MPI_Wait(&handle_rx, &status)) != MPI_SUCCESS)
|
||||||
)) != MPI_SUCCESS)
|
mpi_throw(err, "(MPI) MPI_Wait() [recv] - ");
|
||||||
mpi_throw(err, "(MPI) MPI_Sendrecv() [item] - ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors
|
// Accessors
|
||||||
@ -181,6 +178,8 @@ private:
|
|||||||
ID_t size_{}; //!< MPI total size of the execution
|
ID_t size_{}; //!< MPI total size of the execution
|
||||||
std::string name_{}; //!< The name of the local machine
|
std::string name_{}; //!< The name of the local machine
|
||||||
bool initialized_{}; //!< RAII helper flag
|
bool initialized_{}; //!< RAII helper flag
|
||||||
|
MPI_Request handle_tx{}; //!< MPI async exchange handler for Transmission
|
||||||
|
MPI_Request handle_rx{}; //!< MPI async exchange handler for Receptions
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -377,9 +376,13 @@ struct Timing {
|
|||||||
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000)
|
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000)
|
||||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
|
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
|
||||||
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n";
|
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n";
|
||||||
else
|
else {
|
||||||
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
|
char stime[26]; // fit ulong
|
||||||
<< std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n";
|
auto sec = std::chrono::duration_cast<seconds>(duration_).count();
|
||||||
|
auto msec = (std::chrono::duration_cast<milliseconds>(duration_).count() % 1000) / 10; // keep 2 digit
|
||||||
|
std::sprintf(stime, "%ld.%1ld", sec, msec);
|
||||||
|
std::cout << "[Timing] (Rank " << rank << ") " << what << ": " << stime << " [sec]\n";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -402,4 +405,17 @@ private:
|
|||||||
Tim.stop(); \
|
Tim.stop(); \
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A utility to check if a number is power of two
|
||||||
|
*
|
||||||
|
* @tparam Integral The integral type of the number to check
|
||||||
|
* @param x The number to check
|
||||||
|
* @return True if it is power of 2, false otherwise
|
||||||
|
*/
|
||||||
|
template <typename Integral>
|
||||||
|
constexpr inline bool isPowerOfTwo(Integral x) noexcept {
|
||||||
|
return (!(x & (x - 1)) && x);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif /* UTILS_HPP_ */
|
#endif /* UTILS_HPP_ */
|
||||||
|
@ -23,3 +23,13 @@ bool isActive(mpi_id_t node, size_t nodes) {
|
|||||||
return (node >= 0) && (node < static_cast<mpi_id_t>(nodes));
|
return (node >= 0) && (node < static_cast<mpi_id_t>(nodes));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t tagGenerator(size_t depth, size_t step, size_t stage) {
|
||||||
|
auto stage_bits = static_cast<uint32_t>(std::log2(MAX_PIPELINE_SIZE));
|
||||||
|
auto step_bits = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE));
|
||||||
|
// ^ We use MPI_SIZE room for steps to fit the bubbletonic version
|
||||||
|
|
||||||
|
size_t tag = stage
|
||||||
|
| (step << stage_bits)
|
||||||
|
| (depth << (stage_bits + step_bits));
|
||||||
|
return tag;
|
||||||
|
}
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
#include "distsort.hpp"
|
#include "distsort.hpp"
|
||||||
|
|
||||||
|
|
||||||
// Global config data
|
// Global session data
|
||||||
config_t config;
|
config_t config;
|
||||||
MPI_t<> mpi;
|
MPI_t<> mpi;
|
||||||
distBuffer_t Data;
|
distBuffer_t Data;
|
||||||
@ -43,36 +43,49 @@ bool get_options(int argc, char* argv[]){
|
|||||||
status = false;
|
status = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (arg == "--pipeline") {
|
||||||
|
if (i+1 < argc) {
|
||||||
|
auto stages = atoi(argv[++i]);
|
||||||
|
if (isPowerOfTwo(stages) && stages <= static_cast<int>(MAX_PIPELINE_SIZE))
|
||||||
|
config.pipeline = stages;
|
||||||
|
else
|
||||||
|
status = false;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
status = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
else if (arg == "--validation") {
|
else if (arg == "--validation") {
|
||||||
config.validation = true;
|
config.validation = true;
|
||||||
}
|
}
|
||||||
else if (arg == "--ndebug") {
|
|
||||||
config.ndebug = true;
|
|
||||||
}
|
|
||||||
else if (arg == "--perf") {
|
else if (arg == "--perf") {
|
||||||
config.perf = true;
|
config.perf = true;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--ndebug") {
|
||||||
|
config.ndebug = true;
|
||||||
|
}
|
||||||
else if (arg == "-v" || arg == "--verbose") {
|
else if (arg == "-v" || arg == "--verbose") {
|
||||||
config.verbose = true;
|
config.verbose = true;
|
||||||
}
|
}
|
||||||
else if (arg == "-h" || arg == "--help") {
|
else if (arg == "-h" || arg == "--help") {
|
||||||
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n";
|
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n";
|
||||||
std::cout << "distbitonic -q <N> [--validation] [--ndebug] [-v]\n";
|
std::cout << "distbitonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
|
||||||
std::cout << "distbitonic -h\n";
|
std::cout << "distbitonic -h\n";
|
||||||
std::cout << "distbubbletonic -q <N> [--validation] [--ndebug] [-v]\n";
|
std::cout << "distbubbletonic -q <N> [--pipeline N] [--validation] [--ndebug] [-v]\n";
|
||||||
std::cout << "distbubbletonic -h\n";
|
std::cout << "distbubbletonic -h\n";
|
||||||
std::cout << '\n';
|
std::cout << '\n';
|
||||||
std::cout << "Options:\n\n";
|
std::cout << "Options:\n\n";
|
||||||
std::cout << " -q | --array-size <N>\n";
|
std::cout << " -q | --array-size <N>\n";
|
||||||
std::cout << " Selects the array size according to size = 2^N\n\n";
|
std::cout << " Selects the array size according to size = 2^N\n\n";
|
||||||
std::cout << " --par-sort\n";
|
std::cout << " --pipeline <N>\n";
|
||||||
std::cout << " Request a parallel full sorting algorithm\n\n";
|
std::cout << " Request a pipeline of <N> stages for exchange-minmax\n";
|
||||||
|
std::cout << " N must be power of 2 up to " << MAX_PIPELINE_SIZE << "\n\n";
|
||||||
std::cout << " --validation\n";
|
std::cout << " --validation\n";
|
||||||
std::cout << " Request a full validation at the end, performed by process rank 0\n\n";
|
std::cout << " Request a full validation at the end, performed by process rank 0\n\n";
|
||||||
|
std::cout << " --perf\n";
|
||||||
|
std::cout << " Request performance timing measurements to stdout.\n\n";
|
||||||
std::cout << " --ndebug\n";
|
std::cout << " --ndebug\n";
|
||||||
std::cout << " Skip debug breakpoint when on debug build.\n\n";
|
std::cout << " Skip debug breakpoint when on debug build.\n\n";
|
||||||
std::cout << " -t | --timing\n";
|
|
||||||
std::cout << " Request timing measurements output to stdout.\n\n";
|
|
||||||
std::cout << " -v | --verbose\n";
|
std::cout << " -v | --verbose\n";
|
||||||
std::cout << " Request a more verbose output to stdout.\n\n";
|
std::cout << " Request a more verbose output to stdout.\n\n";
|
||||||
std::cout << " -h | --help\n";
|
std::cout << " -h | --help\n";
|
||||||
|
@ -126,6 +126,49 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* MPI: SysTest (acceptance)
|
||||||
|
* Each process executes distBubbletonic for uin32_t [1 << 16] with pipeline
|
||||||
|
*/
|
||||||
|
TEST_F(TMPIdistSort, distBubbletonic_test3) {
|
||||||
|
// Create and fill vector
|
||||||
|
using tsValue_t = uint32_t; // Test parameters
|
||||||
|
size_t ts_buffer_size = 1 << 16;
|
||||||
|
|
||||||
|
ShadowedVec_t<tsValue_t> ts_Data;
|
||||||
|
std::uniform_int_distribution<tsValue_t > dis(
|
||||||
|
std::numeric_limits<tsValue_t>::min(),
|
||||||
|
std::numeric_limits<tsValue_t>::max()
|
||||||
|
);
|
||||||
|
ts_Data.resize(ts_buffer_size);
|
||||||
|
std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
|
||||||
|
|
||||||
|
// Set pipeline
|
||||||
|
config.pipeline = 8;
|
||||||
|
|
||||||
|
// Execute function under test in all processes
|
||||||
|
distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
|
||||||
|
|
||||||
|
// Local min and max
|
||||||
|
auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
|
||||||
|
auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
|
||||||
|
|
||||||
|
// Gather min/max to rank 0
|
||||||
|
std::vector<tsValue_t> global_mins(ts_mpi.size());
|
||||||
|
std::vector<tsValue_t> global_maxes(ts_mpi.size());
|
||||||
|
MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
|
||||||
|
|
||||||
|
MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// Check results
|
||||||
|
EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
|
||||||
|
if (ts_mpi.rank() == 0) {
|
||||||
|
for (size_t i = 1; i < global_mins.size(); ++i) {
|
||||||
|
EXPECT_LE(global_maxes[i - 1], global_mins[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* MPI: SysTest (acceptance)
|
* MPI: SysTest (acceptance)
|
||||||
@ -209,3 +252,46 @@ TEST_F(TMPIdistSort, distBitonic_test2) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* MPI: SysTest (acceptance)
|
||||||
|
* Each process executes distBitonic for uin32_t [1 << 16] with pipeline
|
||||||
|
*/
|
||||||
|
TEST_F(TMPIdistSort, distBitonic_test3) {
|
||||||
|
// Create and fill vector
|
||||||
|
using tsValue_t = uint32_t; // Test parameters
|
||||||
|
size_t ts_buffer_size = 1 << 16;
|
||||||
|
|
||||||
|
ShadowedVec_t<tsValue_t> ts_Data;
|
||||||
|
std::uniform_int_distribution<tsValue_t > dis(
|
||||||
|
std::numeric_limits<tsValue_t>::min(),
|
||||||
|
std::numeric_limits<tsValue_t>::max()
|
||||||
|
);
|
||||||
|
ts_Data.resize(ts_buffer_size);
|
||||||
|
std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
|
||||||
|
|
||||||
|
// Set pipeline
|
||||||
|
config.pipeline = 8;
|
||||||
|
|
||||||
|
// Execute function under test in all processes
|
||||||
|
distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
|
||||||
|
|
||||||
|
// Local min and max
|
||||||
|
auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
|
||||||
|
auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
|
||||||
|
|
||||||
|
// Gather min/max to rank 0
|
||||||
|
std::vector<tsValue_t> global_mins(ts_mpi.size());
|
||||||
|
std::vector<tsValue_t> global_maxes(ts_mpi.size());
|
||||||
|
MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
|
||||||
|
|
||||||
|
MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// Check results
|
||||||
|
EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
|
||||||
|
if (ts_mpi.rank() == 0) {
|
||||||
|
for (size_t i = 1; i < global_mins.size(); ++i) {
|
||||||
|
EXPECT_LE(global_maxes[i - 1], global_mins[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user