ソースを参照

HW2: RC1 - Model version

tags/v2.0
コミット
1bd0cbb8d0
7個のファイルの変更116行の追加70行の削除
  1. +7
    -7
      homework_2/Makefile
  2. +11
    -11
      homework_2/hpc/btN1P4T4Q24.sh
  3. +3
    -3
      homework_2/include/config.h
  4. +15
    -12
      homework_2/include/distsort.hpp
  5. +54
    -21
      homework_2/include/utils.hpp
  6. +1
    -0
      homework_2/src/distsort.cpp
  7. +25
    -16
      homework_2/src/main.cpp

+ 7
- 7
homework_2/Makefile ファイルの表示

@@ -45,18 +45,18 @@ DEP_DIR := $(BUILD_DIR)/.dep

# ========== Compiler settings ==========
# Compiler flags for debug and release
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11
REL_CFLAGS := -Wall -Wextra -O3 -std=c11
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c11 #-fopenmp
REL_CFLAGS := -Wall -Wextra -O3 -std=c11 #-fopenmp
DEB_CXXFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++17 #-fopenmp
REL_CXXFLAGS := -Wall -Wextra -O3 -std=c++17 #-fopenmp

# Pre-defines
# PRE_DEFS := MYCAB=1729 SUPER_MODE
PRE_DEFS :=
PRE_DEFS := #_GLIBCXX_PARALLEL

# ============== Linker settings ==============
# Linker flags (example: -pthread -lm)
LDFLAGS := -pthread
LDFLAGS := -pthread # -fopenmp

# Map output file
MAP_FILE := output.map
@@ -228,7 +228,7 @@ perfbitonic: CC := mpicc
perfbitonic: CXX := mpic++
perfbitonic: CFLAGS := $(REL_CFLAGS) -g -DCODE_VERSION=BITONIC
perfbitonic: CXXFLAGS := $(REL_CXXFLAGS) -g -DCODE_VERSION=BITONIC
perfbitonic: TARGET := distbitonic
perfbitonic: TARGET := perfbitonic
perfbitonic: $(BUILD_DIR)/$(TARGET)
@mkdir -p out
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)


homework_2/hpc/ntasks16.sh → homework_2/hpc/btN1P4T4Q24.sh ファイルの表示

@@ -1,28 +1,28 @@
#! /usr/bin/env bash

#SBATCH --partition=batch
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --nodes=4
#SBATCH --cpus-per-task=4
#SBATCH --time=1:00

# Use this as following
# $> sbatch <this file>
#
# NOTE:
# First compile with
# $> make -j hpc-build
#

module load gcc/9.2.0 openmpi/4.0.3
# Note:
# The above versions are matching w/ my system's
# versions, thus making compiling/debugging easier.

# Uncomment the following to compile the project
# Note:
# In order for the MPI to run properly (or to run entirely), we need
# to compile it in hpc using the loaded modules above.
# Note:
# Consider moving this to a separate stage before sbatch the tasks.
# make distbitonic
# make distbubbletonic

# Suppress unused UCX_ROOT warning
export UCX_WARN_UNUSED_ENV_VARS=n

# Suppress CUDA-aware support is disabled warning
export OMPI_MCA_opal_warn_on_missing_libcuda=0

srun ./bin/dist_v05
srun ./out/distbitonic -q 24 --perf --validation

+ 3
- 3
homework_2/include/config.h ファイルの表示

@@ -44,15 +44,15 @@ using distValue_t = uint32_t;
/*!
* Session option for each invocation of the executable
*/
struct session_t {
struct config_t {
size_t arraySize{DEFAULT_DATA_SIZE}; //!<
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0
bool ndebug{false}; //!< Skips debug trap on DEBUG builds
bool timing{false}; //!< Enable timing measurements and prints
bool perf{false}; //!< Enable performance timing measurements and prints
bool verbose{false}; //!< Flag to enable verbose output to stdout
};

extern session_t session;
extern config_t config;


#endif /* CONFIG_H_ */

+ 15
- 12
homework_2/include/distsort.hpp ファイルの表示

@@ -12,6 +12,7 @@

#include <vector>
#include <algorithm>
//#include <parallel/algorithm>
#include <cmath>
#include <cstdint>
#if !defined DEBUG
@@ -20,8 +21,8 @@
#include <cassert>

#include "utils.hpp"
#include "config.h"

extern Timing TfullSort, Texchange, Tminmax, TelbowSort;

/*!
* Enumerator for the different versions of the sorting method
@@ -159,11 +160,13 @@ bool isActive(mpi_id_t node, size_t nodes);
*/
template<typename RangeT>
void fullSort(RangeT& data, bool ascending) noexcept {
// Use introsort from stdlib++ here, unless ...
if (ascending)
// Use introsort from stdlib++ here, unless ... __gnu_parallel
if (ascending) {
std::sort(data.begin(), data.end(), std::less<>());
else
}
else {
std::sort(data.begin(), data.end(), std::greater<>());
}
}

/*!
@@ -270,7 +273,7 @@ void minmax(RangeT& local, const RangeT& remote, bool keepSmall) noexcept {
template<typename ShadowedDataT>
void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
// Initially sort to create a half part of a bitonic sequence
fullSort(data, ascending<SortMode::Bubbletonic>(rank, 0));
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bubbletonic>(rank, 0));

// Sort network (O(N) iterations)
for (size_t step = 0; step < static_cast<size_t>(Processes); ++step) {
@@ -280,9 +283,9 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
if ( isActive(rank, Processes) &&
isActive(part, Processes) ) {
// Exchange with partner, keep nim-or-max and sort - O(N)
mpi.exchange(data.getActive(), data.getShadow(), part, step);
minmax(data.getActive(), data.getShadow(), ks);
elbowSort(data, ascending<SortMode::Bubbletonic>(rank, Processes));
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, step);
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks);
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bubbletonic>(rank, Processes));
}
}

@@ -308,7 +311,7 @@ void distBubbletonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
template<typename ShadowedDataT>
void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
// Initially sort to create a half part of a bitonic sequence
fullSort(data, ascending<SortMode::Bitonic>(rank, 0));
timeCall(TfullSort, fullSort, data, ascending<SortMode::Bitonic>(rank, 0));

// Run through sort network using elbow-sort ( O(LogN * LogN) iterations )
auto p = static_cast<uint32_t>(std::log2(Processes));
@@ -319,11 +322,11 @@ void distBitonic(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
auto part = partner<SortMode::Bitonic>(rank, step);
auto ks = keepSmall<SortMode::Bitonic>(rank, part, depth);
// Exchange with partner, keep nim-or-max
mpi.exchange(data.getActive(), data.getShadow(), part, (depth << 8) | step);
minmax(data.getActive(), data.getShadow(), ks);
timeCall(Texchange, mpi.exchange, data.getActive(), data.getShadow(), part, (depth << 8) | step);
timeCall(Tminmax, minmax, data.getActive(), data.getShadow(), ks);
}
// sort - O(N)
elbowSort (data, ascending<SortMode::Bitonic>(rank, depth));
timeCall(TelbowSort, elbowSort, data, ascending<SortMode::Bitonic>(rank, depth));
}
}



+ 54
- 21
homework_2/include/utils.hpp ファイルの表示

@@ -14,6 +14,7 @@
#include <chrono>
#include <unistd.h>
#include <mpi.h>
//#include <functional>

#include "config.h"

@@ -286,7 +287,7 @@ struct Log {
//! We provide logging via << operator
template<typename T>
Log &operator<<(T &&t) {
if (session.verbose) {
if (config.verbose) {
if (line_) {
std::cout << "[Log]: " << t;
line_ = false;
@@ -299,7 +300,7 @@ struct Log {
// overload for special end line handling
Log &operator<<(Endl e) {
(void) e;
if (session.verbose) {
if (config.verbose) {
std::cout << '\n';
line_ = true;
}
@@ -317,39 +318,71 @@ extern Log logger;
*/
struct Timing {
using Tpoint = std::chrono::steady_clock::time_point;
using Tduration = std::chrono::microseconds;
using microseconds = std::chrono::microseconds;
using milliseconds = std::chrono::milliseconds;
using seconds = std::chrono::seconds;

//! tool to mark the starting point
Tpoint start() noexcept { return start_ = std::chrono::steady_clock::now(); }
Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }

//! tool to mark the ending point
Tpoint stop() noexcept { return stop_ = std::chrono::steady_clock::now(); }
Tpoint stop() noexcept {
Tpoint now = std::chrono::steady_clock::now();
duration_ += dt(now, mark_);
return now;
}

auto dt() noexcept {
return std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_).count();
Tduration dt(Tpoint t2, Tpoint t1) noexcept {
return std::chrono::duration_cast<Tduration>(t2 - t1);
}

//! tool to print the time interval
void print_dt(const char *what) noexcept {
if (session.timing) {
auto t = stop_ - start_;
if (std::chrono::duration_cast<microseconds>(t).count() < 10000)
std::cout << "[Timing]: " << what << ": "
<< std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n";
else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000)
std::cout << "[Timing]: " << what << ": "
<< std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n";
else
std::cout << "[Timing]: " << what << ": "
<< std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n";
}
void print_duration(const char *what, mpi_id_t rank) noexcept {
if (std::chrono::duration_cast<microseconds>(duration_).count() < 10000)
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
<< std::to_string(std::chrono::duration_cast<microseconds>(duration_).count()) << " [usec]\n";
else if (std::chrono::duration_cast<milliseconds>(duration_).count() < 10000)
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration_).count()) << " [msec]\n";
else
std::cout << "[Timing] (Rank " << rank << ") " << what << ": "
<< std::to_string(std::chrono::duration_cast<seconds>(duration_).count()) << " [sec]\n";

}

private:
Tpoint start_;
Tpoint stop_;
Tpoint mark_{};
Tduration duration_{};
};

/*!
* Utility high level function to forward a function call to std::invoke and measure
* the excecution time
*
* @tparam Func The function type
* @tparam Args The argument
* @param func
* @param args
* @return
*/


#define timeCall(Tim, Func, ...) \
Tim.start(); \
Func(__VA_ARGS__); \
Tim.stop(); \


//template <typename Ret, typename Func, typename... Args>
//auto timeCall_r(Ret& ret, Func&& func, Args&&... args) {
// Timing timer;
//
// timer.start();
// ret = std::invoke(std::forward<Func>(func), std::forward<Args>(args)...);
// timer.stop();
//
// return timer.dt();
//}

#endif /* UTILS_HPP_ */

+ 1
- 0
homework_2/src/distsort.cpp ファイルの表示

@@ -9,6 +9,7 @@
#include "utils.hpp"
#include "distsort.hpp"

Timing TfullSort, Texchange, Tminmax, TelbowSort;

bool isActive(mpi_id_t node, size_t nodes) {
if (!((nodes > 0) &&


+ 25
- 16
homework_2/src/main.cpp ファイルの表示

@@ -17,12 +17,12 @@
#include "distsort.hpp"


// Global session data
session_t session;
// Global config data
config_t config;
MPI_t<> mpi;
distBuffer_t Data;
Log logger;
Timing timer;
Timing Ttotal;

/*!
* A small command line argument parser
@@ -37,23 +37,23 @@ bool get_options(int argc, char* argv[]){

if (arg == "-q" || arg == "--array-size") {
if (i+1 < argc) {
session.arraySize = 1 << atoi(argv[++i]);
config.arraySize = 1 << atoi(argv[++i]);
}
else {
status = false;
}
}
else if (arg == "--validation") {
session.validation = true;
config.validation = true;
}
else if (arg == "--ndebug") {
session.ndebug = true;
config.ndebug = true;
}
else if (arg == "-t" || arg == "--timing") {
session.timing = true;
else if (arg == "--perf") {
config.perf = true;
}
else if (arg == "-v" || arg == "--verbose") {
session.verbose = true;
config.verbose = true;
}
else if (arg == "-h" || arg == "--help") {
std::cout << "distbitonic/distbubbletonic - A distributed bitonic sort\n\n";
@@ -65,6 +65,8 @@ bool get_options(int argc, char* argv[]){
std::cout << "Options:\n\n";
std::cout << " -q | --array-size <N>\n";
std::cout << " Selects the array size according to size = 2^N\n\n";
std::cout << " --par-sort\n";
std::cout << " Request a parallel full sorting algorithm\n\n";
std::cout << " --validation\n";
std::cout << " Request a full validation at the end, performed by process rank 0\n\n";
std::cout << " --ndebug\n";
@@ -164,11 +166,11 @@ int main(int argc, char* argv[]) try {
#else
volatile bool sleep_wait = true;
#endif
while (sleep_wait && !session.ndebug)
while (sleep_wait && !config.ndebug)
sleep(1);
#endif

logger << "Initialize local array of " << session.arraySize << " elements" << logger.endl;
logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max]
std::mt19937 gen(rd());
std::uniform_int_distribution<distValue_t > dis(
@@ -176,24 +178,31 @@ int main(int argc, char* argv[]) try {
std::numeric_limits<distValue_t>::max()
);
// Fill vector
Data.resize(session.arraySize);
Data.resize(config.arraySize);
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });

if (mpi.rank() == 0)
logger << "Starting distributed sorting ... ";
timer.start();
Ttotal.start();
#if CODE_VERSION == BUBBLETONIC
distBubbletonic(Data, mpi.size(), mpi.rank());
#else
distBitonic (Data, mpi.size(), mpi.rank());
#endif
timer.stop();
Ttotal.stop();
if (mpi.rank() == 0)
logger << " Done." << logger.endl;
std::string timeMsg = "rank " + std::to_string(mpi.rank());
timer.print_dt(timeMsg.c_str());

if (session.validation) {

if (config.perf) {
Ttotal.print_duration("Total ", mpi.rank());
TfullSort.print_duration("Full-Sort ", mpi.rank());
Texchange.print_duration("Exchange ", mpi.rank());
Tminmax.print_duration("Min-Max ", mpi.rank());
TelbowSort.print_duration("Elbow-Sort", mpi.rank());
}
if (config.validation) {
// If requested, we have the chance to fail!
if (mpi.rank() == 0)
std::cout << "Results validation ...";


読み込み中…
キャンセル
保存