diff --git a/homework_2/hpc/N1P2T4Q20.sh b/homework_2/hpc/N1P2T4Q20.sh index c6f1dbf..c80a517 100644 --- a/homework_2/hpc/N1P2T4Q20.sh +++ b/homework_2/hpc/N1P2T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q23.sh b/homework_2/hpc/N1P2T4Q23.sh index 8a82c70..bdb9db8 100644 --- a/homework_2/hpc/N1P2T4Q23.sh +++ b/homework_2/hpc/N1P2T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q25.sh b/homework_2/hpc/N1P2T4Q25.sh index 95858c3..fb55d5e 100644 --- a/homework_2/hpc/N1P2T4Q25.sh +++ b/homework_2/hpc/N1P2T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P2T4Q27.sh b/homework_2/hpc/N1P2T4Q27.sh index beb0faf..0bf98ac 100644 --- a/homework_2/hpc/N1P2T4Q27.sh +++ b/homework_2/hpc/N1P2T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q20.sh b/homework_2/hpc/N1P4T4Q20.sh index d5dd27a..a0072c8 100644 --- a/homework_2/hpc/N1P4T4Q20.sh +++ b/homework_2/hpc/N1P4T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q23.sh b/homework_2/hpc/N1P4T4Q23.sh index dcd0e7a..82cb920 100644 --- a/homework_2/hpc/N1P4T4Q23.sh +++ b/homework_2/hpc/N1P4T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q25.sh b/homework_2/hpc/N1P4T4Q25.sh index a9619ff..c77a744 100644 --- a/homework_2/hpc/N1P4T4Q25.sh +++ b/homework_2/hpc/N1P4T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q26.sh b/homework_2/hpc/N1P4T4Q26.sh new file mode 100644 index 0000000..cd75bd1 --- /dev/null +++ b/homework_2/hpc/N1P4T4Q26.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --time=5:00 + +# Use this as following +# $> sbatch -p batch|rome +# +# NOTE: +# First compile in aristotle with +# $> module load gcc/9.2.0 openmpi/4.0.3 +# $> make -j hpc-build +# + +module load gcc/9.2.0 openmpi/4.0.3 +# Note: +# The above versions are matching w/ my system's +# versions, thus making compiling/debugging easier. + +# Suppress unused UCX_ROOT warning +export UCX_WARN_UNUSED_ENV_VARS=n + +# Suppress CUDA-aware support is disabled warning +export OMPI_MCA_opal_warn_on_missing_libcuda=0 + +srun ./out/distbitonic -q 26 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N1P4T4Q27.sh b/homework_2/hpc/N1P4T4Q27.sh index e8742a4..0ae3358 100644 --- a/homework_2/hpc/N1P4T4Q27.sh +++ b/homework_2/hpc/N1P4T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=1:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q20.sh b/homework_2/hpc/N2P4T4Q20.sh index 93a5944..3ea9142 100644 --- a/homework_2/hpc/N2P4T4Q20.sh +++ b/homework_2/hpc/N2P4T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=2 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q23.sh b/homework_2/hpc/N2P4T4Q23.sh index e9a0851..61a66e1 100644 --- a/homework_2/hpc/N2P4T4Q23.sh +++ b/homework_2/hpc/N2P4T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=2 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q25.sh b/homework_2/hpc/N2P4T4Q25.sh index f03f569..4ce1466 100644 --- a/homework_2/hpc/N2P4T4Q25.sh +++ b/homework_2/hpc/N2P4T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=2 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N2P4T4Q27.sh b/homework_2/hpc/N2P4T4Q27.sh index bf9f8dd..078be28 100644 --- a/homework_2/hpc/N2P4T4Q27.sh +++ b/homework_2/hpc/N2P4T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=2 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q20.sh b/homework_2/hpc/N4P16T4Q20.sh index 4ee3cd1..16c7e14 100644 --- a/homework_2/hpc/N4P16T4Q20.sh +++ b/homework_2/hpc/N4P16T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q22.sh b/homework_2/hpc/N4P16T4Q22.sh new file mode 100644 index 0000000..d48dae4 --- /dev/null +++ b/homework_2/hpc/N4P16T4Q22.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=16 +#SBATCH --cpus-per-task=4 +#SBATCH --time=10:00 + +# Use this as following +# $> sbatch -p batch|rome +# +# NOTE: +# First compile in aristotle with +# $> module load gcc/9.2.0 openmpi/4.0.3 +# $> make -j hpc-build +# + +module load gcc/9.2.0 openmpi/4.0.3 +# Note: +# The above versions are matching w/ my system's +# versions, thus making compiling/debugging easier. + +# Suppress unused UCX_ROOT warning +export UCX_WARN_UNUSED_ENV_VARS=n + +# Suppress CUDA-aware support is disabled warning +export OMPI_MCA_opal_warn_on_missing_libcuda=0 + +srun ./out/distbitonic -q 22 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q23.sh b/homework_2/hpc/N4P16T4Q23.sh index a44b4fc..03f6b03 100644 --- a/homework_2/hpc/N4P16T4Q23.sh +++ b/homework_2/hpc/N4P16T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q25.sh b/homework_2/hpc/N4P16T4Q25.sh index 2be0a7d..65b95c8 100644 --- a/homework_2/hpc/N4P16T4Q25.sh +++ b/homework_2/hpc/N4P16T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P16T4Q27.sh b/homework_2/hpc/N4P16T4Q27.sh index e7b2ba4..e0e95c6 100644 --- a/homework_2/hpc/N4P16T4Q27.sh +++ b/homework_2/hpc/N4P16T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q20.sh b/homework_2/hpc/N4P32T4Q20.sh index 2b7b4bb..182c4e3 100644 --- a/homework_2/hpc/N4P32T4Q20.sh +++ b/homework_2/hpc/N4P32T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=32 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q21.sh b/homework_2/hpc/N4P32T4Q21.sh new file mode 100644 index 0000000..f0d0037 --- /dev/null +++ b/homework_2/hpc/N4P32T4Q21.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=32 +#SBATCH --cpus-per-task=4 +#SBATCH --time=10:00 + +# Use this as following +# $> sbatch -p batch|rome +# +# NOTE: +# First compile in aristotle with +# $> module load gcc/9.2.0 openmpi/4.0.3 +# $> make -j hpc-build +# + +module load gcc/9.2.0 openmpi/4.0.3 +# Note: +# The above versions are matching w/ my system's +# versions, thus making compiling/debugging easier. + +# Suppress unused UCX_ROOT warning +export UCX_WARN_UNUSED_ENV_VARS=n + +# Suppress CUDA-aware support is disabled warning +export OMPI_MCA_opal_warn_on_missing_libcuda=0 + +srun ./out/distbitonic -q 21 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q23.sh b/homework_2/hpc/N4P32T4Q23.sh index 7db03b4..7c32e54 100644 --- a/homework_2/hpc/N4P32T4Q23.sh +++ b/homework_2/hpc/N4P32T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=32 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q25.sh b/homework_2/hpc/N4P32T4Q25.sh index 06da205..e90e4f1 100644 --- a/homework_2/hpc/N4P32T4Q25.sh +++ b/homework_2/hpc/N4P32T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=32 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P32T4Q27.sh b/homework_2/hpc/N4P32T4Q27.sh index f42d08c..5de7e8d 100644 --- a/homework_2/hpc/N4P32T4Q27.sh +++ b/homework_2/hpc/N4P32T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=32 #SBATCH --cpus-per-task=4 -#SBATCH --time=5:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 --exchange-opt \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q20.sh b/homework_2/hpc/N4P4T4Q20.sh index 176ff23..777e221 100644 --- a/homework_2/hpc/N4P4T4Q20.sh +++ b/homework_2/hpc/N4P4T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q23.sh b/homework_2/hpc/N4P4T4Q23.sh index 58d3c99..10d823c 100644 --- a/homework_2/hpc/N4P4T4Q23.sh +++ b/homework_2/hpc/N4P4T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q24.sh b/homework_2/hpc/N4P4T4Q24.sh new file mode 100644 index 0000000..66fd3ef --- /dev/null +++ b/homework_2/hpc/N4P4T4Q24.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --time=5:00 + +# Use this as following +# $> sbatch -p batch|rome +# +# NOTE: +# First compile in aristotle with +# $> module load gcc/9.2.0 openmpi/4.0.3 +# $> make -j hpc-build +# + +module load gcc/9.2.0 openmpi/4.0.3 +# Note: +# The above versions are matching w/ my system's +# versions, thus making compiling/debugging easier. + +# Suppress unused UCX_ROOT warning +export UCX_WARN_UNUSED_ENV_VARS=n + +# Suppress CUDA-aware support is disabled warning +export OMPI_MCA_opal_warn_on_missing_libcuda=0 + +srun ./out/distbitonic -q 24 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q25.sh b/homework_2/hpc/N4P4T4Q25.sh index ded9350..a758509 100644 --- a/homework_2/hpc/N4P4T4Q25.sh +++ b/homework_2/hpc/N4P4T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P4T4Q27.sh b/homework_2/hpc/N4P4T4Q27.sh index d1d03cb..2cb3870 100644 --- a/homework_2/hpc/N4P4T4Q27.sh +++ b/homework_2/hpc/N4P4T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=5:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q20.sh b/homework_2/hpc/N4P8T4Q20.sh index ac2af75..a87809e 100644 --- a/homework_2/hpc/N4P8T4Q20.sh +++ b/homework_2/hpc/N4P8T4Q20.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 20 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 20 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q23.sh b/homework_2/hpc/N4P8T4Q23.sh index 23a1cc5..c940ac1 100644 --- a/homework_2/hpc/N4P8T4Q23.sh +++ b/homework_2/hpc/N4P8T4Q23.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 23 --perf --validation \ No newline at end of file +srun ./out/distbitonic -q 23 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q25.sh b/homework_2/hpc/N4P8T4Q25.sh index ec4a8be..cd9a427 100644 --- a/homework_2/hpc/N4P8T4Q25.sh +++ b/homework_2/hpc/N4P8T4Q25.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 25 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/hpc/N4P8T4Q27.sh b/homework_2/hpc/N4P8T4Q27.sh index 18553df..51dc99e 100644 --- a/homework_2/hpc/N4P8T4Q27.sh +++ b/homework_2/hpc/N4P8T4Q27.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=4 -#SBATCH --time=2:00 +#SBATCH --time=10:00 # Use this as following # $> sbatch -p batch|rome @@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n # Suppress CUDA-aware support is disabled warning export OMPI_MCA_opal_warn_on_missing_libcuda=0 -srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8 \ No newline at end of file +srun ./out/distbitonic -q 27 --validation --perf 8 \ No newline at end of file diff --git a/homework_2/include/config.h b/homework_2/include/config.h index 5c62edf..e2239b5 100644 --- a/homework_2/include/config.h +++ b/homework_2/include/config.h @@ -27,7 +27,7 @@ // Default Data size (in case -q is not present) static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; -// The maximum MPI size we support +// The maximum MPI size we support (in Nodes x Processes) static constexpr size_t MAX_MPI_SIZE = 1024UL; // The maximum pipeline size we support @@ -61,7 +61,7 @@ struct config_t { bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. bool ndebug{false}; //!< Skips debug trap on DEBUG builds. size_t perf{1}; //!< Enable performance timing measurements and prints and repeat - //!< the performs the sorting times to average the measurements + //!< the sorting times. bool verbose{false}; //!< Flag to enable verbose output to stdout. }; diff --git a/homework_2/include/utils.hpp b/homework_2/include/utils.hpp index 6936d86..4b71366 100644 --- a/homework_2/include/utils.hpp +++ b/homework_2/include/utils.hpp @@ -129,7 +129,7 @@ struct MPI_t { * This function matches a transmit and a receive in order for fully exchanged data between * current node and partner. * - * @tparam T The inner valur type used in buffer + * @tparam ValueT The value type used in buffer * * @param ldata [const ValueT*] Pointer to local data to send * @param rdata [ValueT*] Pointer to buffer to receive data from partner @@ -163,7 +163,7 @@ struct MPI_t { * This call MUST paired with exchange_wait() for each MPI_t object. * Calling 2 consecutive exchange_start() for the same MPI_t object is undefined. * - * @tparam ValueT The underlying value type used in buffers + * @tparam ValueT The value type used in buffers * * @param ldata [const ValueT*] Pointer to local data to send * @param rdata [ValueT*] Pointer to buffer to receive data from partner @@ -267,8 +267,8 @@ using mpi_id_t = MPI_t<>::ID_t; /*! * @brief A std::vector wrapper with 2 vectors, an active and a shadow. * - * This type exposes the standard vector - * functionality of the active vector. The shadow can be used when we need to use the vector as mutable + * This type exposes the standard vector functionality of the active vector. + * The shadow can be used when we need to use the vector as mutable * data in algorithms that can not support "in-place" editing (like elbow-sort for example) * * @tparam Value_t the underlying data type of the vectors @@ -418,7 +418,9 @@ private: extern Log logger; /*! - * A small timing utility based on chrono. + * A small timing utility based on chrono that supports timing rounds + * and returning the median of them. Time can accumulate to the measurement + * for each round. */ struct Timing { using Tpoint = std::chrono::steady_clock::time_point; @@ -485,7 +487,7 @@ private: }; /*! - * Utility "high level function"-like macro to forward a function call + * A "high level function"-like utility macro to forward a function call * and accumulate the execution time to the corresponding timing object. * * @param Tim The Timing object [Needs to have methods start() and stop()] diff --git a/homework_2/src/main.cpp b/homework_2/src/main.cpp index 1eded9f..c5f51ef 100644 --- a/homework_2/src/main.cpp +++ b/homework_2/src/main.cpp @@ -24,6 +24,10 @@ distBuffer_t Data; Log logger; distStat_t localStat, remoteStat; +// Mersenne seeded from hw if possible. range: [type_min, type_max] +std::random_device rd; +std::mt19937 gen(rd()); + //! Performance timers for each one of the "costly" functions Timing Ttotal; Timing TfullSort; @@ -106,9 +110,9 @@ bool get_options(int argc, char* argv[]){ } else if (arg == "-h" || arg == "--help") { std::cout << "distbitonic/distbubbletonic - A distributed sort utility\n\n"; - std::cout << "distbitonic -q [-e] [-p | --pipeline N] [--validation] [--perf] [--ndebug] [-v]\n"; + std::cout << "distbitonic -q [-e] [-p | --pipeline N] [--validation] [--perf ] [--ndebug] [-v]\n"; std::cout << "distbitonic -h\n"; - std::cout << "distbubbletonic -q [-e] [-p | --pipeline N] [--validation] [--perf] [--ndebug] [-v]\n"; + std::cout << "distbubbletonic -q [-e] [-p | --pipeline N] [--validation] [--perf ] [--ndebug] [-v]\n"; std::cout << "distbubbletonic -h\n"; std::cout << '\n'; std::cout << "Options:\n\n"; @@ -123,7 +127,7 @@ bool get_options(int argc, char* argv[]){ std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; std::cout << " --perf \n"; std::cout << " Enable performance timing measurements and prints, and repeat\n"; - std::cout << " the sorting times to average the measurements\n\n"; + std::cout << " the sorting times.\n\n"; std::cout << " --ndebug\n"; std::cout << " Skip debug breakpoint when on debug build.\n\n"; std::cout << " -v | --verbose\n"; @@ -190,59 +194,67 @@ bool validator(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) { return ret; } -#if !defined TESTING /*! - * @return Returns 0, but.... we may throw or exit(1) + * Initializes the environment, must called from each process + * + * @param argc [int*] POINTER to main's argc argument + * @param argv [char***] POINTER to main's argv argument */ -int main(int argc, char* argv[]) try { +void init(int* argc, char*** argv) { // Initialize MPI environment - mpi.init(&argc, &argv); + mpi.init(argc, argv); // try to read command line (after MPI parsing) - if (!get_options(argc, argv)) + if (!get_options(*argc, *argv)) exit(1); - logger << "MPI environment initialized." << - " Rank: " << mpi.rank() << - " Size: " << mpi.size() << - logger.endl; + logger << "MPI environment initialized." << " Rank: " << mpi.rank() << " Size: " << mpi.size() + << logger.endl; #if defined DEBUG #if defined TESTING - /* - * In case of a debug build we will wait here until sleep_wait - * will reset via debugger. In order to do that the user must attach - * debugger to all processes. For example: - * $> mpirun -np 2 ./ - * $> ps aux | grep - * $> gdb - * $> gdb - */ - volatile bool sleep_wait = false; + /* + * In case of a debug build we will wait here until sleep_wait + * will reset via debugger. In order to do that the user must attach + * debugger to all processes. For example: + * $> mpirun -np 2 ./ + * $> ps aux | grep + * $> gdb + * $> gdb + */ + volatile bool sleep_wait = false; #else - volatile bool sleep_wait = true; + volatile bool sleep_wait = true; #endif - while (sleep_wait && !config.ndebug) - sleep(1); + while (sleep_wait && !config.ndebug) + sleep(1); #endif - // Initialize local data - logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; - std::random_device rd; // Mersenne seeded from hw if possible. range: [type_min, type_max] - std::mt19937 gen(rd()); - std::uniform_int_distribution dis( - std::numeric_limits::min(), - std::numeric_limits::max() - ); - // Fill vector + // Prepare vector and timing data Data.resize(config.arraySize); - std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); - - // Run distributed sort - if (mpi.rank() == 0) - logger << "Starting distributed sorting ... "; measurements_init(); +} + +#if !defined TESTING +/*! + * @return Returns 0, but.... we may throw or exit(1) + */ +int main(int argc, char* argv[]) try { + + // Init everything + init(&argc, &argv); + for (size_t it = 0 ; it < config.perf ; ++it) { + // Initialize local data + logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); + // Run distributed sort + if (mpi.rank() == 0) + logger << "Starting distributed sorting ... "; Ttotal.start(); #if CODE_VERSION == BUBBLETONIC distBubbletonic(Data, mpi.size(), mpi.rank()); @@ -251,9 +263,9 @@ int main(int argc, char* argv[]) try { #endif Ttotal.stop(); measurements_next(); + if (mpi.rank() == 0) + logger << " Done." << logger.endl; } - if (mpi.rank() == 0) - logger << " Done." << logger.endl; // Print-outs and validation if (config.perf > 1) { @@ -266,10 +278,10 @@ int main(int argc, char* argv[]) try { if (config.validation) { // If requested, we have the chance to fail! if (mpi.rank() == 0) - std::cout << "Results validation ..."; + std::cout << "[Validation] Results validation ..."; bool val = validator(Data, mpi.size(), mpi.rank()); if (mpi.rank() == 0) - std::cout << ((val) ? "\x1B[32m [PASS] \x1B[0m\n" : " \x1B[32m [FAIL] \x1B[0m\n"); + std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n"); } mpi.finalize(); return 0; diff --git a/homework_2/test/tests_CommonUtils.cpp b/homework_2/test/tests_CommonUtils.cpp index 5f14989..4739e0a 100644 --- a/homework_2/test/tests_CommonUtils.cpp +++ b/homework_2/test/tests_CommonUtils.cpp @@ -91,3 +91,29 @@ TEST(TdistCommonUT, elbowSort_test3) { EXPECT_EQ((ts_data == ts_expected_des), true); } +/* + * Tag generator test without stage calls + */ +TEST(TdistCommonUT, tagGenerator_test1) { + // The maximum MPI size we support + // static constexpr size_t MAX_MPI_SIZE = 1024UL; + // The maximum pipeline size we support + // static constexpr size_t MAX_PIPELINE_SIZE = 64UL; + + std::vector ts_tags; + auto ts_logSize = static_cast(std::log2(MAX_MPI_SIZE)); + + for (size_t depth = 0; depth <= ts_logSize; ++depth) { + for (size_t step = 0 ; step < MAX_MPI_SIZE; ++step) { + int tag = static_cast(tagGenerator(depth, step)); + ts_tags.push_back(tag); // Exchange optimization + for (size_t stage = 0; stage < MAX_PIPELINE_SIZE; ++stage) { + ts_tags.push_back(++tag); // stages + } + } + } + std::sort(ts_tags.begin(), ts_tags.end()); + for (size_t i = 0 ; i < ts_tags.size() - 1 ; ++i) + EXPECT_NE(ts_tags[i], ts_tags[i+1]); +} + diff --git a/homework_2/test/tests_MPI.cpp b/homework_2/test/tests_MPI.cpp index 2c79bee..64735d3 100644 --- a/homework_2/test/tests_MPI.cpp +++ b/homework_2/test/tests_MPI.cpp @@ -27,8 +27,8 @@ MPI_t<> ts_mpi; // Mersenne seeded from hw if possible. range: [type_min, type_max] -std::random_device rd; -std::mt19937 gen(rd()); +std::random_device ts_rd; +std::mt19937 ts_gen(ts_rd()); class TMPIdistSort : public ::testing::Test { protected: @@ -59,7 +59,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test1) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Execute function under test in all processes distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); @@ -100,7 +100,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Execute function under test in all processes distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); @@ -141,7 +141,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test3) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Set pipeline config.pipeline = 8; @@ -170,6 +170,96 @@ TEST_F(TMPIdistSort, distBubbletonic_test3) { } } +/* + * MPI: SysTest (acceptance) + * Each process executes distBubbletonic for uin32_t [1 << 16] with exchange optimization + */ +TEST_F(TMPIdistSort, distBubbletonic_test4) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); + + // Set exchange optimization + config.exchangeOpt = true; + + // Execute function under test in all processes + distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +} + +/* + * MPI: SysTest (acceptance) + * Each process executes distBubbletonic for uin32_t [1 << 16] with + * exchange optimization and pipeline + */ +TEST_F(TMPIdistSort, distBubbletonic_test5) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); + + // Set exchange optimization + pipeline + config.exchangeOpt = true; + config.pipeline = 8; + + // Execute function under test in all processes + distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +} + /* * MPI: SysTest (acceptance) * Each process executes distBitonic for uin8_t [16] @@ -185,7 +275,7 @@ TEST_F(TMPIdistSort, distBitonic_test1) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Execute function under test in all processes distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); @@ -226,7 +316,7 @@ TEST_F(TMPIdistSort, distBitonic_test2) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Execute function under test in all processes distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); @@ -267,7 +357,7 @@ TEST_F(TMPIdistSort, distBitonic_test3) { std::numeric_limits::max() ); ts_Data.resize(ts_buffer_size); - std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); }); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); // Set pipeline config.pipeline = 8; @@ -295,3 +385,93 @@ TEST_F(TMPIdistSort, distBitonic_test3) { } } } + +/* + * MPI: SysTest (acceptance) + * Each process executes distBitonic for uin32_t [1 << 16] with exchange optimization + */ +TEST_F(TMPIdistSort, distBitonic_test4) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); + + // Set exchange optimization + config.exchangeOpt = true; + + // Execute function under test in all processes + distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +} + +/* + * MPI: SysTest (acceptance) + * Each process executes distBitonic for uin32_t [1 << 16] with + * exchange optimization and pipeline + */ +TEST_F(TMPIdistSort, distBitonic_test5) { + // Create and fill vector + using tsValue_t = uint32_t; // Test parameters + size_t ts_buffer_size = 1 << 16; + + ShadowedVec_t ts_Data; + std::uniform_int_distribution dis( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + ts_Data.resize(ts_buffer_size); + std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); }); + + // Set exchange optimization + pipeline + config.exchangeOpt = true; + config.pipeline = 8; + + // Execute function under test in all processes + distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank()); + + // Local min and max + auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end()); + auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end()); + + // Gather min/max to rank 0 + std::vector global_mins(ts_mpi.size()); + std::vector global_maxes(ts_mpi.size()); + MPI_Datatype datatype = MPI_TypeMapper::getType(); + + MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD); + MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD); + + // Check results + EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true); + if (ts_mpi.rank() == 0) { + for (size_t i = 1; i < global_mins.size(); ++i) { + EXPECT_LE(global_maxes[i - 1], global_mins[i]); + } + } +} \ No newline at end of file