diff --git a/homework_2/hpc/N1P2T4Q20.sh b/homework_2/hpc/N1P2T4Q20.sh
index c6f1dbf..c80a517 100644
--- a/homework_2/hpc/N1P2T4Q20.sh
+++ b/homework_2/hpc/N1P2T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=2
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P2T4Q23.sh b/homework_2/hpc/N1P2T4Q23.sh
index 8a82c70..bdb9db8 100644
--- a/homework_2/hpc/N1P2T4Q23.sh
+++ b/homework_2/hpc/N1P2T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=2
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P2T4Q25.sh b/homework_2/hpc/N1P2T4Q25.sh
index 95858c3..fb55d5e 100644
--- a/homework_2/hpc/N1P2T4Q25.sh
+++ b/homework_2/hpc/N1P2T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=2
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P2T4Q27.sh b/homework_2/hpc/N1P2T4Q27.sh
index beb0faf..0bf98ac 100644
--- a/homework_2/hpc/N1P2T4Q27.sh
+++ b/homework_2/hpc/N1P2T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=2
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P4T4Q20.sh b/homework_2/hpc/N1P4T4Q20.sh
index d5dd27a..a0072c8 100644
--- a/homework_2/hpc/N1P4T4Q20.sh
+++ b/homework_2/hpc/N1P4T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P4T4Q23.sh b/homework_2/hpc/N1P4T4Q23.sh
index dcd0e7a..82cb920 100644
--- a/homework_2/hpc/N1P4T4Q23.sh
+++ b/homework_2/hpc/N1P4T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P4T4Q25.sh b/homework_2/hpc/N1P4T4Q25.sh
index a9619ff..c77a744 100644
--- a/homework_2/hpc/N1P4T4Q25.sh
+++ b/homework_2/hpc/N1P4T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P4T4Q26.sh b/homework_2/hpc/N1P4T4Q26.sh
new file mode 100644
index 0000000..cd75bd1
--- /dev/null
+++ b/homework_2/hpc/N1P4T4Q26.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=4
+#SBATCH --time=5:00
+
+# Use this as following
+#   $> sbatch -p batch|rome <this file>
+#
+# NOTE:
+#   First compile in aristotle with
+#   $> module load gcc/9.2.0 openmpi/4.0.3
+#   $> make -j hpc-build
+#
+
+module load gcc/9.2.0 openmpi/4.0.3
+# Note:
+#   The above versions are matching w/ my system's
+#   versions, thus making compiling/debugging easier.
+
+# Suppress unused UCX_ROOT warning
+export UCX_WARN_UNUSED_ENV_VARS=n
+
+# Suppress CUDA-aware support is disabled warning
+export OMPI_MCA_opal_warn_on_missing_libcuda=0
+
+srun ./out/distbitonic -q 26 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N1P4T4Q27.sh b/homework_2/hpc/N1P4T4Q27.sh
index e8742a4..0ae3358 100644
--- a/homework_2/hpc/N1P4T4Q27.sh
+++ b/homework_2/hpc/N1P4T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=1:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N2P4T4Q20.sh b/homework_2/hpc/N2P4T4Q20.sh
index 93a5944..3ea9142 100644
--- a/homework_2/hpc/N2P4T4Q20.sh
+++ b/homework_2/hpc/N2P4T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N2P4T4Q23.sh b/homework_2/hpc/N2P4T4Q23.sh
index e9a0851..61a66e1 100644
--- a/homework_2/hpc/N2P4T4Q23.sh
+++ b/homework_2/hpc/N2P4T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N2P4T4Q25.sh b/homework_2/hpc/N2P4T4Q25.sh
index f03f569..4ce1466 100644
--- a/homework_2/hpc/N2P4T4Q25.sh
+++ b/homework_2/hpc/N2P4T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N2P4T4Q27.sh b/homework_2/hpc/N2P4T4Q27.sh
index bf9f8dd..078be28 100644
--- a/homework_2/hpc/N2P4T4Q27.sh
+++ b/homework_2/hpc/N2P4T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P16T4Q20.sh b/homework_2/hpc/N4P16T4Q20.sh
index 4ee3cd1..16c7e14 100644
--- a/homework_2/hpc/N4P16T4Q20.sh
+++ b/homework_2/hpc/N4P16T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=16
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P16T4Q22.sh b/homework_2/hpc/N4P16T4Q22.sh
new file mode 100644
index 0000000..d48dae4
--- /dev/null
+++ b/homework_2/hpc/N4P16T4Q22.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=16
+#SBATCH --cpus-per-task=4
+#SBATCH --time=10:00
+
+# Use this as following
+#   $> sbatch -p batch|rome <this file>
+#
+# NOTE:
+#   First compile in aristotle with
+#   $> module load gcc/9.2.0 openmpi/4.0.3
+#   $> make -j hpc-build
+#
+
+module load gcc/9.2.0 openmpi/4.0.3
+# Note:
+#   The above versions are matching w/ my system's
+#   versions, thus making compiling/debugging easier.
+
+# Suppress unused UCX_ROOT warning
+export UCX_WARN_UNUSED_ENV_VARS=n
+
+# Suppress CUDA-aware support is disabled warning
+export OMPI_MCA_opal_warn_on_missing_libcuda=0
+
+srun ./out/distbitonic -q 22 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P16T4Q23.sh b/homework_2/hpc/N4P16T4Q23.sh
index a44b4fc..03f6b03 100644
--- a/homework_2/hpc/N4P16T4Q23.sh
+++ b/homework_2/hpc/N4P16T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=16
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P16T4Q25.sh b/homework_2/hpc/N4P16T4Q25.sh
index 2be0a7d..65b95c8 100644
--- a/homework_2/hpc/N4P16T4Q25.sh
+++ b/homework_2/hpc/N4P16T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=16
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P16T4Q27.sh b/homework_2/hpc/N4P16T4Q27.sh
index e7b2ba4..e0e95c6 100644
--- a/homework_2/hpc/N4P16T4Q27.sh
+++ b/homework_2/hpc/N4P16T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=16
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P32T4Q20.sh b/homework_2/hpc/N4P32T4Q20.sh
index 2b7b4bb..182c4e3 100644
--- a/homework_2/hpc/N4P32T4Q20.sh
+++ b/homework_2/hpc/N4P32T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=32
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P32T4Q21.sh b/homework_2/hpc/N4P32T4Q21.sh
new file mode 100644
index 0000000..f0d0037
--- /dev/null
+++ b/homework_2/hpc/N4P32T4Q21.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=32
+#SBATCH --cpus-per-task=4
+#SBATCH --time=10:00
+
+# Use this as following
+#   $> sbatch -p batch|rome <this file>
+#
+# NOTE:
+#   First compile in aristotle with
+#   $> module load gcc/9.2.0 openmpi/4.0.3
+#   $> make -j hpc-build
+#
+
+module load gcc/9.2.0 openmpi/4.0.3
+# Note:
+#   The above versions are matching w/ my system's
+#   versions, thus making compiling/debugging easier.
+
+# Suppress unused UCX_ROOT warning
+export UCX_WARN_UNUSED_ENV_VARS=n
+
+# Suppress CUDA-aware support is disabled warning
+export OMPI_MCA_opal_warn_on_missing_libcuda=0
+
+srun ./out/distbitonic -q 21 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P32T4Q23.sh b/homework_2/hpc/N4P32T4Q23.sh
index 7db03b4..7c32e54 100644
--- a/homework_2/hpc/N4P32T4Q23.sh
+++ b/homework_2/hpc/N4P32T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=32
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P32T4Q25.sh b/homework_2/hpc/N4P32T4Q25.sh
index 06da205..e90e4f1 100644
--- a/homework_2/hpc/N4P32T4Q25.sh
+++ b/homework_2/hpc/N4P32T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=32
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P32T4Q27.sh b/homework_2/hpc/N4P32T4Q27.sh
index f42d08c..5de7e8d 100644
--- a/homework_2/hpc/N4P32T4Q27.sh
+++ b/homework_2/hpc/N4P32T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=32
 #SBATCH --cpus-per-task=4
-#SBATCH --time=5:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8 --exchange-opt
\ No newline at end of file
diff --git a/homework_2/hpc/N4P4T4Q20.sh b/homework_2/hpc/N4P4T4Q20.sh
index 176ff23..777e221 100644
--- a/homework_2/hpc/N4P4T4Q20.sh
+++ b/homework_2/hpc/N4P4T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P4T4Q23.sh b/homework_2/hpc/N4P4T4Q23.sh
index 58d3c99..10d823c 100644
--- a/homework_2/hpc/N4P4T4Q23.sh
+++ b/homework_2/hpc/N4P4T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P4T4Q24.sh b/homework_2/hpc/N4P4T4Q24.sh
new file mode 100644
index 0000000..66fd3ef
--- /dev/null
+++ b/homework_2/hpc/N4P4T4Q24.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=4
+#SBATCH --time=5:00
+
+# Use this as following
+#   $> sbatch -p batch|rome <this file>
+#
+# NOTE:
+#   First compile in aristotle with
+#   $> module load gcc/9.2.0 openmpi/4.0.3
+#   $> make -j hpc-build
+#
+
+module load gcc/9.2.0 openmpi/4.0.3
+# Note:
+#   The above versions are matching w/ my system's
+#   versions, thus making compiling/debugging easier.
+
+# Suppress unused UCX_ROOT warning
+export UCX_WARN_UNUSED_ENV_VARS=n
+
+# Suppress CUDA-aware support is disabled warning
+export OMPI_MCA_opal_warn_on_missing_libcuda=0
+
+srun ./out/distbitonic -q 24 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P4T4Q25.sh b/homework_2/hpc/N4P4T4Q25.sh
index ded9350..a758509 100644
--- a/homework_2/hpc/N4P4T4Q25.sh
+++ b/homework_2/hpc/N4P4T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P4T4Q27.sh b/homework_2/hpc/N4P4T4Q27.sh
index d1d03cb..2cb3870 100644
--- a/homework_2/hpc/N4P4T4Q27.sh
+++ b/homework_2/hpc/N4P4T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=5:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P8T4Q20.sh b/homework_2/hpc/N4P8T4Q20.sh
index ac2af75..a87809e 100644
--- a/homework_2/hpc/N4P8T4Q20.sh
+++ b/homework_2/hpc/N4P8T4Q20.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 20 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 20 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P8T4Q23.sh b/homework_2/hpc/N4P8T4Q23.sh
index 23a1cc5..c940ac1 100644
--- a/homework_2/hpc/N4P8T4Q23.sh
+++ b/homework_2/hpc/N4P8T4Q23.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 23 --perf --validation
\ No newline at end of file
+srun ./out/distbitonic -q 23 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P8T4Q25.sh b/homework_2/hpc/N4P8T4Q25.sh
index ec4a8be..cd9a427 100644
--- a/homework_2/hpc/N4P8T4Q25.sh
+++ b/homework_2/hpc/N4P8T4Q25.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 25 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 25 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/hpc/N4P8T4Q27.sh b/homework_2/hpc/N4P8T4Q27.sh
index 18553df..51dc99e 100644
--- a/homework_2/hpc/N4P8T4Q27.sh
+++ b/homework_2/hpc/N4P8T4Q27.sh
@@ -3,7 +3,7 @@
 #SBATCH --nodes=4
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=4
-#SBATCH --time=2:00
+#SBATCH --time=10:00
 
 # Use this as following
 #   $> sbatch -p batch|rome <this file>
@@ -25,4 +25,4 @@ export UCX_WARN_UNUSED_ENV_VARS=n
 # Suppress CUDA-aware support is disabled warning
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
-srun ./out-rc3b/distbitonic -q 27 --perf --validation --pipeline 8
\ No newline at end of file
+srun ./out/distbitonic -q 27 --validation --perf 8
\ No newline at end of file
diff --git a/homework_2/include/config.h b/homework_2/include/config.h
index 5c62edf..e2239b5 100644
--- a/homework_2/include/config.h
+++ b/homework_2/include/config.h
@@ -27,7 +27,7 @@
 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 
-// The maximum MPI size we support
+// The maximum MPI size we support (in Nodes x Processes)
 static constexpr size_t MAX_MPI_SIZE        = 1024UL;
 
 // The maximum pipeline size we support
@@ -61,7 +61,7 @@ struct config_t {
     bool    validation{false};              //!< Request a full validation at the end, performed by process rank 0.
     bool    ndebug{false};                  //!< Skips debug trap on DEBUG builds.
     size_t  perf{1};                        //!< Enable performance timing measurements and prints and repeat
-                                            //!< the performs the sorting <perf> times to average the measurements
+                                            //!< the sorting <perf> times.
     bool    verbose{false};                 //!< Flag to enable verbose output to stdout.
 };
 
diff --git a/homework_2/include/utils.hpp b/homework_2/include/utils.hpp
index 6936d86..4b71366 100644
--- a/homework_2/include/utils.hpp
+++ b/homework_2/include/utils.hpp
@@ -129,7 +129,7 @@ struct MPI_t {
      * This function matches a transmit and a receive in order for fully exchanged data between
      * current node and partner.
      *
-     * @tparam T        The inner valur type used in buffer
+     * @tparam ValueT   The value type used in buffer
      *
      * @param ldata     [const ValueT*] Pointer to local data to send
      * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
@@ -163,7 +163,7 @@ struct MPI_t {
      *      This call MUST paired with exchange_wait() for each MPI_t object.
      *      Calling 2 consecutive exchange_start() for the same MPI_t object is undefined.
      *
-     * @tparam ValueT   The underlying value type used in buffers
+     * @tparam ValueT   The value type used in buffers
      *
      * @param ldata     [const ValueT*] Pointer to local data to send
      * @param rdata     [ValueT*]       Pointer to buffer to receive data from partner
@@ -267,8 +267,8 @@ using mpi_id_t = MPI_t<>::ID_t;
 /*!
  * @brief A std::vector wrapper with 2 vectors, an active and a shadow.
  *
- * This type exposes the standard vector
- * functionality of the active vector. The shadow can be used when we need to use the vector as mutable
+ * This type exposes the standard vector functionality of the active vector.
+ * The shadow can be used when we need to use the vector as mutable
  * data in algorithms that can not support "in-place" editing (like elbow-sort for example)
  *
  * @tparam Value_t  the underlying data type of the vectors
@@ -418,7 +418,9 @@ private:
 extern Log logger;
 
 /*!
- * A small timing utility based on chrono.
+ * A small timing utility based on chrono that supports timing rounds
+ * and returning the median of them. Time can accumulate to the measurement
+ * for each round.
  */
 struct Timing {
     using Tpoint = std::chrono::steady_clock::time_point;
@@ -485,7 +487,7 @@ private:
 };
 
 /*!
- * Utility "high level function"-like macro to forward a function call
+ * A "high level function"-like utility macro to forward a function call
  * and accumulate the execution time to the corresponding timing object.
  *
  * @param   Tim     The Timing object [Needs to have methods start() and stop()]
diff --git a/homework_2/src/main.cpp b/homework_2/src/main.cpp
index 1eded9f..c5f51ef 100644
--- a/homework_2/src/main.cpp
+++ b/homework_2/src/main.cpp
@@ -24,6 +24,10 @@ distBuffer_t    Data;
 Log             logger;
 distStat_t      localStat, remoteStat;
 
+// Mersenne seeded from hw if possible. range: [type_min, type_max]
+std::random_device  rd;
+std::mt19937        gen(rd());
+
 //! Performance timers for each one of the "costly" functions
 Timing Ttotal;
 Timing TfullSort;
@@ -106,9 +110,9 @@ bool get_options(int argc, char* argv[]){
         }
         else if (arg == "-h" || arg == "--help") {
             std::cout << "distbitonic/distbubbletonic - A distributed sort utility\n\n";
-            std::cout << "distbitonic -q <N> [-e] [-p | --pipeline N] [--validation] [--perf] [--ndebug] [-v]\n";
+            std::cout << "distbitonic -q <N> [-e] [-p | --pipeline N] [--validation] [--perf <N>] [--ndebug] [-v]\n";
             std::cout << "distbitonic -h\n";
-            std::cout << "distbubbletonic -q <N> [-e] [-p | --pipeline N] [--validation] [--perf] [--ndebug] [-v]\n";
+            std::cout << "distbubbletonic -q <N> [-e] [-p | --pipeline N] [--validation] [--perf <N> ] [--ndebug] [-v]\n";
             std::cout << "distbubbletonic -h\n";
             std::cout << '\n';
             std::cout << "Options:\n\n";
@@ -123,7 +127,7 @@ bool get_options(int argc, char* argv[]){
             std::cout << "      Request a full validation at the end, performed by process rank 0\n\n";
             std::cout << "   --perf <N> \n";
             std::cout << "      Enable performance timing measurements and prints, and repeat\n";
-            std::cout << "      the sorting <N> times to average the measurements\n\n";
+            std::cout << "      the sorting <N> times.\n\n";
             std::cout << "   --ndebug\n";
             std::cout << "      Skip debug breakpoint when on debug build.\n\n";
             std::cout << "   -v | --verbose\n";
@@ -190,59 +194,67 @@ bool validator(ShadowedDataT& data, mpi_id_t Processes, mpi_id_t rank) {
     return ret;
 }
 
-#if !defined TESTING
 /*!
- * @return Returns 0, but.... we may throw or exit(1)
+ * Initializes the environment, must called from each process
+ *
+ * @param argc  [int*]      POINTER to main's argc argument
+ * @param argv  [char***]   POINTER to main's argv argument
  */
-int main(int argc, char* argv[]) try {
+void init(int* argc, char*** argv) {
     // Initialize MPI environment
-    mpi.init(&argc, &argv);
+    mpi.init(argc, argv);
 
     // try to read command line (after MPI parsing)
-    if (!get_options(argc, argv))
+    if (!get_options(*argc, *argv))
         exit(1);
 
-    logger << "MPI environment initialized." <<
-              " Rank: " << mpi.rank() <<
-              " Size: " << mpi.size() <<
-              logger.endl;
+    logger << "MPI environment initialized." << " Rank: " << mpi.rank() << " Size: " << mpi.size()
+           << logger.endl;
 
     #if defined DEBUG
     #if defined TESTING
-    /*
-     * In case of a debug build we will wait here until sleep_wait
-     * will reset via debugger. In order to do that the user must attach
-     * debugger to all processes. For example:
-     *  $> mpirun -np 2 ./<program path>
-     *  $> ps aux | grep <program>
-     *  $> gdb <program> <PID1>
-     *  $> gdb <program> <PID2>
-     */
-     volatile bool sleep_wait = false;
+        /*
+         * In case of a debug build we will wait here until sleep_wait
+         * will reset via debugger. In order to do that the user must attach
+         * debugger to all processes. For example:
+         *  $> mpirun -np 2 ./<program path>
+         *  $> ps aux | grep <program>
+         *  $> gdb <program> <PID1>
+         *  $> gdb <program> <PID2>
+         */
+         volatile bool sleep_wait = false;
     #else
-    volatile bool sleep_wait = true;
+        volatile bool sleep_wait = true;
     #endif
-    while (sleep_wait && !config.ndebug)
-        sleep(1);
+        while (sleep_wait && !config.ndebug)
+            sleep(1);
     #endif
 
-    // Initialize local data
-    logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
-    std::random_device rd;      // Mersenne seeded from hw if possible. range: [type_min, type_max]
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<distValue_t > dis(
-            std::numeric_limits<distValue_t>::min(),
-            std::numeric_limits<distValue_t>::max()
-        );
-    // Fill vector
+    // Prepare vector and timing data
     Data.resize(config.arraySize);
-    std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
-
-    // Run distributed sort
-    if (mpi.rank() == 0)
-        logger << "Starting distributed sorting ... ";
     measurements_init();
+}
+
+#if !defined TESTING
+/*!
+ * @return Returns 0, but.... we may throw or exit(1)
+ */
+int main(int argc, char* argv[]) try {
+
+    // Init everything
+    init(&argc, &argv);
+
     for (size_t it = 0 ; it < config.perf ; ++it) {
+        // Initialize local data
+        logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
+        std::uniform_int_distribution<distValue_t > dis(
+                std::numeric_limits<distValue_t>::min(),
+                std::numeric_limits<distValue_t>::max()
+        );
+        std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
+        // Run distributed sort
+        if (mpi.rank() == 0)
+            logger << "Starting distributed sorting ... ";
         Ttotal.start();
     #if CODE_VERSION == BUBBLETONIC
         distBubbletonic(Data, mpi.size(), mpi.rank());
@@ -251,9 +263,9 @@ int main(int argc, char* argv[]) try {
     #endif
         Ttotal.stop();
         measurements_next();
+        if (mpi.rank() == 0)
+            logger << " Done." << logger.endl;
     }
-    if (mpi.rank() == 0)
-        logger << " Done." << logger.endl;
 
     // Print-outs and validation
     if (config.perf > 1) {
@@ -266,10 +278,10 @@ int main(int argc, char* argv[]) try {
     if (config.validation) {
         // If requested, we have the chance to fail!
         if (mpi.rank() == 0)
-            std::cout << "Results validation ...";
+            std::cout << "[Validation] Results validation ...";
         bool val = validator(Data, mpi.size(), mpi.rank());
         if (mpi.rank() == 0)
-            std::cout << ((val) ? "\x1B[32m [PASS] \x1B[0m\n" : " \x1B[32m [FAIL] \x1B[0m\n");
+            std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
     }
     mpi.finalize();
     return 0;
diff --git a/homework_2/test/tests_CommonUtils.cpp b/homework_2/test/tests_CommonUtils.cpp
index 5f14989..4739e0a 100644
--- a/homework_2/test/tests_CommonUtils.cpp
+++ b/homework_2/test/tests_CommonUtils.cpp
@@ -91,3 +91,29 @@ TEST(TdistCommonUT, elbowSort_test3) {
     EXPECT_EQ((ts_data == ts_expected_des), true);
 }
 
+/*
+ * Tag generator test without stage calls
+ */
+TEST(TdistCommonUT, tagGenerator_test1) {
+    // The maximum MPI size we support
+    // static constexpr size_t MAX_MPI_SIZE = 1024UL;
+    // The maximum pipeline size we support
+    // static constexpr size_t MAX_PIPELINE_SIZE = 64UL;
+
+    std::vector<int> ts_tags;
+    auto ts_logSize = static_cast<uint32_t>(std::log2(MAX_MPI_SIZE));
+
+    for (size_t depth = 0; depth <= ts_logSize; ++depth) {
+        for (size_t step = 0 ; step < MAX_MPI_SIZE; ++step) {
+            int tag = static_cast<int>(tagGenerator(depth, step));
+            ts_tags.push_back(tag);     // Exchange optimization
+            for (size_t stage = 0; stage < MAX_PIPELINE_SIZE; ++stage) {
+                ts_tags.push_back(++tag);   // stages
+            }
+        }
+    }
+    std::sort(ts_tags.begin(), ts_tags.end());
+    for (size_t i = 0 ; i < ts_tags.size() - 1 ; ++i)
+        EXPECT_NE(ts_tags[i], ts_tags[i+1]);
+}
+
diff --git a/homework_2/test/tests_MPI.cpp b/homework_2/test/tests_MPI.cpp
index 2c79bee..64735d3 100644
--- a/homework_2/test/tests_MPI.cpp
+++ b/homework_2/test/tests_MPI.cpp
@@ -27,8 +27,8 @@
 MPI_t<>         ts_mpi;
 
 // Mersenne seeded from hw if possible. range: [type_min, type_max]
-std::random_device rd;
-std::mt19937 gen(rd());
+std::random_device  ts_rd;
+std::mt19937        ts_gen(ts_rd());
 
 class TMPIdistSort : public ::testing::Test {
 protected:
@@ -59,7 +59,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test1) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Execute function under test in all processes
     distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
@@ -100,7 +100,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test2) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Execute function under test in all processes
     distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
@@ -141,7 +141,7 @@ TEST_F(TMPIdistSort, distBubbletonic_test3) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Set pipeline
     config.pipeline = 8;
@@ -170,6 +170,96 @@ TEST_F(TMPIdistSort, distBubbletonic_test3) {
     }
 }
 
+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBubbletonic for uin32_t [1 << 16] with exchange optimization
+ */
+TEST_F(TMPIdistSort, distBubbletonic_test4) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
+
+    // Set exchange optimization
+    config.exchangeOpt = true;
+
+    // Execute function under test in all processes
+    distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}
+
+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBubbletonic for uin32_t [1 << 16] with
+ * exchange optimization and pipeline
+ */
+TEST_F(TMPIdistSort, distBubbletonic_test5) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
+
+    // Set exchange optimization + pipeline
+    config.exchangeOpt = true;
+    config.pipeline    = 8;
+
+    // Execute function under test in all processes
+    distBubbletonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}
+
 /*
  * MPI: SysTest (acceptance)
  * Each process executes distBitonic for uin8_t [16]
@@ -185,7 +275,7 @@ TEST_F(TMPIdistSort, distBitonic_test1) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Execute function under test in all processes
     distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
@@ -226,7 +316,7 @@ TEST_F(TMPIdistSort, distBitonic_test2) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Execute function under test in all processes
     distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
@@ -267,7 +357,7 @@ TEST_F(TMPIdistSort, distBitonic_test3) {
             std::numeric_limits<tsValue_t>::max()
     );
     ts_Data.resize(ts_buffer_size);
-    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(gen); });
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
 
     // Set pipeline
     config.pipeline = 8;
@@ -295,3 +385,93 @@ TEST_F(TMPIdistSort, distBitonic_test3) {
         }
     }
 }
+
+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBitonic for uin32_t [1 << 16] with exchange optimization
+ */
+TEST_F(TMPIdistSort, distBitonic_test4) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
+
+    // Set exchange optimization
+    config.exchangeOpt = true;
+
+    // Execute function under test in all processes
+    distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}
+
+/*
+ * MPI: SysTest (acceptance)
+ * Each process executes distBitonic for uin32_t [1 << 16] with
+ * exchange optimization and pipeline
+ */
+TEST_F(TMPIdistSort, distBitonic_test5) {
+    // Create and fill vector
+    using tsValue_t = uint32_t;      // Test parameters
+    size_t ts_buffer_size = 1 << 16;
+
+    ShadowedVec_t<tsValue_t> ts_Data;
+    std::uniform_int_distribution<tsValue_t > dis(
+            std::numeric_limits<tsValue_t>::min(),
+            std::numeric_limits<tsValue_t>::max()
+    );
+    ts_Data.resize(ts_buffer_size);
+    std::generate(ts_Data.begin(), ts_Data.end(), [&]() { return dis(ts_gen); });
+
+    // Set exchange optimization + pipeline
+    config.exchangeOpt = true;
+    config.pipeline    = 8;
+
+    // Execute function under test in all processes
+    distBitonic(ts_Data, ts_mpi.size(), ts_mpi.rank());
+
+    // Local min and max
+    auto local_min = *std::min_element(ts_Data.begin(), ts_Data.end());
+    auto local_max = *std::max_element(ts_Data.begin(), ts_Data.end());
+
+    // Gather min/max to rank 0
+    std::vector<tsValue_t> global_mins(ts_mpi.size());
+    std::vector<tsValue_t> global_maxes(ts_mpi.size());
+    MPI_Datatype datatype = MPI_TypeMapper<tsValue_t>::getType();
+
+    MPI_Gather(&local_min, 1, datatype, global_mins.data(), 1, datatype, 0, MPI_COMM_WORLD);
+    MPI_Gather(&local_max, 1, datatype, global_maxes.data(), 1, datatype, 0, MPI_COMM_WORLD);
+
+    // Check results
+    EXPECT_EQ(std::is_sorted(ts_Data.begin(), ts_Data.end()), true);
+    if (ts_mpi.rank() == 0) {
+        for (size_t i = 1; i < global_mins.size(); ++i) {
+            EXPECT_LE(global_maxes[i - 1], global_mins[i]);
+        }
+    }
+}
\ No newline at end of file