HW3: RC2 - A prephase added for v1 and v2

1 月之前 · f749862193
--- a/homework_3/.gitignore
+++ b/homework_3/.gitignore
@@ -20,4 +20,8 @@ various/
 .vs/
 .vscode/

 # nvidia
 *.ncu-proj



--- a/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere
+++ b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere
@@ -0,0 +1,72 @@
 Submitting: hpc/BitncV0Q20.sh
 Submitted batch job 1914643
 Submitting: hpc/BitncV0Q21.sh
 Submitted batch job 1914644
 Submitting: hpc/BitncV0Q22.sh
 Submitted batch job 1914645
 Submitting: hpc/BitncV0Q23.sh
 Submitted batch job 1914646
 Submitting: hpc/BitncV0Q24.sh
 Submitted batch job 1914647
 Submitting: hpc/BitncV0Q25.sh
 Submitted batch job 1914648
 Submitting: hpc/BitncV0Q26.sh
 Submitted batch job 1914649
 Submitting: hpc/BitncV0Q27.sh
 Submitted batch job 1914650
 Submitting: hpc/BitncV0Q28.sh
 Submitted batch job 1914651
 Submitting: hpc/BitncV0Q29.sh
 Submitted batch job 1914652
 Submitting: hpc/BitncV0Q30.sh
 Submitted batch job 1914653


 Submitting: hpc/BitncV1Q20.sh
 Submitted batch job 1914654
 Submitting: hpc/BitncV1Q21.sh
 Submitted batch job 1914655
 Submitting: hpc/BitncV1Q22.sh
 Submitted batch job 1914656
 Submitting: hpc/BitncV1Q23.sh
 Submitted batch job 1914657
 Submitting: hpc/BitncV1Q24.sh
 Submitted batch job 1914658
 Submitting: hpc/BitncV1Q25.sh
 Submitted batch job 1914659
 Submitting: hpc/BitncV1Q26.sh
 Submitted batch job 1914660
 Submitting: hpc/BitncV1Q27.sh
 Submitted batch job 1914661
 Submitting: hpc/BitncV1Q28.sh
 Submitted batch job 1914662
 Submitting: hpc/BitncV1Q29.sh
 Submitted batch job 1914663
 Submitting: hpc/BitncV1Q30.sh
 Submitted batch job 1914664


 Submitting: hpc/BitncV2Q20.sh
 Submitted batch job 1914665
 Submitting: hpc/BitncV2Q21.sh
 Submitted batch job 1914666
 Submitting: hpc/BitncV2Q22.sh
 Submitted batch job 1914667
 Submitting: hpc/BitncV2Q23.sh
 Submitted batch job 1914668
 Submitting: hpc/BitncV2Q24.sh
 Submitted batch job 1914669
 Submitting: hpc/BitncV2Q25.sh
 Submitted batch job 1914670
 Submitting: hpc/BitncV2Q26.sh
 Submitted batch job 1914671
 Submitting: hpc/BitncV2Q27.sh
 Submitted batch job 1914672
 Submitting: hpc/BitncV2Q28.sh
 Submitted batch job 1914673
 Submitting: hpc/BitncV2Q29.sh
 Submitted batch job 1914674
 Submitting: hpc/BitncV2Q30.sh
 Submitted batch job 1914675


--- a/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu
+++ b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu
@@ -0,0 +1,70 @@
 Submitting: hpc/BitncV0Q20.sh
 Submitted batch job 1914677
 Submitting: hpc/BitncV0Q21.sh
 Submitted batch job 1914678
 Submitting: hpc/BitncV0Q22.sh
 Submitted batch job 1914679
 Submitting: hpc/BitncV0Q23.sh
 Submitted batch job 1914680
 Submitting: hpc/BitncV0Q24.sh
 Submitted batch job 1914681
 Submitting: hpc/BitncV0Q25.sh
 Submitted batch job 1914682
 Submitting: hpc/BitncV0Q26.sh
 Submitted batch job 1914683
 Submitting: hpc/BitncV0Q27.sh
 Submitted batch job 1914684
 Submitting: hpc/BitncV0Q28.sh
 Submitted batch job 1914685
 Submitting: hpc/BitncV0Q29.sh
 Submitted batch job 1914686
 Submitting: hpc/BitncV0Q30.sh
 Submitted batch job 1914687


 Submitting: hpc/BitncV1Q20.sh
 Submitted batch job 1914688
 Submitting: hpc/BitncV1Q21.sh
 Submitted batch job 1914689
 Submitting: hpc/BitncV1Q22.sh
 Submitted batch job 1914690
 Submitting: hpc/BitncV1Q23.sh
 Submitted batch job 1914691
 Submitting: hpc/BitncV1Q24.sh
 Submitted batch job 1914692
 Submitting: hpc/BitncV1Q25.sh
 Submitted batch job 1914693
 Submitting: hpc/BitncV1Q26.sh
 Submitted batch job 1914694
 Submitting: hpc/BitncV1Q27.sh
 Submitted batch job 1914695
 Submitting: hpc/BitncV1Q28.sh
 Submitted batch job 1914696
 Submitting: hpc/BitncV1Q29.sh
 Submitted batch job 1914697
 Submitting: hpc/BitncV1Q30.sh
 Submitted batch job 1914698


 Submitting: hpc/BitncV2Q20.sh
 Submitted batch job 1914699
 Submitting: hpc/BitncV2Q21.sh
 Submitted batch job 1914700
 Submitting: hpc/BitncV2Q22.sh
 Submitted batch job 1914701
 Submitting: hpc/BitncV2Q23.sh
 Submitted batch job 1914702
 Submitting: hpc/BitncV2Q24.sh
 Submitted batch job 1914703
 Submitting: hpc/BitncV2Q25.sh
 Submitted batch job 1914704
 Submitting: hpc/BitncV2Q26.sh
 Submitted batch job 1914705
 Submitting: hpc/BitncV2Q27.sh
 Submitted batch job 1914706
 Submitting: hpc/BitncV2Q28.sh
 Submitted batch job 1914707
 Submitting: hpc/BitncV2Q29.sh
 Submitted batch job 1914708
 Submitting: hpc/BitncV2Q30.sh
 Submitted batch job 1914709
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1048576 (Q=20)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 7118 [usec]
 [Timing] Mem-xch : 3881 [usec]
 [Timing] Sorting : 3233 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     2097152 (Q=21)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 7597 [usec]
 [Timing] Mem-xch : 3359 [usec]
 [Timing] Sorting : 4237 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     4194304 (Q=22)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 10 [msec]
 [Timing] Mem-xch : 4320 [usec]
 [Timing] Sorting : 5982 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     8388608 (Q=23)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 29 [msec]
 [Timing] Mem-xch : 14 [msec]
 [Timing] Sorting : 14 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     16777216 (Q=24)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 43 [msec]
 [Timing] Mem-xch : 13 [msec]
 [Timing] Sorting : 29 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     33554432 (Q=25)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 89 [msec]
 [Timing] Mem-xch : 29 [msec]
 [Timing] Sorting : 59 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     67108864 (Q=26)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 184 [msec]
 [Timing] Mem-xch : 63 [msec]
 [Timing] Sorting : 121 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     134217728 (Q=27)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 414 [msec]
 [Timing] Mem-xch : 157 [msec]
 [Timing] Sorting : 255 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     268435456 (Q=28)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 909 [msec]
 [Timing] Mem-xch : 363 [msec]
 [Timing] Sorting : 548 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     536870912 (Q=29)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 2005 [msec]
 [Timing] Mem-xch : 840 [msec]
 [Timing] Sorting : 1163 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1073741824 (Q=30)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 3593 [msec]
 [Timing] Mem-xch : 1137 [msec]
 [Timing] Sorting : 2456 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1048576 (Q=20)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 5607 [usec]
 [Timing] Mem-xch : 4043 [usec]
 [Timing] Sorting : 1562 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     2097152 (Q=21)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 4605 [usec]
 [Timing] Mem-xch : 2073 [usec]
 [Timing] Sorting : 2367 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     4194304 (Q=22)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 11 [msec]
 [Timing] Mem-xch : 7261 [usec]
 [Timing] Sorting : 3887 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     8388608 (Q=23)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 16 [msec]
 [Timing] Mem-xch : 8281 [usec]
 [Timing] Sorting : 8624 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     16777216 (Q=24)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 33 [msec]
 [Timing] Mem-xch : 15 [msec]
 [Timing] Sorting : 18 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     33554432 (Q=25)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 65 [msec]
 [Timing] Mem-xch : 27 [msec]
 [Timing] Sorting : 38 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     67108864 (Q=26)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 136 [msec]
 [Timing] Mem-xch : 63 [msec]
 [Timing] Sorting : 72 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     134217728 (Q=27)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 281 [msec]
 [Timing] Mem-xch : 125 [msec]
 [Timing] Sorting : 156 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     268435456 (Q=28)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 731 [msec]
 [Timing] Mem-xch : 366 [msec]
 [Timing] Sorting : 362 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     536870912 (Q=29)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 1378 [msec]
 [Timing] Mem-xch : 632 [msec]
 [Timing] Sorting : 753 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1073741824 (Q=30)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 3177 [msec]
 [Timing] Mem-xch : 1564 [msec]
 [Timing] Sorting : 1580 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1048576 (Q=20)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 3147 [usec]
 [Timing] Mem-xch : 1491 [usec]
 [Timing] Sorting : 1646 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     2097152 (Q=21)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 4908 [usec]
 [Timing] Mem-xch : 2369 [usec]
 [Timing] Sorting : 2545 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     4194304 (Q=22)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 8561 [usec]
 [Timing] Mem-xch : 4249 [usec]
 [Timing] Sorting : 4299 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     8388608 (Q=23)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 17 [msec]
 [Timing] Mem-xch : 8507 [usec]
 [Timing] Sorting : 9197 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     16777216 (Q=24)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 34 [msec]
 [Timing] Mem-xch : 14 [msec]
 [Timing] Sorting : 19 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     33554432 (Q=25)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 69 [msec]
 [Timing] Mem-xch : 28 [msec]
 [Timing] Sorting : 41 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     67108864 (Q=26)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 149 [msec]
 [Timing] Mem-xch : 71 [msec]
 [Timing] Sorting : 87 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     134217728 (Q=27)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 323 [msec]
 [Timing] Mem-xch : 151 [msec]
 [Timing] Sorting : 166 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     268435456 (Q=28)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 754 [msec]
 [Timing] Mem-xch : 367 [msec]
 [Timing] Sorting : 384 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     536870912 (Q=29)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 1425 [msec]
 [Timing] Mem-xch : 639 [msec]
 [Timing] Sorting : 796 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out
+++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out
@@ -0,0 +1,22 @@
 [Log]: Array size:     1073741824 (Q=30)
 [Log]: Repeated sorts: 7
 [Log]: GPU:            NVIDIA A100-SXM4-40GB
 [Log]: Block size:     512
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Log]: Initialize array ...  Done.
 [Log]: Start sorting ...     Done.
 [Timing] Total   : 3231 [msec]
 [Timing] Mem-xch : 1532 [msec]
 [Timing] Sorting : 1676 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
+++ b/homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
--- a/homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
+++ b/homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
--- a/homework_3/analyse/b31ca23/Pending-PIDs
+++ b/homework_3/analyse/b31ca23/Pending-PIDs
@@ -0,0 +1,45 @@
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh 
 Submitted batch job 1914456
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh 
 Submitted batch job 1914457
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh 
 Submitted batch job 1914458
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh 
 Submitted batch job 1914459
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh 
 Submitted batch job 1914460
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh 
 Submitted batch job 1914461
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh 
 Submitted batch job 1914462
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh 
 Submitted batch job 1914463
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh 
 Submitted batch job 1914464
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh 
 Submitted batch job 1914465
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh 
 Submitted batch job 1914466
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh 
 Submitted batch job 1914467
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh 
 Submitted batch job 1914468
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh 
 Submitted batch job 1914469
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh 
 Submitted batch job 1914470
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh 
 Submitted batch job 1914471
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh 
 Submitted batch job 1914472
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh 
 Submitted batch job 1914473
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh 
 Submitted batch job 1914474
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh 
 Submitted batch job 1914475
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh 
 Submitted batch job 1914476
 [cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh 
 Submitted batch job 1914477

--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out
@@ -0,0 +1,2 @@
 [Timing] Total: 5920 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out
@@ -0,0 +1,2 @@
 [Timing] Total: 6571 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out
@@ -0,0 +1,2 @@
 [Timing] Total: 13 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out
@@ -0,0 +1,2 @@
 [Timing] Total: 24 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out
@@ -0,0 +1,2 @@
 [Timing] Total: 46 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out
@@ -0,0 +1,2 @@
 [Timing] Total: 92 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out
@@ -0,0 +1,2 @@
 [Timing] Total: 213 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out
@@ -0,0 +1,2 @@
 [Timing] Total: 440 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out
@@ -0,0 +1,2 @@
 [Timing] Total: 935 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out
@@ -0,0 +1,2 @@
 [Timing] Total: 1847 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out
@@ -0,0 +1,2 @@
 [Timing] Total: 3798 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out
@@ -0,0 +1,2 @@
 [Timing] Total: 2843 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out
@@ -0,0 +1,2 @@
 [Timing] Total: 4979 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out
@@ -0,0 +1,2 @@
 [Timing] Total: 9909 [usec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out
@@ -0,0 +1,2 @@
 [Timing] Total: 20 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out
@@ -0,0 +1,2 @@
 [Timing] Total: 35 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out
@@ -0,0 +1,2 @@
 [Timing] Total: 70 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out
@@ -0,0 +1,2 @@
 [Timing] Total: 170 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out
@@ -0,0 +1,2 @@
 [Timing] Total: 346 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out
@@ -0,0 +1,2 @@
 [Timing] Total: 735 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out
@@ -0,0 +1,2 @@
 [Timing] Total: 1522 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out
+++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out
@@ -0,0 +1,2 @@
 [Timing] Total: 2950 [msec]
 [Validation] Results validation ...[32m [PASSED] [0m
--- a/homework_3/analyse/prof.sh
+++ b/homework_3/analyse/prof.sh
@@ -0,0 +1,23 @@
 #!/usr/bin/env bash

 #
 # prof.sh <exec> <report.file>
 #

 sudo /usr/local/cuda-11.4/bin/ncu \
     --target-processes all       \
     --metrics "$(echo -n \
 "smsp__inst_executed,"\
 "smsp__cycles_active.avg,"\
 "smsp__cycles_active.sum,"\
 "gpu__time_duration.sum,"\
 "smsp__average_warp_latency_issue_stalled_barrier,"\
 "smsp__warp_issue_stalled_barrier_per_warp_active,"\
 "l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\
 "l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\
 "l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\
 "l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\
 "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\
 "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\
     )" \
     "$1" -q 20 -b 512 > "$2"
--- a/homework_3/hpc/submitJobs.sh
+++ b/homework_3/hpc/submitJobs.sh
@@ -2,8 +2,8 @@

 # Submission parameters
 QOS="small"
 PARTITION="ampere"
 SCRIPT_DIR="hpc"  # Directory containing the job scripts
 PARTITION="ampere"  # ampere 	gpu
 SCRIPT_DIR="hpc"    # Directory containing the job scripts

 # Range of values for the -q parameter
 VERSIONS=("V0" "V1" "V2")
@@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do
    script_path="${SCRIPT_DIR}/${script_name}"

    if [[ -f "$script_path" ]]; then
      echo "Submitting: $script_path"
      sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
      echo "Submitted: $script_path"
      #sbatch -p "$PARTITION" "$script_path"
    else
      echo "Warning: File not found - $script_path"
    fi
--- a/homework_3/reportv1.3
+++ b/homework_3/reportv1.3
--- a/homework_3/reportv2.3
+++ b/homework_3/reportv2.3
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@@ -15,6 +15,7 @@
 #include <cmath>
 #include <cstdint>
 #include <utility>
 #include <stdexcept>

 #include "utils.hpp"

@@ -159,9 +160,9 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
        throw std::runtime_error("[CUDA] - Can not allocate memory");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory to device");
    Timer_memory.stop();

    size_t Nth = config.blockSize;
@@ -180,7 +181,7 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory from device");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
@@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
    }
 }


 /*!
 * This is unrolled part of the bitonic double loop for the init phase where the entire
 * double loop can fit in one block with shared memory access.
 *
 * First each thread caches its corresponding data point from the current and the following data block.
 * After that we execute the pre-phase on the local data and then we write back to global memory.
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param stages    [size_t]    The number of stages to pre execute
 * @param maxStages [size_t]    The maximum number of stages for the entire sort
 */
 template <typename ValueT>
 __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
    for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
        for (size_t step = stage; step > 0; ) {
            --step;
            interBlockStep_(data, n, step, stage);
            __syncthreads();
        }
    }
 }

 /*!
 * A CUDA version of the Bitonic sort algorithm.
 *
@@ -262,18 +288,22 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
        throw std::runtime_error("[CUDA] - Can not allocate memory");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory to device");
    Timer_memory.stop();

    size_t Nth = config.blockSize;
    size_t Nbl = NBlocks(size);

    auto Stages          = static_cast<size_t>(log2(size));
    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));    //
    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
    size_t PrephaseStages= InnerBlockSteps + 1;

    Timer_sorting.start();
    for (size_t stage = 1; stage <= Stages; ++stage) {
    prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages);
    cudaDeviceSynchronize();
    for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
        size_t step = stage - 1;
        for ( ; step > InnerBlockSteps; --step) {
            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@@ -286,7 +316,7 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory from device");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
@@ -301,7 +331,9 @@ void bitonicSort(DataT& data) {
 * @note
 *  Each block thread collection can exchange twice the size of data points.
 */
 inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
 inline constexpr size_t effectiveBlockSize(size_t blockSize) {
    return SizeToThreadsRatio * blockSize;
 }



@@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
        __syncthreads();
    }

    // Write back to global memory
    // Write back to global memory (no sync here, there will be sync from host)
    data[gIdx0]              = shared_data[lIdx0];
    data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
 }

 /*!
 * This is unrolled part of the bitonic double loop for the init phase where the entire
 * double loop can fit in one block with shared memory access.
 *
 * First each thread caches its corresponding data point from the current and the following data block.
 * After that we execute the pre-phase on the local data and then we write back to global memory.
 *
 * @tparam ValueT   The underlying data type of the array items
 * @param data      [ValueT*]   Pointer to data array
 * @param n         [size_t]    The total size of the array
 * @param stages    [size_t]    The number of stages to pre execute
 * @param maxStages [size_t]    The maximum number of stages for the entire sort
 */
 template <typename ValueT>
 __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
    extern __shared__ ValueT shared_data[];

    /*
     * Global and local(shared) memory indices (calculated once)
     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
     * And we cache the neighbor block address indexes in local (shared) memory
     */
    threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
    threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);

    if (gIdx0 + blockDim.x >= n)   // Boundary check
        return;

    // Fetch to local memory the entire effective block size (2 positions for each thread)
    shared_data[lIdx0]              = data[gIdx0];
    shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
    __syncthreads();
    for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
        for (size_t step = stage; step > 0; ) {
            --step;

            // Init thread global and local indices
            threadId_t gIdx = gIdx0;
            threadId_t lIdx = lIdx0;
            // Find partner and keep-small configuration based on the global data positions
            threadId_t pIdx  = partner(gIdx, step);
            if (gIdx > pIdx) {
                // Shift inside effective block
                gIdx += blockDim.x; // global
                pIdx += blockDim.x;
                lIdx += blockDim.x;	// local
            }
            bool keep = keepSmall(gIdx, pIdx, stage);

            // Exchange data on local(shared) copy
            threadId_t lpIdx = toLocal(pIdx, blockDim.x);
            exchange(shared_data, lIdx, lpIdx, keep);
            __syncthreads();
        }
    }

    // Write back to global memory (no sync here, there will be sync from host)
    data[gIdx0]              = shared_data[lIdx0];
    data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
 }

 /*!
@@ -421,19 +513,23 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
        throw std::runtime_error("[CUDA] - Can not allocate memory");
    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory to device");
    Timer_memory.stop();

    size_t Nth = config.blockSize;
    size_t Nbl = NBlocks(size);
    size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
    size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t);

    auto Stages          = static_cast<size_t>(log2(size));
    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
    size_t PrephaseStages= InnerBlockSteps + 1;

    Timer_sorting.start();
    for (size_t stage = 1; stage <= Stages; ++stage) {
    prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages);
    cudaDeviceSynchronize();
    for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
        size_t step = stage - 1;
        for ( ; step > InnerBlockSteps; --step) {
            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@@ -446,7 +542,7 @@ void bitonicSort(DataT& data) {

    Timer_memory.start();
    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
        throw std::runtime_error("[CUDA] - Can not copy memory from device");
    cudaFree(dev_data);
    Timer_memory.stop();
 }
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@@ -65,8 +65,8 @@ using ArraySize_t = uint64_t;
 *  The values of the members are set from the command line.
 */
 struct config_t {
 	ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
 	size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
    ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
    size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
    bool        validation{false};              //!< Request a full validation at the end, performed by process rank 0.
    size_t      perf{1};                        //!< Enable performance timing measurements and prints. Repeat
                                                //!< the sorting <perf> times to do so.
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@@ -8,6 +8,7 @@
 */

 #include <exception>
 #include <stdexcept>
 #include <iostream>
 #include <algorithm>
 #include <random>
@@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){

    // Check configuration requirements
    if (config.blockSize % device.warpSize)
    	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
    	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size");
    if (config.arraySize < 2*config.blockSize)
    	throw std::runtime_error("[Config] - Unsupported array size (smaller than "
                                 + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
                                 + std::to_string(SizeToThreadsRatio*config.blockSize) + ")");
    if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
    	throw std::runtime_error("[CUDA] - Unsupported array size: "
                                 + std::to_string(config.arraySize * sizeof(Value_t))
                                 + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
                                 + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")");

    return status;
 }
@@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try {
    // Init everything
    init(&argc, &argv);

    logger << "Code version:   " << 'V' << STR(CODE_VERSION) << logger.endl;
    logger << "Array size:     " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
    logger << "Repeated sorts: " << config.perf << logger.endl;
    logger << "GPU:            " << device.name << logger.endl;
@@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try {
        logger << " Done." << logger.endl;

        // Run distributed sort
        logger << "Start sorting ...    ";
        logger << "Start sorting    ... ";
        Timer_total.start();
        bitonicSort(Data);
        Timer_total.stop();