HW3: RC2 - A prephase added for v1 and v2

This commit is contained in:
Christos Choutouridis 2025-02-16 16:05:01 +02:00
parent 7a6f7f53b5
commit f749862193
68 changed files with 9039 additions and 24 deletions

View File

@ -20,4 +20,8 @@ various/
.vs/ .vs/
.vscode/ .vscode/
# nvidia
*.ncu-proj

View File

@ -0,0 +1,72 @@
Submitting: hpc/BitncV0Q20.sh
Submitted batch job 1914643
Submitting: hpc/BitncV0Q21.sh
Submitted batch job 1914644
Submitting: hpc/BitncV0Q22.sh
Submitted batch job 1914645
Submitting: hpc/BitncV0Q23.sh
Submitted batch job 1914646
Submitting: hpc/BitncV0Q24.sh
Submitted batch job 1914647
Submitting: hpc/BitncV0Q25.sh
Submitted batch job 1914648
Submitting: hpc/BitncV0Q26.sh
Submitted batch job 1914649
Submitting: hpc/BitncV0Q27.sh
Submitted batch job 1914650
Submitting: hpc/BitncV0Q28.sh
Submitted batch job 1914651
Submitting: hpc/BitncV0Q29.sh
Submitted batch job 1914652
Submitting: hpc/BitncV0Q30.sh
Submitted batch job 1914653
Submitting: hpc/BitncV1Q20.sh
Submitted batch job 1914654
Submitting: hpc/BitncV1Q21.sh
Submitted batch job 1914655
Submitting: hpc/BitncV1Q22.sh
Submitted batch job 1914656
Submitting: hpc/BitncV1Q23.sh
Submitted batch job 1914657
Submitting: hpc/BitncV1Q24.sh
Submitted batch job 1914658
Submitting: hpc/BitncV1Q25.sh
Submitted batch job 1914659
Submitting: hpc/BitncV1Q26.sh
Submitted batch job 1914660
Submitting: hpc/BitncV1Q27.sh
Submitted batch job 1914661
Submitting: hpc/BitncV1Q28.sh
Submitted batch job 1914662
Submitting: hpc/BitncV1Q29.sh
Submitted batch job 1914663
Submitting: hpc/BitncV1Q30.sh
Submitted batch job 1914664
Submitting: hpc/BitncV2Q20.sh
Submitted batch job 1914665
Submitting: hpc/BitncV2Q21.sh
Submitted batch job 1914666
Submitting: hpc/BitncV2Q22.sh
Submitted batch job 1914667
Submitting: hpc/BitncV2Q23.sh
Submitted batch job 1914668
Submitting: hpc/BitncV2Q24.sh
Submitted batch job 1914669
Submitting: hpc/BitncV2Q25.sh
Submitted batch job 1914670
Submitting: hpc/BitncV2Q26.sh
Submitted batch job 1914671
Submitting: hpc/BitncV2Q27.sh
Submitted batch job 1914672
Submitting: hpc/BitncV2Q28.sh
Submitted batch job 1914673
Submitting: hpc/BitncV2Q29.sh
Submitted batch job 1914674
Submitting: hpc/BitncV2Q30.sh
Submitted batch job 1914675

View File

@ -0,0 +1,70 @@
Submitting: hpc/BitncV0Q20.sh
Submitted batch job 1914677
Submitting: hpc/BitncV0Q21.sh
Submitted batch job 1914678
Submitting: hpc/BitncV0Q22.sh
Submitted batch job 1914679
Submitting: hpc/BitncV0Q23.sh
Submitted batch job 1914680
Submitting: hpc/BitncV0Q24.sh
Submitted batch job 1914681
Submitting: hpc/BitncV0Q25.sh
Submitted batch job 1914682
Submitting: hpc/BitncV0Q26.sh
Submitted batch job 1914683
Submitting: hpc/BitncV0Q27.sh
Submitted batch job 1914684
Submitting: hpc/BitncV0Q28.sh
Submitted batch job 1914685
Submitting: hpc/BitncV0Q29.sh
Submitted batch job 1914686
Submitting: hpc/BitncV0Q30.sh
Submitted batch job 1914687
Submitting: hpc/BitncV1Q20.sh
Submitted batch job 1914688
Submitting: hpc/BitncV1Q21.sh
Submitted batch job 1914689
Submitting: hpc/BitncV1Q22.sh
Submitted batch job 1914690
Submitting: hpc/BitncV1Q23.sh
Submitted batch job 1914691
Submitting: hpc/BitncV1Q24.sh
Submitted batch job 1914692
Submitting: hpc/BitncV1Q25.sh
Submitted batch job 1914693
Submitting: hpc/BitncV1Q26.sh
Submitted batch job 1914694
Submitting: hpc/BitncV1Q27.sh
Submitted batch job 1914695
Submitting: hpc/BitncV1Q28.sh
Submitted batch job 1914696
Submitting: hpc/BitncV1Q29.sh
Submitted batch job 1914697
Submitting: hpc/BitncV1Q30.sh
Submitted batch job 1914698
Submitting: hpc/BitncV2Q20.sh
Submitted batch job 1914699
Submitting: hpc/BitncV2Q21.sh
Submitted batch job 1914700
Submitting: hpc/BitncV2Q22.sh
Submitted batch job 1914701
Submitting: hpc/BitncV2Q23.sh
Submitted batch job 1914702
Submitting: hpc/BitncV2Q24.sh
Submitted batch job 1914703
Submitting: hpc/BitncV2Q25.sh
Submitted batch job 1914704
Submitting: hpc/BitncV2Q26.sh
Submitted batch job 1914705
Submitting: hpc/BitncV2Q27.sh
Submitted batch job 1914706
Submitting: hpc/BitncV2Q28.sh
Submitted batch job 1914707
Submitting: hpc/BitncV2Q29.sh
Submitted batch job 1914708
Submitting: hpc/BitncV2Q30.sh
Submitted batch job 1914709

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 7118 [usec]
[Timing] Mem-xch : 3881 [usec]
[Timing] Sorting : 3233 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 7597 [usec]
[Timing] Mem-xch : 3359 [usec]
[Timing] Sorting : 4237 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 10 [msec]
[Timing] Mem-xch : 4320 [usec]
[Timing] Sorting : 5982 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 29 [msec]
[Timing] Mem-xch : 14 [msec]
[Timing] Sorting : 14 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 43 [msec]
[Timing] Mem-xch : 13 [msec]
[Timing] Sorting : 29 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 89 [msec]
[Timing] Mem-xch : 29 [msec]
[Timing] Sorting : 59 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 184 [msec]
[Timing] Mem-xch : 63 [msec]
[Timing] Sorting : 121 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 414 [msec]
[Timing] Mem-xch : 157 [msec]
[Timing] Sorting : 255 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 909 [msec]
[Timing] Mem-xch : 363 [msec]
[Timing] Sorting : 548 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 2005 [msec]
[Timing] Mem-xch : 840 [msec]
[Timing] Sorting : 1163 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3593 [msec]
[Timing] Mem-xch : 1137 [msec]
[Timing] Sorting : 2456 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 5607 [usec]
[Timing] Mem-xch : 4043 [usec]
[Timing] Sorting : 1562 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 4605 [usec]
[Timing] Mem-xch : 2073 [usec]
[Timing] Sorting : 2367 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 11 [msec]
[Timing] Mem-xch : 7261 [usec]
[Timing] Sorting : 3887 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 16 [msec]
[Timing] Mem-xch : 8281 [usec]
[Timing] Sorting : 8624 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 33 [msec]
[Timing] Mem-xch : 15 [msec]
[Timing] Sorting : 18 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 65 [msec]
[Timing] Mem-xch : 27 [msec]
[Timing] Sorting : 38 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 136 [msec]
[Timing] Mem-xch : 63 [msec]
[Timing] Sorting : 72 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 281 [msec]
[Timing] Mem-xch : 125 [msec]
[Timing] Sorting : 156 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 731 [msec]
[Timing] Mem-xch : 366 [msec]
[Timing] Sorting : 362 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 1378 [msec]
[Timing] Mem-xch : 632 [msec]
[Timing] Sorting : 753 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3177 [msec]
[Timing] Mem-xch : 1564 [msec]
[Timing] Sorting : 1580 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3147 [usec]
[Timing] Mem-xch : 1491 [usec]
[Timing] Sorting : 1646 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 4908 [usec]
[Timing] Mem-xch : 2369 [usec]
[Timing] Sorting : 2545 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 8561 [usec]
[Timing] Mem-xch : 4249 [usec]
[Timing] Sorting : 4299 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 17 [msec]
[Timing] Mem-xch : 8507 [usec]
[Timing] Sorting : 9197 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 34 [msec]
[Timing] Mem-xch : 14 [msec]
[Timing] Sorting : 19 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 69 [msec]
[Timing] Mem-xch : 28 [msec]
[Timing] Sorting : 41 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 149 [msec]
[Timing] Mem-xch : 71 [msec]
[Timing] Sorting : 87 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 323 [msec]
[Timing] Mem-xch : 151 [msec]
[Timing] Sorting : 166 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 754 [msec]
[Timing] Mem-xch : 367 [msec]
[Timing] Sorting : 384 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 1425 [msec]
[Timing] Mem-xch : 639 [msec]
[Timing] Sorting : 796 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3231 [msec]
[Timing] Mem-xch : 1532 [msec]
[Timing] Sorting : 1676 [msec]
[Validation] Results validation ... [PASSED] 

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,45 @@
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh
Submitted batch job 1914456
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh
Submitted batch job 1914457
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh
Submitted batch job 1914458
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh
Submitted batch job 1914459
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh
Submitted batch job 1914460
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh
Submitted batch job 1914461
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh
Submitted batch job 1914462
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh
Submitted batch job 1914463
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh
Submitted batch job 1914464
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh
Submitted batch job 1914465
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh
Submitted batch job 1914466
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh
Submitted batch job 1914467
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh
Submitted batch job 1914468
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh
Submitted batch job 1914469
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh
Submitted batch job 1914470
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh
Submitted batch job 1914471
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh
Submitted batch job 1914472
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh
Submitted batch job 1914473
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh
Submitted batch job 1914474
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh
Submitted batch job 1914475
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh
Submitted batch job 1914476
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh
Submitted batch job 1914477

View File

@ -0,0 +1,2 @@
[Timing] Total: 5920 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 6571 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 13 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 24 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 46 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 92 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 213 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 440 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 935 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 1847 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 3798 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 2843 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 4979 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 9909 [usec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 20 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 35 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 70 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 170 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 346 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 735 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 1522 [msec]
[Validation] Results validation ... [PASSED] 

View File

@ -0,0 +1,2 @@
[Timing] Total: 2950 [msec]
[Validation] Results validation ... [PASSED] 

23
homework_3/analyse/prof.sh Executable file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env bash
#
# prof.sh <exec> <report.file>
#
sudo /usr/local/cuda-11.4/bin/ncu \
--target-processes all \
--metrics "$(echo -n \
"smsp__inst_executed,"\
"smsp__cycles_active.avg,"\
"smsp__cycles_active.sum,"\
"gpu__time_duration.sum,"\
"smsp__average_warp_latency_issue_stalled_barrier,"\
"smsp__warp_issue_stalled_barrier_per_warp_active,"\
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\
)" \
"$1" -q 20 -b 512 > "$2"

View File

@ -2,8 +2,8 @@
# Submission parameters # Submission parameters
QOS="small" QOS="small"
PARTITION="ampere" PARTITION="ampere" # ampere gpu
SCRIPT_DIR="hpc" # Directory containing the job scripts SCRIPT_DIR="hpc" # Directory containing the job scripts
# Range of values for the -q parameter # Range of values for the -q parameter
VERSIONS=("V0" "V1" "V2") VERSIONS=("V0" "V1" "V2")
@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do
script_path="${SCRIPT_DIR}/${script_name}" script_path="${SCRIPT_DIR}/${script_name}"
if [[ -f "$script_path" ]]; then if [[ -f "$script_path" ]]; then
echo "Submitting: $script_path"
sbatch --qos="$QOS" -p "$PARTITION" "$script_path" sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
echo "Submitted: $script_path" #sbatch -p "$PARTITION" "$script_path"
else else
echo "Warning: File not found - $script_path" echo "Warning: File not found - $script_path"
fi fi

1917
homework_3/reportv1.3 Normal file

File diff suppressed because it is too large Load Diff

1917
homework_3/reportv2.3 Normal file

File diff suppressed because it is too large Load Diff

View File

@ -15,6 +15,7 @@
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
#include <utility> #include <utility>
#include <stdexcept>
#include "utils.hpp" #include "utils.hpp"
@ -159,9 +160,9 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop(); Timer_memory.stop();
size_t Nth = config.blockSize; size_t Nth = config.blockSize;
@ -180,7 +181,7 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data); cudaFree(dev_data);
Timer_memory.stop(); Timer_memory.stop();
} }
@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
} }
} }
/*!
* This is unrolled part of the bitonic double loop for the init phase where the entire
* double loop can fit in one block with shared memory access.
*
* First each thread caches its corresponding data point from the current and the following data block.
* After that we execute the pre-phase on the local data and then we write back to global memory.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param stages [size_t] The number of stages to pre execute
* @param maxStages [size_t] The maximum number of stages for the entire sort
*/
template <typename ValueT>
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
interBlockStep_(data, n, step, stage);
__syncthreads();
}
}
}
/*! /*!
* A CUDA version of the Bitonic sort algorithm. * A CUDA version of the Bitonic sort algorithm.
* *
@ -262,18 +288,22 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop(); Timer_memory.stop();
size_t Nth = config.blockSize; size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size); size_t Nbl = NBlocks(size);
auto Stages = static_cast<size_t>(log2(size)); auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); // auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
size_t PrephaseStages= InnerBlockSteps + 1;
Timer_sorting.start(); Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) { prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages);
cudaDeviceSynchronize();
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
size_t step = stage - 1; size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) { for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@ -286,7 +316,7 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data); cudaFree(dev_data);
Timer_memory.stop(); Timer_memory.stop();
} }
@ -301,7 +331,9 @@ void bitonicSort(DataT& data) {
* @note * @note
* Each block thread collection can exchange twice the size of data points. * Each block thread collection can exchange twice the size of data points.
*/ */
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } inline constexpr size_t effectiveBlockSize(size_t blockSize) {
return SizeToThreadsRatio * blockSize;
}
@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
__syncthreads(); __syncthreads();
} }
// Write back to global memory // Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0]; data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
}
/*!
* This is unrolled part of the bitonic double loop for the init phase where the entire
* double loop can fit in one block with shared memory access.
*
* First each thread caches its corresponding data point from the current and the following data block.
* After that we execute the pre-phase on the local data and then we write back to global memory.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param stages [size_t] The number of stages to pre execute
* @param maxStages [size_t] The maximum number of stages for the entire sort
*/
template <typename ValueT>
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
extern __shared__ ValueT shared_data[];
/*
* Global and local(shared) memory indices (calculated once)
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we cache the neighbor block address indexes in local (shared) memory
*/
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
if (gIdx0 + blockDim.x >= n) // Boundary check
return;
// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx0] = data[gIdx0];
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
__syncthreads(); __syncthreads();
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
// Init thread global and local indices
threadId_t gIdx = gIdx0;
threadId_t lIdx = lIdx0;
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Shift inside effective block
gIdx += blockDim.x; // global
pIdx += blockDim.x;
lIdx += blockDim.x; // local
}
bool keep = keepSmall(gIdx, pIdx, stage);
// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
__syncthreads();
}
}
// Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
} }
/*! /*!
@ -421,19 +513,23 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop(); Timer_memory.stop();
size_t Nth = config.blockSize; size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size); size_t Nbl = NBlocks(size);
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t);
auto Stages = static_cast<size_t>(log2(size)); auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
size_t PrephaseStages= InnerBlockSteps + 1;
Timer_sorting.start(); Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) { prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages);
cudaDeviceSynchronize();
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
size_t step = stage - 1; size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) { for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@ -446,7 +542,7 @@ void bitonicSort(DataT& data) {
Timer_memory.start(); Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data); cudaFree(dev_data);
Timer_memory.stop(); Timer_memory.stop();
} }

View File

@ -65,8 +65,8 @@ using ArraySize_t = uint64_t;
* The values of the members are set from the command line. * The values of the members are set from the command line.
*/ */
struct config_t { struct config_t {
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
//!< the sorting <perf> times to do so. //!< the sorting <perf> times to do so.

View File

@ -8,6 +8,7 @@
*/ */
#include <exception> #include <exception>
#include <stdexcept>
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#include <random> #include <random>
@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){
// Check configuration requirements // Check configuration requirements
if (config.blockSize % device.warpSize) if (config.blockSize % device.warpSize)
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n"); throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size");
if (config.arraySize < 2*config.blockSize) if (config.arraySize < 2*config.blockSize)
throw std::runtime_error("[Config] - Unsupported array size (smaller than " throw std::runtime_error("[Config] - Unsupported array size (smaller than "
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n"); + std::to_string(SizeToThreadsRatio*config.blockSize) + ")");
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
throw std::runtime_error("[CUDA] - Unsupported array size: " throw std::runtime_error("[CUDA] - Unsupported array size: "
+ std::to_string(config.arraySize * sizeof(Value_t)) + std::to_string(config.arraySize * sizeof(Value_t))
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n"); + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")");
return status; return status;
} }
@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try {
// Init everything // Init everything
init(&argc, &argv); init(&argc, &argv);
logger << "Code version: " << 'V' << STR(CODE_VERSION) << logger.endl;
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl; logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
logger << "Repeated sorts: " << config.perf << logger.endl; logger << "Repeated sorts: " << config.perf << logger.endl;
logger << "GPU: " << device.name << logger.endl; logger << "GPU: " << device.name << logger.endl;
@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try {
logger << " Done." << logger.endl; logger << " Done." << logger.endl;
// Run distributed sort // Run distributed sort
logger << "Start sorting ... "; logger << "Start sorting ... ";
Timer_total.start(); Timer_total.start();
bitonicSort(Data); bitonicSort(Data);
Timer_total.stop(); Timer_total.stop();