HW3: RC2 - A prephase added for v1 and v2
This commit is contained in:
parent
7a6f7f53b5
commit
f749862193
4
homework_3/.gitignore
vendored
4
homework_3/.gitignore
vendored
@ -20,4 +20,8 @@ various/
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
# nvidia
|
||||
*.ncu-proj
|
||||
|
||||
|
||||
|
||||
|
72
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere
Normal file
72
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere
Normal file
@ -0,0 +1,72 @@
|
||||
Submitting: hpc/BitncV0Q20.sh
|
||||
Submitted batch job 1914643
|
||||
Submitting: hpc/BitncV0Q21.sh
|
||||
Submitted batch job 1914644
|
||||
Submitting: hpc/BitncV0Q22.sh
|
||||
Submitted batch job 1914645
|
||||
Submitting: hpc/BitncV0Q23.sh
|
||||
Submitted batch job 1914646
|
||||
Submitting: hpc/BitncV0Q24.sh
|
||||
Submitted batch job 1914647
|
||||
Submitting: hpc/BitncV0Q25.sh
|
||||
Submitted batch job 1914648
|
||||
Submitting: hpc/BitncV0Q26.sh
|
||||
Submitted batch job 1914649
|
||||
Submitting: hpc/BitncV0Q27.sh
|
||||
Submitted batch job 1914650
|
||||
Submitting: hpc/BitncV0Q28.sh
|
||||
Submitted batch job 1914651
|
||||
Submitting: hpc/BitncV0Q29.sh
|
||||
Submitted batch job 1914652
|
||||
Submitting: hpc/BitncV0Q30.sh
|
||||
Submitted batch job 1914653
|
||||
|
||||
|
||||
Submitting: hpc/BitncV1Q20.sh
|
||||
Submitted batch job 1914654
|
||||
Submitting: hpc/BitncV1Q21.sh
|
||||
Submitted batch job 1914655
|
||||
Submitting: hpc/BitncV1Q22.sh
|
||||
Submitted batch job 1914656
|
||||
Submitting: hpc/BitncV1Q23.sh
|
||||
Submitted batch job 1914657
|
||||
Submitting: hpc/BitncV1Q24.sh
|
||||
Submitted batch job 1914658
|
||||
Submitting: hpc/BitncV1Q25.sh
|
||||
Submitted batch job 1914659
|
||||
Submitting: hpc/BitncV1Q26.sh
|
||||
Submitted batch job 1914660
|
||||
Submitting: hpc/BitncV1Q27.sh
|
||||
Submitted batch job 1914661
|
||||
Submitting: hpc/BitncV1Q28.sh
|
||||
Submitted batch job 1914662
|
||||
Submitting: hpc/BitncV1Q29.sh
|
||||
Submitted batch job 1914663
|
||||
Submitting: hpc/BitncV1Q30.sh
|
||||
Submitted batch job 1914664
|
||||
|
||||
|
||||
Submitting: hpc/BitncV2Q20.sh
|
||||
Submitted batch job 1914665
|
||||
Submitting: hpc/BitncV2Q21.sh
|
||||
Submitted batch job 1914666
|
||||
Submitting: hpc/BitncV2Q22.sh
|
||||
Submitted batch job 1914667
|
||||
Submitting: hpc/BitncV2Q23.sh
|
||||
Submitted batch job 1914668
|
||||
Submitting: hpc/BitncV2Q24.sh
|
||||
Submitted batch job 1914669
|
||||
Submitting: hpc/BitncV2Q25.sh
|
||||
Submitted batch job 1914670
|
||||
Submitting: hpc/BitncV2Q26.sh
|
||||
Submitted batch job 1914671
|
||||
Submitting: hpc/BitncV2Q27.sh
|
||||
Submitted batch job 1914672
|
||||
Submitting: hpc/BitncV2Q28.sh
|
||||
Submitted batch job 1914673
|
||||
Submitting: hpc/BitncV2Q29.sh
|
||||
Submitted batch job 1914674
|
||||
Submitting: hpc/BitncV2Q30.sh
|
||||
Submitted batch job 1914675
|
||||
|
||||
|
70
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu
Normal file
70
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu
Normal file
@ -0,0 +1,70 @@
|
||||
Submitting: hpc/BitncV0Q20.sh
|
||||
Submitted batch job 1914677
|
||||
Submitting: hpc/BitncV0Q21.sh
|
||||
Submitted batch job 1914678
|
||||
Submitting: hpc/BitncV0Q22.sh
|
||||
Submitted batch job 1914679
|
||||
Submitting: hpc/BitncV0Q23.sh
|
||||
Submitted batch job 1914680
|
||||
Submitting: hpc/BitncV0Q24.sh
|
||||
Submitted batch job 1914681
|
||||
Submitting: hpc/BitncV0Q25.sh
|
||||
Submitted batch job 1914682
|
||||
Submitting: hpc/BitncV0Q26.sh
|
||||
Submitted batch job 1914683
|
||||
Submitting: hpc/BitncV0Q27.sh
|
||||
Submitted batch job 1914684
|
||||
Submitting: hpc/BitncV0Q28.sh
|
||||
Submitted batch job 1914685
|
||||
Submitting: hpc/BitncV0Q29.sh
|
||||
Submitted batch job 1914686
|
||||
Submitting: hpc/BitncV0Q30.sh
|
||||
Submitted batch job 1914687
|
||||
|
||||
|
||||
Submitting: hpc/BitncV1Q20.sh
|
||||
Submitted batch job 1914688
|
||||
Submitting: hpc/BitncV1Q21.sh
|
||||
Submitted batch job 1914689
|
||||
Submitting: hpc/BitncV1Q22.sh
|
||||
Submitted batch job 1914690
|
||||
Submitting: hpc/BitncV1Q23.sh
|
||||
Submitted batch job 1914691
|
||||
Submitting: hpc/BitncV1Q24.sh
|
||||
Submitted batch job 1914692
|
||||
Submitting: hpc/BitncV1Q25.sh
|
||||
Submitted batch job 1914693
|
||||
Submitting: hpc/BitncV1Q26.sh
|
||||
Submitted batch job 1914694
|
||||
Submitting: hpc/BitncV1Q27.sh
|
||||
Submitted batch job 1914695
|
||||
Submitting: hpc/BitncV1Q28.sh
|
||||
Submitted batch job 1914696
|
||||
Submitting: hpc/BitncV1Q29.sh
|
||||
Submitted batch job 1914697
|
||||
Submitting: hpc/BitncV1Q30.sh
|
||||
Submitted batch job 1914698
|
||||
|
||||
|
||||
Submitting: hpc/BitncV2Q20.sh
|
||||
Submitted batch job 1914699
|
||||
Submitting: hpc/BitncV2Q21.sh
|
||||
Submitted batch job 1914700
|
||||
Submitting: hpc/BitncV2Q22.sh
|
||||
Submitted batch job 1914701
|
||||
Submitting: hpc/BitncV2Q23.sh
|
||||
Submitted batch job 1914702
|
||||
Submitting: hpc/BitncV2Q24.sh
|
||||
Submitted batch job 1914703
|
||||
Submitting: hpc/BitncV2Q25.sh
|
||||
Submitted batch job 1914704
|
||||
Submitting: hpc/BitncV2Q26.sh
|
||||
Submitted batch job 1914705
|
||||
Submitting: hpc/BitncV2Q27.sh
|
||||
Submitted batch job 1914706
|
||||
Submitting: hpc/BitncV2Q28.sh
|
||||
Submitted batch job 1914707
|
||||
Submitting: hpc/BitncV2Q29.sh
|
||||
Submitted batch job 1914708
|
||||
Submitting: hpc/BitncV2Q30.sh
|
||||
Submitted batch job 1914709
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1048576 (Q=20)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 7118 [usec]
|
||||
[Timing] Mem-xch : 3881 [usec]
|
||||
[Timing] Sorting : 3233 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 2097152 (Q=21)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 7597 [usec]
|
||||
[Timing] Mem-xch : 3359 [usec]
|
||||
[Timing] Sorting : 4237 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 4194304 (Q=22)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 10 [msec]
|
||||
[Timing] Mem-xch : 4320 [usec]
|
||||
[Timing] Sorting : 5982 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 8388608 (Q=23)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 29 [msec]
|
||||
[Timing] Mem-xch : 14 [msec]
|
||||
[Timing] Sorting : 14 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 16777216 (Q=24)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 43 [msec]
|
||||
[Timing] Mem-xch : 13 [msec]
|
||||
[Timing] Sorting : 29 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 33554432 (Q=25)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 89 [msec]
|
||||
[Timing] Mem-xch : 29 [msec]
|
||||
[Timing] Sorting : 59 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 67108864 (Q=26)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 184 [msec]
|
||||
[Timing] Mem-xch : 63 [msec]
|
||||
[Timing] Sorting : 121 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 134217728 (Q=27)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 414 [msec]
|
||||
[Timing] Mem-xch : 157 [msec]
|
||||
[Timing] Sorting : 255 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 268435456 (Q=28)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 909 [msec]
|
||||
[Timing] Mem-xch : 363 [msec]
|
||||
[Timing] Sorting : 548 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 536870912 (Q=29)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 2005 [msec]
|
||||
[Timing] Mem-xch : 840 [msec]
|
||||
[Timing] Sorting : 1163 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1073741824 (Q=30)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 3593 [msec]
|
||||
[Timing] Mem-xch : 1137 [msec]
|
||||
[Timing] Sorting : 2456 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1048576 (Q=20)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 5607 [usec]
|
||||
[Timing] Mem-xch : 4043 [usec]
|
||||
[Timing] Sorting : 1562 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 2097152 (Q=21)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 4605 [usec]
|
||||
[Timing] Mem-xch : 2073 [usec]
|
||||
[Timing] Sorting : 2367 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 4194304 (Q=22)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 11 [msec]
|
||||
[Timing] Mem-xch : 7261 [usec]
|
||||
[Timing] Sorting : 3887 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 8388608 (Q=23)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 16 [msec]
|
||||
[Timing] Mem-xch : 8281 [usec]
|
||||
[Timing] Sorting : 8624 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 16777216 (Q=24)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 33 [msec]
|
||||
[Timing] Mem-xch : 15 [msec]
|
||||
[Timing] Sorting : 18 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 33554432 (Q=25)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 65 [msec]
|
||||
[Timing] Mem-xch : 27 [msec]
|
||||
[Timing] Sorting : 38 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 67108864 (Q=26)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 136 [msec]
|
||||
[Timing] Mem-xch : 63 [msec]
|
||||
[Timing] Sorting : 72 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 134217728 (Q=27)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 281 [msec]
|
||||
[Timing] Mem-xch : 125 [msec]
|
||||
[Timing] Sorting : 156 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 268435456 (Q=28)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 731 [msec]
|
||||
[Timing] Mem-xch : 366 [msec]
|
||||
[Timing] Sorting : 362 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 536870912 (Q=29)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 1378 [msec]
|
||||
[Timing] Mem-xch : 632 [msec]
|
||||
[Timing] Sorting : 753 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1073741824 (Q=30)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 3177 [msec]
|
||||
[Timing] Mem-xch : 1564 [msec]
|
||||
[Timing] Sorting : 1580 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1048576 (Q=20)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 3147 [usec]
|
||||
[Timing] Mem-xch : 1491 [usec]
|
||||
[Timing] Sorting : 1646 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 2097152 (Q=21)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 4908 [usec]
|
||||
[Timing] Mem-xch : 2369 [usec]
|
||||
[Timing] Sorting : 2545 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 4194304 (Q=22)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 8561 [usec]
|
||||
[Timing] Mem-xch : 4249 [usec]
|
||||
[Timing] Sorting : 4299 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 8388608 (Q=23)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 17 [msec]
|
||||
[Timing] Mem-xch : 8507 [usec]
|
||||
[Timing] Sorting : 9197 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 16777216 (Q=24)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 34 [msec]
|
||||
[Timing] Mem-xch : 14 [msec]
|
||||
[Timing] Sorting : 19 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 33554432 (Q=25)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 69 [msec]
|
||||
[Timing] Mem-xch : 28 [msec]
|
||||
[Timing] Sorting : 41 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 67108864 (Q=26)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 149 [msec]
|
||||
[Timing] Mem-xch : 71 [msec]
|
||||
[Timing] Sorting : 87 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 134217728 (Q=27)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 323 [msec]
|
||||
[Timing] Mem-xch : 151 [msec]
|
||||
[Timing] Sorting : 166 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 268435456 (Q=28)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 754 [msec]
|
||||
[Timing] Mem-xch : 367 [msec]
|
||||
[Timing] Sorting : 384 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 536870912 (Q=29)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 1425 [msec]
|
||||
[Timing] Mem-xch : 639 [msec]
|
||||
[Timing] Sorting : 796 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out
Normal file
22
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out
Normal file
@ -0,0 +1,22 @@
|
||||
[Log]: Array size: 1073741824 (Q=30)
|
||||
[Log]: Repeated sorts: 7
|
||||
[Log]: GPU: NVIDIA A100-SXM4-40GB
|
||||
[Log]: Block size: 512
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Log]: Initialize array ... Done.
|
||||
[Log]: Start sorting ... Done.
|
||||
[Timing] Total : 3231 [msec]
|
||||
[Timing] Mem-xch : 1532 [msec]
|
||||
[Timing] Sorting : 1676 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
2049
homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
Normal file
2049
homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
Normal file
File diff suppressed because it is too large
Load Diff
2049
homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
Normal file
2049
homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
Normal file
File diff suppressed because it is too large
Load Diff
45
homework_3/analyse/b31ca23/Pending-PIDs
Normal file
45
homework_3/analyse/b31ca23/Pending-PIDs
Normal file
@ -0,0 +1,45 @@
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh
|
||||
Submitted batch job 1914456
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh
|
||||
Submitted batch job 1914457
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh
|
||||
Submitted batch job 1914458
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh
|
||||
Submitted batch job 1914459
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh
|
||||
Submitted batch job 1914460
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh
|
||||
Submitted batch job 1914461
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh
|
||||
Submitted batch job 1914462
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh
|
||||
Submitted batch job 1914463
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh
|
||||
Submitted batch job 1914464
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh
|
||||
Submitted batch job 1914465
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh
|
||||
Submitted batch job 1914466
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh
|
||||
Submitted batch job 1914467
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh
|
||||
Submitted batch job 1914468
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh
|
||||
Submitted batch job 1914469
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh
|
||||
Submitted batch job 1914470
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh
|
||||
Submitted batch job 1914471
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh
|
||||
Submitted batch job 1914472
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh
|
||||
Submitted batch job 1914473
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh
|
||||
Submitted batch job 1914474
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh
|
||||
Submitted batch job 1914475
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh
|
||||
Submitted batch job 1914476
|
||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh
|
||||
Submitted batch job 1914477
|
||||
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 5920 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 6571 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 13 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 24 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 46 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 92 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 213 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 440 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 935 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 1847 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 3798 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 2843 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 4979 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 9909 [usec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 20 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 35 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 70 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 170 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 346 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 735 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 1522 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
@ -0,0 +1,2 @@
|
||||
[Timing] Total: 2950 [msec]
|
||||
[Validation] Results validation ...[32m [PASSED] [0m
|
23
homework_3/analyse/prof.sh
Executable file
23
homework_3/analyse/prof.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# prof.sh <exec> <report.file>
|
||||
#
|
||||
|
||||
sudo /usr/local/cuda-11.4/bin/ncu \
|
||||
--target-processes all \
|
||||
--metrics "$(echo -n \
|
||||
"smsp__inst_executed,"\
|
||||
"smsp__cycles_active.avg,"\
|
||||
"smsp__cycles_active.sum,"\
|
||||
"gpu__time_duration.sum,"\
|
||||
"smsp__average_warp_latency_issue_stalled_barrier,"\
|
||||
"smsp__warp_issue_stalled_barrier_per_warp_active,"\
|
||||
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\
|
||||
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\
|
||||
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\
|
||||
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\
|
||||
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\
|
||||
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\
|
||||
)" \
|
||||
"$1" -q 20 -b 512 > "$2"
|
@ -2,8 +2,8 @@
|
||||
|
||||
# Submission parameters
|
||||
QOS="small"
|
||||
PARTITION="ampere"
|
||||
SCRIPT_DIR="hpc" # Directory containing the job scripts
|
||||
PARTITION="ampere" # ampere gpu
|
||||
SCRIPT_DIR="hpc" # Directory containing the job scripts
|
||||
|
||||
# Range of values for the -q parameter
|
||||
VERSIONS=("V0" "V1" "V2")
|
||||
@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do
|
||||
script_path="${SCRIPT_DIR}/${script_name}"
|
||||
|
||||
if [[ -f "$script_path" ]]; then
|
||||
echo "Submitting: $script_path"
|
||||
sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
|
||||
echo "Submitted: $script_path"
|
||||
#sbatch -p "$PARTITION" "$script_path"
|
||||
else
|
||||
echo "Warning: File not found - $script_path"
|
||||
fi
|
||||
|
1917
homework_3/reportv1.3
Normal file
1917
homework_3/reportv1.3
Normal file
File diff suppressed because it is too large
Load Diff
1917
homework_3/reportv2.3
Normal file
1917
homework_3/reportv2.3
Normal file
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,7 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
@ -159,9 +160,9 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory");
|
||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device");
|
||||
Timer_memory.stop();
|
||||
|
||||
size_t Nth = config.blockSize;
|
||||
@ -180,7 +181,7 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device");
|
||||
cudaFree(dev_data);
|
||||
Timer_memory.stop();
|
||||
}
|
||||
@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* This is unrolled part of the bitonic double loop for the init phase where the entire
|
||||
* double loop can fit in one block with shared memory access.
|
||||
*
|
||||
* First each thread caches its corresponding data point from the current and the following data block.
|
||||
* After that we execute the pre-phase on the local data and then we write back to global memory.
|
||||
*
|
||||
* @tparam ValueT The underlying data type of the array items
|
||||
* @param data [ValueT*] Pointer to data array
|
||||
* @param n [size_t] The total size of the array
|
||||
* @param stages [size_t] The number of stages to pre execute
|
||||
* @param maxStages [size_t] The maximum number of stages for the entire sort
|
||||
*/
|
||||
template <typename ValueT>
|
||||
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
|
||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
|
||||
for (size_t step = stage; step > 0; ) {
|
||||
--step;
|
||||
interBlockStep_(data, n, step, stage);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* A CUDA version of the Bitonic sort algorithm.
|
||||
*
|
||||
@ -262,18 +288,22 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory");
|
||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device");
|
||||
Timer_memory.stop();
|
||||
|
||||
size_t Nth = config.blockSize;
|
||||
size_t Nbl = NBlocks(size);
|
||||
|
||||
auto Stages = static_cast<size_t>(log2(size));
|
||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); //
|
||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
|
||||
size_t PrephaseStages= InnerBlockSteps + 1;
|
||||
|
||||
Timer_sorting.start();
|
||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||
prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages);
|
||||
cudaDeviceSynchronize();
|
||||
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
|
||||
size_t step = stage - 1;
|
||||
for ( ; step > InnerBlockSteps; --step) {
|
||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||
@ -286,7 +316,7 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device");
|
||||
cudaFree(dev_data);
|
||||
Timer_memory.stop();
|
||||
}
|
||||
@ -301,7 +331,9 @@ void bitonicSort(DataT& data) {
|
||||
* @note
|
||||
* Each block thread collection can exchange twice the size of data points.
|
||||
*/
|
||||
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
|
||||
inline constexpr size_t effectiveBlockSize(size_t blockSize) {
|
||||
return SizeToThreadsRatio * blockSize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write back to global memory
|
||||
// Write back to global memory (no sync here, there will be sync from host)
|
||||
data[gIdx0] = shared_data[lIdx0];
|
||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
||||
}
|
||||
|
||||
/*!
|
||||
* This is unrolled part of the bitonic double loop for the init phase where the entire
|
||||
* double loop can fit in one block with shared memory access.
|
||||
*
|
||||
* First each thread caches its corresponding data point from the current and the following data block.
|
||||
* After that we execute the pre-phase on the local data and then we write back to global memory.
|
||||
*
|
||||
* @tparam ValueT The underlying data type of the array items
|
||||
* @param data [ValueT*] Pointer to data array
|
||||
* @param n [size_t] The total size of the array
|
||||
* @param stages [size_t] The number of stages to pre execute
|
||||
* @param maxStages [size_t] The maximum number of stages for the entire sort
|
||||
*/
|
||||
template <typename ValueT>
|
||||
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
|
||||
extern __shared__ ValueT shared_data[];
|
||||
|
||||
/*
|
||||
* Global and local(shared) memory indices (calculated once)
|
||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
||||
* And we cache the neighbor block address indexes in local (shared) memory
|
||||
*/
|
||||
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
||||
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
|
||||
|
||||
if (gIdx0 + blockDim.x >= n) // Boundary check
|
||||
return;
|
||||
|
||||
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
||||
shared_data[lIdx0] = data[gIdx0];
|
||||
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
|
||||
__syncthreads();
|
||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
|
||||
for (size_t step = stage; step > 0; ) {
|
||||
--step;
|
||||
|
||||
// Init thread global and local indices
|
||||
threadId_t gIdx = gIdx0;
|
||||
threadId_t lIdx = lIdx0;
|
||||
// Find partner and keep-small configuration based on the global data positions
|
||||
threadId_t pIdx = partner(gIdx, step);
|
||||
if (gIdx > pIdx) {
|
||||
// Shift inside effective block
|
||||
gIdx += blockDim.x; // global
|
||||
pIdx += blockDim.x;
|
||||
lIdx += blockDim.x; // local
|
||||
}
|
||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
||||
|
||||
// Exchange data on local(shared) copy
|
||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
||||
exchange(shared_data, lIdx, lpIdx, keep);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// Write back to global memory (no sync here, there will be sync from host)
|
||||
data[gIdx0] = shared_data[lIdx0];
|
||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
||||
}
|
||||
|
||||
/*!
|
||||
@ -421,19 +513,23 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
||||
throw std::runtime_error("[CUDA] - Can not allocate memory");
|
||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory to device");
|
||||
Timer_memory.stop();
|
||||
|
||||
size_t Nth = config.blockSize;
|
||||
size_t Nbl = NBlocks(size);
|
||||
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
|
||||
size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t);
|
||||
|
||||
auto Stages = static_cast<size_t>(log2(size));
|
||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
|
||||
size_t PrephaseStages= InnerBlockSteps + 1;
|
||||
|
||||
Timer_sorting.start();
|
||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
||||
prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages);
|
||||
cudaDeviceSynchronize();
|
||||
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
|
||||
size_t step = stage - 1;
|
||||
for ( ; step > InnerBlockSteps; --step) {
|
||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
||||
@ -446,7 +542,7 @@ void bitonicSort(DataT& data) {
|
||||
|
||||
Timer_memory.start();
|
||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
||||
throw std::runtime_error("[CUDA] - Can not copy memory from device");
|
||||
cudaFree(dev_data);
|
||||
Timer_memory.stop();
|
||||
}
|
||||
|
@ -65,8 +65,8 @@ using ArraySize_t = uint64_t;
|
||||
* The values of the members are set from the command line.
|
||||
*/
|
||||
struct config_t {
|
||||
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
||||
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
|
||||
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
||||
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
|
||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
||||
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
|
||||
//!< the sorting <perf> times to do so.
|
||||
|
@ -8,6 +8,7 @@
|
||||
*/
|
||||
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){
|
||||
|
||||
// Check configuration requirements
|
||||
if (config.blockSize % device.warpSize)
|
||||
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
|
||||
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size");
|
||||
if (config.arraySize < 2*config.blockSize)
|
||||
throw std::runtime_error("[Config] - Unsupported array size (smaller than "
|
||||
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
|
||||
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")");
|
||||
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
|
||||
throw std::runtime_error("[CUDA] - Unsupported array size: "
|
||||
+ std::to_string(config.arraySize * sizeof(Value_t))
|
||||
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
|
||||
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")");
|
||||
|
||||
return status;
|
||||
}
|
||||
@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try {
|
||||
// Init everything
|
||||
init(&argc, &argv);
|
||||
|
||||
logger << "Code version: " << 'V' << STR(CODE_VERSION) << logger.endl;
|
||||
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
|
||||
logger << "Repeated sorts: " << config.perf << logger.endl;
|
||||
logger << "GPU: " << device.name << logger.endl;
|
||||
@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try {
|
||||
logger << " Done." << logger.endl;
|
||||
|
||||
// Run distributed sort
|
||||
logger << "Start sorting ... ";
|
||||
logger << "Start sorting ... ";
|
||||
Timer_total.start();
|
||||
bitonicSort(Data);
|
||||
Timer_total.stop();
|
||||
|
Loading…
x
Reference in New Issue
Block a user