@@ -20,4 +20,8 @@ various/ | |||||
.vs/ | .vs/ | ||||
.vscode/ | .vscode/ | ||||
# nvidia | |||||
*.ncu-proj | |||||
@@ -0,0 +1,72 @@ | |||||
Submitting: hpc/BitncV0Q20.sh | |||||
Submitted batch job 1914643 | |||||
Submitting: hpc/BitncV0Q21.sh | |||||
Submitted batch job 1914644 | |||||
Submitting: hpc/BitncV0Q22.sh | |||||
Submitted batch job 1914645 | |||||
Submitting: hpc/BitncV0Q23.sh | |||||
Submitted batch job 1914646 | |||||
Submitting: hpc/BitncV0Q24.sh | |||||
Submitted batch job 1914647 | |||||
Submitting: hpc/BitncV0Q25.sh | |||||
Submitted batch job 1914648 | |||||
Submitting: hpc/BitncV0Q26.sh | |||||
Submitted batch job 1914649 | |||||
Submitting: hpc/BitncV0Q27.sh | |||||
Submitted batch job 1914650 | |||||
Submitting: hpc/BitncV0Q28.sh | |||||
Submitted batch job 1914651 | |||||
Submitting: hpc/BitncV0Q29.sh | |||||
Submitted batch job 1914652 | |||||
Submitting: hpc/BitncV0Q30.sh | |||||
Submitted batch job 1914653 | |||||
Submitting: hpc/BitncV1Q20.sh | |||||
Submitted batch job 1914654 | |||||
Submitting: hpc/BitncV1Q21.sh | |||||
Submitted batch job 1914655 | |||||
Submitting: hpc/BitncV1Q22.sh | |||||
Submitted batch job 1914656 | |||||
Submitting: hpc/BitncV1Q23.sh | |||||
Submitted batch job 1914657 | |||||
Submitting: hpc/BitncV1Q24.sh | |||||
Submitted batch job 1914658 | |||||
Submitting: hpc/BitncV1Q25.sh | |||||
Submitted batch job 1914659 | |||||
Submitting: hpc/BitncV1Q26.sh | |||||
Submitted batch job 1914660 | |||||
Submitting: hpc/BitncV1Q27.sh | |||||
Submitted batch job 1914661 | |||||
Submitting: hpc/BitncV1Q28.sh | |||||
Submitted batch job 1914662 | |||||
Submitting: hpc/BitncV1Q29.sh | |||||
Submitted batch job 1914663 | |||||
Submitting: hpc/BitncV1Q30.sh | |||||
Submitted batch job 1914664 | |||||
Submitting: hpc/BitncV2Q20.sh | |||||
Submitted batch job 1914665 | |||||
Submitting: hpc/BitncV2Q21.sh | |||||
Submitted batch job 1914666 | |||||
Submitting: hpc/BitncV2Q22.sh | |||||
Submitted batch job 1914667 | |||||
Submitting: hpc/BitncV2Q23.sh | |||||
Submitted batch job 1914668 | |||||
Submitting: hpc/BitncV2Q24.sh | |||||
Submitted batch job 1914669 | |||||
Submitting: hpc/BitncV2Q25.sh | |||||
Submitted batch job 1914670 | |||||
Submitting: hpc/BitncV2Q26.sh | |||||
Submitted batch job 1914671 | |||||
Submitting: hpc/BitncV2Q27.sh | |||||
Submitted batch job 1914672 | |||||
Submitting: hpc/BitncV2Q28.sh | |||||
Submitted batch job 1914673 | |||||
Submitting: hpc/BitncV2Q29.sh | |||||
Submitted batch job 1914674 | |||||
Submitting: hpc/BitncV2Q30.sh | |||||
Submitted batch job 1914675 | |||||
@@ -0,0 +1,70 @@ | |||||
Submitting: hpc/BitncV0Q20.sh | |||||
Submitted batch job 1914677 | |||||
Submitting: hpc/BitncV0Q21.sh | |||||
Submitted batch job 1914678 | |||||
Submitting: hpc/BitncV0Q22.sh | |||||
Submitted batch job 1914679 | |||||
Submitting: hpc/BitncV0Q23.sh | |||||
Submitted batch job 1914680 | |||||
Submitting: hpc/BitncV0Q24.sh | |||||
Submitted batch job 1914681 | |||||
Submitting: hpc/BitncV0Q25.sh | |||||
Submitted batch job 1914682 | |||||
Submitting: hpc/BitncV0Q26.sh | |||||
Submitted batch job 1914683 | |||||
Submitting: hpc/BitncV0Q27.sh | |||||
Submitted batch job 1914684 | |||||
Submitting: hpc/BitncV0Q28.sh | |||||
Submitted batch job 1914685 | |||||
Submitting: hpc/BitncV0Q29.sh | |||||
Submitted batch job 1914686 | |||||
Submitting: hpc/BitncV0Q30.sh | |||||
Submitted batch job 1914687 | |||||
Submitting: hpc/BitncV1Q20.sh | |||||
Submitted batch job 1914688 | |||||
Submitting: hpc/BitncV1Q21.sh | |||||
Submitted batch job 1914689 | |||||
Submitting: hpc/BitncV1Q22.sh | |||||
Submitted batch job 1914690 | |||||
Submitting: hpc/BitncV1Q23.sh | |||||
Submitted batch job 1914691 | |||||
Submitting: hpc/BitncV1Q24.sh | |||||
Submitted batch job 1914692 | |||||
Submitting: hpc/BitncV1Q25.sh | |||||
Submitted batch job 1914693 | |||||
Submitting: hpc/BitncV1Q26.sh | |||||
Submitted batch job 1914694 | |||||
Submitting: hpc/BitncV1Q27.sh | |||||
Submitted batch job 1914695 | |||||
Submitting: hpc/BitncV1Q28.sh | |||||
Submitted batch job 1914696 | |||||
Submitting: hpc/BitncV1Q29.sh | |||||
Submitted batch job 1914697 | |||||
Submitting: hpc/BitncV1Q30.sh | |||||
Submitted batch job 1914698 | |||||
Submitting: hpc/BitncV2Q20.sh | |||||
Submitted batch job 1914699 | |||||
Submitting: hpc/BitncV2Q21.sh | |||||
Submitted batch job 1914700 | |||||
Submitting: hpc/BitncV2Q22.sh | |||||
Submitted batch job 1914701 | |||||
Submitting: hpc/BitncV2Q23.sh | |||||
Submitted batch job 1914702 | |||||
Submitting: hpc/BitncV2Q24.sh | |||||
Submitted batch job 1914703 | |||||
Submitting: hpc/BitncV2Q25.sh | |||||
Submitted batch job 1914704 | |||||
Submitting: hpc/BitncV2Q26.sh | |||||
Submitted batch job 1914705 | |||||
Submitting: hpc/BitncV2Q27.sh | |||||
Submitted batch job 1914706 | |||||
Submitting: hpc/BitncV2Q28.sh | |||||
Submitted batch job 1914707 | |||||
Submitting: hpc/BitncV2Q29.sh | |||||
Submitted batch job 1914708 | |||||
Submitting: hpc/BitncV2Q30.sh | |||||
Submitted batch job 1914709 |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1048576 (Q=20) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 7118 [usec] | |||||
[Timing] Mem-xch : 3881 [usec] | |||||
[Timing] Sorting : 3233 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 2097152 (Q=21) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 7597 [usec] | |||||
[Timing] Mem-xch : 3359 [usec] | |||||
[Timing] Sorting : 4237 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 4194304 (Q=22) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 10 [msec] | |||||
[Timing] Mem-xch : 4320 [usec] | |||||
[Timing] Sorting : 5982 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 8388608 (Q=23) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 29 [msec] | |||||
[Timing] Mem-xch : 14 [msec] | |||||
[Timing] Sorting : 14 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 16777216 (Q=24) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 43 [msec] | |||||
[Timing] Mem-xch : 13 [msec] | |||||
[Timing] Sorting : 29 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 33554432 (Q=25) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 89 [msec] | |||||
[Timing] Mem-xch : 29 [msec] | |||||
[Timing] Sorting : 59 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 67108864 (Q=26) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 184 [msec] | |||||
[Timing] Mem-xch : 63 [msec] | |||||
[Timing] Sorting : 121 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 134217728 (Q=27) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 414 [msec] | |||||
[Timing] Mem-xch : 157 [msec] | |||||
[Timing] Sorting : 255 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 268435456 (Q=28) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 909 [msec] | |||||
[Timing] Mem-xch : 363 [msec] | |||||
[Timing] Sorting : 548 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 536870912 (Q=29) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 2005 [msec] | |||||
[Timing] Mem-xch : 840 [msec] | |||||
[Timing] Sorting : 1163 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1073741824 (Q=30) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 3593 [msec] | |||||
[Timing] Mem-xch : 1137 [msec] | |||||
[Timing] Sorting : 2456 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1048576 (Q=20) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 5607 [usec] | |||||
[Timing] Mem-xch : 4043 [usec] | |||||
[Timing] Sorting : 1562 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 2097152 (Q=21) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 4605 [usec] | |||||
[Timing] Mem-xch : 2073 [usec] | |||||
[Timing] Sorting : 2367 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 4194304 (Q=22) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 11 [msec] | |||||
[Timing] Mem-xch : 7261 [usec] | |||||
[Timing] Sorting : 3887 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 8388608 (Q=23) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 16 [msec] | |||||
[Timing] Mem-xch : 8281 [usec] | |||||
[Timing] Sorting : 8624 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 16777216 (Q=24) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 33 [msec] | |||||
[Timing] Mem-xch : 15 [msec] | |||||
[Timing] Sorting : 18 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 33554432 (Q=25) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 65 [msec] | |||||
[Timing] Mem-xch : 27 [msec] | |||||
[Timing] Sorting : 38 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 67108864 (Q=26) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 136 [msec] | |||||
[Timing] Mem-xch : 63 [msec] | |||||
[Timing] Sorting : 72 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 134217728 (Q=27) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 281 [msec] | |||||
[Timing] Mem-xch : 125 [msec] | |||||
[Timing] Sorting : 156 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 268435456 (Q=28) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 731 [msec] | |||||
[Timing] Mem-xch : 366 [msec] | |||||
[Timing] Sorting : 362 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 536870912 (Q=29) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 1378 [msec] | |||||
[Timing] Mem-xch : 632 [msec] | |||||
[Timing] Sorting : 753 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1073741824 (Q=30) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 3177 [msec] | |||||
[Timing] Mem-xch : 1564 [msec] | |||||
[Timing] Sorting : 1580 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1048576 (Q=20) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 3147 [usec] | |||||
[Timing] Mem-xch : 1491 [usec] | |||||
[Timing] Sorting : 1646 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 2097152 (Q=21) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 4908 [usec] | |||||
[Timing] Mem-xch : 2369 [usec] | |||||
[Timing] Sorting : 2545 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 4194304 (Q=22) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 8561 [usec] | |||||
[Timing] Mem-xch : 4249 [usec] | |||||
[Timing] Sorting : 4299 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 8388608 (Q=23) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 17 [msec] | |||||
[Timing] Mem-xch : 8507 [usec] | |||||
[Timing] Sorting : 9197 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 16777216 (Q=24) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 34 [msec] | |||||
[Timing] Mem-xch : 14 [msec] | |||||
[Timing] Sorting : 19 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 33554432 (Q=25) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 69 [msec] | |||||
[Timing] Mem-xch : 28 [msec] | |||||
[Timing] Sorting : 41 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 67108864 (Q=26) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 149 [msec] | |||||
[Timing] Mem-xch : 71 [msec] | |||||
[Timing] Sorting : 87 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 134217728 (Q=27) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 323 [msec] | |||||
[Timing] Mem-xch : 151 [msec] | |||||
[Timing] Sorting : 166 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 268435456 (Q=28) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 754 [msec] | |||||
[Timing] Mem-xch : 367 [msec] | |||||
[Timing] Sorting : 384 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 536870912 (Q=29) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 1425 [msec] | |||||
[Timing] Mem-xch : 639 [msec] | |||||
[Timing] Sorting : 796 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,22 @@ | |||||
[Log]: Array size: 1073741824 (Q=30) | |||||
[Log]: Repeated sorts: 7 | |||||
[Log]: GPU: NVIDIA A100-SXM4-40GB | |||||
[Log]: Block size: 512 | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Log]: Initialize array ... Done. | |||||
[Log]: Start sorting ... Done. | |||||
[Timing] Total : 3231 [msec] | |||||
[Timing] Mem-xch : 1532 [msec] | |||||
[Timing] Sorting : 1676 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,45 @@ | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh | |||||
Submitted batch job 1914456 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh | |||||
Submitted batch job 1914457 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh | |||||
Submitted batch job 1914458 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh | |||||
Submitted batch job 1914459 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh | |||||
Submitted batch job 1914460 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh | |||||
Submitted batch job 1914461 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh | |||||
Submitted batch job 1914462 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh | |||||
Submitted batch job 1914463 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh | |||||
Submitted batch job 1914464 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh | |||||
Submitted batch job 1914465 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh | |||||
Submitted batch job 1914466 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh | |||||
Submitted batch job 1914467 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh | |||||
Submitted batch job 1914468 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh | |||||
Submitted batch job 1914469 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh | |||||
Submitted batch job 1914470 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh | |||||
Submitted batch job 1914471 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh | |||||
Submitted batch job 1914472 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh | |||||
Submitted batch job 1914473 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh | |||||
Submitted batch job 1914474 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh | |||||
Submitted batch job 1914475 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh | |||||
Submitted batch job 1914476 | |||||
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh | |||||
Submitted batch job 1914477 | |||||
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 5920 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 6571 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 13 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 24 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 46 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 92 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 213 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 440 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 935 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 1847 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 3798 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 2843 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 4979 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 9909 [usec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 20 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 35 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 70 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 170 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 346 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 735 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 1522 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,2 @@ | |||||
[Timing] Total: 2950 [msec] | |||||
[Validation] Results validation ...[32m [PASSED] [0m |
@@ -0,0 +1,23 @@ | |||||
#!/usr/bin/env bash | |||||
# | |||||
# prof.sh <exec> <report.file> | |||||
# | |||||
sudo /usr/local/cuda-11.4/bin/ncu \ | |||||
--target-processes all \ | |||||
--metrics "$(echo -n \ | |||||
"smsp__inst_executed,"\ | |||||
"smsp__cycles_active.avg,"\ | |||||
"smsp__cycles_active.sum,"\ | |||||
"gpu__time_duration.sum,"\ | |||||
"smsp__average_warp_latency_issue_stalled_barrier,"\ | |||||
"smsp__warp_issue_stalled_barrier_per_warp_active,"\ | |||||
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\ | |||||
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\ | |||||
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\ | |||||
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\ | |||||
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\ | |||||
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\ | |||||
)" \ | |||||
"$1" -q 20 -b 512 > "$2" |
@@ -2,8 +2,8 @@ | |||||
# Submission parameters | # Submission parameters | ||||
QOS="small" | QOS="small" | ||||
PARTITION="ampere" | |||||
SCRIPT_DIR="hpc" # Directory containing the job scripts | |||||
PARTITION="ampere" # ampere gpu | |||||
SCRIPT_DIR="hpc" # Directory containing the job scripts | |||||
# Range of values for the -q parameter | # Range of values for the -q parameter | ||||
VERSIONS=("V0" "V1" "V2") | VERSIONS=("V0" "V1" "V2") | ||||
@@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do | |||||
script_path="${SCRIPT_DIR}/${script_name}" | script_path="${SCRIPT_DIR}/${script_name}" | ||||
if [[ -f "$script_path" ]]; then | if [[ -f "$script_path" ]]; then | ||||
echo "Submitting: $script_path" | |||||
sbatch --qos="$QOS" -p "$PARTITION" "$script_path" | sbatch --qos="$QOS" -p "$PARTITION" "$script_path" | ||||
echo "Submitted: $script_path" | |||||
#sbatch -p "$PARTITION" "$script_path" | |||||
else | else | ||||
echo "Warning: File not found - $script_path" | echo "Warning: File not found - $script_path" | ||||
fi | fi | ||||
@@ -15,6 +15,7 @@ | |||||
#include <cmath> | #include <cmath> | ||||
#include <cstdint> | #include <cstdint> | ||||
#include <utility> | #include <utility> | ||||
#include <stdexcept> | |||||
#include "utils.hpp" | #include "utils.hpp" | ||||
@@ -159,9 +160,9 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | |||||
throw std::runtime_error("[CUDA] - Can not allocate memory"); | |||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory to device"); | |||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
size_t Nth = config.blockSize; | size_t Nth = config.blockSize; | ||||
@@ -180,7 +181,7 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory from device"); | |||||
cudaFree(dev_data); | cudaFree(dev_data); | ||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
} | } | ||||
@@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st | |||||
} | } | ||||
} | } | ||||
/*! | |||||
* This is unrolled part of the bitonic double loop for the init phase where the entire | |||||
* double loop can fit in one block with shared memory access. | |||||
* | |||||
* First each thread caches its corresponding data point from the current and the following data block. | |||||
* After that we execute the pre-phase on the local data and then we write back to global memory. | |||||
* | |||||
* @tparam ValueT The underlying data type of the array items | |||||
* @param data [ValueT*] Pointer to data array | |||||
* @param n [size_t] The total size of the array | |||||
* @param stages [size_t] The number of stages to pre execute | |||||
* @param maxStages [size_t] The maximum number of stages for the entire sort | |||||
*/ | |||||
template <typename ValueT> | |||||
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { | |||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { | |||||
for (size_t step = stage; step > 0; ) { | |||||
--step; | |||||
interBlockStep_(data, n, step, stage); | |||||
__syncthreads(); | |||||
} | |||||
} | |||||
} | |||||
/*! | /*! | ||||
* A CUDA version of the Bitonic sort algorithm. | * A CUDA version of the Bitonic sort algorithm. | ||||
* | * | ||||
@@ -262,18 +288,22 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | |||||
throw std::runtime_error("[CUDA] - Can not allocate memory"); | |||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory to device"); | |||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
size_t Nth = config.blockSize; | size_t Nth = config.blockSize; | ||||
size_t Nbl = NBlocks(size); | size_t Nbl = NBlocks(size); | ||||
auto Stages = static_cast<size_t>(log2(size)); | auto Stages = static_cast<size_t>(log2(size)); | ||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); // | |||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); | |||||
size_t PrephaseStages= InnerBlockSteps + 1; | |||||
Timer_sorting.start(); | Timer_sorting.start(); | ||||
for (size_t stage = 1; stage <= Stages; ++stage) { | |||||
prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages); | |||||
cudaDeviceSynchronize(); | |||||
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { | |||||
size_t step = stage - 1; | size_t step = stage - 1; | ||||
for ( ; step > InnerBlockSteps; --step) { | for ( ; step > InnerBlockSteps; --step) { | ||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
@@ -286,7 +316,7 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory from device"); | |||||
cudaFree(dev_data); | cudaFree(dev_data); | ||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
} | } | ||||
@@ -301,7 +331,9 @@ void bitonicSort(DataT& data) { | |||||
* @note | * @note | ||||
* Each block thread collection can exchange twice the size of data points. | * Each block thread collection can exchange twice the size of data points. | ||||
*/ | */ | ||||
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } | |||||
inline constexpr size_t effectiveBlockSize(size_t blockSize) { | |||||
return SizeToThreadsRatio * blockSize; | |||||
} | |||||
@@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st | |||||
__syncthreads(); | __syncthreads(); | ||||
} | } | ||||
// Write back to global memory | |||||
// Write back to global memory (no sync here, there will be sync from host) | |||||
data[gIdx0] = shared_data[lIdx0]; | data[gIdx0] = shared_data[lIdx0]; | ||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; | data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; | ||||
} | |||||
/*! | |||||
* This is unrolled part of the bitonic double loop for the init phase where the entire | |||||
* double loop can fit in one block with shared memory access. | |||||
* | |||||
* First each thread caches its corresponding data point from the current and the following data block. | |||||
* After that we execute the pre-phase on the local data and then we write back to global memory. | |||||
* | |||||
* @tparam ValueT The underlying data type of the array items | |||||
* @param data [ValueT*] Pointer to data array | |||||
* @param n [size_t] The total size of the array | |||||
* @param stages [size_t] The number of stages to pre execute | |||||
* @param maxStages [size_t] The maximum number of stages for the entire sort | |||||
*/ | |||||
template <typename ValueT> | |||||
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { | |||||
extern __shared__ ValueT shared_data[]; | |||||
/* | |||||
* Global and local(shared) memory indices (calculated once) | |||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2) | |||||
* And we cache the neighbor block address indexes in local (shared) memory | |||||
*/ | |||||
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; | |||||
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); | |||||
if (gIdx0 + blockDim.x >= n) // Boundary check | |||||
return; | |||||
// Fetch to local memory the entire effective block size (2 positions for each thread) | |||||
shared_data[lIdx0] = data[gIdx0]; | |||||
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; | |||||
__syncthreads(); | __syncthreads(); | ||||
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { | |||||
for (size_t step = stage; step > 0; ) { | |||||
--step; | |||||
// Init thread global and local indices | |||||
threadId_t gIdx = gIdx0; | |||||
threadId_t lIdx = lIdx0; | |||||
// Find partner and keep-small configuration based on the global data positions | |||||
threadId_t pIdx = partner(gIdx, step); | |||||
if (gIdx > pIdx) { | |||||
// Shift inside effective block | |||||
gIdx += blockDim.x; // global | |||||
pIdx += blockDim.x; | |||||
lIdx += blockDim.x; // local | |||||
} | |||||
bool keep = keepSmall(gIdx, pIdx, stage); | |||||
// Exchange data on local(shared) copy | |||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x); | |||||
exchange(shared_data, lIdx, lpIdx, keep); | |||||
__syncthreads(); | |||||
} | |||||
} | |||||
// Write back to global memory (no sync here, there will be sync from host) | |||||
data[gIdx0] = shared_data[lIdx0]; | |||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; | |||||
} | } | ||||
/*! | /*! | ||||
@@ -421,19 +513,23 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | |||||
throw std::runtime_error("[CUDA] - Can not allocate memory"); | |||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory to device"); | |||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
size_t Nth = config.blockSize; | size_t Nth = config.blockSize; | ||||
size_t Nbl = NBlocks(size); | size_t Nbl = NBlocks(size); | ||||
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); | |||||
size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t); | |||||
auto Stages = static_cast<size_t>(log2(size)); | auto Stages = static_cast<size_t>(log2(size)); | ||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); | auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); | ||||
size_t PrephaseStages= InnerBlockSteps + 1; | |||||
Timer_sorting.start(); | Timer_sorting.start(); | ||||
for (size_t stage = 1; stage <= Stages; ++stage) { | |||||
prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages); | |||||
cudaDeviceSynchronize(); | |||||
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { | |||||
size_t step = stage - 1; | size_t step = stage - 1; | ||||
for ( ; step > InnerBlockSteps; --step) { | for ( ; step > InnerBlockSteps; --step) { | ||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
@@ -446,7 +542,7 @@ void bitonicSort(DataT& data) { | |||||
Timer_memory.start(); | Timer_memory.start(); | ||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | |||||
throw std::runtime_error("[CUDA] - Can not copy memory from device"); | |||||
cudaFree(dev_data); | cudaFree(dev_data); | ||||
Timer_memory.stop(); | Timer_memory.stop(); | ||||
} | } | ||||
@@ -65,8 +65,8 @@ using ArraySize_t = uint64_t; | |||||
* The values of the members are set from the command line. | * The values of the members are set from the command line. | ||||
*/ | */ | ||||
struct config_t { | struct config_t { | ||||
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. | |||||
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. | |||||
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. | |||||
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. | |||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. | bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. | ||||
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat | size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat | ||||
//!< the sorting <perf> times to do so. | //!< the sorting <perf> times to do so. | ||||
@@ -8,6 +8,7 @@ | |||||
*/ | */ | ||||
#include <exception> | #include <exception> | ||||
#include <stdexcept> | |||||
#include <iostream> | #include <iostream> | ||||
#include <algorithm> | #include <algorithm> | ||||
#include <random> | #include <random> | ||||
@@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){ | |||||
// Check configuration requirements | // Check configuration requirements | ||||
if (config.blockSize % device.warpSize) | if (config.blockSize % device.warpSize) | ||||
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n"); | |||||
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size"); | |||||
if (config.arraySize < 2*config.blockSize) | if (config.arraySize < 2*config.blockSize) | ||||
throw std::runtime_error("[Config] - Unsupported array size (smaller than " | throw std::runtime_error("[Config] - Unsupported array size (smaller than " | ||||
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n"); | |||||
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")"); | |||||
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) | if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) | ||||
throw std::runtime_error("[CUDA] - Unsupported array size: " | throw std::runtime_error("[CUDA] - Unsupported array size: " | ||||
+ std::to_string(config.arraySize * sizeof(Value_t)) | + std::to_string(config.arraySize * sizeof(Value_t)) | ||||
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n"); | |||||
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")"); | |||||
return status; | return status; | ||||
} | } | ||||
@@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try { | |||||
// Init everything | // Init everything | ||||
init(&argc, &argv); | init(&argc, &argv); | ||||
logger << "Code version: " << 'V' << STR(CODE_VERSION) << logger.endl; | |||||
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl; | logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl; | ||||
logger << "Repeated sorts: " << config.perf << logger.endl; | logger << "Repeated sorts: " << config.perf << logger.endl; | ||||
logger << "GPU: " << device.name << logger.endl; | logger << "GPU: " << device.name << logger.endl; | ||||
@@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try { | |||||
logger << " Done." << logger.endl; | logger << " Done." << logger.endl; | ||||
// Run distributed sort | // Run distributed sort | ||||
logger << "Start sorting ... "; | |||||
logger << "Start sorting ... "; | |||||
Timer_total.start(); | Timer_total.start(); | ||||
bitonicSort(Data); | bitonicSort(Data); | ||||
Timer_total.stop(); | Timer_total.stop(); | ||||