Browse Source

HW3: RC2 - A prephase added for v1 and v2

HW3-RC3
parent
commit
f749862193
68 changed files with 9039 additions and 24 deletions
  1. +4
    -0
      homework_3/.gitignore
  2. +72
    -0
      homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere
  3. +70
    -0
      homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu
  4. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out
  5. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out
  6. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out
  7. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out
  8. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out
  9. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out
  10. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out
  11. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out
  12. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out
  13. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out
  14. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out
  15. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out
  16. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out
  17. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out
  18. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out
  19. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out
  20. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out
  21. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out
  22. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out
  23. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out
  24. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out
  25. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out
  26. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out
  27. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out
  28. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out
  29. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out
  30. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out
  31. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out
  32. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out
  33. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out
  34. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out
  35. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out
  36. +22
    -0
      homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out
  37. +2049
    -0
      homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
  38. +2049
    -0
      homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
  39. +45
    -0
      homework_3/analyse/b31ca23/Pending-PIDs
  40. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out
  41. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out
  42. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out
  43. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out
  44. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out
  45. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out
  46. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out
  47. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out
  48. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out
  49. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out
  50. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out
  51. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out
  52. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out
  53. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out
  54. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out
  55. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out
  56. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out
  57. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out
  58. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out
  59. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out
  60. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out
  61. +2
    -0
      homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out
  62. +23
    -0
      homework_3/analyse/prof.sh
  63. +4
    -3
      homework_3/hpc/submitJobs.sh
  64. +1917
    -0
      homework_3/reportv1.3
  65. +1917
    -0
      homework_3/reportv2.3
  66. +111
    -15
      homework_3/src/bitonicsort.hpp
  67. +2
    -2
      homework_3/src/config.h
  68. +6
    -4
      homework_3/src/main.cpp

+ 4
- 0
homework_3/.gitignore View File

@@ -20,4 +20,8 @@ various/
.vs/
.vscode/

# nvidia
*.ncu-proj




+ 72
- 0
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere View File

@@ -0,0 +1,72 @@
Submitting: hpc/BitncV0Q20.sh
Submitted batch job 1914643
Submitting: hpc/BitncV0Q21.sh
Submitted batch job 1914644
Submitting: hpc/BitncV0Q22.sh
Submitted batch job 1914645
Submitting: hpc/BitncV0Q23.sh
Submitted batch job 1914646
Submitting: hpc/BitncV0Q24.sh
Submitted batch job 1914647
Submitting: hpc/BitncV0Q25.sh
Submitted batch job 1914648
Submitting: hpc/BitncV0Q26.sh
Submitted batch job 1914649
Submitting: hpc/BitncV0Q27.sh
Submitted batch job 1914650
Submitting: hpc/BitncV0Q28.sh
Submitted batch job 1914651
Submitting: hpc/BitncV0Q29.sh
Submitted batch job 1914652
Submitting: hpc/BitncV0Q30.sh
Submitted batch job 1914653


Submitting: hpc/BitncV1Q20.sh
Submitted batch job 1914654
Submitting: hpc/BitncV1Q21.sh
Submitted batch job 1914655
Submitting: hpc/BitncV1Q22.sh
Submitted batch job 1914656
Submitting: hpc/BitncV1Q23.sh
Submitted batch job 1914657
Submitting: hpc/BitncV1Q24.sh
Submitted batch job 1914658
Submitting: hpc/BitncV1Q25.sh
Submitted batch job 1914659
Submitting: hpc/BitncV1Q26.sh
Submitted batch job 1914660
Submitting: hpc/BitncV1Q27.sh
Submitted batch job 1914661
Submitting: hpc/BitncV1Q28.sh
Submitted batch job 1914662
Submitting: hpc/BitncV1Q29.sh
Submitted batch job 1914663
Submitting: hpc/BitncV1Q30.sh
Submitted batch job 1914664


Submitting: hpc/BitncV2Q20.sh
Submitted batch job 1914665
Submitting: hpc/BitncV2Q21.sh
Submitted batch job 1914666
Submitting: hpc/BitncV2Q22.sh
Submitted batch job 1914667
Submitting: hpc/BitncV2Q23.sh
Submitted batch job 1914668
Submitting: hpc/BitncV2Q24.sh
Submitted batch job 1914669
Submitting: hpc/BitncV2Q25.sh
Submitted batch job 1914670
Submitting: hpc/BitncV2Q26.sh
Submitted batch job 1914671
Submitting: hpc/BitncV2Q27.sh
Submitted batch job 1914672
Submitting: hpc/BitncV2Q28.sh
Submitted batch job 1914673
Submitting: hpc/BitncV2Q29.sh
Submitted batch job 1914674
Submitting: hpc/BitncV2Q30.sh
Submitted batch job 1914675



+ 70
- 0
homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu View File

@@ -0,0 +1,70 @@
Submitting: hpc/BitncV0Q20.sh
Submitted batch job 1914677
Submitting: hpc/BitncV0Q21.sh
Submitted batch job 1914678
Submitting: hpc/BitncV0Q22.sh
Submitted batch job 1914679
Submitting: hpc/BitncV0Q23.sh
Submitted batch job 1914680
Submitting: hpc/BitncV0Q24.sh
Submitted batch job 1914681
Submitting: hpc/BitncV0Q25.sh
Submitted batch job 1914682
Submitting: hpc/BitncV0Q26.sh
Submitted batch job 1914683
Submitting: hpc/BitncV0Q27.sh
Submitted batch job 1914684
Submitting: hpc/BitncV0Q28.sh
Submitted batch job 1914685
Submitting: hpc/BitncV0Q29.sh
Submitted batch job 1914686
Submitting: hpc/BitncV0Q30.sh
Submitted batch job 1914687


Submitting: hpc/BitncV1Q20.sh
Submitted batch job 1914688
Submitting: hpc/BitncV1Q21.sh
Submitted batch job 1914689
Submitting: hpc/BitncV1Q22.sh
Submitted batch job 1914690
Submitting: hpc/BitncV1Q23.sh
Submitted batch job 1914691
Submitting: hpc/BitncV1Q24.sh
Submitted batch job 1914692
Submitting: hpc/BitncV1Q25.sh
Submitted batch job 1914693
Submitting: hpc/BitncV1Q26.sh
Submitted batch job 1914694
Submitting: hpc/BitncV1Q27.sh
Submitted batch job 1914695
Submitting: hpc/BitncV1Q28.sh
Submitted batch job 1914696
Submitting: hpc/BitncV1Q29.sh
Submitted batch job 1914697
Submitting: hpc/BitncV1Q30.sh
Submitted batch job 1914698


Submitting: hpc/BitncV2Q20.sh
Submitted batch job 1914699
Submitting: hpc/BitncV2Q21.sh
Submitted batch job 1914700
Submitting: hpc/BitncV2Q22.sh
Submitted batch job 1914701
Submitting: hpc/BitncV2Q23.sh
Submitted batch job 1914702
Submitting: hpc/BitncV2Q24.sh
Submitted batch job 1914703
Submitting: hpc/BitncV2Q25.sh
Submitted batch job 1914704
Submitting: hpc/BitncV2Q26.sh
Submitted batch job 1914705
Submitting: hpc/BitncV2Q27.sh
Submitted batch job 1914706
Submitting: hpc/BitncV2Q28.sh
Submitted batch job 1914707
Submitting: hpc/BitncV2Q29.sh
Submitted batch job 1914708
Submitting: hpc/BitncV2Q30.sh
Submitted batch job 1914709

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 7118 [usec]
[Timing] Mem-xch : 3881 [usec]
[Timing] Sorting : 3233 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 7597 [usec]
[Timing] Mem-xch : 3359 [usec]
[Timing] Sorting : 4237 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 10 [msec]
[Timing] Mem-xch : 4320 [usec]
[Timing] Sorting : 5982 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 29 [msec]
[Timing] Mem-xch : 14 [msec]
[Timing] Sorting : 14 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 43 [msec]
[Timing] Mem-xch : 13 [msec]
[Timing] Sorting : 29 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 89 [msec]
[Timing] Mem-xch : 29 [msec]
[Timing] Sorting : 59 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 184 [msec]
[Timing] Mem-xch : 63 [msec]
[Timing] Sorting : 121 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 414 [msec]
[Timing] Mem-xch : 157 [msec]
[Timing] Sorting : 255 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 909 [msec]
[Timing] Mem-xch : 363 [msec]
[Timing] Sorting : 548 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 2005 [msec]
[Timing] Mem-xch : 840 [msec]
[Timing] Sorting : 1163 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3593 [msec]
[Timing] Mem-xch : 1137 [msec]
[Timing] Sorting : 2456 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 5607 [usec]
[Timing] Mem-xch : 4043 [usec]
[Timing] Sorting : 1562 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 4605 [usec]
[Timing] Mem-xch : 2073 [usec]
[Timing] Sorting : 2367 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 11 [msec]
[Timing] Mem-xch : 7261 [usec]
[Timing] Sorting : 3887 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 16 [msec]
[Timing] Mem-xch : 8281 [usec]
[Timing] Sorting : 8624 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 33 [msec]
[Timing] Mem-xch : 15 [msec]
[Timing] Sorting : 18 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 65 [msec]
[Timing] Mem-xch : 27 [msec]
[Timing] Sorting : 38 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 136 [msec]
[Timing] Mem-xch : 63 [msec]
[Timing] Sorting : 72 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 281 [msec]
[Timing] Mem-xch : 125 [msec]
[Timing] Sorting : 156 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 731 [msec]
[Timing] Mem-xch : 366 [msec]
[Timing] Sorting : 362 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 1378 [msec]
[Timing] Mem-xch : 632 [msec]
[Timing] Sorting : 753 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3177 [msec]
[Timing] Mem-xch : 1564 [msec]
[Timing] Sorting : 1580 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1048576 (Q=20)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3147 [usec]
[Timing] Mem-xch : 1491 [usec]
[Timing] Sorting : 1646 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 2097152 (Q=21)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 4908 [usec]
[Timing] Mem-xch : 2369 [usec]
[Timing] Sorting : 2545 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 4194304 (Q=22)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 8561 [usec]
[Timing] Mem-xch : 4249 [usec]
[Timing] Sorting : 4299 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 8388608 (Q=23)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 17 [msec]
[Timing] Mem-xch : 8507 [usec]
[Timing] Sorting : 9197 [usec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 16777216 (Q=24)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 34 [msec]
[Timing] Mem-xch : 14 [msec]
[Timing] Sorting : 19 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 33554432 (Q=25)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 69 [msec]
[Timing] Mem-xch : 28 [msec]
[Timing] Sorting : 41 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 67108864 (Q=26)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 149 [msec]
[Timing] Mem-xch : 71 [msec]
[Timing] Sorting : 87 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 134217728 (Q=27)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 323 [msec]
[Timing] Mem-xch : 151 [msec]
[Timing] Sorting : 166 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 268435456 (Q=28)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 754 [msec]
[Timing] Mem-xch : 367 [msec]
[Timing] Sorting : 384 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 536870912 (Q=29)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 1425 [msec]
[Timing] Mem-xch : 639 [msec]
[Timing] Sorting : 796 [msec]
[Validation] Results validation ... [PASSED] 

+ 22
- 0
homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out View File

@@ -0,0 +1,22 @@
[Log]: Array size: 1073741824 (Q=30)
[Log]: Repeated sorts: 7
[Log]: GPU: NVIDIA A100-SXM4-40GB
[Log]: Block size: 512
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Log]: Initialize array ... Done.
[Log]: Start sorting ... Done.
[Timing] Total : 3231 [msec]
[Timing] Mem-xch : 1532 [msec]
[Timing] Sorting : 1676 [msec]
[Validation] Results validation ... [PASSED] 

+ 2049
- 0
homework_3/analyse/RC1-7a6f7f5/profReportv1.txt
File diff suppressed because it is too large
View File


+ 2049
- 0
homework_3/analyse/RC1-7a6f7f5/profreportv2.txt
File diff suppressed because it is too large
View File


+ 45
- 0
homework_3/analyse/b31ca23/Pending-PIDs View File

@@ -0,0 +1,45 @@
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh
Submitted batch job 1914456
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh
Submitted batch job 1914457
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh
Submitted batch job 1914458
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh
Submitted batch job 1914459
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh
Submitted batch job 1914460
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh
Submitted batch job 1914461
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh
Submitted batch job 1914462
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh
Submitted batch job 1914463
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh
Submitted batch job 1914464
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh
Submitted batch job 1914465
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh
Submitted batch job 1914466
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh
Submitted batch job 1914467
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh
Submitted batch job 1914468
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh
Submitted batch job 1914469
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh
Submitted batch job 1914470
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh
Submitted batch job 1914471
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh
Submitted batch job 1914472
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh
Submitted batch job 1914473
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh
Submitted batch job 1914474
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh
Submitted batch job 1914475
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh
Submitted batch job 1914476
[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh
Submitted batch job 1914477


+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 5920 [usec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 6571 [usec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 13 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 24 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 46 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 92 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 213 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 440 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 935 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 1847 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 3798 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 2843 [usec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 4979 [usec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 9909 [usec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 20 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 35 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 70 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 170 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 346 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 735 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 1522 [msec]
[Validation] Results validation ... [PASSED] 

+ 2
- 0
homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out View File

@@ -0,0 +1,2 @@
[Timing] Total: 2950 [msec]
[Validation] Results validation ... [PASSED] 

+ 23
- 0
homework_3/analyse/prof.sh View File

@@ -0,0 +1,23 @@
#!/usr/bin/env bash

#
# prof.sh <exec> <report.file>
#

sudo /usr/local/cuda-11.4/bin/ncu \
--target-processes all \
--metrics "$(echo -n \
"smsp__inst_executed,"\
"smsp__cycles_active.avg,"\
"smsp__cycles_active.sum,"\
"gpu__time_duration.sum,"\
"smsp__average_warp_latency_issue_stalled_barrier,"\
"smsp__warp_issue_stalled_barrier_per_warp_active,"\
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\
"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\
"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\
"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\
)" \
"$1" -q 20 -b 512 > "$2"

+ 4
- 3
homework_3/hpc/submitJobs.sh View File

@@ -2,8 +2,8 @@

# Submission parameters
QOS="small"
PARTITION="ampere"
SCRIPT_DIR="hpc" # Directory containing the job scripts
PARTITION="ampere" # ampere gpu
SCRIPT_DIR="hpc" # Directory containing the job scripts

# Range of values for the -q parameter
VERSIONS=("V0" "V1" "V2")
@@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do
script_path="${SCRIPT_DIR}/${script_name}"

if [[ -f "$script_path" ]]; then
echo "Submitting: $script_path"
sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
echo "Submitted: $script_path"
#sbatch -p "$PARTITION" "$script_path"
else
echo "Warning: File not found - $script_path"
fi


+ 1917
- 0
homework_3/reportv1.3
File diff suppressed because it is too large
View File


+ 1917
- 0
homework_3/reportv2.3
File diff suppressed because it is too large
View File


+ 111
- 15
homework_3/src/bitonicsort.hpp View File

@@ -15,6 +15,7 @@
#include <cmath>
#include <cstdint>
#include <utility>
#include <stdexcept>

#include "utils.hpp"

@@ -159,9 +160,9 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop();

size_t Nth = config.blockSize;
@@ -180,7 +181,7 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data);
Timer_memory.stop();
}
@@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
}
}


/*!
* This is unrolled part of the bitonic double loop for the init phase where the entire
* double loop can fit in one block with shared memory access.
*
* First each thread caches its corresponding data point from the current and the following data block.
* After that we execute the pre-phase on the local data and then we write back to global memory.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param stages [size_t] The number of stages to pre execute
* @param maxStages [size_t] The maximum number of stages for the entire sort
*/
template <typename ValueT>
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;
interBlockStep_(data, n, step, stage);
__syncthreads();
}
}
}

/*!
* A CUDA version of the Bitonic sort algorithm.
*
@@ -262,18 +288,22 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop();

size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size);

auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); //
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
size_t PrephaseStages= InnerBlockSteps + 1;

Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) {
prephase<<<Nbl, Nth>>>(dev_data, size, PrephaseStages, Stages);
cudaDeviceSynchronize();
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@@ -286,7 +316,7 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data);
Timer_memory.stop();
}
@@ -301,7 +331,9 @@ void bitonicSort(DataT& data) {
* @note
* Each block thread collection can exchange twice the size of data points.
*/
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
inline constexpr size_t effectiveBlockSize(size_t blockSize) {
return SizeToThreadsRatio * blockSize;
}



@@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
__syncthreads();
}

// Write back to global memory
// Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
}

/*!
* This is unrolled part of the bitonic double loop for the init phase where the entire
* double loop can fit in one block with shared memory access.
*
* First each thread caches its corresponding data point from the current and the following data block.
* After that we execute the pre-phase on the local data and then we write back to global memory.
*
* @tparam ValueT The underlying data type of the array items
* @param data [ValueT*] Pointer to data array
* @param n [size_t] The total size of the array
* @param stages [size_t] The number of stages to pre execute
* @param maxStages [size_t] The maximum number of stages for the entire sort
*/
template <typename ValueT>
__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) {
extern __shared__ ValueT shared_data[];

/*
* Global and local(shared) memory indices (calculated once)
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
* And we cache the neighbor block address indexes in local (shared) memory
*/
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);

if (gIdx0 + blockDim.x >= n) // Boundary check
return;

// Fetch to local memory the entire effective block size (2 positions for each thread)
shared_data[lIdx0] = data[gIdx0];
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
__syncthreads();
for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) {
for (size_t step = stage; step > 0; ) {
--step;

// Init thread global and local indices
threadId_t gIdx = gIdx0;
threadId_t lIdx = lIdx0;
// Find partner and keep-small configuration based on the global data positions
threadId_t pIdx = partner(gIdx, step);
if (gIdx > pIdx) {
// Shift inside effective block
gIdx += blockDim.x; // global
pIdx += blockDim.x;
lIdx += blockDim.x; // local
}
bool keep = keepSmall(gIdx, pIdx, stage);

// Exchange data on local(shared) copy
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
exchange(shared_data, lIdx, lpIdx, keep);
__syncthreads();
}
}

// Write back to global memory (no sync here, there will be sync from host)
data[gIdx0] = shared_data[lIdx0];
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
}

/*!
@@ -421,19 +513,23 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
throw std::runtime_error("[CUDA] - Can not allocate memory");
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
throw std::runtime_error("[CUDA] - Can not copy memory to device");
Timer_memory.stop();

size_t Nth = config.blockSize;
size_t Nbl = NBlocks(size);
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t);

auto Stages = static_cast<size_t>(log2(size));
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
size_t PrephaseStages= InnerBlockSteps + 1;

Timer_sorting.start();
for (size_t stage = 1; stage <= Stages; ++stage) {
prephase<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, PrephaseStages, Stages);
cudaDeviceSynchronize();
for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) {
size_t step = stage - 1;
for ( ; step > InnerBlockSteps; --step) {
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
@@ -446,7 +542,7 @@ void bitonicSort(DataT& data) {

Timer_memory.start();
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
throw std::runtime_error("[CUDA] - Can not copy memory from device");
cudaFree(dev_data);
Timer_memory.stop();
}


+ 2
- 2
homework_3/src/config.h View File

@@ -65,8 +65,8 @@ using ArraySize_t = uint64_t;
* The values of the members are set from the command line.
*/
struct config_t {
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
//!< the sorting <perf> times to do so.


+ 6
- 4
homework_3/src/main.cpp View File

@@ -8,6 +8,7 @@
*/

#include <exception>
#include <stdexcept>
#include <iostream>
#include <algorithm>
#include <random>
@@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){

// Check configuration requirements
if (config.blockSize % device.warpSize)
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size");
if (config.arraySize < 2*config.blockSize)
throw std::runtime_error("[Config] - Unsupported array size (smaller than "
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")");
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
throw std::runtime_error("[CUDA] - Unsupported array size: "
+ std::to_string(config.arraySize * sizeof(Value_t))
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")");

return status;
}
@@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try {
// Init everything
init(&argc, &argv);

logger << "Code version: " << 'V' << STR(CODE_VERSION) << logger.endl;
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
logger << "Repeated sorts: " << config.perf << logger.endl;
logger << "GPU: " << device.name << logger.endl;
@@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try {
logger << " Done." << logger.endl;

// Run distributed sort
logger << "Start sorting ... ";
logger << "Start sorting ... ";
Timer_total.start();
bitonicSort(Data);
Timer_total.stop();


Loading…
Cancel
Save