diff --git a/homework_3/.gitignore b/homework_3/.gitignore index 004a010..a29ffc5 100644 --- a/homework_3/.gitignore +++ b/homework_3/.gitignore @@ -20,4 +20,8 @@ various/ .vs/ .vscode/ +# nvidia +*.ncu-proj + + diff --git a/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere new file mode 100644 index 0000000..ee3f7fd --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-ampere @@ -0,0 +1,72 @@ +Submitting: hpc/BitncV0Q20.sh +Submitted batch job 1914643 +Submitting: hpc/BitncV0Q21.sh +Submitted batch job 1914644 +Submitting: hpc/BitncV0Q22.sh +Submitted batch job 1914645 +Submitting: hpc/BitncV0Q23.sh +Submitted batch job 1914646 +Submitting: hpc/BitncV0Q24.sh +Submitted batch job 1914647 +Submitting: hpc/BitncV0Q25.sh +Submitted batch job 1914648 +Submitting: hpc/BitncV0Q26.sh +Submitted batch job 1914649 +Submitting: hpc/BitncV0Q27.sh +Submitted batch job 1914650 +Submitting: hpc/BitncV0Q28.sh +Submitted batch job 1914651 +Submitting: hpc/BitncV0Q29.sh +Submitted batch job 1914652 +Submitting: hpc/BitncV0Q30.sh +Submitted batch job 1914653 + + +Submitting: hpc/BitncV1Q20.sh +Submitted batch job 1914654 +Submitting: hpc/BitncV1Q21.sh +Submitted batch job 1914655 +Submitting: hpc/BitncV1Q22.sh +Submitted batch job 1914656 +Submitting: hpc/BitncV1Q23.sh +Submitted batch job 1914657 +Submitting: hpc/BitncV1Q24.sh +Submitted batch job 1914658 +Submitting: hpc/BitncV1Q25.sh +Submitted batch job 1914659 +Submitting: hpc/BitncV1Q26.sh +Submitted batch job 1914660 +Submitting: hpc/BitncV1Q27.sh +Submitted batch job 1914661 +Submitting: hpc/BitncV1Q28.sh +Submitted batch job 1914662 +Submitting: hpc/BitncV1Q29.sh +Submitted batch job 1914663 +Submitting: hpc/BitncV1Q30.sh +Submitted batch job 1914664 + + +Submitting: hpc/BitncV2Q20.sh +Submitted batch job 1914665 +Submitting: hpc/BitncV2Q21.sh +Submitted batch job 1914666 +Submitting: hpc/BitncV2Q22.sh +Submitted batch job 1914667 +Submitting: hpc/BitncV2Q23.sh +Submitted batch job 1914668 +Submitting: hpc/BitncV2Q24.sh +Submitted batch job 1914669 +Submitting: hpc/BitncV2Q25.sh +Submitted batch job 1914670 +Submitting: hpc/BitncV2Q26.sh +Submitted batch job 1914671 +Submitting: hpc/BitncV2Q27.sh +Submitted batch job 1914672 +Submitting: hpc/BitncV2Q28.sh +Submitted batch job 1914673 +Submitting: hpc/BitncV2Q29.sh +Submitted batch job 1914674 +Submitting: hpc/BitncV2Q30.sh +Submitted batch job 1914675 + + diff --git a/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu new file mode 100644 index 0000000..2763c54 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/Pending-PIDs-gpu @@ -0,0 +1,70 @@ +Submitting: hpc/BitncV0Q20.sh +Submitted batch job 1914677 +Submitting: hpc/BitncV0Q21.sh +Submitted batch job 1914678 +Submitting: hpc/BitncV0Q22.sh +Submitted batch job 1914679 +Submitting: hpc/BitncV0Q23.sh +Submitted batch job 1914680 +Submitting: hpc/BitncV0Q24.sh +Submitted batch job 1914681 +Submitting: hpc/BitncV0Q25.sh +Submitted batch job 1914682 +Submitting: hpc/BitncV0Q26.sh +Submitted batch job 1914683 +Submitting: hpc/BitncV0Q27.sh +Submitted batch job 1914684 +Submitting: hpc/BitncV0Q28.sh +Submitted batch job 1914685 +Submitting: hpc/BitncV0Q29.sh +Submitted batch job 1914686 +Submitting: hpc/BitncV0Q30.sh +Submitted batch job 1914687 + + +Submitting: hpc/BitncV1Q20.sh +Submitted batch job 1914688 +Submitting: hpc/BitncV1Q21.sh +Submitted batch job 1914689 +Submitting: hpc/BitncV1Q22.sh +Submitted batch job 1914690 +Submitting: hpc/BitncV1Q23.sh +Submitted batch job 1914691 +Submitting: hpc/BitncV1Q24.sh +Submitted batch job 1914692 +Submitting: hpc/BitncV1Q25.sh +Submitted batch job 1914693 +Submitting: hpc/BitncV1Q26.sh +Submitted batch job 1914694 +Submitting: hpc/BitncV1Q27.sh +Submitted batch job 1914695 +Submitting: hpc/BitncV1Q28.sh +Submitted batch job 1914696 +Submitting: hpc/BitncV1Q29.sh +Submitted batch job 1914697 +Submitting: hpc/BitncV1Q30.sh +Submitted batch job 1914698 + + +Submitting: hpc/BitncV2Q20.sh +Submitted batch job 1914699 +Submitting: hpc/BitncV2Q21.sh +Submitted batch job 1914700 +Submitting: hpc/BitncV2Q22.sh +Submitted batch job 1914701 +Submitting: hpc/BitncV2Q23.sh +Submitted batch job 1914702 +Submitting: hpc/BitncV2Q24.sh +Submitted batch job 1914703 +Submitting: hpc/BitncV2Q25.sh +Submitted batch job 1914704 +Submitting: hpc/BitncV2Q26.sh +Submitted batch job 1914705 +Submitting: hpc/BitncV2Q27.sh +Submitted batch job 1914706 +Submitting: hpc/BitncV2Q28.sh +Submitted batch job 1914707 +Submitting: hpc/BitncV2Q29.sh +Submitted batch job 1914708 +Submitting: hpc/BitncV2Q30.sh +Submitted batch job 1914709 diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out new file mode 100644 index 0000000..5a2598a --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q20.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1048576 (Q=20) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 7118 [usec] +[Timing] Mem-xch : 3881 [usec] +[Timing] Sorting : 3233 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out new file mode 100644 index 0000000..79c5b52 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q21.out @@ -0,0 +1,22 @@ +[Log]: Array size: 2097152 (Q=21) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 7597 [usec] +[Timing] Mem-xch : 3359 [usec] +[Timing] Sorting : 4237 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out new file mode 100644 index 0000000..f0e984f --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q22.out @@ -0,0 +1,22 @@ +[Log]: Array size: 4194304 (Q=22) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 10 [msec] +[Timing] Mem-xch : 4320 [usec] +[Timing] Sorting : 5982 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out new file mode 100644 index 0000000..f59fd49 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q23.out @@ -0,0 +1,22 @@ +[Log]: Array size: 8388608 (Q=23) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 29 [msec] +[Timing] Mem-xch : 14 [msec] +[Timing] Sorting : 14 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out new file mode 100644 index 0000000..9177d3f --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q24.out @@ -0,0 +1,22 @@ +[Log]: Array size: 16777216 (Q=24) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 43 [msec] +[Timing] Mem-xch : 13 [msec] +[Timing] Sorting : 29 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out new file mode 100644 index 0000000..62267c0 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q25.out @@ -0,0 +1,22 @@ +[Log]: Array size: 33554432 (Q=25) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 89 [msec] +[Timing] Mem-xch : 29 [msec] +[Timing] Sorting : 59 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out new file mode 100644 index 0000000..3c11832 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q26.out @@ -0,0 +1,22 @@ +[Log]: Array size: 67108864 (Q=26) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 184 [msec] +[Timing] Mem-xch : 63 [msec] +[Timing] Sorting : 121 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out new file mode 100644 index 0000000..f33a2c6 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q27.out @@ -0,0 +1,22 @@ +[Log]: Array size: 134217728 (Q=27) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 414 [msec] +[Timing] Mem-xch : 157 [msec] +[Timing] Sorting : 255 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out new file mode 100644 index 0000000..4ed678b --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q28.out @@ -0,0 +1,22 @@ +[Log]: Array size: 268435456 (Q=28) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 909 [msec] +[Timing] Mem-xch : 363 [msec] +[Timing] Sorting : 548 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out new file mode 100644 index 0000000..e1c59bd --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q29.out @@ -0,0 +1,22 @@ +[Log]: Array size: 536870912 (Q=29) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 2005 [msec] +[Timing] Mem-xch : 840 [msec] +[Timing] Sorting : 1163 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out new file mode 100644 index 0000000..17c3212 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV0Q30.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1073741824 (Q=30) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 3593 [msec] +[Timing] Mem-xch : 1137 [msec] +[Timing] Sorting : 2456 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out new file mode 100644 index 0000000..d16038e --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q20.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1048576 (Q=20) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 5607 [usec] +[Timing] Mem-xch : 4043 [usec] +[Timing] Sorting : 1562 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out new file mode 100644 index 0000000..73c1ed2 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q21.out @@ -0,0 +1,22 @@ +[Log]: Array size: 2097152 (Q=21) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 4605 [usec] +[Timing] Mem-xch : 2073 [usec] +[Timing] Sorting : 2367 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out new file mode 100644 index 0000000..52c47c9 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q22.out @@ -0,0 +1,22 @@ +[Log]: Array size: 4194304 (Q=22) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 11 [msec] +[Timing] Mem-xch : 7261 [usec] +[Timing] Sorting : 3887 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out new file mode 100644 index 0000000..37fd860 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q23.out @@ -0,0 +1,22 @@ +[Log]: Array size: 8388608 (Q=23) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 16 [msec] +[Timing] Mem-xch : 8281 [usec] +[Timing] Sorting : 8624 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out new file mode 100644 index 0000000..6454e1e --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q24.out @@ -0,0 +1,22 @@ +[Log]: Array size: 16777216 (Q=24) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 33 [msec] +[Timing] Mem-xch : 15 [msec] +[Timing] Sorting : 18 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out new file mode 100644 index 0000000..4e37052 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q25.out @@ -0,0 +1,22 @@ +[Log]: Array size: 33554432 (Q=25) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 65 [msec] +[Timing] Mem-xch : 27 [msec] +[Timing] Sorting : 38 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out new file mode 100644 index 0000000..4e705e8 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q26.out @@ -0,0 +1,22 @@ +[Log]: Array size: 67108864 (Q=26) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 136 [msec] +[Timing] Mem-xch : 63 [msec] +[Timing] Sorting : 72 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out new file mode 100644 index 0000000..78577bc --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q27.out @@ -0,0 +1,22 @@ +[Log]: Array size: 134217728 (Q=27) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 281 [msec] +[Timing] Mem-xch : 125 [msec] +[Timing] Sorting : 156 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out new file mode 100644 index 0000000..cba64c8 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q28.out @@ -0,0 +1,22 @@ +[Log]: Array size: 268435456 (Q=28) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 731 [msec] +[Timing] Mem-xch : 366 [msec] +[Timing] Sorting : 362 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out new file mode 100644 index 0000000..3acbf6f --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q29.out @@ -0,0 +1,22 @@ +[Log]: Array size: 536870912 (Q=29) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 1378 [msec] +[Timing] Mem-xch : 632 [msec] +[Timing] Sorting : 753 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out new file mode 100644 index 0000000..e768588 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV1Q30.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1073741824 (Q=30) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 3177 [msec] +[Timing] Mem-xch : 1564 [msec] +[Timing] Sorting : 1580 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out new file mode 100644 index 0000000..e2403d4 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q20.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1048576 (Q=20) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 3147 [usec] +[Timing] Mem-xch : 1491 [usec] +[Timing] Sorting : 1646 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out new file mode 100644 index 0000000..7e5f117 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q21.out @@ -0,0 +1,22 @@ +[Log]: Array size: 2097152 (Q=21) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 4908 [usec] +[Timing] Mem-xch : 2369 [usec] +[Timing] Sorting : 2545 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out new file mode 100644 index 0000000..792ad6c --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q22.out @@ -0,0 +1,22 @@ +[Log]: Array size: 4194304 (Q=22) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 8561 [usec] +[Timing] Mem-xch : 4249 [usec] +[Timing] Sorting : 4299 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out new file mode 100644 index 0000000..14b51c2 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q23.out @@ -0,0 +1,22 @@ +[Log]: Array size: 8388608 (Q=23) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 17 [msec] +[Timing] Mem-xch : 8507 [usec] +[Timing] Sorting : 9197 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out new file mode 100644 index 0000000..ae5e36e --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q24.out @@ -0,0 +1,22 @@ +[Log]: Array size: 16777216 (Q=24) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 34 [msec] +[Timing] Mem-xch : 14 [msec] +[Timing] Sorting : 19 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out new file mode 100644 index 0000000..7b02c36 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q25.out @@ -0,0 +1,22 @@ +[Log]: Array size: 33554432 (Q=25) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 69 [msec] +[Timing] Mem-xch : 28 [msec] +[Timing] Sorting : 41 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out new file mode 100644 index 0000000..5851b86 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q26.out @@ -0,0 +1,22 @@ +[Log]: Array size: 67108864 (Q=26) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 149 [msec] +[Timing] Mem-xch : 71 [msec] +[Timing] Sorting : 87 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out new file mode 100644 index 0000000..eb944d2 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q27.out @@ -0,0 +1,22 @@ +[Log]: Array size: 134217728 (Q=27) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 323 [msec] +[Timing] Mem-xch : 151 [msec] +[Timing] Sorting : 166 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out new file mode 100644 index 0000000..18cd3d5 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q28.out @@ -0,0 +1,22 @@ +[Log]: Array size: 268435456 (Q=28) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 754 [msec] +[Timing] Mem-xch : 367 [msec] +[Timing] Sorting : 384 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out new file mode 100644 index 0000000..3e51390 --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q29.out @@ -0,0 +1,22 @@ +[Log]: Array size: 536870912 (Q=29) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 1425 [msec] +[Timing] Mem-xch : 639 [msec] +[Timing] Sorting : 796 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out new file mode 100644 index 0000000..ab3960d --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/ampere/slurm-BitncV2Q30.out @@ -0,0 +1,22 @@ +[Log]: Array size: 1073741824 (Q=30) +[Log]: Repeated sorts: 7 +[Log]: GPU: NVIDIA A100-SXM4-40GB +[Log]: Block size: 512 +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Log]: Initialize array ... Done. +[Log]: Start sorting ... Done. +[Timing] Total : 3231 [msec] +[Timing] Mem-xch : 1532 [msec] +[Timing] Sorting : 1676 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/RC1-7a6f7f5/profReportv1.txt b/homework_3/analyse/RC1-7a6f7f5/profReportv1.txt new file mode 100644 index 0000000..07632fd --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/profReportv1.txt @@ -0,0 +1,2049 @@ +==PROF== Connected to process 22938 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v1/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes +==PROF== Disconnected from process 22938 +[22938] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 1.06 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.91 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 17.12 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 5.48 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 1,054,051.50 + smsp__average_warp_latency_issue_stalled_barrier.ratio 10,540.52 + smsp__inst_executed.avg inst 770,272.39 + smsp__inst_executed.max inst 782,344 + smsp__inst_executed.min inst 758,292 + smsp__inst_executed.sum inst 49,297,433 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 23.30 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.23 + smsp__cycles_active.avg cycle 1,462,946.80 + smsp__cycles_active.sum cycle 93,628,595 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,435.56 + smsp__inst_executed.max inst 12,845 + smsp__inst_executed.min inst 12,159 + smsp__inst_executed.sum inst 795,876 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,038.88 + smsp__cycles_active.sum cycle 4,674,488 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.52 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 159,299.29 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,592.99 + smsp__inst_executed.avg inst 132,203.94 + smsp__inst_executed.max inst 134,386 + smsp__inst_executed.min inst 130,049 + smsp__inst_executed.sum inst 8,461,052 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.87 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 251,805.25 + smsp__cycles_active.sum cycle 16,115,536 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,300.55 + smsp__inst_executed.max inst 12,511 + smsp__inst_executed.min inst 11,976 + smsp__inst_executed.sum inst 787,235 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,663.48 + smsp__cycles_active.sum cycle 4,586,463 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.24 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.92 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.77 + smsp__inst_executed.max inst 12,590 + smsp__inst_executed.min inst 11,916 + smsp__inst_executed.sum inst 787,761 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,511.94 + smsp__cycles_active.sum cycle 4,576,764 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.38 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 157,819.92 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,578.20 + smsp__inst_executed.avg inst 132,202.20 + smsp__inst_executed.max inst 134,335 + smsp__inst_executed.min inst 130,058 + smsp__inst_executed.sum inst 8,460,941 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.63 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,205.56 + smsp__cycles_active.sum cycle 16,141,156 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,295.12 + smsp__inst_executed.max inst 12,845 + smsp__inst_executed.min inst 11,576 + smsp__inst_executed.sum inst 786,888 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,152.91 + smsp__cycles_active.sum cycle 4,617,786 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.95 + smsp__inst_executed.max inst 12,735 + smsp__inst_executed.min inst 11,896 + smsp__inst_executed.sum inst 787,133 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,377.27 + smsp__cycles_active.sum cycle 4,568,145 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.39 + smsp__inst_executed.max inst 12,687 + smsp__inst_executed.min inst 11,876 + smsp__inst_executed.sum inst 787,801 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,765.78 + smsp__cycles_active.sum cycle 4,593,010 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:17, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.65 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.72 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,738.71 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,607.39 + smsp__inst_executed.avg inst 132,202.83 + smsp__inst_executed.max inst 134,317 + smsp__inst_executed.min inst 130,020 + smsp__inst_executed.sum inst 8,460,981 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.97 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 253,056.44 + smsp__cycles_active.sum cycle 16,195,612 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.95 + smsp__inst_executed.max inst 12,831 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,621 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,120.38 + smsp__cycles_active.sum cycle 4,487,704 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.12 + smsp__inst_executed.max inst 12,790 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,824 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,326.78 + smsp__cycles_active.sum cycle 4,628,914 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.86 + smsp__inst_executed.max inst 12,545 + smsp__inst_executed.min inst 12,035 + smsp__inst_executed.sum inst 787,127 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,534.31 + smsp__cycles_active.sum cycle 4,578,196 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,307.75 + smsp__inst_executed.max inst 12,590 + smsp__inst_executed.min inst 11,663 + smsp__inst_executed.sum inst 787,696 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,304.95 + smsp__cycles_active.sum cycle 4,627,517 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.25 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 159,251.51 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,592.52 + smsp__inst_executed.avg inst 132,211.53 + smsp__inst_executed.max inst 134,374 + smsp__inst_executed.min inst 130,131 + smsp__inst_executed.sum inst 8,461,538 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.82 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,324.94 + smsp__cycles_active.sum cycle 16,148,796 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.98 + smsp__inst_executed.max inst 12,542 + smsp__inst_executed.min inst 12,084 + smsp__inst_executed.sum inst 786,559 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,892.06 + smsp__cycles_active.sum cycle 4,537,092 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.59 + smsp__inst_executed.max inst 12,644 + smsp__inst_executed.min inst 11,864 + smsp__inst_executed.sum inst 786,662 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,680.12 + smsp__cycles_active.sum cycle 4,587,528 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.95 + smsp__inst_executed.max inst 12,565 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,813 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,494.78 + smsp__cycles_active.sum cycle 4,639,666 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,299.05 + smsp__inst_executed.max inst 12,764 + smsp__inst_executed.min inst 11,918 + smsp__inst_executed.sum inst 787,139 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,412.25 + smsp__cycles_active.sum cycle 4,506,384 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:18, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.50 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.47 + smsp__inst_executed.max inst 12,770 + smsp__inst_executed.min inst 11,835 + smsp__inst_executed.sum inst 787,742 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,493.03 + smsp__cycles_active.sum cycle 4,639,554 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,924.14 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,609.24 + smsp__inst_executed.avg inst 132,203.48 + smsp__inst_executed.max inst 132,405 + smsp__inst_executed.min inst 132,014 + smsp__inst_executed.sum inst 8,461,023 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 21.00 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,862.81 + smsp__cycles_active.sum cycle 16,183,220 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.66 + smsp__inst_executed.max inst 12,472 + smsp__inst_executed.min inst 12,102 + smsp__inst_executed.sum inst 786,474 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,593.94 + smsp__cycles_active.sum cycle 4,582,012 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.95 + smsp__inst_executed.max inst 12,688 + smsp__inst_executed.min inst 11,936 + smsp__inst_executed.sum inst 786,493 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,439.05 + smsp__cycles_active.sum cycle 4,572,099 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.89 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.78 + smsp__inst_executed.max inst 12,548 + smsp__inst_executed.min inst 11,926 + smsp__inst_executed.sum inst 786,610 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,595.69 + smsp__cycles_active.sum cycle 4,582,124 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.39 + smsp__inst_executed.max inst 12,562 + smsp__inst_executed.min inst 11,983 + smsp__inst_executed.sum inst 786,777 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,335.70 + smsp__cycles_active.sum cycle 4,629,485 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298 + smsp__inst_executed.max inst 12,523 + smsp__inst_executed.min inst 11,883 + smsp__inst_executed.sum inst 787,072 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,859.33 + smsp__cycles_active.sum cycle 4,598,997 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.75 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,307.64 + smsp__inst_executed.max inst 12,574 + smsp__inst_executed.min inst 12,091 + smsp__inst_executed.sum inst 787,689 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,764.64 + smsp__cycles_active.sum cycle 4,656,937 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 182.78 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.72 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 158,443.01 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,584.43 + smsp__inst_executed.avg inst 132,209.97 + smsp__inst_executed.max inst 134,328 + smsp__inst_executed.min inst 130,097 + smsp__inst_executed.sum inst 8,461,438 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.67 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,761.25 + smsp__cycles_active.sum cycle 16,176,720 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.78 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.77 + smsp__inst_executed.max inst 12,668 + smsp__inst_executed.min inst 11,916 + smsp__inst_executed.sum inst 786,481 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,578.50 + smsp__cycles_active.sum cycle 4,645,024 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:19, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.92 + smsp__inst_executed.max inst 12,639 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,491 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,972.80 + smsp__cycles_active.sum cycle 4,606,259 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.28 + smsp__inst_executed.max inst 12,596 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,514 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,734.44 + smsp__cycles_active.sum cycle 4,591,004 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.53 + smsp__inst_executed.max inst 12,668 + smsp__inst_executed.min inst 11,966 + smsp__inst_executed.sum inst 786,530 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,884.67 + smsp__cycles_active.sum cycle 4,536,619 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.44 + smsp__inst_executed.max inst 12,568 + smsp__inst_executed.min inst 12,004 + smsp__inst_executed.sum inst 786,716 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,469.70 + smsp__cycles_active.sum cycle 4,574,061 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.23 + smsp__inst_executed.max inst 12,752 + smsp__inst_executed.min inst 12,082 + smsp__inst_executed.sum inst 787,087 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,690.11 + smsp__cycles_active.sum cycle 4,524,167 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,310.39 + smsp__inst_executed.max inst 12,698 + smsp__inst_executed.min inst 11,982 + smsp__inst_executed.sum inst 787,865 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,817.17 + smsp__cycles_active.sum cycle 4,660,299 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,300.52 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,603.01 + smsp__inst_executed.avg inst 132,192.88 + smsp__inst_executed.max inst 134,273 + smsp__inst_executed.min inst 130,123 + smsp__inst_executed.sum inst 8,460,344 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.96 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,499.88 + smsp__cycles_active.sum cycle 16,159,992 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.98 + smsp__inst_executed.max inst 12,504 + smsp__inst_executed.min inst 12,076 + smsp__inst_executed.sum inst 786,431 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,785.80 + smsp__cycles_active.sum cycle 4,658,291 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.17 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,880 + smsp__inst_executed.sum inst 786,443 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,055.12 + smsp__cycles_active.sum cycle 4,547,528 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.56 + smsp__inst_executed.max inst 12,501 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,468 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,654.25 + smsp__cycles_active.sum cycle 4,585,872 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.83 + smsp__inst_executed.max inst 12,524 + smsp__inst_executed.min inst 11,928 + smsp__inst_executed.sum inst 786,549 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,462.27 + smsp__cycles_active.sum cycle 4,573,585 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:20, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.52 + smsp__inst_executed.max inst 12,785 + smsp__inst_executed.min inst 11,986 + smsp__inst_executed.sum inst 786,657 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,521.95 + smsp__cycles_active.sum cycle 4,577,405 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.20 + smsp__inst_executed.max inst 12,707 + smsp__inst_executed.min inst 11,874 + smsp__inst_executed.sum inst 786,765 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,517.52 + smsp__cycles_active.sum cycle 4,577,121 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.66 + smsp__inst_executed.max inst 12,494 + smsp__inst_executed.min inst 12,032 + smsp__inst_executed.sum inst 787,114 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,726.52 + smsp__cycles_active.sum cycle 4,590,497 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.78 + smsp__inst_executed.max inst 12,573 + smsp__inst_executed.min inst 11,874 + smsp__inst_executed.sum inst 787,762 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,852.81 + smsp__cycles_active.sum cycle 4,598,580 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.72 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 158,707.09 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,587.07 + smsp__inst_executed.avg inst 132,223.38 + smsp__inst_executed.max inst 134,277 + smsp__inst_executed.min inst 130,147 + smsp__inst_executed.sum inst 8,462,296 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.80 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 251,577.56 + smsp__cycles_active.sum cycle 16,100,964 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.55 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.16 + smsp__inst_executed.max inst 12,868 + smsp__inst_executed.min inst 11,716 + smsp__inst_executed.sum inst 786,442 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,504.88 + smsp__cycles_active.sum cycle 4,640,312 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.23 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.09 + smsp__inst_executed.max inst 12,496 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,438 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,023.64 + smsp__cycles_active.sum cycle 4,609,513 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.92 + smsp__inst_executed.max inst 12,480 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,427 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,365.19 + smsp__cycles_active.sum cycle 4,631,372 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.72 + smsp__inst_executed.max inst 12,515 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,478 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,533.81 + smsp__cycles_active.sum cycle 4,578,164 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.28 + smsp__inst_executed.max inst 12,696 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,450 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,528.86 + smsp__cycles_active.sum cycle 4,577,847 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:21, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.33 + smsp__inst_executed.max inst 12,884 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,581 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,784.72 + smsp__cycles_active.sum cycle 4,530,222 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.85 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.06 + smsp__inst_executed.max inst 12,512 + smsp__inst_executed.min inst 12,014 + smsp__inst_executed.sum inst 786,756 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,642.33 + smsp__cycles_active.sum cycle 4,585,109 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,297.52 + smsp__inst_executed.max inst 12,537 + smsp__inst_executed.min inst 12,055 + smsp__inst_executed.sum inst 787,041 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,705.81 + smsp__cycles_active.sum cycle 4,525,172 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,310.97 + smsp__inst_executed.max inst 12,739 + smsp__inst_executed.min inst 12,075 + smsp__inst_executed.sum inst 787,902 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,633.72 + smsp__cycles_active.sum cycle 4,648,558 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.74 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 158,858.86 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,588.59 + smsp__inst_executed.avg inst 132,194.42 + smsp__inst_executed.max inst 134,318 + smsp__inst_executed.min inst 130,076 + smsp__inst_executed.sum inst 8,460,443 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.74 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,719.28 + smsp__cycles_active.sum cycle 16,174,034 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 56.54 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.77 + smsp__inst_executed.max inst 12,500 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,417 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 69,383.22 + smsp__cycles_active.sum cycle 4,440,526 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.28 + smsp__inst_executed.max inst 12,856 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,450 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,261.28 + smsp__cycles_active.sum cycle 4,624,722 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.01 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.75 + smsp__inst_executed.max inst 12,500 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,480 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,742.92 + smsp__cycles_active.sum cycle 4,591,547 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.50 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.27 + smsp__inst_executed.max inst 12,505 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,449 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,612.34 + smsp__cycles_active.sum cycle 4,583,190 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.44 + smsp__inst_executed.max inst 12,539 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,460 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,165.33 + smsp__cycles_active.sum cycle 4,554,581 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.31 + smsp__inst_executed.max inst 12,472 + smsp__inst_executed.min inst 12,108 + smsp__inst_executed.sum inst 786,516 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,451.98 + smsp__cycles_active.sum cycle 4,508,927 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:22, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.08 + smsp__inst_executed.max inst 12,576 + smsp__inst_executed.min inst 11,684 + smsp__inst_executed.sum inst 786,629 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,947.14 + smsp__cycles_active.sum cycle 4,540,617 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:23, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.01 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.75 + smsp__inst_executed.max inst 12,874 + smsp__inst_executed.min inst 11,724 + smsp__inst_executed.sum inst 786,864 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,199.77 + smsp__cycles_active.sum cycle 4,620,785 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:23, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,301.61 + smsp__inst_executed.max inst 12,569 + smsp__inst_executed.min inst 12,033 + smsp__inst_executed.sum inst 787,303 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,236.08 + smsp__cycles_active.sum cycle 4,559,109 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:23, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,303.88 + smsp__inst_executed.max inst 12,532 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 787,448 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,734.66 + smsp__cycles_active.sum cycle 4,591,018 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:23, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 184.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 14.71 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.71 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,046.97 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,600.47 + smsp__inst_executed.avg inst 131,978.41 + smsp__inst_executed.max inst 134,065 + smsp__inst_executed.min inst 129,873 + smsp__inst_executed.sum inst 8,446,618 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.89 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,655.50 + smsp__cycles_active.sum cycle 16,169,952 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/analyse/RC1-7a6f7f5/profreportv2.txt b/homework_3/analyse/RC1-7a6f7f5/profreportv2.txt new file mode 100644 index 0000000..109607b --- /dev/null +++ b/homework_3/analyse/RC1-7a6f7f5/profreportv2.txt @@ -0,0 +1,2049 @@ +==PROF== Connected to process 23012 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes +==PROF== Disconnected from process 23012 +[23012] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 1.20 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 2,981,888 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 111,954.62 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 112,106 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 111,827 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 1,791,274 + smsp__average_warp_latency_issue_stalled_barrier.pct % 644,639.76 + smsp__average_warp_latency_issue_stalled_barrier.ratio 6,446.40 + smsp__inst_executed.avg inst 1,030,883.66 + smsp__inst_executed.max inst 1,031,104 + smsp__inst_executed.min inst 1,030,650 + smsp__inst_executed.sum inst 65,976,554 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.50 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 1,667,322.50 + smsp__cycles_active.sum cycle 106,708,640 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.62 + smsp__inst_executed.max inst 12,930 + smsp__inst_executed.min inst 12,094 + smsp__inst_executed.sum inst 787,752 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,839.28 + smsp__cycles_active.sum cycle 4,725,714 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.58 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,673.56 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,005 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,385 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,777 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,076.57 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,230.77 + smsp__inst_executed.avg inst 189,293.97 + smsp__inst_executed.max inst 192,369 + smsp__inst_executed.min inst 186,352 + smsp__inst_executed.sum inst 12,114,814 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.78 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,320.25 + smsp__cycles_active.sum cycle 20,244,496 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.77 + smsp__inst_executed.max inst 12,539 + smsp__inst_executed.min inst 12,060 + smsp__inst_executed.sum inst 787,121 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,962.83 + smsp__cycles_active.sum cycle 4,541,621 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.71 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.31 + smsp__inst_executed.max inst 12,697 + smsp__inst_executed.min inst 11,822 + smsp__inst_executed.sum inst 787,796 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,821.19 + smsp__cycles_active.sum cycle 4,660,556 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 232.45 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,692.06 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,017 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,412 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,073 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,072.53 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,240.73 + smsp__inst_executed.avg inst 189,295.75 + smsp__inst_executed.max inst 192,417 + smsp__inst_executed.min inst 186,276 + smsp__inst_executed.sum inst 12,114,928 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.89 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,238.97 + smsp__cycles_active.sum cycle 20,239,294 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.88 + smsp__inst_executed.max inst 12,554 + smsp__inst_executed.min inst 11,832 + smsp__inst_executed.sum inst 786,744 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,835.05 + smsp__cycles_active.sum cycle 4,597,443 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.89 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.73 + smsp__inst_executed.max inst 12,788 + smsp__inst_executed.min inst 11,840 + smsp__inst_executed.sum inst 787,119 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,345.14 + smsp__cycles_active.sum cycle 4,566,089 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.71 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.28 + smsp__inst_executed.max inst 12,596 + smsp__inst_executed.min inst 11,926 + smsp__inst_executed.sum inst 787,730 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,894.73 + smsp__cycles_active.sum cycle 4,729,263 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 229.41 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,675.88 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,994 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,371 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,814 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,117.44 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,241.17 + smsp__inst_executed.avg inst 189,295.52 + smsp__inst_executed.max inst 192,256 + smsp__inst_executed.min inst 186,332 + smsp__inst_executed.sum inst 12,114,913 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.85 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 317,343.97 + smsp__cycles_active.sum cycle 20,310,014 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.78 + smsp__inst_executed.max inst 12,480 + smsp__inst_executed.min inst 11,928 + smsp__inst_executed.sum inst 786,610 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,846.03 + smsp__cycles_active.sum cycle 4,534,146 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.23 + smsp__inst_executed.max inst 12,604 + smsp__inst_executed.min inst 11,836 + smsp__inst_executed.sum inst 786,767 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,106.22 + smsp__cycles_active.sum cycle 4,550,798 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.36 + smsp__inst_executed.max inst 12,513 + smsp__inst_executed.min inst 11,712 + smsp__inst_executed.sum inst 787,095 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,202.59 + smsp__cycles_active.sum cycle 4,492,966 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.52 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.52 + smsp__inst_executed.max inst 12,682 + smsp__inst_executed.min inst 11,859 + smsp__inst_executed.sum inst 787,745 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,621.48 + smsp__cycles_active.sum cycle 4,711,775 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 229.09 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.12 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,974 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,355 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,978 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,010.98 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,240.11 + smsp__inst_executed.avg inst 189,293.06 + smsp__inst_executed.max inst 192,343 + smsp__inst_executed.min inst 186,209 + smsp__inst_executed.sum inst 12,114,756 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.81 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 317,996.44 + smsp__cycles_active.sum cycle 20,351,772 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.22 + smsp__inst_executed.max inst 12,699 + smsp__inst_executed.min inst 11,910 + smsp__inst_executed.sum inst 786,510 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,268.28 + smsp__cycles_active.sum cycle 4,561,170 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.45 + smsp__inst_executed.max inst 12,669 + smsp__inst_executed.min inst 11,950 + smsp__inst_executed.sum inst 786,589 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,528.27 + smsp__cycles_active.sum cycle 4,513,809 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.50 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.23 + smsp__inst_executed.max inst 12,648 + smsp__inst_executed.min inst 11,996 + smsp__inst_executed.sum inst 786,767 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,717.06 + smsp__cycles_active.sum cycle 4,525,892 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.44 + smsp__inst_executed.max inst 12,776 + smsp__inst_executed.min inst 11,972 + smsp__inst_executed.sum inst 787,100 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,171.77 + smsp__cycles_active.sum cycle 4,490,993 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.58 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.16 + smsp__inst_executed.max inst 12,776 + smsp__inst_executed.min inst 12,048 + smsp__inst_executed.sum inst 787,786 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,678.91 + smsp__cycles_active.sum cycle 4,715,450 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 228.96 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,689.06 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,011 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,382 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,025 + smsp__average_warp_latency_issue_stalled_barrier.pct % 125,081.44 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,250.81 + smsp__inst_executed.avg inst 189,292.86 + smsp__inst_executed.max inst 192,415 + smsp__inst_executed.min inst 186,212 + smsp__inst_executed.sum inst 12,114,743 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.96 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,855.81 + smsp__cycles_active.sum cycle 20,278,772 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.97 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.30 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,920 + smsp__inst_executed.sum inst 786,451 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,684.17 + smsp__cycles_active.sum cycle 4,715,787 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.70 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,477 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,366.16 + smsp__cycles_active.sum cycle 4,567,434 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.27 + smsp__inst_executed.max inst 12,693 + smsp__inst_executed.min inst 12,042 + smsp__inst_executed.sum inst 786,577 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,688.91 + smsp__cycles_active.sum cycle 4,524,090 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.41 + smsp__inst_executed.max inst 12,585 + smsp__inst_executed.min inst 11,776 + smsp__inst_executed.sum inst 786,778 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,063.67 + smsp__cycles_active.sum cycle 4,548,075 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.94 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,776 + smsp__inst_executed.sum inst 787,132 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,037.02 + smsp__cycles_active.sum cycle 4,546,369 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.74 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.58 + smsp__inst_executed.max inst 12,726 + smsp__inst_executed.min inst 12,072 + smsp__inst_executed.sum inst 787,813 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,656 + smsp__cycles_active.sum cycle 4,713,984 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,682.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,995 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,336 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,924 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,533.56 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,245.34 + smsp__inst_executed.avg inst 189,299.56 + smsp__inst_executed.max inst 192,317 + smsp__inst_executed.min inst 186,295 + smsp__inst_executed.sum inst 12,115,172 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.91 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,917.50 + smsp__cycles_active.sum cycle 20,282,720 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.72 + smsp__inst_executed.max inst 12,664 + smsp__inst_executed.min inst 11,916 + smsp__inst_executed.sum inst 786,414 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,857.64 + smsp__cycles_active.sum cycle 4,598,889 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.97 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.48 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,892 + smsp__inst_executed.sum inst 786,463 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,070.30 + smsp__cycles_active.sum cycle 4,676,499 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.53 + smsp__inst_executed.max inst 12,851 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,530 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,288.86 + smsp__cycles_active.sum cycle 4,498,487 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.62 + smsp__inst_executed.max inst 12,632 + smsp__inst_executed.min inst 12,036 + smsp__inst_executed.sum inst 786,600 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,317.92 + smsp__cycles_active.sum cycle 4,564,347 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.17 + smsp__inst_executed.max inst 12,673 + smsp__inst_executed.min inst 11,880 + smsp__inst_executed.sum inst 786,763 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,760.25 + smsp__cycles_active.sum cycle 4,592,656 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.23 + smsp__inst_executed.max inst 12,716 + smsp__inst_executed.min inst 11,876 + smsp__inst_executed.sum inst 787,087 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,216.77 + smsp__cycles_active.sum cycle 4,557,873 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.39 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.69 + smsp__inst_executed.max inst 12,831 + smsp__inst_executed.min inst 11,757 + smsp__inst_executed.sum inst 787,756 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,196.75 + smsp__cycles_active.sum cycle 4,684,592 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 228.77 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,694.50 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,913 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,365 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,112 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,085.89 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,230.86 + smsp__inst_executed.avg inst 189,320.77 + smsp__inst_executed.max inst 192,334 + smsp__inst_executed.min inst 186,279 + smsp__inst_executed.sum inst 12,116,529 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.78 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,332 + smsp__cycles_active.sum cycle 20,245,248 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.72 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.30 + smsp__inst_executed.max inst 12,488 + smsp__inst_executed.min inst 11,916 + smsp__inst_executed.sum inst 786,451 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,574.31 + smsp__cycles_active.sum cycle 4,580,756 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.20 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.91 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,910 + smsp__inst_executed.sum inst 786,490 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,645.88 + smsp__cycles_active.sum cycle 4,585,336 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.29 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.03 + smsp__inst_executed.max inst 12,492 + smsp__inst_executed.min inst 11,892 + smsp__inst_executed.sum inst 786,498 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,041.34 + smsp__cycles_active.sum cycle 4,674,646 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.23 + smsp__inst_executed.max inst 12,678 + smsp__inst_executed.min inst 12,066 + smsp__inst_executed.sum inst 786,511 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,712.69 + smsp__cycles_active.sum cycle 4,525,612 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.75 + smsp__inst_executed.max inst 12,526 + smsp__inst_executed.min inst 12,050 + smsp__inst_executed.sum inst 786,544 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,762.81 + smsp__cycles_active.sum cycle 4,592,820 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.14 + smsp__inst_executed.max inst 12,740 + smsp__inst_executed.min inst 11,704 + smsp__inst_executed.sum inst 786,761 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,097.56 + smsp__cycles_active.sum cycle 4,614,244 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.39 + smsp__inst_executed.max inst 12,695 + smsp__inst_executed.min inst 11,854 + smsp__inst_executed.sum inst 787,097 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,856.70 + smsp__cycles_active.sum cycle 4,534,829 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.20 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.80 + smsp__inst_executed.max inst 12,711 + smsp__inst_executed.min inst 11,932 + smsp__inst_executed.sum inst 787,827 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,244.53 + smsp__cycles_active.sum cycle 4,623,650 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 228.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.06 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,967 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,333 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,977 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,987.84 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,249.88 + smsp__inst_executed.avg inst 189,274.94 + smsp__inst_executed.max inst 192,335 + smsp__inst_executed.min inst 186,200 + smsp__inst_executed.sum inst 12,113,596 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.97 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,573.25 + smsp__cycles_active.sum cycle 20,260,688 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.78 + smsp__inst_executed.max inst 12,852 + smsp__inst_executed.min inst 11,520 + smsp__inst_executed.sum inst 786,418 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 75,112.75 + smsp__cycles_active.sum cycle 4,807,216 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.91 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.53 + smsp__inst_executed.max inst 12,679 + smsp__inst_executed.min inst 11,900 + smsp__inst_executed.sum inst 786,466 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,599.36 + smsp__cycles_active.sum cycle 4,646,359 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.06 + smsp__inst_executed.max inst 12,478 + smsp__inst_executed.min inst 12,100 + smsp__inst_executed.sum inst 786,436 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,386.41 + smsp__cycles_active.sum cycle 4,568,730 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.22 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.20 + smsp__inst_executed.max inst 12,496 + smsp__inst_executed.min inst 11,920 + smsp__inst_executed.sum inst 786,509 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,866.02 + smsp__cycles_active.sum cycle 4,727,425 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.89 + smsp__inst_executed.max inst 12,675 + smsp__inst_executed.min inst 11,914 + smsp__inst_executed.sum inst 786,553 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,482.77 + smsp__cycles_active.sum cycle 4,574,897 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.39 + smsp__inst_executed.max inst 12,501 + smsp__inst_executed.min inst 12,038 + smsp__inst_executed.sum inst 786,649 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,637.31 + smsp__cycles_active.sum cycle 4,584,788 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.56 + smsp__inst_executed.max inst 12,543 + smsp__inst_executed.min inst 11,712 + smsp__inst_executed.sum inst 786,852 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,770.67 + smsp__cycles_active.sum cycle 4,593,323 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,301.70 + smsp__inst_executed.max inst 12,527 + smsp__inst_executed.min inst 12,071 + smsp__inst_executed.sum inst 787,309 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,540.92 + smsp__cycles_active.sum cycle 4,578,619 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.23 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,314.02 + smsp__inst_executed.max inst 12,699 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 788,097 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,509.19 + smsp__cycles_active.sum cycle 4,640,588 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 232.10 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,689.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,069 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,389 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,028 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,193.57 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,241.94 + smsp__inst_executed.avg inst 189,278.17 + smsp__inst_executed.max inst 192,324 + smsp__inst_executed.min inst 186,272 + smsp__inst_executed.sum inst 12,113,803 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.87 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,996.25 + smsp__cycles_active.sum cycle 20,287,760 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 56.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.95 + smsp__inst_executed.max inst 12,680 + smsp__inst_executed.min inst 11,896 + smsp__inst_executed.sum inst 786,429 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 68,417.62 + smsp__cycles_active.sum cycle 4,378,728 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.45 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.66 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,410 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,576.05 + smsp__cycles_active.sum cycle 4,772,867 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.85 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.80 + smsp__inst_executed.max inst 12,492 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,419 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,754.67 + smsp__cycles_active.sum cycle 4,656,299 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.85 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.94 + smsp__inst_executed.max inst 12,660 + smsp__inst_executed.min inst 12,090 + smsp__inst_executed.sum inst 786,428 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,405.38 + smsp__cycles_active.sum cycle 4,633,944 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.11 + smsp__inst_executed.max inst 12,516 + smsp__inst_executed.min inst 11,872 + smsp__inst_executed.sum inst 786,503 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,029.31 + smsp__cycles_active.sum cycle 4,737,876 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.77 + smsp__inst_executed.max inst 12,648 + smsp__inst_executed.min inst 11,890 + smsp__inst_executed.sum inst 786,481 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,229.30 + smsp__cycles_active.sum cycle 4,494,675 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.17 + smsp__inst_executed.max inst 12,491 + smsp__inst_executed.min inst 12,062 + smsp__inst_executed.sum inst 786,571 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,537.89 + smsp__cycles_active.sum cycle 4,578,425 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.97 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.28 + smsp__inst_executed.max inst 12,521 + smsp__inst_executed.min inst 12,037 + smsp__inst_executed.sum inst 786,834 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,773.67 + smsp__cycles_active.sum cycle 4,593,515 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.47 + smsp__inst_executed.max inst 12,737 + smsp__inst_executed.min inst 11,886 + smsp__inst_executed.sum inst 787,102 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,021.17 + smsp__cycles_active.sum cycle 4,545,355 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.26 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.88 + smsp__inst_executed.max inst 12,759 + smsp__inst_executed.min inst 12,026 + smsp__inst_executed.sum inst 787,832 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,721.27 + smsp__cycles_active.sum cycle 4,654,161 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.97 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,679.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,990 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,275 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,876 + smsp__average_warp_latency_issue_stalled_barrier.pct % 125,400.79 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,254.01 + smsp__inst_executed.avg inst 189,032.33 + smsp__inst_executed.max inst 192,028 + smsp__inst_executed.min inst 186,044 + smsp__inst_executed.sum inst 12,098,069 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.03 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,085.12 + smsp__cycles_active.sum cycle 20,229,448 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/analyse/b31ca23/Pending-PIDs b/homework_3/analyse/b31ca23/Pending-PIDs new file mode 100644 index 0000000..3dfa31b --- /dev/null +++ b/homework_3/analyse/b31ca23/Pending-PIDs @@ -0,0 +1,45 @@ +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q20.sh +Submitted batch job 1914456 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q21.sh +Submitted batch job 1914457 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q22.sh +Submitted batch job 1914458 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q23.sh +Submitted batch job 1914459 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q24.sh +Submitted batch job 1914460 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q25.sh +Submitted batch job 1914461 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q26.sh +Submitted batch job 1914462 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q27.sh +Submitted batch job 1914463 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q28.sh +Submitted batch job 1914464 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q29.sh +Submitted batch job 1914465 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV0Q30.sh +Submitted batch job 1914466 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q20.sh +Submitted batch job 1914467 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q21.sh +Submitted batch job 1914468 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q22.sh +Submitted batch job 1914469 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q23.sh +Submitted batch job 1914470 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q24.sh +Submitted batch job 1914471 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q25.sh +Submitted batch job 1914472 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q26.sh +Submitted batch job 1914473 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q27.sh +Submitted batch job 1914474 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q28.sh +Submitted batch job 1914475 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q29.sh +Submitted batch job 1914476 +[cchoutou@aristotle6 homework_3]$ sbatch --qos=small -p ampere hpc/BitncV1Q30.sh +Submitted batch job 1914477 + diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out new file mode 100644 index 0000000..531bf7b --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q20-1914456.out @@ -0,0 +1,2 @@ +[Timing] Total: 5920 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out new file mode 100644 index 0000000..f2b9a27 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q21-1914457.out @@ -0,0 +1,2 @@ +[Timing] Total: 6571 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out new file mode 100644 index 0000000..7df3192 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q22-1914458.out @@ -0,0 +1,2 @@ +[Timing] Total: 13 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out new file mode 100644 index 0000000..d225a93 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q23-1914459.out @@ -0,0 +1,2 @@ +[Timing] Total: 24 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out new file mode 100644 index 0000000..0eb8356 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q24-1914460.out @@ -0,0 +1,2 @@ +[Timing] Total: 46 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out new file mode 100644 index 0000000..bdc2dfe --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q25-1914461.out @@ -0,0 +1,2 @@ +[Timing] Total: 92 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out new file mode 100644 index 0000000..a5246d8 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q26-1914462.out @@ -0,0 +1,2 @@ +[Timing] Total: 213 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out new file mode 100644 index 0000000..f13d2f8 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q27-1914463.out @@ -0,0 +1,2 @@ +[Timing] Total: 440 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out new file mode 100644 index 0000000..f14bee6 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q28-1914464.out @@ -0,0 +1,2 @@ +[Timing] Total: 935 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out new file mode 100644 index 0000000..587cf5b --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q29-1914465.out @@ -0,0 +1,2 @@ +[Timing] Total: 1847 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out new file mode 100644 index 0000000..0e6eb6c --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV0Q30-1914466.out @@ -0,0 +1,2 @@ +[Timing] Total: 3798 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out new file mode 100644 index 0000000..a7dcac6 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q20-1914467.out @@ -0,0 +1,2 @@ +[Timing] Total: 2843 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out new file mode 100644 index 0000000..5a9a5ff --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q21-1914468.out @@ -0,0 +1,2 @@ +[Timing] Total: 4979 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out new file mode 100644 index 0000000..930ad44 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q22-1914469.out @@ -0,0 +1,2 @@ +[Timing] Total: 9909 [usec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out new file mode 100644 index 0000000..86ec2cd --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q23-1914470.out @@ -0,0 +1,2 @@ +[Timing] Total: 20 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out new file mode 100644 index 0000000..3109f65 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q24-1914471.out @@ -0,0 +1,2 @@ +[Timing] Total: 35 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out new file mode 100644 index 0000000..b2de5c8 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q25-1914472.out @@ -0,0 +1,2 @@ +[Timing] Total: 70 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out new file mode 100644 index 0000000..5a5ebd2 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q26-1914473.out @@ -0,0 +1,2 @@ +[Timing] Total: 170 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out new file mode 100644 index 0000000..cd85834 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q27-1914474.out @@ -0,0 +1,2 @@ +[Timing] Total: 346 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out new file mode 100644 index 0000000..0136e52 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q28-1914475.out @@ -0,0 +1,2 @@ +[Timing] Total: 735 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out new file mode 100644 index 0000000..34b8260 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q29-1914476.out @@ -0,0 +1,2 @@ +[Timing] Total: 1522 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out new file mode 100644 index 0000000..fbddd80 --- /dev/null +++ b/homework_3/analyse/b31ca23/slurm-ampere-BitncV1Q30-1914477.out @@ -0,0 +1,2 @@ +[Timing] Total: 2950 [msec] +[Validation] Results validation ... [PASSED]  diff --git a/homework_3/analyse/prof.sh b/homework_3/analyse/prof.sh new file mode 100755 index 0000000..3adfcd3 --- /dev/null +++ b/homework_3/analyse/prof.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# +# prof.sh +# + +sudo /usr/local/cuda-11.4/bin/ncu \ + --target-processes all \ + --metrics "$(echo -n \ +"smsp__inst_executed,"\ +"smsp__cycles_active.avg,"\ +"smsp__cycles_active.sum,"\ +"gpu__time_duration.sum,"\ +"smsp__average_warp_latency_issue_stalled_barrier,"\ +"smsp__warp_issue_stalled_barrier_per_warp_active,"\ +"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld,"\ +"l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st,"\ +"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read,"\ +"l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write,"\ +"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,"\ +"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum "\ + )" \ + "$1" -q 20 -b 512 > "$2" diff --git a/homework_3/hpc/submitJobs.sh b/homework_3/hpc/submitJobs.sh index d717498..0b6ed8f 100755 --- a/homework_3/hpc/submitJobs.sh +++ b/homework_3/hpc/submitJobs.sh @@ -2,8 +2,8 @@ # Submission parameters QOS="small" -PARTITION="ampere" -SCRIPT_DIR="hpc" # Directory containing the job scripts +PARTITION="ampere" # ampere gpu +SCRIPT_DIR="hpc" # Directory containing the job scripts # Range of values for the -q parameter VERSIONS=("V0" "V1" "V2") @@ -17,8 +17,9 @@ for version in "${VERSIONS[@]}"; do script_path="${SCRIPT_DIR}/${script_name}" if [[ -f "$script_path" ]]; then + echo "Submitting: $script_path" sbatch --qos="$QOS" -p "$PARTITION" "$script_path" - echo "Submitted: $script_path" + #sbatch -p "$PARTITION" "$script_path" else echo "Warning: File not found - $script_path" fi diff --git a/homework_3/reportv1.3 b/homework_3/reportv1.3 new file mode 100644 index 0000000..1e77f44 --- /dev/null +++ b/homework_3/reportv1.3 @@ -0,0 +1,1917 @@ +==PROF== Connected to process 19677 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v1/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 5 passes +==PROF== Disconnected from process 19677 +[19677] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:57, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 1.06 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 1,054,215.71 + smsp__average_warp_latency_issue_stalled_barrier.ratio 10,542.16 + smsp__inst_executed.avg inst 770,278.16 + smsp__inst_executed.max inst 770,517 + smsp__inst_executed.min inst 770,078 + smsp__inst_executed.sum inst 49,297,802 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 23.29 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.23 + smsp__cycles_active.avg cycle 1,464,763.30 + smsp__cycles_active.sum cycle 93,744,851 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:57, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,434.70 + smsp__inst_executed.max inst 12,627 + smsp__inst_executed.min inst 12,202 + smsp__inst_executed.sum inst 795,821 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,275.05 + smsp__cycles_active.sum cycle 4,625,603 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:57, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.54 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,167.14 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,601.67 + smsp__inst_executed.avg inst 132,203.41 + smsp__inst_executed.max inst 134,386 + smsp__inst_executed.min inst 130,079 + smsp__inst_executed.sum inst 8,461,018 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.86 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 253,150.12 + smsp__cycles_active.sum cycle 16,201,608 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,300.38 + smsp__inst_executed.max inst 12,564 + smsp__inst_executed.min inst 12,036 + smsp__inst_executed.sum inst 787,224 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,561.48 + smsp__cycles_active.sum cycle 4,579,935 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.33 + smsp__inst_executed.max inst 12,555 + smsp__inst_executed.min inst 12,038 + smsp__inst_executed.sum inst 787,733 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,921.98 + smsp__cycles_active.sum cycle 4,667,007 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 183.49 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 160,010.27 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,600.10 + smsp__inst_executed.avg inst 132,209.20 + smsp__inst_executed.max inst 134,250 + smsp__inst_executed.min inst 130,144 + smsp__inst_executed.sum inst 8,461,389 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.92 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,547.31 + smsp__cycles_active.sum cycle 16,163,028 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.86 + smsp__inst_executed.max inst 12,694 + smsp__inst_executed.min inst 12,054 + smsp__inst_executed.sum inst 786,871 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,596.58 + smsp__cycles_active.sum cycle 4,582,181 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,299 + smsp__inst_executed.max inst 12,638 + smsp__inst_executed.min inst 11,881 + smsp__inst_executed.sum inst 787,136 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,894.47 + smsp__cycles_active.sum cycle 4,601,246 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.91 + smsp__inst_executed.max inst 12,636 + smsp__inst_executed.min inst 11,910 + smsp__inst_executed.sum inst 787,834 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,313.89 + smsp__cycles_active.sum cycle 4,564,089 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 184.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 158,555.84 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,585.56 + smsp__inst_executed.avg inst 132,207.33 + smsp__inst_executed.max inst 134,301 + smsp__inst_executed.min inst 130,116 + smsp__inst_executed.sum inst 8,461,269 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.73 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,473.81 + smsp__cycles_active.sum cycle 16,158,324 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.09 + smsp__inst_executed.max inst 12,593 + smsp__inst_executed.min inst 11,856 + smsp__inst_executed.sum inst 786,630 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,576.33 + smsp__cycles_active.sum cycle 4,516,885 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.56 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,788 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,507.47 + smsp__cycles_active.sum cycle 4,576,478 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.73 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.97 + smsp__inst_executed.max inst 12,689 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 787,134 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,018.25 + smsp__cycles_active.sum cycle 4,545,168 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:58, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.72 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.84 + smsp__inst_executed.max inst 12,686 + smsp__inst_executed.min inst 12,079 + smsp__inst_executed.sum inst 787,766 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,024.20 + smsp__cycles_active.sum cycle 4,609,549 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 157,276.34 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,572.76 + smsp__inst_executed.avg inst 132,205.28 + smsp__inst_executed.max inst 134,358 + smsp__inst_executed.min inst 130,024 + smsp__inst_executed.sum inst 8,461,138 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.55 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,593.19 + smsp__cycles_active.sum cycle 16,165,964 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.47 + smsp__inst_executed.max inst 12,560 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,526 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,795.55 + smsp__cycles_active.sum cycle 4,530,915 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.76 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.78 + smsp__inst_executed.max inst 12,745 + smsp__inst_executed.min inst 11,874 + smsp__inst_executed.sum inst 786,610 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,441.03 + smsp__cycles_active.sum cycle 4,508,226 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.70 + smsp__inst_executed.max inst 12,566 + smsp__inst_executed.min inst 12,056 + smsp__inst_executed.sum inst 786,797 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,597.62 + smsp__cycles_active.sum cycle 4,582,248 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,299.03 + smsp__inst_executed.max inst 12,648 + smsp__inst_executed.min inst 11,910 + smsp__inst_executed.sum inst 787,138 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,625.34 + smsp__cycles_active.sum cycle 4,520,022 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.55 + smsp__inst_executed.max inst 12,690 + smsp__inst_executed.min inst 12,090 + smsp__inst_executed.sum inst 787,747 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,911.34 + smsp__cycles_active.sum cycle 4,602,326 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 184.93 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 159,654.44 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,596.54 + smsp__inst_executed.avg inst 132,204.97 + smsp__inst_executed.max inst 134,424 + smsp__inst_executed.min inst 129,985 + smsp__inst_executed.sum inst 8,461,118 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.86 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,486.12 + smsp__cycles_active.sum cycle 16,159,112 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.47 + smsp__inst_executed.max inst 12,834 + smsp__inst_executed.min inst 11,932 + smsp__inst_executed.sum inst 786,526 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,577.20 + smsp__cycles_active.sum cycle 4,516,941 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.45 + smsp__inst_executed.max inst 12,702 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,525 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,559.42 + smsp__cycles_active.sum cycle 4,579,803 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.50 + smsp__inst_executed.max inst 12,638 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,656 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,387.86 + smsp__cycles_active.sum cycle 4,568,823 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:35:59, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.50 + smsp__inst_executed.max inst 12,785 + smsp__inst_executed.min inst 11,630 + smsp__inst_executed.sum inst 786,784 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,530.47 + smsp__cycles_active.sum cycle 4,577,950 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.41 + smsp__inst_executed.max inst 12,716 + smsp__inst_executed.min inst 11,883 + smsp__inst_executed.sum inst 787,098 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,755.27 + smsp__cycles_active.sum cycle 4,528,337 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,310.11 + smsp__inst_executed.max inst 12,496 + smsp__inst_executed.min inst 11,901 + smsp__inst_executed.sum inst 787,847 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,085.02 + smsp__cycles_active.sum cycle 4,613,441 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 158,201.12 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,582.01 + smsp__inst_executed.avg inst 132,195.78 + smsp__inst_executed.max inst 134,319 + smsp__inst_executed.min inst 130,101 + smsp__inst_executed.sum inst 8,460,530 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.71 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 251,923.50 + smsp__cycles_active.sum cycle 16,123,104 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.02 + smsp__inst_executed.max inst 12,668 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,497 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,635.66 + smsp__cycles_active.sum cycle 4,648,682 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.31 + smsp__inst_executed.max inst 12,508 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,452 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,858 + smsp__cycles_active.sum cycle 4,534,912 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.89 + smsp__inst_executed.max inst 12,659 + smsp__inst_executed.min inst 11,942 + smsp__inst_executed.sum inst 786,489 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,747.28 + smsp__cycles_active.sum cycle 4,527,826 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.03 + smsp__inst_executed.max inst 12,683 + smsp__inst_executed.min inst 11,982 + smsp__inst_executed.sum inst 786,626 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,881.03 + smsp__cycles_active.sum cycle 4,536,386 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.31 + smsp__inst_executed.max inst 12,752 + smsp__inst_executed.min inst 11,612 + smsp__inst_executed.sum inst 786,772 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,166.55 + smsp__cycles_active.sum cycle 4,554,659 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.28 + smsp__inst_executed.max inst 12,667 + smsp__inst_executed.min inst 11,870 + smsp__inst_executed.sum inst 787,090 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,893.25 + smsp__cycles_active.sum cycle 4,537,168 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,311.05 + smsp__inst_executed.max inst 12,751 + smsp__inst_executed.min inst 12,075 + smsp__inst_executed.sum inst 787,907 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,743.69 + smsp__cycles_active.sum cycle 4,591,596 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 161,553.58 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,615.54 + smsp__inst_executed.avg inst 132,193.28 + smsp__inst_executed.max inst 134,294 + smsp__inst_executed.min inst 130,087 + smsp__inst_executed.sum inst 8,460,370 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 21.09 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,649.62 + smsp__cycles_active.sum cycle 16,169,576 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:00, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.17 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.12 + smsp__inst_executed.max inst 12,484 + smsp__inst_executed.min inst 12,084 + smsp__inst_executed.sum inst 786,504 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,022.83 + smsp__cycles_active.sum cycle 4,609,461 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.45 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,900 + smsp__inst_executed.sum inst 786,461 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,587.06 + smsp__cycles_active.sum cycle 4,581,572 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.53 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.42 + smsp__inst_executed.max inst 12,632 + smsp__inst_executed.min inst 12,096 + smsp__inst_executed.sum inst 786,459 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,582.89 + smsp__cycles_active.sum cycle 4,517,305 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.80 + smsp__inst_executed.max inst 12,500 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,483 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,332.38 + smsp__cycles_active.sum cycle 4,565,272 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.08 + smsp__inst_executed.max inst 12,636 + smsp__inst_executed.min inst 11,868 + smsp__inst_executed.sum inst 786,565 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,497.30 + smsp__cycles_active.sum cycle 4,575,827 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.33 + smsp__inst_executed.max inst 12,709 + smsp__inst_executed.min inst 11,780 + smsp__inst_executed.sum inst 786,709 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,223.28 + smsp__cycles_active.sum cycle 4,622,290 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,296.56 + smsp__inst_executed.max inst 12,676 + smsp__inst_executed.min inst 11,885 + smsp__inst_executed.sum inst 786,980 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,705.17 + smsp__cycles_active.sum cycle 4,525,131 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.72 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,311.58 + smsp__inst_executed.max inst 12,710 + smsp__inst_executed.min inst 11,851 + smsp__inst_executed.sum inst 787,941 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,827.20 + smsp__cycles_active.sum cycle 4,596,941 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 159,138.76 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,591.39 + smsp__inst_executed.avg inst 132,167.45 + smsp__inst_executed.max inst 134,248 + smsp__inst_executed.min inst 130,050 + smsp__inst_executed.sum inst 8,458,717 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.84 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 251,922.81 + smsp__cycles_active.sum cycle 16,123,060 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.58 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.83 + smsp__inst_executed.max inst 12,872 + smsp__inst_executed.min inst 11,524 + smsp__inst_executed.sum inst 786,421 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,310.11 + smsp__cycles_active.sum cycle 4,691,847 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.10 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.67 + smsp__inst_executed.max inst 12,488 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,475 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,120.91 + smsp__cycles_active.sum cycle 4,615,738 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:01, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.85 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.33 + smsp__inst_executed.max inst 12,500 + smsp__inst_executed.min inst 11,728 + smsp__inst_executed.sum inst 786,453 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,726.75 + smsp__cycles_active.sum cycle 4,590,512 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.86 + smsp__inst_executed.max inst 12,522 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,487 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,432.41 + smsp__cycles_active.sum cycle 4,571,674 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.25 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,898 + smsp__inst_executed.sum inst 786,512 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,698.97 + smsp__cycles_active.sum cycle 4,588,734 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.50 + smsp__inst_executed.max inst 12,484 + smsp__inst_executed.min inst 12,008 + smsp__inst_executed.sum inst 786,592 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,943.89 + smsp__cycles_active.sum cycle 4,604,409 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.40 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.12 + smsp__inst_executed.max inst 12,713 + smsp__inst_executed.min inst 11,621 + smsp__inst_executed.sum inst 786,760 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,649.19 + smsp__cycles_active.sum cycle 4,585,548 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.70 + smsp__inst_executed.max inst 12,725 + smsp__inst_executed.min inst 11,966 + smsp__inst_executed.sum inst 787,117 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,102.67 + smsp__cycles_active.sum cycle 4,550,571 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.50 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.14 + smsp__inst_executed.max inst 12,737 + smsp__inst_executed.min inst 12,018 + smsp__inst_executed.sum inst 787,785 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,724.20 + smsp__cycles_active.sum cycle 4,590,349 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 185.76 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 159,061.95 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,590.62 + smsp__inst_executed.avg inst 132,213.64 + smsp__inst_executed.max inst 134,321 + smsp__inst_executed.min inst 130,119 + smsp__inst_executed.sum inst 8,461,673 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 20.80 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 252,290.12 + smsp__cycles_active.sum cycle 16,146,568 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 56.96 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.03 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 12,072 + smsp__inst_executed.sum inst 786,434 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 68,948.02 + smsp__cycles_active.sum cycle 4,412,673 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.71 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.23 + smsp__inst_executed.max inst 12,712 + smsp__inst_executed.min inst 11,696 + smsp__inst_executed.sum inst 786,447 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,574.20 + smsp__cycles_active.sum cycle 4,644,749 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.01 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.03 + smsp__inst_executed.max inst 12,668 + smsp__inst_executed.min inst 12,068 + smsp__inst_executed.sum inst 786,434 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,415.67 + smsp__cycles_active.sum cycle 4,634,603 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:02, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.58 + smsp__inst_executed.max inst 12,676 + smsp__inst_executed.min inst 11,938 + smsp__inst_executed.sum inst 786,469 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,550.84 + smsp__cycles_active.sum cycle 4,579,254 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.86 + smsp__inst_executed.max inst 12,476 + smsp__inst_executed.min inst 12,078 + smsp__inst_executed.sum inst 786,487 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,923.06 + smsp__cycles_active.sum cycle 4,539,076 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.56 + smsp__inst_executed.max inst 12,503 + smsp__inst_executed.min inst 11,928 + smsp__inst_executed.sum inst 786,532 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,782.67 + smsp__cycles_active.sum cycle 4,530,091 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.79 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.62 + smsp__inst_executed.max inst 12,556 + smsp__inst_executed.min inst 12,068 + smsp__inst_executed.sum inst 786,600 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,737.78 + smsp__cycles_active.sum cycle 4,527,218 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.75 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.31 + smsp__inst_executed.max inst 12,661 + smsp__inst_executed.min inst 11,903 + smsp__inst_executed.sum inst 786,836 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,611.56 + smsp__cycles_active.sum cycle 4,583,140 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.89 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,300.94 + smsp__inst_executed.max inst 12,703 + smsp__inst_executed.min inst 11,887 + smsp__inst_executed.sum inst 787,260 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,911.81 + smsp__cycles_active.sum cycle 4,538,356 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,305.08 + smsp__inst_executed.max inst 12,731 + smsp__inst_executed.min inst 11,780 + smsp__inst_executed.sum inst 787,525 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,733.38 + smsp__cycles_active.sum cycle 4,590,936 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:36:03, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 184.58 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 10.83 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.47 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 161,368.32 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,613.68 + smsp__inst_executed.avg inst 131,997.05 + smsp__inst_executed.max inst 134,093 + smsp__inst_executed.min inst 129,868 + smsp__inst_executed.sum inst 8,447,811 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 21.12 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.21 + smsp__cycles_active.avg cycle 251,939.98 + smsp__cycles_active.sum cycle 16,124,159 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/reportv2.3 b/homework_3/reportv2.3 new file mode 100644 index 0000000..1f50b79 --- /dev/null +++ b/homework_3/reportv2.3 @@ -0,0 +1,1917 @@ +==PROF== Connected to process 20279 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 5 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 5 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 5 passes +==PROF== Disconnected from process 20279 +[20279] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 1.20 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 186,368 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 2,981,888 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 111,946.88 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 112,116 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 111,795 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 1,791,150 + smsp__average_warp_latency_issue_stalled_barrier.pct % 644,345.26 + smsp__average_warp_latency_issue_stalled_barrier.ratio 6,443.45 + smsp__inst_executed.avg inst 1,030,868.94 + smsp__inst_executed.max inst 1,031,062 + smsp__inst_executed.min inst 1,030,675 + smsp__inst_executed.sum inst 65,975,612 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.50 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12 + smsp__cycles_active.avg cycle 1,666,829.12 + smsp__cycles_active.sum cycle 106,677,064 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.84 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.59 + smsp__inst_executed.max inst 12,538 + smsp__inst_executed.min inst 11,945 + smsp__inst_executed.sum inst 787,750 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,268.67 + smsp__cycles_active.sum cycle 4,689,195 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,642.38 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,963 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,322 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,278 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,392.55 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,233.93 + smsp__inst_executed.avg inst 189,292.45 + smsp__inst_executed.max inst 192,372 + smsp__inst_executed.min inst 186,246 + smsp__inst_executed.sum inst 12,114,717 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.81 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,267.31 + smsp__cycles_active.sum cycle 20,241,108 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.58 + smsp__inst_executed.max inst 12,667 + smsp__inst_executed.min inst 11,936 + smsp__inst_executed.sum inst 787,109 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,505.30 + smsp__cycles_active.sum cycle 4,512,339 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.55 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.17 + smsp__inst_executed.max inst 12,702 + smsp__inst_executed.min inst 11,606 + smsp__inst_executed.sum inst 787,787 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,897.17 + smsp__cycles_active.sum cycle 4,665,419 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 230.91 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,680 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,009 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,334 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,880 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,674.16 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,236.74 + smsp__inst_executed.avg inst 189,294.36 + smsp__inst_executed.max inst 192,238 + smsp__inst_executed.min inst 186,252 + smsp__inst_executed.sum inst 12,114,839 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.85 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,040.81 + smsp__cycles_active.sum cycle 20,226,612 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.72 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.78 + smsp__inst_executed.max inst 12,542 + smsp__inst_executed.min inst 11,960 + smsp__inst_executed.sum inst 786,802 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,235.28 + smsp__cycles_active.sum cycle 4,559,058 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.95 + smsp__inst_executed.max inst 12,560 + smsp__inst_executed.min inst 12,096 + smsp__inst_executed.sum inst 787,133 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,575.53 + smsp__cycles_active.sum cycle 4,516,834 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.61 + smsp__inst_executed.max inst 12,640 + smsp__inst_executed.min inst 12,096 + smsp__inst_executed.sum inst 787,751 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,641.39 + smsp__cycles_active.sum cycle 4,649,049 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:48, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.87 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,674.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,017 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,354 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,796 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,483.94 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,234.84 + smsp__inst_executed.avg inst 189,288.14 + smsp__inst_executed.max inst 192,081 + smsp__inst_executed.min inst 186,477 + smsp__inst_executed.sum inst 12,114,441 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.86 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 315,433.75 + smsp__cycles_active.sum cycle 20,187,760 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.34 + smsp__inst_executed.max inst 12,724 + smsp__inst_executed.min inst 12,076 + smsp__inst_executed.sum inst 786,582 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,402.61 + smsp__cycles_active.sum cycle 4,505,767 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.27 + smsp__inst_executed.max inst 12,717 + smsp__inst_executed.min inst 11,988 + smsp__inst_executed.sum inst 786,833 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,681.59 + smsp__cycles_active.sum cycle 4,523,622 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.42 + smsp__inst_executed.max inst 12,663 + smsp__inst_executed.min inst 11,882 + smsp__inst_executed.sum inst 787,099 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,688.28 + smsp__cycles_active.sum cycle 4,524,050 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.49 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.03 + smsp__inst_executed.max inst 12,686 + smsp__inst_executed.min inst 11,852 + smsp__inst_executed.sum inst 787,778 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,892.83 + smsp__cycles_active.sum cycle 4,665,141 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.33 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,677 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,976 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,331 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,832 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,882.24 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,238.82 + smsp__inst_executed.avg inst 189,292.19 + smsp__inst_executed.max inst 192,340 + smsp__inst_executed.min inst 186,215 + smsp__inst_executed.sum inst 12,114,700 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.86 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,203.25 + smsp__cycles_active.sum cycle 20,237,008 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.06 + smsp__inst_executed.max inst 12,694 + smsp__inst_executed.min inst 11,900 + smsp__inst_executed.sum inst 786,500 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,488.72 + smsp__cycles_active.sum cycle 4,511,278 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.25 + smsp__inst_executed.max inst 12,681 + smsp__inst_executed.min inst 12,008 + smsp__inst_executed.sum inst 786,640 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,605.89 + smsp__cycles_active.sum cycle 4,518,777 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.84 + smsp__inst_executed.max inst 12,543 + smsp__inst_executed.min inst 11,998 + smsp__inst_executed.sum inst 786,742 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,795.58 + smsp__cycles_active.sum cycle 4,530,917 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,299.95 + smsp__inst_executed.max inst 12,683 + smsp__inst_executed.min inst 11,720 + smsp__inst_executed.sum inst 787,197 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,136.48 + smsp__cycles_active.sum cycle 4,488,735 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.52 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.09 + smsp__inst_executed.max inst 12,613 + smsp__inst_executed.min inst 11,865 + smsp__inst_executed.sum inst 787,782 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,887.53 + smsp__cycles_active.sum cycle 4,664,802 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,682.56 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,017 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,315 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,921 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,910.64 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,249.11 + smsp__inst_executed.avg inst 189,291.42 + smsp__inst_executed.max inst 192,361 + smsp__inst_executed.min inst 186,192 + smsp__inst_executed.sum inst 12,114,651 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.97 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,146.12 + smsp__cycles_active.sum cycle 20,233,352 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:49, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.03 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.48 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,868 + smsp__inst_executed.sum inst 786,463 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,004.22 + smsp__cycles_active.sum cycle 4,672,270 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.81 + smsp__inst_executed.max inst 12,480 + smsp__inst_executed.min inst 12,068 + smsp__inst_executed.sum inst 786,548 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,790.83 + smsp__cycles_active.sum cycle 4,530,613 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.59 + smsp__inst_executed.max inst 12,701 + smsp__inst_executed.min inst 12,068 + smsp__inst_executed.sum inst 786,598 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,847.19 + smsp__cycles_active.sum cycle 4,534,220 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.72 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 12,038 + smsp__inst_executed.sum inst 786,798 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,747 + smsp__cycles_active.sum cycle 4,527,808 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.14 + smsp__inst_executed.max inst 12,645 + smsp__inst_executed.min inst 12,029 + smsp__inst_executed.sum inst 787,081 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,059.03 + smsp__cycles_active.sum cycle 4,483,778 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.58 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.86 + smsp__inst_executed.max inst 12,724 + smsp__inst_executed.min inst 11,654 + smsp__inst_executed.sum inst 787,767 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,813.80 + smsp__cycles_active.sum cycle 4,660,083 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,669.44 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,942 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,386 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,711 + smsp__average_warp_latency_issue_stalled_barrier.pct % 125,049.38 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,250.49 + smsp__inst_executed.avg inst 189,291.03 + smsp__inst_executed.max inst 192,313 + smsp__inst_executed.min inst 186,310 + smsp__inst_executed.sum inst 12,114,626 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.97 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,608.81 + smsp__cycles_active.sum cycle 20,262,964 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.78 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.95 + smsp__inst_executed.max inst 12,856 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,429 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,331.70 + smsp__cycles_active.sum cycle 4,565,229 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.03 + smsp__inst_executed.max inst 12,488 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,434 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,232.05 + smsp__cycles_active.sum cycle 4,686,851 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.50 + smsp__inst_executed.max inst 12,488 + smsp__inst_executed.min inst 12,072 + smsp__inst_executed.sum inst 786,528 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,846.25 + smsp__cycles_active.sum cycle 4,534,160 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.84 + smsp__inst_executed.max inst 12,564 + smsp__inst_executed.min inst 12,104 + smsp__inst_executed.sum inst 786,614 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,881.05 + smsp__cycles_active.sum cycle 4,536,387 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:50, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.40 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.73 + smsp__inst_executed.max inst 12,757 + smsp__inst_executed.min inst 11,970 + smsp__inst_executed.sum inst 786,799 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,142.94 + smsp__cycles_active.sum cycle 4,553,148 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.95 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.62 + smsp__inst_executed.max inst 12,553 + smsp__inst_executed.min inst 12,119 + smsp__inst_executed.sum inst 787,112 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,189.52 + smsp__cycles_active.sum cycle 4,492,129 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.71 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.52 + smsp__inst_executed.max inst 12,538 + smsp__inst_executed.min inst 12,074 + smsp__inst_executed.sum inst 787,809 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,879.23 + smsp__cycles_active.sum cycle 4,664,271 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,673 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,007 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,299 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,768 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,557.10 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,245.57 + smsp__inst_executed.avg inst 189,303.22 + smsp__inst_executed.max inst 192,317 + smsp__inst_executed.min inst 186,277 + smsp__inst_executed.sum inst 12,115,406 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.96 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 315,741.19 + smsp__cycles_active.sum cycle 20,207,436 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.40 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.92 + smsp__inst_executed.max inst 12,648 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,427 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,978.88 + smsp__cycles_active.sum cycle 4,606,648 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.30 + smsp__inst_executed.max inst 12,848 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,451 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,708.22 + smsp__cycles_active.sum cycle 4,589,326 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.19 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.11 + smsp__inst_executed.max inst 12,876 + smsp__inst_executed.min inst 11,688 + smsp__inst_executed.sum inst 786,503 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,332.14 + smsp__cycles_active.sum cycle 4,693,257 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.50 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.89 + smsp__inst_executed.max inst 12,507 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,489 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,441.14 + smsp__cycles_active.sum cycle 4,508,233 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.69 + smsp__inst_executed.max inst 12,682 + smsp__inst_executed.min inst 11,866 + smsp__inst_executed.sum inst 786,604 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,768.55 + smsp__cycles_active.sum cycle 4,529,187 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.67 + smsp__inst_executed.max inst 12,534 + smsp__inst_executed.min inst 11,732 + smsp__inst_executed.sum inst 786,795 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,007.56 + smsp__cycles_active.sum cycle 4,544,484 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,299.09 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 787,142 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,781.25 + smsp__cycles_active.sum cycle 4,530,000 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:51, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.02 + smsp__inst_executed.max inst 12,707 + smsp__inst_executed.min inst 11,847 + smsp__inst_executed.sum inst 787,777 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,505.88 + smsp__cycles_active.sum cycle 4,640,376 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,666.06 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,013 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,348 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,657 + smsp__average_warp_latency_issue_stalled_barrier.pct % 124,275.15 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,242.75 + smsp__inst_executed.avg inst 189,315.86 + smsp__inst_executed.max inst 192,371 + smsp__inst_executed.min inst 186,294 + smsp__inst_executed.sum inst 12,116,215 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.90 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,297.72 + smsp__cycles_active.sum cycle 20,243,054 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.20 + smsp__inst_executed.max inst 12,484 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,445 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,382.31 + smsp__cycles_active.sum cycle 4,760,468 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.88 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.11 + smsp__inst_executed.max inst 12,484 + smsp__inst_executed.min inst 11,716 + smsp__inst_executed.sum inst 786,439 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,860.06 + smsp__cycles_active.sum cycle 4,599,044 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.04 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.05 + smsp__inst_executed.max inst 12,664 + smsp__inst_executed.min inst 11,700 + smsp__inst_executed.sum inst 786,435 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,882.38 + smsp__cycles_active.sum cycle 4,600,472 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.13 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.81 + smsp__inst_executed.max inst 12,870 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,484 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,247.75 + smsp__cycles_active.sum cycle 4,687,856 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.89 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.59 + smsp__inst_executed.max inst 12,494 + smsp__inst_executed.min inst 11,898 + smsp__inst_executed.sum inst 786,534 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,630.66 + smsp__cycles_active.sum cycle 4,520,362 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.27 + smsp__inst_executed.max inst 12,510 + smsp__inst_executed.min inst 12,082 + smsp__inst_executed.sum inst 786,641 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,548.77 + smsp__cycles_active.sum cycle 4,515,121 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.64 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,857 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,171.45 + smsp__cycles_active.sum cycle 4,554,973 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.86 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,301.05 + smsp__inst_executed.max inst 12,725 + smsp__inst_executed.min inst 11,871 + smsp__inst_executed.sum inst 787,267 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,490.50 + smsp__cycles_active.sum cycle 4,511,392 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.17 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,316.05 + smsp__inst_executed.max inst 12,594 + smsp__inst_executed.min inst 11,865 + smsp__inst_executed.sum inst 788,227 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,533.61 + smsp__cycles_active.sum cycle 4,642,151 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 231.55 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,681.88 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,120 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,332 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,910 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,982.60 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,239.83 + smsp__inst_executed.avg inst 189,283.48 + smsp__inst_executed.max inst 192,309 + smsp__inst_executed.min inst 186,242 + smsp__inst_executed.sum inst 12,114,143 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.88 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 316,209.50 + smsp__cycles_active.sum cycle 20,237,408 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:52, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 56.70 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.97 + smsp__inst_executed.max inst 12,492 + smsp__inst_executed.min inst 11,896 + smsp__inst_executed.sum inst 786,430 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 68,714 + smsp__cycles_active.sum cycle 4,397,696 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.64 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.14 + smsp__inst_executed.max inst 12,844 + smsp__inst_executed.min inst 11,528 + smsp__inst_executed.sum inst 786,441 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,171.33 + smsp__cycles_active.sum cycle 4,746,965 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.72 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.55 + smsp__inst_executed.max inst 12,684 + smsp__inst_executed.min inst 11,884 + smsp__inst_executed.sum inst 786,467 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,696.42 + smsp__cycles_active.sum cycle 4,588,571 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.08 + smsp__inst_executed.max inst 12,660 + smsp__inst_executed.min inst 11,724 + smsp__inst_executed.sum inst 786,437 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,640.89 + smsp__cycles_active.sum cycle 4,585,017 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.06 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.06 + smsp__inst_executed.max inst 12,524 + smsp__inst_executed.min inst 11,900 + smsp__inst_executed.sum inst 786,436 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,132.61 + smsp__cycles_active.sum cycle 4,680,487 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.08 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.61 + smsp__inst_executed.max inst 12,634 + smsp__inst_executed.min inst 11,884 + smsp__inst_executed.sum inst 786,535 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,620.73 + smsp__cycles_active.sum cycle 4,519,727 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.28 + smsp__inst_executed.max inst 12,704 + smsp__inst_executed.min inst 11,892 + smsp__inst_executed.sum inst 786,642 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,037.52 + smsp__cycles_active.sum cycle 4,546,401 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.23 + smsp__inst_executed.max inst 12,931 + smsp__inst_executed.min inst 11,840 + smsp__inst_executed.sum inst 786,767 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,840.56 + smsp__cycles_active.sum cycle 4,533,796 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.42 + smsp__inst_executed.max inst 12,587 + smsp__inst_executed.min inst 11,966 + smsp__inst_executed.sum inst 787,099 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,543.30 + smsp__cycles_active.sum cycle 4,514,771 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.39 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.44 + smsp__inst_executed.max inst 12,751 + smsp__inst_executed.min inst 11,714 + smsp__inst_executed.sum inst 787,804 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,313.14 + smsp__cycles_active.sum cycle 4,628,041 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 13:40:53, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 228.54 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,691.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,988 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,367 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,060 + smsp__average_warp_latency_issue_stalled_barrier.pct % 123,962.42 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,239.62 + smsp__inst_executed.avg inst 189,051.73 + smsp__inst_executed.max inst 192,054 + smsp__inst_executed.min inst 186,060 + smsp__inst_executed.sum inst 12,099,311 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.83 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 317,268.88 + smsp__cycles_active.sum cycle 20,305,208 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp index fbcf924..98c8904 100644 --- a/homework_3/src/bitonicsort.hpp +++ b/homework_3/src/bitonicsort.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "utils.hpp" @@ -159,9 +160,9 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + throw std::runtime_error("[CUDA] - Can not allocate memory"); if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory to device"); Timer_memory.stop(); size_t Nth = config.blockSize; @@ -180,7 +181,7 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory from device"); cudaFree(dev_data); Timer_memory.stop(); } @@ -247,6 +248,31 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st } } + +/*! + * This is unrolled part of the bitonic double loop for the init phase where the entire + * double loop can fit in one block with shared memory access. + * + * First each thread caches its corresponding data point from the current and the following data block. + * After that we execute the pre-phase on the local data and then we write back to global memory. + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param stages [size_t] The number of stages to pre execute + * @param maxStages [size_t] The maximum number of stages for the entire sort + */ +template +__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { + for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { + for (size_t step = stage; step > 0; ) { + --step; + interBlockStep_(data, n, step, stage); + __syncthreads(); + } + } +} + /*! * A CUDA version of the Bitonic sort algorithm. * @@ -262,18 +288,22 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + throw std::runtime_error("[CUDA] - Can not allocate memory"); if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory to device"); Timer_memory.stop(); size_t Nth = config.blockSize; size_t Nbl = NBlocks(size); auto Stages = static_cast(log2(size)); - auto InnerBlockSteps = static_cast(log2(Nth)); // + auto InnerBlockSteps = static_cast(log2(Nth)); + size_t PrephaseStages= InnerBlockSteps + 1; + Timer_sorting.start(); - for (size_t stage = 1; stage <= Stages; ++stage) { + prephase<<>>(dev_data, size, PrephaseStages, Stages); + cudaDeviceSynchronize(); + for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { size_t step = stage - 1; for ( ; step > InnerBlockSteps; --step) { interBlockStep<<>>(dev_data, size, step, stage); @@ -286,7 +316,7 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory from device"); cudaFree(dev_data); Timer_memory.stop(); } @@ -301,7 +331,9 @@ void bitonicSort(DataT& data) { * @note * Each block thread collection can exchange twice the size of data points. */ -inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } +inline constexpr size_t effectiveBlockSize(size_t blockSize) { + return SizeToThreadsRatio * blockSize; +} @@ -400,10 +432,70 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st __syncthreads(); } - // Write back to global memory + // Write back to global memory (no sync here, there will be sync from host) data[gIdx0] = shared_data[lIdx0]; data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; +} + +/*! + * This is unrolled part of the bitonic double loop for the init phase where the entire + * double loop can fit in one block with shared memory access. + * + * First each thread caches its corresponding data point from the current and the following data block. + * After that we execute the pre-phase on the local data and then we write back to global memory. + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param stages [size_t] The number of stages to pre execute + * @param maxStages [size_t] The maximum number of stages for the entire sort + */ +template +__global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages) { + extern __shared__ ValueT shared_data[]; + + /* + * Global and local(shared) memory indices (calculated once) + * Here we skip blocks every time (one for SizeToThreadsRatio = 2) + * And we cache the neighbor block address indexes in local (shared) memory + */ + threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; + threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); + + if (gIdx0 + blockDim.x >= n) // Boundary check + return; + + // Fetch to local memory the entire effective block size (2 positions for each thread) + shared_data[lIdx0] = data[gIdx0]; + shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; __syncthreads(); + for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { + for (size_t step = stage; step > 0; ) { + --step; + + // Init thread global and local indices + threadId_t gIdx = gIdx0; + threadId_t lIdx = lIdx0; + // Find partner and keep-small configuration based on the global data positions + threadId_t pIdx = partner(gIdx, step); + if (gIdx > pIdx) { + // Shift inside effective block + gIdx += blockDim.x; // global + pIdx += blockDim.x; + lIdx += blockDim.x; // local + } + bool keep = keepSmall(gIdx, pIdx, stage); + + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx, blockDim.x); + exchange(shared_data, lIdx, lpIdx, keep); + __syncthreads(); + } + } + + // Write back to global memory (no sync here, there will be sync from host) + data[gIdx0] = shared_data[lIdx0]; + data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; } /*! @@ -421,19 +513,23 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + throw std::runtime_error("[CUDA] - Can not allocate memory"); if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory to device"); Timer_memory.stop(); size_t Nth = config.blockSize; size_t Nbl = NBlocks(size); - size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); + size_t kernelMemSize = effectiveBlockSize(config.blockSize) * sizeof(value_t); auto Stages = static_cast(log2(size)); auto InnerBlockSteps = static_cast(log2(Nth)); + size_t PrephaseStages= InnerBlockSteps + 1; + Timer_sorting.start(); - for (size_t stage = 1; stage <= Stages; ++stage) { + prephase<<>>(dev_data, size, PrephaseStages, Stages); + cudaDeviceSynchronize(); + for (size_t stage = PrephaseStages + 1; stage <= Stages; ++stage) { size_t step = stage - 1; for ( ; step > InnerBlockSteps; --step) { interBlockStep<<>>(dev_data, size, step, stage); @@ -446,7 +542,7 @@ void bitonicSort(DataT& data) { Timer_memory.start(); if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) - throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); + throw std::runtime_error("[CUDA] - Can not copy memory from device"); cudaFree(dev_data); Timer_memory.stop(); } diff --git a/homework_3/src/config.h b/homework_3/src/config.h index 5423f1a..c2e9e0b 100644 --- a/homework_3/src/config.h +++ b/homework_3/src/config.h @@ -65,8 +65,8 @@ using ArraySize_t = uint64_t; * The values of the members are set from the command line. */ struct config_t { - ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. - size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. + ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. + size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat //!< the sorting times to do so. diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp index c93490f..ebd3c3b 100644 --- a/homework_3/src/main.cpp +++ b/homework_3/src/main.cpp @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -140,14 +141,14 @@ bool get_options(int argc, char* argv[]){ // Check configuration requirements if (config.blockSize % device.warpSize) - throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n"); + throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size"); if (config.arraySize < 2*config.blockSize) throw std::runtime_error("[Config] - Unsupported array size (smaller than " - + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n"); + + std::to_string(SizeToThreadsRatio*config.blockSize) + ")"); if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) throw std::runtime_error("[CUDA] - Unsupported array size: " + std::to_string(config.arraySize * sizeof(Value_t)) - + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n"); + + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")"); return status; } @@ -197,6 +198,7 @@ int main(int argc, char* argv[]) try { // Init everything init(&argc, &argv); + logger << "Code version: " << 'V' << STR(CODE_VERSION) << logger.endl; logger << "Array size: " << config.arraySize << " (Q=" << static_cast(log2(config.arraySize))<< ")" << logger.endl; logger << "Repeated sorts: " << config.perf << logger.endl; logger << "GPU: " << device.name << logger.endl; @@ -213,7 +215,7 @@ int main(int argc, char* argv[]) try { logger << " Done." << logger.endl; // Run distributed sort - logger << "Start sorting ... "; + logger << "Start sorting ... "; Timer_total.start(); bitonicSort(Data); Timer_total.stop();