From a5ae5c6bdb34e015ae07257fc0bc7854570ba85d Mon Sep 17 00:00:00 2001 From: Christos Choutouridis Date: Sun, 16 Feb 2025 23:16:38 +0200 Subject: [PATCH] HE3: RC3 - V2 code refactor version measurements --- .../analyse/RC3/profReportv2_A-regVals.txt | 2049 +++++++++++++++++ .../RC3/profReportv2_B-CodeRefactor.txt | 2049 +++++++++++++++++ homework_3/src/bitonicsort.hpp | 121 +- homework_3/src/config.h | 4 +- 4 files changed, 4196 insertions(+), 27 deletions(-) create mode 100644 homework_3/analyse/RC3/profReportv2_A-regVals.txt create mode 100644 homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt diff --git a/homework_3/analyse/RC3/profReportv2_A-regVals.txt b/homework_3/analyse/RC3/profReportv2_A-regVals.txt new file mode 100644 index 0000000..2e4181d --- /dev/null +++ b/homework_3/analyse/RC3/profReportv2_A-regVals.txt @@ -0,0 +1,2049 @@ +==PROF== Connected to process 97867 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes +==PROF== Disconnected from process 97867 +[97867] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:25, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 2.74 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 424,497.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 424,572 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 424,430 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 6,791,964 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 307,967.38 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 308,221 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 307,721 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 4,927,478 + smsp__average_warp_latency_issue_stalled_barrier.pct % 1,710,758.26 + smsp__average_warp_latency_issue_stalled_barrier.ratio 17,107.58 + smsp__inst_executed.avg inst 2,094,137.72 + smsp__inst_executed.max inst 2,094,574 + smsp__inst_executed.min inst 2,093,816 + smsp__inst_executed.sum inst 134,024,814 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 14.50 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.14 + smsp__cycles_active.avg cycle 3,809,017.45 + smsp__cycles_active.sum cycle 243,777,117 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:25, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.81 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,098.91 + smsp__inst_executed.max inst 13,825 + smsp__inst_executed.min inst 12,525 + smsp__inst_executed.sum inst 838,330 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,635.02 + smsp__cycles_active.sum cycle 4,712,641 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:25, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,043.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,838 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,216 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,692 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,255.12 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,790 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,720 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,082 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,705.07 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,467.05 + smsp__inst_executed.avg inst 201,993.16 + smsp__inst_executed.max inst 205,329 + smsp__inst_executed.min inst 198,680 + smsp__inst_executed.sum inst 12,927,562 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.43 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 359,188.83 + smsp__cycles_active.sum cycle 22,988,085 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:25, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,076.97 + smsp__inst_executed.max inst 13,642 + smsp__inst_executed.min inst 12,644 + smsp__inst_executed.sum inst 836,926 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,829.16 + smsp__cycles_active.sum cycle 4,533,066 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:25, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.41 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,098.47 + smsp__inst_executed.max inst 13,695 + smsp__inst_executed.min inst 12,514 + smsp__inst_executed.sum inst 838,302 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,821.30 + smsp__cycles_active.sum cycle 4,724,563 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 261.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,031.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,842 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,220 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,500 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,252.81 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,772 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,700 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,045 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,886.83 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,468.87 + smsp__inst_executed.avg inst 201,980.12 + smsp__inst_executed.max inst 205,383 + smsp__inst_executed.min inst 198,773 + smsp__inst_executed.sum inst 12,926,728 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.42 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 359,825.25 + smsp__cycles_active.sum cycle 23,028,816 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,066.20 + smsp__inst_executed.max inst 13,614 + smsp__inst_executed.min inst 12,528 + smsp__inst_executed.sum inst 836,237 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,445.25 + smsp__cycles_active.sum cycle 4,572,496 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,078.12 + smsp__inst_executed.max inst 13,392 + smsp__inst_executed.min inst 12,753 + smsp__inst_executed.sum inst 837,000 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,808.16 + smsp__cycles_active.sum cycle 4,531,722 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.84 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.41 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,100.52 + smsp__inst_executed.max inst 13,544 + smsp__inst_executed.min inst 12,412 + smsp__inst_executed.sum inst 838,433 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,285.58 + smsp__cycles_active.sum cycle 4,690,277 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,043.50 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,864 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,218 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,696 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,272.62 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,842 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,682 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,362 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,681.13 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,466.81 + smsp__inst_executed.avg inst 201,994.86 + smsp__inst_executed.max inst 205,278 + smsp__inst_executed.min inst 198,784 + smsp__inst_executed.sum inst 12,927,671 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.42 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 359,214.95 + smsp__cycles_active.sum cycle 22,989,757 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,062.50 + smsp__inst_executed.max inst 13,331 + smsp__inst_executed.min inst 12,768 + smsp__inst_executed.sum inst 836,000 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,902.33 + smsp__cycles_active.sum cycle 4,537,749 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,065.06 + smsp__inst_executed.max inst 13,572 + smsp__inst_executed.min inst 12,398 + smsp__inst_executed.sum inst 836,164 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,989.39 + smsp__cycles_active.sum cycle 4,543,321 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,078.56 + smsp__inst_executed.max inst 13,535 + smsp__inst_executed.min inst 12,643 + smsp__inst_executed.sum inst 837,028 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,968 + smsp__cycles_active.sum cycle 4,541,952 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.87 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,101.59 + smsp__inst_executed.max inst 13,450 + smsp__inst_executed.min inst 12,686 + smsp__inst_executed.sum inst 838,502 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,303.42 + smsp__cycles_active.sum cycle 4,691,419 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:26, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.02 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,043.62 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,844 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,240 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,698 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,261.31 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,739 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,770 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,181 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,341.85 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,463.42 + smsp__inst_executed.avg inst 201,989.12 + smsp__inst_executed.max inst 205,318 + smsp__inst_executed.min inst 198,758 + smsp__inst_executed.sum inst 12,927,304 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.44 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 357,843.03 + smsp__cycles_active.sum cycle 22,901,954 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,058.53 + smsp__inst_executed.max inst 13,526 + smsp__inst_executed.min inst 12,306 + smsp__inst_executed.sum inst 835,746 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,787.86 + smsp__cycles_active.sum cycle 4,594,423 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,059.02 + smsp__inst_executed.max inst 13,478 + smsp__inst_executed.min inst 12,830 + smsp__inst_executed.sum inst 835,777 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,028.17 + smsp__cycles_active.sum cycle 4,545,803 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,065.78 + smsp__inst_executed.max inst 13,438 + smsp__inst_executed.min inst 12,574 + smsp__inst_executed.sum inst 836,210 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,904.80 + smsp__cycles_active.sum cycle 4,537,907 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.27 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,078.08 + smsp__inst_executed.max inst 13,449 + smsp__inst_executed.min inst 12,852 + smsp__inst_executed.sum inst 836,997 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,550.70 + smsp__cycles_active.sum cycle 4,515,245 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.81 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,101.52 + smsp__inst_executed.max inst 13,598 + smsp__inst_executed.min inst 12,493 + smsp__inst_executed.sum inst 838,497 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,532.58 + smsp__cycles_active.sum cycle 4,706,085 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.91 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,053.50 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,874 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,224 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,856 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,271 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,836 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,727 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,336 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,501.01 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,465.01 + smsp__inst_executed.avg inst 202,004.09 + smsp__inst_executed.max inst 205,252 + smsp__inst_executed.min inst 198,682 + smsp__inst_executed.sum inst 12,928,262 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.39 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 359,682.91 + smsp__cycles_active.sum cycle 23,019,706 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.35 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,057.83 + smsp__inst_executed.max inst 13,496 + smsp__inst_executed.min inst 12,644 + smsp__inst_executed.sum inst 835,701 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,331.45 + smsp__cycles_active.sum cycle 4,693,213 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.40 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,059.84 + smsp__inst_executed.max inst 13,458 + smsp__inst_executed.min inst 12,796 + smsp__inst_executed.sum inst 835,830 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,956.33 + smsp__cycles_active.sum cycle 4,541,205 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,061.11 + smsp__inst_executed.max inst 13,295 + smsp__inst_executed.min inst 12,718 + smsp__inst_executed.sum inst 835,911 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,879.30 + smsp__cycles_active.sum cycle 4,536,275 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,069.02 + smsp__inst_executed.max inst 13,573 + smsp__inst_executed.min inst 12,724 + smsp__inst_executed.sum inst 836,417 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,109.20 + smsp__cycles_active.sum cycle 4,550,989 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:27, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.46 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,077.70 + smsp__inst_executed.max inst 13,477 + smsp__inst_executed.min inst 12,610 + smsp__inst_executed.sum inst 836,973 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,790.67 + smsp__cycles_active.sum cycle 4,530,603 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,100.05 + smsp__inst_executed.max inst 13,648 + smsp__inst_executed.min inst 12,388 + smsp__inst_executed.sum inst 838,403 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,601.83 + smsp__cycles_active.sum cycle 4,710,517 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,043.62 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,840 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,234 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,698 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,256.06 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,782 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,767 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,097 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,621.11 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,466.21 + smsp__inst_executed.avg inst 201,979.69 + smsp__inst_executed.max inst 205,239 + smsp__inst_executed.min inst 198,743 + smsp__inst_executed.sum inst 12,926,700 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.42 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 359,167 + smsp__cycles_active.sum cycle 22,986,688 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.78 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,054.70 + smsp__inst_executed.max inst 13,370 + smsp__inst_executed.min inst 12,868 + smsp__inst_executed.sum inst 835,501 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,049 + smsp__cycles_active.sum cycle 4,611,136 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.06 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.12 + smsp__inst_executed.max inst 13,659 + smsp__inst_executed.min inst 12,536 + smsp__inst_executed.sum inst 835,592 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,524.42 + smsp__cycles_active.sum cycle 4,705,563 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,057.20 + smsp__inst_executed.max inst 13,566 + smsp__inst_executed.min inst 12,672 + smsp__inst_executed.sum inst 835,661 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,965.64 + smsp__cycles_active.sum cycle 4,541,801 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,061.50 + smsp__inst_executed.max inst 13,438 + smsp__inst_executed.min inst 12,632 + smsp__inst_executed.sum inst 835,936 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,155.02 + smsp__cycles_active.sum cycle 4,553,921 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,068.72 + smsp__inst_executed.max inst 13,420 + smsp__inst_executed.min inst 12,724 + smsp__inst_executed.sum inst 836,398 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,460.50 + smsp__cycles_active.sum cycle 4,573,472 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,078.17 + smsp__inst_executed.max inst 13,504 + smsp__inst_executed.min inst 12,664 + smsp__inst_executed.sum inst 837,003 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,057.59 + smsp__cycles_active.sum cycle 4,547,686 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.49 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.41 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.92 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,098.28 + smsp__inst_executed.max inst 13,569 + smsp__inst_executed.min inst 12,427 + smsp__inst_executed.sum inst 838,290 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,038.50 + smsp__cycles_active.sum cycle 4,674,464 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 261.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,049.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,926 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,284 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,788 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,288.69 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,900 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,826 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,619 + smsp__average_warp_latency_issue_stalled_barrier.pct % 147,924.16 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,479.24 + smsp__inst_executed.avg inst 202,010.41 + smsp__inst_executed.max inst 205,310 + smsp__inst_executed.min inst 198,801 + smsp__inst_executed.sum inst 12,928,666 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.53 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.14 + smsp__cycles_active.avg cycle 359,403.17 + smsp__cycles_active.sum cycle 23,001,803 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:28, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.88 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.27 + smsp__inst_executed.max inst 13,430 + smsp__inst_executed.min inst 12,688 + smsp__inst_executed.sum inst 835,601 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,117.38 + smsp__cycles_active.sum cycle 4,615,512 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.10 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,057.19 + smsp__inst_executed.max inst 13,468 + smsp__inst_executed.min inst 12,648 + smsp__inst_executed.sum inst 835,660 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,818.41 + smsp__cycles_active.sum cycle 4,596,378 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.97 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.58 + smsp__inst_executed.max inst 13,392 + smsp__inst_executed.min inst 12,796 + smsp__inst_executed.sum inst 835,621 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,603.70 + smsp__cycles_active.sum cycle 4,710,637 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,060.06 + smsp__inst_executed.max inst 13,759 + smsp__inst_executed.min inst 12,626 + smsp__inst_executed.sum inst 835,844 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,422.75 + smsp__cycles_active.sum cycle 4,571,056 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,061.48 + smsp__inst_executed.max inst 13,504 + smsp__inst_executed.min inst 12,727 + smsp__inst_executed.sum inst 835,935 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,423.25 + smsp__cycles_active.sum cycle 4,571,088 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.40 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,065.03 + smsp__inst_executed.max inst 13,452 + smsp__inst_executed.min inst 12,618 + smsp__inst_executed.sum inst 836,162 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,757.98 + smsp__cycles_active.sum cycle 4,528,511 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,077.34 + smsp__inst_executed.max inst 13,374 + smsp__inst_executed.min inst 12,676 + smsp__inst_executed.sum inst 836,950 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,080.64 + smsp__cycles_active.sum cycle 4,549,161 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.87 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,098.39 + smsp__inst_executed.max inst 13,471 + smsp__inst_executed.min inst 12,696 + smsp__inst_executed.sum inst 838,297 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,613.73 + smsp__cycles_active.sum cycle 4,711,279 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 262.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,044 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,924 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,230 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,704 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,279.81 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,930 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,741 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,477 + smsp__average_warp_latency_issue_stalled_barrier.pct % 147,875.18 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,478.75 + smsp__inst_executed.avg inst 201,980.91 + smsp__inst_executed.max inst 205,280 + smsp__inst_executed.min inst 198,678 + smsp__inst_executed.sum inst 12,926,778 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.53 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.14 + smsp__cycles_active.avg cycle 359,333.61 + smsp__cycles_active.sum cycle 22,997,351 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,054.95 + smsp__inst_executed.max inst 13,496 + smsp__inst_executed.min inst 12,616 + smsp__inst_executed.sum inst 835,517 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,933.08 + smsp__cycles_active.sum cycle 4,795,717 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:29, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.10 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,057.78 + smsp__inst_executed.max inst 13,340 + smsp__inst_executed.min inst 12,836 + smsp__inst_executed.sum inst 835,698 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,251.72 + smsp__cycles_active.sum cycle 4,624,110 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.10 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,057.56 + smsp__inst_executed.max inst 13,451 + smsp__inst_executed.min inst 12,476 + smsp__inst_executed.sum inst 835,684 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,830.58 + smsp__cycles_active.sum cycle 4,597,157 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.35 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.78 + smsp__inst_executed.max inst 13,655 + smsp__inst_executed.min inst 12,680 + smsp__inst_executed.sum inst 835,634 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,102.31 + smsp__cycles_active.sum cycle 4,678,548 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,058.23 + smsp__inst_executed.max inst 13,683 + smsp__inst_executed.min inst 12,462 + smsp__inst_executed.sum inst 835,727 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,104.28 + smsp__cycles_active.sum cycle 4,550,674 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,061.97 + smsp__inst_executed.max inst 13,440 + smsp__inst_executed.min inst 12,672 + smsp__inst_executed.sum inst 835,966 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,301.42 + smsp__cycles_active.sum cycle 4,563,291 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,066.27 + smsp__inst_executed.max inst 13,392 + smsp__inst_executed.min inst 12,467 + smsp__inst_executed.sum inst 836,241 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,628.64 + smsp__cycles_active.sum cycle 4,584,233 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.98 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,080.28 + smsp__inst_executed.max inst 13,453 + smsp__inst_executed.min inst 12,667 + smsp__inst_executed.sum inst 837,138 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,083.31 + smsp__cycles_active.sum cycle 4,549,332 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.71 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,107.33 + smsp__inst_executed.max inst 13,571 + smsp__inst_executed.min inst 12,788 + smsp__inst_executed.sum inst 838,869 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,305.81 + smsp__cycles_active.sum cycle 4,691,572 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 261.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,049.62 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 50,902 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,172 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,794 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,273.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 28,813 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,665 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,372 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,879.79 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,468.80 + smsp__inst_executed.avg inst 202,028.11 + smsp__inst_executed.max inst 205,339 + smsp__inst_executed.min inst 198,670 + smsp__inst_executed.sum inst 12,929,799 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.45 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 358,847.58 + smsp__cycles_active.sum cycle 22,966,245 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,055.41 + smsp__inst_executed.max inst 13,496 + smsp__inst_executed.min inst 12,632 + smsp__inst_executed.sum inst 835,546 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 69,485.11 + smsp__cycles_active.sum cycle 4,447,047 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:30, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.83 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,054.95 + smsp__inst_executed.max inst 13,280 + smsp__inst_executed.min inst 12,644 + smsp__inst_executed.sum inst 835,517 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,500.81 + smsp__cycles_active.sum cycle 4,768,052 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.09 + smsp__inst_executed.max inst 13,460 + smsp__inst_executed.min inst 12,580 + smsp__inst_executed.sum inst 835,590 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,786.02 + smsp__cycles_active.sum cycle 4,594,305 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.20 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.41 + smsp__inst_executed.max inst 13,448 + smsp__inst_executed.min inst 12,688 + smsp__inst_executed.sum inst 835,610 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,242.58 + smsp__cycles_active.sum cycle 4,623,525 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.16 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,056.45 + smsp__inst_executed.max inst 13,638 + smsp__inst_executed.min inst 12,668 + smsp__inst_executed.sum inst 835,613 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,627.94 + smsp__cycles_active.sum cycle 4,712,188 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,060.33 + smsp__inst_executed.max inst 13,688 + smsp__inst_executed.min inst 12,548 + smsp__inst_executed.sum inst 835,861 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,508 + smsp__cycles_active.sum cycle 4,576,512 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4.00 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,058.94 + smsp__inst_executed.max inst 13,522 + smsp__inst_executed.min inst 12,708 + smsp__inst_executed.sum inst 835,772 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,081.84 + smsp__cycles_active.sum cycle 4,549,238 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,069.42 + smsp__inst_executed.max inst 13,618 + smsp__inst_executed.min inst 12,322 + smsp__inst_executed.sum inst 836,443 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,235.78 + smsp__cycles_active.sum cycle 4,559,090 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.45 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.99 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,078.45 + smsp__inst_executed.max inst 13,414 + smsp__inst_executed.min inst 12,640 + smsp__inst_executed.sum inst 837,021 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,165.06 + smsp__cycles_active.sum cycle 4,554,564 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.55 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.40 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 3.97 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 13,100.33 + smsp__inst_executed.max inst 13,504 + smsp__inst_executed.min inst 12,711 + smsp__inst_executed.sum inst 838,421 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,508.81 + smsp__cycles_active.sum cycle 4,704,564 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:41:31, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 261.60 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 50,050.25 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 51,026 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 49,180 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 800,804 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 28,281.12 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 29,045 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 27,669 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 452,498 + smsp__average_warp_latency_issue_stalled_barrier.pct % 146,579.33 + smsp__average_warp_latency_issue_stalled_barrier.ratio 1,465.79 + smsp__inst_executed.avg inst 201,758.88 + smsp__inst_executed.max inst 205,041 + smsp__inst_executed.min inst 198,447 + smsp__inst_executed.sum inst 12,912,568 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.46 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13 + smsp__cycles_active.avg cycle 358,041.27 + smsp__cycles_active.sum cycle 22,914,641 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt b/homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt new file mode 100644 index 0000000..38cbdab --- /dev/null +++ b/homework_3/analyse/RC3/profReportv2_B-CodeRefactor.txt @@ -0,0 +1,2049 @@ +==PROF== Connected to process 100431 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA) +==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes +==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes +==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes +==PROF== Disconnected from process 100431 +[100431] bitonicCUDA@127.0.0.1 + void prephase(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum msecond 2.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 237,568 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 237,568 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 237,568 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 3,801,088 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 209,070.94 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 209,334 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 208,875 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 3,345,135 + smsp__average_warp_latency_issue_stalled_barrier.pct % 1,692,604.61 + smsp__average_warp_latency_issue_stalled_barrier.ratio 16,926.05 + smsp__inst_executed.avg inst 1,953,951.83 + smsp__inst_executed.max inst 1,954,175 + smsp__inst_executed.min inst 1,953,723 + smsp__inst_executed.sum inst 125,052,917 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 15.35 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.15 + smsp__cycles_active.avg cycle 3,559,774.03 + smsp__cycles_active.sum cycle 227,825,538 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.90 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.03 + smsp__inst_executed.max inst 12,569 + smsp__inst_executed.min inst 11,654 + smsp__inst_executed.sum inst 787,778 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,062.50 + smsp__cycles_active.sum cycle 4,676,000 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 435.49 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,913.38 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,394 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,370 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,614 + smsp__average_warp_latency_issue_stalled_barrier.pct % 353,960.85 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,539.61 + smsp__inst_executed.avg inst 313,131.16 + smsp__inst_executed.max inst 313,277 + smsp__inst_executed.min inst 312,868 + smsp__inst_executed.sum inst 20,040,394 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.30 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 598,137.94 + smsp__cycles_active.sum cycle 38,280,828 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.58 + smsp__inst_executed.max inst 12,573 + smsp__inst_executed.min inst 12,056 + smsp__inst_executed.sum inst 787,109 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,585.64 + smsp__cycles_active.sum cycle 4,517,481 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.78 + smsp__inst_executed.max inst 12,547 + smsp__inst_executed.min inst 12,011 + smsp__inst_executed.sum inst 787,762 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,957.47 + smsp__cycles_active.sum cycle 4,669,278 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 434.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,894.31 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,021 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,779 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,309 + smsp__average_warp_latency_issue_stalled_barrier.pct % 351,861.41 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,518.61 + smsp__inst_executed.avg inst 313,124.58 + smsp__inst_executed.max inst 313,358 + smsp__inst_executed.min inst 312,849 + smsp__inst_executed.sum inst 20,039,973 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.19 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,902.02 + smsp__cycles_active.sum cycle 38,265,729 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.72 + smsp__inst_executed.max inst 12,585 + smsp__inst_executed.min inst 11,832 + smsp__inst_executed.sum inst 786,798 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,044.09 + smsp__cycles_active.sum cycle 4,546,822 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,297.98 + smsp__inst_executed.max inst 12,581 + smsp__inst_executed.min inst 11,984 + smsp__inst_executed.sum inst 787,071 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,504.80 + smsp__cycles_active.sum cycle 4,512,307 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.74 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.89 + smsp__inst_executed.max inst 12,884 + smsp__inst_executed.min inst 11,903 + smsp__inst_executed.sum inst 787,833 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,146.27 + smsp__cycles_active.sum cycle 4,681,361 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 434.91 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,897.94 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,370 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,387 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,367 + smsp__average_warp_latency_issue_stalled_barrier.pct % 355,060.44 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,550.60 + smsp__inst_executed.avg inst 313,126.61 + smsp__inst_executed.max inst 313,456 + smsp__inst_executed.min inst 312,810 + smsp__inst_executed.sum inst 20,040,103 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.38 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,783.91 + smsp__cycles_active.sum cycle 38,258,170 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.58 + smsp__inst_executed.max inst 12,562 + smsp__inst_executed.min inst 11,884 + smsp__inst_executed.sum inst 786,597 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,651.47 + smsp__cycles_active.sum cycle 4,521,694 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.34 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.06 + smsp__inst_executed.max inst 12,557 + smsp__inst_executed.min inst 11,928 + smsp__inst_executed.sum inst 786,756 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,246.70 + smsp__cycles_active.sum cycle 4,559,789 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.75 + smsp__inst_executed.max inst 12,670 + smsp__inst_executed.min inst 11,749 + smsp__inst_executed.sum inst 787,120 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,653.20 + smsp__cycles_active.sum cycle 4,521,805 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.78 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.36 + smsp__inst_executed.max inst 12,811 + smsp__inst_executed.min inst 11,692 + smsp__inst_executed.sum inst 787,799 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,714.31 + smsp__cycles_active.sum cycle 4,653,716 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 434.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,905.69 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 31,983 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,807 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,491 + smsp__average_warp_latency_issue_stalled_barrier.pct % 354,022.84 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,540.23 + smsp__inst_executed.avg inst 313,135.20 + smsp__inst_executed.max inst 318,231 + smsp__inst_executed.min inst 308,095 + smsp__inst_executed.sum inst 20,040,653 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.31 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,940.59 + smsp__cycles_active.sum cycle 38,268,198 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.25 + smsp__inst_executed.max inst 12,640 + smsp__inst_executed.min inst 11,916 + smsp__inst_executed.sum inst 786,512 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,480.17 + smsp__cycles_active.sum cycle 4,510,731 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.56 + smsp__inst_executed.max inst 12,505 + smsp__inst_executed.min inst 12,076 + smsp__inst_executed.sum inst 786,596 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,671.28 + smsp__cycles_active.sum cycle 4,522,962 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.59 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.92 + smsp__inst_executed.max inst 12,757 + smsp__inst_executed.min inst 11,856 + smsp__inst_executed.sum inst 786,747 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,114.81 + smsp__cycles_active.sum cycle 4,551,348 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.41 + smsp__inst_executed.max inst 12,687 + smsp__inst_executed.min inst 11,920 + smsp__inst_executed.sum inst 787,098 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,416.27 + smsp__cycles_active.sum cycle 4,506,641 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.68 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.94 + smsp__inst_executed.max inst 12,697 + smsp__inst_executed.min inst 11,640 + smsp__inst_executed.sum inst 787,772 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,201.34 + smsp__cycles_active.sum cycle 4,684,886 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 433.86 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,913.81 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 31,996 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,782 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,621 + smsp__average_warp_latency_issue_stalled_barrier.pct % 354,697.47 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,546.97 + smsp__inst_executed.avg inst 313,117.39 + smsp__inst_executed.max inst 318,197 + smsp__inst_executed.min inst 308,095 + smsp__inst_executed.sum inst 20,039,513 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.33 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 598,408.55 + smsp__cycles_active.sum cycle 38,298,147 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.03 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.61 + smsp__inst_executed.max inst 12,676 + smsp__inst_executed.min inst 11,864 + smsp__inst_executed.sum inst 786,471 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,508.12 + smsp__cycles_active.sum cycle 4,704,520 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.09 + smsp__inst_executed.max inst 12,660 + smsp__inst_executed.min inst 12,078 + smsp__inst_executed.sum inst 786,566 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,077.06 + smsp__cycles_active.sum cycle 4,548,932 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.24 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.83 + smsp__inst_executed.max inst 12,628 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,549 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,525.67 + smsp__cycles_active.sum cycle 4,513,643 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.66 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.34 + smsp__inst_executed.max inst 12,521 + smsp__inst_executed.min inst 11,630 + smsp__inst_executed.sum inst 786,774 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,924.95 + smsp__cycles_active.sum cycle 4,539,197 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.92 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,297.78 + smsp__inst_executed.max inst 12,697 + smsp__inst_executed.min inst 12,067 + smsp__inst_executed.sum inst 787,058 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,093.91 + smsp__cycles_active.sum cycle 4,550,010 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.45 + smsp__inst_executed.max inst 12,732 + smsp__inst_executed.min inst 11,792 + smsp__inst_executed.sum inst 787,741 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,599.72 + smsp__cycles_active.sum cycle 4,646,382 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 434.43 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,869.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,359 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,446 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 509,916 + smsp__average_warp_latency_issue_stalled_barrier.pct % 352,782.28 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,527.82 + smsp__inst_executed.avg inst 313,121.81 + smsp__inst_executed.max inst 313,218 + smsp__inst_executed.min inst 312,990 + smsp__inst_executed.sum inst 20,039,796 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.20 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 599,438.23 + smsp__cycles_active.sum cycle 38,364,047 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.07 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.05 + smsp__inst_executed.max inst 12,492 + smsp__inst_executed.min inst 11,906 + smsp__inst_executed.sum inst 786,435 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,595.95 + smsp__cycles_active.sum cycle 4,582,141 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.22 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.66 + smsp__inst_executed.max inst 12,512 + smsp__inst_executed.min inst 11,704 + smsp__inst_executed.sum inst 786,474 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,743.53 + smsp__cycles_active.sum cycle 4,655,586 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.08 + smsp__inst_executed.max inst 12,463 + smsp__inst_executed.min inst 11,886 + smsp__inst_executed.sum inst 786,501 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,541.78 + smsp__cycles_active.sum cycle 4,514,674 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.53 + smsp__inst_executed.max inst 12,514 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,594 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,334.75 + smsp__cycles_active.sum cycle 4,501,424 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.62 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,293.53 + smsp__inst_executed.max inst 12,555 + smsp__inst_executed.min inst 11,987 + smsp__inst_executed.sum inst 786,786 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,984.45 + smsp__cycles_active.sum cycle 4,543,005 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.67 + smsp__inst_executed.max inst 12,569 + smsp__inst_executed.min inst 11,918 + smsp__inst_executed.sum inst 787,115 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,573.64 + smsp__cycles_active.sum cycle 4,516,713 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.23 + smsp__inst_executed.max inst 12,720 + smsp__inst_executed.min inst 11,718 + smsp__inst_executed.sum inst 787,727 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,732.94 + smsp__cycles_active.sum cycle 4,654,908 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 434.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,883.31 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,395 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,412 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,133 + smsp__average_warp_latency_issue_stalled_barrier.pct % 354,700.81 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,547.01 + smsp__inst_executed.avg inst 313,122.11 + smsp__inst_executed.max inst 318,197 + smsp__inst_executed.min inst 308,040 + smsp__inst_executed.sum inst 20,039,815 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.35 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,850.22 + smsp__cycles_active.sum cycle 38,262,414 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.23 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.16 + smsp__inst_executed.max inst 12,688 + smsp__inst_executed.min inst 11,888 + smsp__inst_executed.sum inst 786,378 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,841.34 + smsp__cycles_active.sum cycle 4,597,846 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.75 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.83 + smsp__inst_executed.max inst 12,660 + smsp__inst_executed.min inst 11,928 + smsp__inst_executed.sum inst 786,485 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,607.83 + smsp__cycles_active.sum cycle 4,582,901 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.87 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.42 + smsp__inst_executed.max inst 12,700 + smsp__inst_executed.min inst 11,680 + smsp__inst_executed.sum inst 786,459 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,464.98 + smsp__cycles_active.sum cycle 4,701,759 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.21 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.50 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 11,732 + smsp__inst_executed.sum inst 786,528 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,751.97 + smsp__cycles_active.sum cycle 4,528,126 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.05 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.84 + smsp__inst_executed.max inst 12,520 + smsp__inst_executed.min inst 12,034 + smsp__inst_executed.sum inst 786,614 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,176.25 + smsp__cycles_active.sum cycle 4,555,280 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.58 + smsp__inst_executed.max inst 12,547 + smsp__inst_executed.min inst 11,776 + smsp__inst_executed.sum inst 786,725 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,971.50 + smsp__cycles_active.sum cycle 4,542,176 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.45 + smsp__inst_executed.max inst 12,555 + smsp__inst_executed.min inst 12,073 + smsp__inst_executed.sum inst 787,101 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,221.45 + smsp__cycles_active.sum cycle 4,494,173 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.36 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,308.89 + smsp__inst_executed.max inst 12,584 + smsp__inst_executed.min inst 11,684 + smsp__inst_executed.sum inst 787,769 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,809.75 + smsp__cycles_active.sum cycle 4,659,824 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 435.17 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,881.75 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,405 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,298 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,108 + smsp__average_warp_latency_issue_stalled_barrier.pct % 351,267.26 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,512.67 + smsp__inst_executed.avg inst 313,146.62 + smsp__inst_executed.max inst 318,289 + smsp__inst_executed.min inst 308,129 + smsp__inst_executed.sum inst 20,041,384 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.17 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,515.77 + smsp__cycles_active.sum cycle 38,241,009 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.70 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.25 + smsp__inst_executed.max inst 12,664 + smsp__inst_executed.min inst 11,904 + smsp__inst_executed.sum inst 786,448 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,531.66 + smsp__cycles_active.sum cycle 4,770,026 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.82 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.38 + smsp__inst_executed.max inst 12,490 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,456 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,103.58 + smsp__cycles_active.sum cycle 4,614,629 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.69 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.58 + smsp__inst_executed.max inst 12,486 + smsp__inst_executed.min inst 11,896 + smsp__inst_executed.sum inst 786,469 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,726.84 + smsp__cycles_active.sum cycle 4,590,518 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.86 + smsp__inst_executed.max inst 12,664 + smsp__inst_executed.min inst 11,716 + smsp__inst_executed.sum inst 786,487 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 73,256.52 + smsp__cycles_active.sum cycle 4,688,417 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,289.66 + smsp__inst_executed.max inst 12,852 + smsp__inst_executed.min inst 11,702 + smsp__inst_executed.sum inst 786,538 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,927.86 + smsp__cycles_active.sum cycle 4,539,383 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.37 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,291.56 + smsp__inst_executed.max inst 12,652 + smsp__inst_executed.min inst 11,912 + smsp__inst_executed.sum inst 786,660 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,620.88 + smsp__cycles_active.sum cycle 4,519,736 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.01 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,294.31 + smsp__inst_executed.max inst 12,766 + smsp__inst_executed.min inst 11,720 + smsp__inst_executed.sum inst 786,836 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,559.72 + smsp__cycles_active.sum cycle 4,579,822 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.18 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,301.61 + smsp__inst_executed.max inst 12,512 + smsp__inst_executed.min inst 12,076 + smsp__inst_executed.sum inst 787,303 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,958.73 + smsp__cycles_active.sum cycle 4,541,359 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.42 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,314.73 + smsp__inst_executed.max inst 12,723 + smsp__inst_executed.min inst 11,867 + smsp__inst_executed.sum inst 788,143 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,355.83 + smsp__cycles_active.sum cycle 4,630,773 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 433.79 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,889.94 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,543 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,317 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,239 + smsp__average_warp_latency_issue_stalled_barrier.pct % 353,670.79 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,536.71 + smsp__inst_executed.avg inst 313,134.42 + smsp__inst_executed.max inst 318,339 + smsp__inst_executed.min inst 308,068 + smsp__inst_executed.sum inst 20,040,603 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.31 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19 + smsp__cycles_active.avg cycle 597,595.56 + smsp__cycles_active.sum cycle 38,246,116 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 56.96 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.81 + smsp__inst_executed.max inst 12,680 + smsp__inst_executed.min inst 11,900 + smsp__inst_executed.sum inst 786,420 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 68,992.86 + smsp__cycles_active.sum cycle 4,415,543 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.80 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.44 + smsp__inst_executed.max inst 12,672 + smsp__inst_executed.min inst 11,908 + smsp__inst_executed.sum inst 786,460 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 74,819.06 + smsp__cycles_active.sum cycle 4,788,420 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.14 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,287.36 + smsp__inst_executed.max inst 12,484 + smsp__inst_executed.min inst 12,088 + smsp__inst_executed.sum inst 786,391 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,808.34 + smsp__cycles_active.sum cycle 4,595,734 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.94 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.64 + smsp__inst_executed.max inst 12,656 + smsp__inst_executed.min inst 12,092 + smsp__inst_executed.sum inst 786,473 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,451.16 + smsp__cycles_active.sum cycle 4,636,874 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 60.54 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.56 + smsp__inst_executed.max inst 12,677 + smsp__inst_executed.min inst 11,720 + smsp__inst_executed.sum inst 786,468 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,851.78 + smsp__cycles_active.sum cycle 4,662,514 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 57.98 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,288.55 + smsp__inst_executed.max inst 12,492 + smsp__inst_executed.min inst 11,924 + smsp__inst_executed.sum inst 786,467 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,639.48 + smsp__cycles_active.sum cycle 4,520,927 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.11 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,290.67 + smsp__inst_executed.max inst 12,540 + smsp__inst_executed.min inst 12,048 + smsp__inst_executed.sum inst 786,603 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,134.56 + smsp__cycles_active.sum cycle 4,552,612 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.56 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,292.61 + smsp__inst_executed.max inst 12,727 + smsp__inst_executed.min inst 11,881 + smsp__inst_executed.sum inst 786,727 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 71,327.62 + smsp__cycles_active.sum cycle 4,564,968 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 58.30 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,298.61 + smsp__inst_executed.max inst 12,810 + smsp__inst_executed.min inst 11,926 + smsp__inst_executed.sum inst 787,111 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 70,631.98 + smsp__cycles_active.sum cycle 4,520,447 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void interBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 59.52 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0 + smsp__average_warp_latency_issue_stalled_barrier.pct % 0 + smsp__average_warp_latency_issue_stalled_barrier.ratio 0 + smsp__inst_executed.avg inst 12,309.06 + smsp__inst_executed.max inst 12,561 + smsp__inst_executed.min inst 12,043 + smsp__inst_executed.sum inst 787,780 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0 + smsp__cycles_active.avg cycle 72,662.08 + smsp__cycles_active.sum cycle 4,650,373 + ---------------------------------------------------------------------- --------------- ------------------------------ + + void inBlockStep(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7 + Section: Command line profiler metrics + ---------------------------------------------------------------------- --------------- ------------------------------ + gpu__time_duration.sum usecond 436.35 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0 + l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50 + l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,827.38 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,385 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,390 + l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 509,238 + smsp__average_warp_latency_issue_stalled_barrier.pct % 358,185.65 + smsp__average_warp_latency_issue_stalled_barrier.ratio 3,581.86 + smsp__inst_executed.avg inst 312,880.55 + smsp__inst_executed.max inst 317,962 + smsp__inst_executed.min inst 307,889 + smsp__inst_executed.sum inst 20,024,355 + smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.51 + smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.20 + smsp__cycles_active.avg cycle 598,707.28 + smsp__cycles_active.sum cycle 38,317,266 + ---------------------------------------------------------------------- --------------- ------------------------------ + diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp index 98c8904..a4bb9be 100644 --- a/homework_3/src/bitonicsort.hpp +++ b/homework_3/src/bitonicsort.hpp @@ -112,8 +112,8 @@ template __device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) { if (( keepSmall && (data[tid] > data[partner])) || (!keepSmall && (data[tid] < data[partner])) ) { - ValueT temp = data[tid]; - data[tid] = data[partner]; + ValueT temp = data[tid]; + data[tid] = data[partner]; data[partner] = temp; } } @@ -378,6 +378,7 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage } } + /*! * This is unrolled part of the bitonic double loop. * @@ -399,42 +400,44 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st * Here we skip blocks every time (one for SizeToThreadsRatio = 2) * And we cache the neighbor block address indexes in local (shared) memory */ - threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; - threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); + threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; + threadId_t lIdx = toLocal(gIdx, blockDim.x); - if (gIdx0 + blockDim.x >= n) // Boundary check + if (gIdx + blockDim.x >= n) // Boundary check return; // Fetch to local memory the entire effective block size (2 positions for each thread) - shared_data[lIdx0] = data[gIdx0]; - shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; + shared_data[lIdx] = data[gIdx]; + shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x]; __syncthreads(); for (size_t step = innerSteps + 1; step > 0; ) { --step; - // Init thread global and local indices - threadId_t gIdx = gIdx0; - threadId_t lIdx = lIdx0; // Find partner and keep-small configuration based on the global data positions threadId_t pIdx = partner(gIdx, step); if (gIdx > pIdx) { - // Shift inside effective block - gIdx += blockDim.x; // global - pIdx += blockDim.x; - lIdx += blockDim.x; // local + // Work on the right site + bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage); + + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x); + exchange(shared_data, lIdx + blockDim.x, lpIdx, keep); } - bool keep = keepSmall(gIdx, pIdx, stage); + else { + // Work on the left site + bool keep = keepSmall(gIdx, pIdx, stage); - // Exchange data on local(shared) copy - threadId_t lpIdx = toLocal(pIdx, blockDim.x); - exchange(shared_data, lIdx, lpIdx, keep); + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx, blockDim.x); + exchange(shared_data, lIdx, lpIdx, keep); + } __syncthreads(); } // Write back to global memory (no sync here, there will be sync from host) - data[gIdx0] = shared_data[lIdx0]; - data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; + data[gIdx] = shared_data[lIdx]; + data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x]; } /*! @@ -459,6 +462,59 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages * Here we skip blocks every time (one for SizeToThreadsRatio = 2) * And we cache the neighbor block address indexes in local (shared) memory */ + threadId_t gIdx = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; + threadId_t lIdx = toLocal(gIdx, blockDim.x); + + if (gIdx + blockDim.x >= n) // Boundary check + return; + + // Fetch to local memory the entire effective block size (2 positions for each thread) + shared_data[lIdx] = data[gIdx]; + shared_data[lIdx + blockDim.x] = data[gIdx + blockDim.x]; + __syncthreads(); + + for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { + for (size_t step = stage; step > 0; ) { + --step; + + // Find partner and keep-small configuration based on the global data positions + threadId_t pIdx = partner(gIdx, step); + if (gIdx > pIdx) { + // Work on the right site + bool keep = keepSmall(gIdx + blockDim.x, pIdx + blockDim.x, stage); + + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx + blockDim.x, blockDim.x); + exchange(shared_data, lIdx + blockDim.x, lpIdx, keep); + } + else { + // Work on the left site + bool keep = keepSmall(gIdx, pIdx, stage); + + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx, blockDim.x); + exchange(shared_data, lIdx, lpIdx, keep); + } + __syncthreads(); + } + } + + // Write back to global memory (no sync here, there will be sync from host) + data[gIdx] = shared_data[lIdx]; + data[gIdx + blockDim.x] = shared_data[lIdx + blockDim.x]; + +#if 0 + /* + * Idea: + * - Keep a register copy of data[gIdx0], and data[gIdx0 + blockDim.x] + * - Instead of exchange in shared_data, read in register the partner and exchange there. + * - Write back only if there was an exchange + * + * ^^ + * Unfortunately this breaks sequential consistency and register values (lValve) does not match with share_data + * or even lValueR0 and lValueL0. Maybe there is something to do with register spilling (lValue keeps spill + * on local mem). + */ threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); @@ -466,29 +522,40 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages return; // Fetch to local memory the entire effective block size (2 positions for each thread) - shared_data[lIdx0] = data[gIdx0]; - shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; + // also keep thread's init values (L and R) on register locations + ValueT lValueL0 = data[gIdx0]; + ValueT lValueR0 = data[gIdx0 + blockDim.x]; + shared_data[lIdx0] = lValueL0; + shared_data[lIdx0 + blockDim.x] = lValueR0; __syncthreads(); + for (size_t stage = 1; (stage <= stages) && (stage <= maxStages); ++stage) { for (size_t step = stage; step > 0; ) { --step; - // Init thread global and local indices + // Init thread global, local indices and active local register value threadId_t gIdx = gIdx0; threadId_t lIdx = lIdx0; + ValueT lValue = lValueL0; // "Me" on the left side of effective block + // Find partner and keep-small configuration based on the global data positions threadId_t pIdx = partner(gIdx, step); if (gIdx > pIdx) { // Shift inside effective block - gIdx += blockDim.x; // global + gIdx += blockDim.x; // global pIdx += blockDim.x; - lIdx += blockDim.x; // local + lIdx += blockDim.x; // local + lValue = lValueR0; // The other me (the right side) } bool keep = keepSmall(gIdx, pIdx, stage); // Exchange data on local(shared) copy threadId_t lpIdx = toLocal(pIdx, blockDim.x); - exchange(shared_data, lIdx, lpIdx, keep); + ValueT pValue = shared_data[lpIdx]; + if (exchangeVals(&lValue, &pValue, keep)) { + shared_data[lIdx] = lValue; + shared_data[lpIdx] = pValue; + } __syncthreads(); } } @@ -496,8 +563,10 @@ __global__ void prephase(ValueT* data, size_t n, size_t stages, size_t maxStages // Write back to global memory (no sync here, there will be sync from host) data[gIdx0] = shared_data[lIdx0]; data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; +#endif } + /*! * A CUDA version of the Bitonic sort algorithm. * diff --git a/homework_3/src/config.h b/homework_3/src/config.h index c2e9e0b..9d227a7 100644 --- a/homework_3/src/config.h +++ b/homework_3/src/config.h @@ -16,8 +16,10 @@ /* * Versioning: * - RC1: First version to test on HPC + * - RC2: A pre-phase added for v1 and v2 + * - RC3: */ -static constexpr char version[] = "0.1"; +static constexpr char version[] = "0.2"; /* * Defines for different version of the exercise