AUTH's THMMY "Parallel and distributed systems" course assignments.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2050 lines
228 KiB

  1. ==PROF== Connected to process 100431 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA)
  2. ==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes
  3. ==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes
  4. ==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes
  5. ==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes
  6. ==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes
  7. ==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes
  8. ==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes
  9. ==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes
  10. ==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes
  11. ==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes
  12. ==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes
  13. ==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes
  14. ==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes
  15. ==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes
  16. ==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes
  17. ==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes
  18. ==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes
  19. ==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes
  20. ==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes
  21. ==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes
  22. ==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes
  23. ==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes
  24. ==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes
  25. ==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes
  26. ==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes
  27. ==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes
  28. ==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes
  29. ==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes
  30. ==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes
  31. ==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes
  32. ==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes
  33. ==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes
  34. ==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes
  35. ==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes
  36. ==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes
  37. ==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes
  38. ==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes
  39. ==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes
  40. ==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes
  41. ==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes
  42. ==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes
  43. ==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes
  44. ==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes
  45. ==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes
  46. ==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes
  47. ==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes
  48. ==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes
  49. ==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes
  50. ==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes
  51. ==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes
  52. ==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes
  53. ==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes
  54. ==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes
  55. ==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes
  56. ==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes
  57. ==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes
  58. ==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes
  59. ==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes
  60. ==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes
  61. ==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes
  62. ==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes
  63. ==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes
  64. ==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes
  65. ==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes
  66. ==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes
  67. ==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes
  68. ==PROF== Disconnected from process 100431
  69. [100431] bitonicCUDA@127.0.0.1
  70. void prephase<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  71. Section: Command line profiler metrics
  72. ---------------------------------------------------------------------- --------------- ------------------------------
  73. gpu__time_duration.sum msecond 2.56
  74. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  75. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  76. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  77. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  78. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  79. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  80. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 237,568
  81. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 237,568
  82. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 237,568
  83. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 3,801,088
  84. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 209,070.94
  85. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 209,334
  86. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 208,875
  87. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 3,345,135
  88. smsp__average_warp_latency_issue_stalled_barrier.pct % 1,692,604.61
  89. smsp__average_warp_latency_issue_stalled_barrier.ratio 16,926.05
  90. smsp__inst_executed.avg inst 1,953,951.83
  91. smsp__inst_executed.max inst 1,954,175
  92. smsp__inst_executed.min inst 1,953,723
  93. smsp__inst_executed.sum inst 125,052,917
  94. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 15.35
  95. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.15
  96. smsp__cycles_active.avg cycle 3,559,774.03
  97. smsp__cycles_active.sum cycle 227,825,538
  98. ---------------------------------------------------------------------- --------------- ------------------------------
  99. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  100. Section: Command line profiler metrics
  101. ---------------------------------------------------------------------- --------------- ------------------------------
  102. gpu__time_duration.sum usecond 59.90
  103. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  104. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  105. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  106. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  107. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  108. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  109. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  110. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  111. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  112. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  113. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  114. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  115. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  116. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  117. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  118. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  119. smsp__inst_executed.avg inst 12,309.03
  120. smsp__inst_executed.max inst 12,569
  121. smsp__inst_executed.min inst 11,654
  122. smsp__inst_executed.sum inst 787,778
  123. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  124. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  125. smsp__cycles_active.avg cycle 73,062.50
  126. smsp__cycles_active.sum cycle 4,676,000
  127. ---------------------------------------------------------------------- --------------- ------------------------------
  128. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  129. Section: Command line profiler metrics
  130. ---------------------------------------------------------------------- --------------- ------------------------------
  131. gpu__time_duration.sum usecond 435.49
  132. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  133. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  134. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  135. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  136. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  137. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  138. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  139. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  140. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  141. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  142. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,913.38
  143. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,394
  144. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,370
  145. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,614
  146. smsp__average_warp_latency_issue_stalled_barrier.pct % 353,960.85
  147. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,539.61
  148. smsp__inst_executed.avg inst 313,131.16
  149. smsp__inst_executed.max inst 313,277
  150. smsp__inst_executed.min inst 312,868
  151. smsp__inst_executed.sum inst 20,040,394
  152. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.30
  153. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  154. smsp__cycles_active.avg cycle 598,137.94
  155. smsp__cycles_active.sum cycle 38,280,828
  156. ---------------------------------------------------------------------- --------------- ------------------------------
  157. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  158. Section: Command line profiler metrics
  159. ---------------------------------------------------------------------- --------------- ------------------------------
  160. gpu__time_duration.sum usecond 58.24
  161. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  162. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  163. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  164. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  165. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  166. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  167. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  168. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  169. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  170. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  171. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  172. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  173. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  174. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  175. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  176. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  177. smsp__inst_executed.avg inst 12,298.58
  178. smsp__inst_executed.max inst 12,573
  179. smsp__inst_executed.min inst 12,056
  180. smsp__inst_executed.sum inst 787,109
  181. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  182. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  183. smsp__cycles_active.avg cycle 70,585.64
  184. smsp__cycles_active.sum cycle 4,517,481
  185. ---------------------------------------------------------------------- --------------- ------------------------------
  186. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  187. Section: Command line profiler metrics
  188. ---------------------------------------------------------------------- --------------- ------------------------------
  189. gpu__time_duration.sum usecond 59.94
  190. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  191. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  192. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  193. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  194. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23
  195. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  196. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  197. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  198. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  199. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  200. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  201. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  202. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  203. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  204. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  205. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  206. smsp__inst_executed.avg inst 12,308.78
  207. smsp__inst_executed.max inst 12,547
  208. smsp__inst_executed.min inst 12,011
  209. smsp__inst_executed.sum inst 787,762
  210. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  211. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  212. smsp__cycles_active.avg cycle 72,957.47
  213. smsp__cycles_active.sum cycle 4,669,278
  214. ---------------------------------------------------------------------- --------------- ------------------------------
  215. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  216. Section: Command line profiler metrics
  217. ---------------------------------------------------------------------- --------------- ------------------------------
  218. gpu__time_duration.sum usecond 434.34
  219. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  220. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  221. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  222. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  223. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  224. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  225. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  226. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768
  227. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768
  228. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  229. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,894.31
  230. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,021
  231. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,779
  232. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,309
  233. smsp__average_warp_latency_issue_stalled_barrier.pct % 351,861.41
  234. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,518.61
  235. smsp__inst_executed.avg inst 313,124.58
  236. smsp__inst_executed.max inst 313,358
  237. smsp__inst_executed.min inst 312,849
  238. smsp__inst_executed.sum inst 20,039,973
  239. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.19
  240. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  241. smsp__cycles_active.avg cycle 597,902.02
  242. smsp__cycles_active.sum cycle 38,265,729
  243. ---------------------------------------------------------------------- --------------- ------------------------------
  244. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  245. Section: Command line profiler metrics
  246. ---------------------------------------------------------------------- --------------- ------------------------------
  247. gpu__time_duration.sum usecond 58.30
  248. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  249. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  250. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  251. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  252. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  253. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  254. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  255. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  256. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  257. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  258. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  259. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  260. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  261. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  262. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  263. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  264. smsp__inst_executed.avg inst 12,293.72
  265. smsp__inst_executed.max inst 12,585
  266. smsp__inst_executed.min inst 11,832
  267. smsp__inst_executed.sum inst 786,798
  268. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  269. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  270. smsp__cycles_active.avg cycle 71,044.09
  271. smsp__cycles_active.sum cycle 4,546,822
  272. ---------------------------------------------------------------------- --------------- ------------------------------
  273. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:33, Context 1, Stream 7
  274. Section: Command line profiler metrics
  275. ---------------------------------------------------------------------- --------------- ------------------------------
  276. gpu__time_duration.sum usecond 58.11
  277. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  278. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  279. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  280. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  281. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  282. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  283. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  284. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  285. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  286. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  287. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  288. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  289. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  290. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  291. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  292. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  293. smsp__inst_executed.avg inst 12,297.98
  294. smsp__inst_executed.max inst 12,581
  295. smsp__inst_executed.min inst 11,984
  296. smsp__inst_executed.sum inst 787,071
  297. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  298. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  299. smsp__cycles_active.avg cycle 70,504.80
  300. smsp__cycles_active.sum cycle 4,512,307
  301. ---------------------------------------------------------------------- --------------- ------------------------------
  302. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  303. Section: Command line profiler metrics
  304. ---------------------------------------------------------------------- --------------- ------------------------------
  305. gpu__time_duration.sum usecond 59.74
  306. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  307. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  308. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  309. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  310. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  311. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  312. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  313. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  314. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  315. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  316. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  317. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  318. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  319. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  320. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  321. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  322. smsp__inst_executed.avg inst 12,309.89
  323. smsp__inst_executed.max inst 12,884
  324. smsp__inst_executed.min inst 11,903
  325. smsp__inst_executed.sum inst 787,833
  326. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  327. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  328. smsp__cycles_active.avg cycle 73,146.27
  329. smsp__cycles_active.sum cycle 4,681,361
  330. ---------------------------------------------------------------------- --------------- ------------------------------
  331. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  332. Section: Command line profiler metrics
  333. ---------------------------------------------------------------------- --------------- ------------------------------
  334. gpu__time_duration.sum usecond 434.91
  335. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  336. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  337. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  338. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  339. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  340. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  341. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  342. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  343. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  344. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  345. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,897.94
  346. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,370
  347. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,387
  348. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,367
  349. smsp__average_warp_latency_issue_stalled_barrier.pct % 355,060.44
  350. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,550.60
  351. smsp__inst_executed.avg inst 313,126.61
  352. smsp__inst_executed.max inst 313,456
  353. smsp__inst_executed.min inst 312,810
  354. smsp__inst_executed.sum inst 20,040,103
  355. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.38
  356. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  357. smsp__cycles_active.avg cycle 597,783.91
  358. smsp__cycles_active.sum cycle 38,258,170
  359. ---------------------------------------------------------------------- --------------- ------------------------------
  360. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  361. Section: Command line profiler metrics
  362. ---------------------------------------------------------------------- --------------- ------------------------------
  363. gpu__time_duration.sum usecond 58.30
  364. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  365. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  366. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  367. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  368. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  369. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  370. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  371. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  372. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  373. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  374. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  375. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  376. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  377. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  378. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  379. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  380. smsp__inst_executed.avg inst 12,290.58
  381. smsp__inst_executed.max inst 12,562
  382. smsp__inst_executed.min inst 11,884
  383. smsp__inst_executed.sum inst 786,597
  384. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  385. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  386. smsp__cycles_active.avg cycle 70,651.47
  387. smsp__cycles_active.sum cycle 4,521,694
  388. ---------------------------------------------------------------------- --------------- ------------------------------
  389. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  390. Section: Command line profiler metrics
  391. ---------------------------------------------------------------------- --------------- ------------------------------
  392. gpu__time_duration.sum usecond 58.34
  393. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  394. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  395. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  396. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  397. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  398. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  399. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  400. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  401. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  402. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  403. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  404. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  405. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  406. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  407. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  408. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  409. smsp__inst_executed.avg inst 12,293.06
  410. smsp__inst_executed.max inst 12,557
  411. smsp__inst_executed.min inst 11,928
  412. smsp__inst_executed.sum inst 786,756
  413. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  414. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  415. smsp__cycles_active.avg cycle 71,246.70
  416. smsp__cycles_active.sum cycle 4,559,789
  417. ---------------------------------------------------------------------- --------------- ------------------------------
  418. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  419. Section: Command line profiler metrics
  420. ---------------------------------------------------------------------- --------------- ------------------------------
  421. gpu__time_duration.sum usecond 58.18
  422. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  423. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  424. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  425. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  426. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  427. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  428. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  429. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  430. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  431. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  432. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  433. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  434. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  435. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  436. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  437. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  438. smsp__inst_executed.avg inst 12,298.75
  439. smsp__inst_executed.max inst 12,670
  440. smsp__inst_executed.min inst 11,749
  441. smsp__inst_executed.sum inst 787,120
  442. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  443. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  444. smsp__cycles_active.avg cycle 70,653.20
  445. smsp__cycles_active.sum cycle 4,521,805
  446. ---------------------------------------------------------------------- --------------- ------------------------------
  447. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  448. Section: Command line profiler metrics
  449. ---------------------------------------------------------------------- --------------- ------------------------------
  450. gpu__time_duration.sum usecond 59.78
  451. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  452. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  453. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  454. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  455. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  456. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  457. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  458. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  459. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  460. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  461. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  462. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  463. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  464. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  465. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  466. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  467. smsp__inst_executed.avg inst 12,309.36
  468. smsp__inst_executed.max inst 12,811
  469. smsp__inst_executed.min inst 11,692
  470. smsp__inst_executed.sum inst 787,799
  471. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  472. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  473. smsp__cycles_active.avg cycle 72,714.31
  474. smsp__cycles_active.sum cycle 4,653,716
  475. ---------------------------------------------------------------------- --------------- ------------------------------
  476. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  477. Section: Command line profiler metrics
  478. ---------------------------------------------------------------------- --------------- ------------------------------
  479. gpu__time_duration.sum usecond 434.11
  480. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  481. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  482. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  483. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  484. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  485. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  486. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  487. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768
  488. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768
  489. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  490. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,905.69
  491. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 31,983
  492. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,807
  493. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,491
  494. smsp__average_warp_latency_issue_stalled_barrier.pct % 354,022.84
  495. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,540.23
  496. smsp__inst_executed.avg inst 313,135.20
  497. smsp__inst_executed.max inst 318,231
  498. smsp__inst_executed.min inst 308,095
  499. smsp__inst_executed.sum inst 20,040,653
  500. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.31
  501. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  502. smsp__cycles_active.avg cycle 597,940.59
  503. smsp__cycles_active.sum cycle 38,268,198
  504. ---------------------------------------------------------------------- --------------- ------------------------------
  505. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  506. Section: Command line profiler metrics
  507. ---------------------------------------------------------------------- --------------- ------------------------------
  508. gpu__time_duration.sum usecond 57.92
  509. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  510. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  511. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  512. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  513. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  514. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  515. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  516. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  517. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  518. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  519. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  520. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  521. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  522. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  523. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  524. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  525. smsp__inst_executed.avg inst 12,289.25
  526. smsp__inst_executed.max inst 12,640
  527. smsp__inst_executed.min inst 11,916
  528. smsp__inst_executed.sum inst 786,512
  529. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  530. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  531. smsp__cycles_active.avg cycle 70,480.17
  532. smsp__cycles_active.sum cycle 4,510,731
  533. ---------------------------------------------------------------------- --------------- ------------------------------
  534. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  535. Section: Command line profiler metrics
  536. ---------------------------------------------------------------------- --------------- ------------------------------
  537. gpu__time_duration.sum usecond 58.43
  538. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  539. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  540. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  541. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  542. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  543. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  544. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  545. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  546. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  547. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  548. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  549. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  550. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  551. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  552. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  553. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  554. smsp__inst_executed.avg inst 12,290.56
  555. smsp__inst_executed.max inst 12,505
  556. smsp__inst_executed.min inst 12,076
  557. smsp__inst_executed.sum inst 786,596
  558. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  559. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  560. smsp__cycles_active.avg cycle 70,671.28
  561. smsp__cycles_active.sum cycle 4,522,962
  562. ---------------------------------------------------------------------- --------------- ------------------------------
  563. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  564. Section: Command line profiler metrics
  565. ---------------------------------------------------------------------- --------------- ------------------------------
  566. gpu__time_duration.sum usecond 58.59
  567. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  568. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  569. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  570. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  571. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  572. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  573. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  574. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  575. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  576. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  577. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  578. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  579. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  580. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  581. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  582. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  583. smsp__inst_executed.avg inst 12,292.92
  584. smsp__inst_executed.max inst 12,757
  585. smsp__inst_executed.min inst 11,856
  586. smsp__inst_executed.sum inst 786,747
  587. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  588. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  589. smsp__cycles_active.avg cycle 71,114.81
  590. smsp__cycles_active.sum cycle 4,551,348
  591. ---------------------------------------------------------------------- --------------- ------------------------------
  592. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:34, Context 1, Stream 7
  593. Section: Command line profiler metrics
  594. ---------------------------------------------------------------------- --------------- ------------------------------
  595. gpu__time_duration.sum usecond 57.98
  596. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  597. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  598. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  599. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  600. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  601. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  602. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  603. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  604. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  605. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  606. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  607. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  608. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  609. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  610. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  611. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  612. smsp__inst_executed.avg inst 12,298.41
  613. smsp__inst_executed.max inst 12,687
  614. smsp__inst_executed.min inst 11,920
  615. smsp__inst_executed.sum inst 787,098
  616. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  617. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  618. smsp__cycles_active.avg cycle 70,416.27
  619. smsp__cycles_active.sum cycle 4,506,641
  620. ---------------------------------------------------------------------- --------------- ------------------------------
  621. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  622. Section: Command line profiler metrics
  623. ---------------------------------------------------------------------- --------------- ------------------------------
  624. gpu__time_duration.sum usecond 59.68
  625. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  626. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  627. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  628. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  629. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  630. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  631. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  632. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  633. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  634. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  635. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  636. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  637. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  638. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  639. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  640. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  641. smsp__inst_executed.avg inst 12,308.94
  642. smsp__inst_executed.max inst 12,697
  643. smsp__inst_executed.min inst 11,640
  644. smsp__inst_executed.sum inst 787,772
  645. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  646. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  647. smsp__cycles_active.avg cycle 73,201.34
  648. smsp__cycles_active.sum cycle 4,684,886
  649. ---------------------------------------------------------------------- --------------- ------------------------------
  650. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  651. Section: Command line profiler metrics
  652. ---------------------------------------------------------------------- --------------- ------------------------------
  653. gpu__time_duration.sum usecond 433.86
  654. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  655. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  656. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  657. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  658. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  659. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  660. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  661. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 32,768
  662. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,768
  663. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  664. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,913.81
  665. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 31,996
  666. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,782
  667. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,621
  668. smsp__average_warp_latency_issue_stalled_barrier.pct % 354,697.47
  669. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,546.97
  670. smsp__inst_executed.avg inst 313,117.39
  671. smsp__inst_executed.max inst 318,197
  672. smsp__inst_executed.min inst 308,095
  673. smsp__inst_executed.sum inst 20,039,513
  674. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.33
  675. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  676. smsp__cycles_active.avg cycle 598,408.55
  677. smsp__cycles_active.sum cycle 38,298,147
  678. ---------------------------------------------------------------------- --------------- ------------------------------
  679. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  680. Section: Command line profiler metrics
  681. ---------------------------------------------------------------------- --------------- ------------------------------
  682. gpu__time_duration.sum usecond 60.03
  683. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  684. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  685. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  686. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  687. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  688. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  689. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  690. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  691. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  692. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  693. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  694. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  695. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  696. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  697. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  698. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  699. smsp__inst_executed.avg inst 12,288.61
  700. smsp__inst_executed.max inst 12,676
  701. smsp__inst_executed.min inst 11,864
  702. smsp__inst_executed.sum inst 786,471
  703. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  704. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  705. smsp__cycles_active.avg cycle 73,508.12
  706. smsp__cycles_active.sum cycle 4,704,520
  707. ---------------------------------------------------------------------- --------------- ------------------------------
  708. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  709. Section: Command line profiler metrics
  710. ---------------------------------------------------------------------- --------------- ------------------------------
  711. gpu__time_duration.sum usecond 58.21
  712. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  713. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  714. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  715. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  716. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  717. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  718. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  719. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  720. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  721. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  722. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  723. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  724. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  725. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  726. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  727. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  728. smsp__inst_executed.avg inst 12,290.09
  729. smsp__inst_executed.max inst 12,660
  730. smsp__inst_executed.min inst 12,078
  731. smsp__inst_executed.sum inst 786,566
  732. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  733. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  734. smsp__cycles_active.avg cycle 71,077.06
  735. smsp__cycles_active.sum cycle 4,548,932
  736. ---------------------------------------------------------------------- --------------- ------------------------------
  737. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  738. Section: Command line profiler metrics
  739. ---------------------------------------------------------------------- --------------- ------------------------------
  740. gpu__time_duration.sum usecond 58.24
  741. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  742. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  743. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  744. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  745. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  746. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  747. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  748. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  749. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  750. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  751. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  752. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  753. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  754. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  755. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  756. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  757. smsp__inst_executed.avg inst 12,289.83
  758. smsp__inst_executed.max inst 12,628
  759. smsp__inst_executed.min inst 11,908
  760. smsp__inst_executed.sum inst 786,549
  761. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  762. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  763. smsp__cycles_active.avg cycle 70,525.67
  764. smsp__cycles_active.sum cycle 4,513,643
  765. ---------------------------------------------------------------------- --------------- ------------------------------
  766. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  767. Section: Command line profiler metrics
  768. ---------------------------------------------------------------------- --------------- ------------------------------
  769. gpu__time_duration.sum usecond 58.66
  770. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  771. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  772. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  773. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  774. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44
  775. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  776. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  777. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  778. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  779. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  780. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  781. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  782. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  783. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  784. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  785. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  786. smsp__inst_executed.avg inst 12,293.34
  787. smsp__inst_executed.max inst 12,521
  788. smsp__inst_executed.min inst 11,630
  789. smsp__inst_executed.sum inst 786,774
  790. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  791. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  792. smsp__cycles_active.avg cycle 70,924.95
  793. smsp__cycles_active.sum cycle 4,539,197
  794. ---------------------------------------------------------------------- --------------- ------------------------------
  795. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  796. Section: Command line profiler metrics
  797. ---------------------------------------------------------------------- --------------- ------------------------------
  798. gpu__time_duration.sum usecond 57.92
  799. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  800. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  801. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  802. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  803. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  804. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  805. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  806. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  807. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  808. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  809. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  810. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  811. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  812. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  813. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  814. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  815. smsp__inst_executed.avg inst 12,297.78
  816. smsp__inst_executed.max inst 12,697
  817. smsp__inst_executed.min inst 12,067
  818. smsp__inst_executed.sum inst 787,058
  819. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  820. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  821. smsp__cycles_active.avg cycle 71,093.91
  822. smsp__cycles_active.sum cycle 4,550,010
  823. ---------------------------------------------------------------------- --------------- ------------------------------
  824. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  825. Section: Command line profiler metrics
  826. ---------------------------------------------------------------------- --------------- ------------------------------
  827. gpu__time_duration.sum usecond 59.62
  828. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  829. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  830. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  831. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  832. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  833. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  834. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  835. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  836. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  837. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  838. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  839. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  840. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  841. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  842. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  843. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  844. smsp__inst_executed.avg inst 12,308.45
  845. smsp__inst_executed.max inst 12,732
  846. smsp__inst_executed.min inst 11,792
  847. smsp__inst_executed.sum inst 787,741
  848. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  849. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  850. smsp__cycles_active.avg cycle 72,599.72
  851. smsp__cycles_active.sum cycle 4,646,382
  852. ---------------------------------------------------------------------- --------------- ------------------------------
  853. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  854. Section: Command line profiler metrics
  855. ---------------------------------------------------------------------- --------------- ------------------------------
  856. gpu__time_duration.sum usecond 434.43
  857. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  858. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  859. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  860. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  861. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  862. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  863. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  864. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  865. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  866. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  867. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,869.75
  868. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,359
  869. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,446
  870. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 509,916
  871. smsp__average_warp_latency_issue_stalled_barrier.pct % 352,782.28
  872. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,527.82
  873. smsp__inst_executed.avg inst 313,121.81
  874. smsp__inst_executed.max inst 313,218
  875. smsp__inst_executed.min inst 312,990
  876. smsp__inst_executed.sum inst 20,039,796
  877. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.20
  878. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  879. smsp__cycles_active.avg cycle 599,438.23
  880. smsp__cycles_active.sum cycle 38,364,047
  881. ---------------------------------------------------------------------- --------------- ------------------------------
  882. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:35, Context 1, Stream 7
  883. Section: Command line profiler metrics
  884. ---------------------------------------------------------------------- --------------- ------------------------------
  885. gpu__time_duration.sum usecond 59.07
  886. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  887. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  888. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  889. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  890. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  891. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  892. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  893. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  894. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  895. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  896. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  897. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  898. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  899. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  900. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  901. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  902. smsp__inst_executed.avg inst 12,288.05
  903. smsp__inst_executed.max inst 12,492
  904. smsp__inst_executed.min inst 11,906
  905. smsp__inst_executed.sum inst 786,435
  906. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  907. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  908. smsp__cycles_active.avg cycle 71,595.95
  909. smsp__cycles_active.sum cycle 4,582,141
  910. ---------------------------------------------------------------------- --------------- ------------------------------
  911. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  912. Section: Command line profiler metrics
  913. ---------------------------------------------------------------------- --------------- ------------------------------
  914. gpu__time_duration.sum usecond 60.22
  915. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  916. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  917. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  918. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  919. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  920. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  921. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  922. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  923. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  924. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  925. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  926. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  927. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  928. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  929. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  930. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  931. smsp__inst_executed.avg inst 12,288.66
  932. smsp__inst_executed.max inst 12,512
  933. smsp__inst_executed.min inst 11,704
  934. smsp__inst_executed.sum inst 786,474
  935. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  936. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  937. smsp__cycles_active.avg cycle 72,743.53
  938. smsp__cycles_active.sum cycle 4,655,586
  939. ---------------------------------------------------------------------- --------------- ------------------------------
  940. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  941. Section: Command line profiler metrics
  942. ---------------------------------------------------------------------- --------------- ------------------------------
  943. gpu__time_duration.sum usecond 58.18
  944. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  945. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  946. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  947. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  948. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  949. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  950. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  951. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  952. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  953. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  954. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  955. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  956. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  957. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  958. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  959. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  960. smsp__inst_executed.avg inst 12,289.08
  961. smsp__inst_executed.max inst 12,463
  962. smsp__inst_executed.min inst 11,886
  963. smsp__inst_executed.sum inst 786,501
  964. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  965. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  966. smsp__cycles_active.avg cycle 70,541.78
  967. smsp__cycles_active.sum cycle 4,514,674
  968. ---------------------------------------------------------------------- --------------- ------------------------------
  969. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  970. Section: Command line profiler metrics
  971. ---------------------------------------------------------------------- --------------- ------------------------------
  972. gpu__time_duration.sum usecond 58.18
  973. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  974. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  975. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  976. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  977. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47
  978. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  979. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  980. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  981. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  982. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  983. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  984. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  985. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  986. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  987. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  988. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  989. smsp__inst_executed.avg inst 12,290.53
  990. smsp__inst_executed.max inst 12,514
  991. smsp__inst_executed.min inst 12,088
  992. smsp__inst_executed.sum inst 786,594
  993. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  994. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  995. smsp__cycles_active.avg cycle 70,334.75
  996. smsp__cycles_active.sum cycle 4,501,424
  997. ---------------------------------------------------------------------- --------------- ------------------------------
  998. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  999. Section: Command line profiler metrics
  1000. ---------------------------------------------------------------------- --------------- ------------------------------
  1001. gpu__time_duration.sum usecond 58.62
  1002. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1003. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1004. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1005. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1006. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1007. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1008. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1009. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1010. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1011. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1012. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1013. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1014. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1015. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1016. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1017. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1018. smsp__inst_executed.avg inst 12,293.53
  1019. smsp__inst_executed.max inst 12,555
  1020. smsp__inst_executed.min inst 11,987
  1021. smsp__inst_executed.sum inst 786,786
  1022. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1023. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1024. smsp__cycles_active.avg cycle 70,984.45
  1025. smsp__cycles_active.sum cycle 4,543,005
  1026. ---------------------------------------------------------------------- --------------- ------------------------------
  1027. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1028. Section: Command line profiler metrics
  1029. ---------------------------------------------------------------------- --------------- ------------------------------
  1030. gpu__time_duration.sum usecond 57.82
  1031. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1032. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1033. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1034. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1035. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1036. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1037. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1038. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1039. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1040. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1041. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1042. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1043. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1044. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1045. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1046. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1047. smsp__inst_executed.avg inst 12,298.67
  1048. smsp__inst_executed.max inst 12,569
  1049. smsp__inst_executed.min inst 11,918
  1050. smsp__inst_executed.sum inst 787,115
  1051. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1052. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1053. smsp__cycles_active.avg cycle 70,573.64
  1054. smsp__cycles_active.sum cycle 4,516,713
  1055. ---------------------------------------------------------------------- --------------- ------------------------------
  1056. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1057. Section: Command line profiler metrics
  1058. ---------------------------------------------------------------------- --------------- ------------------------------
  1059. gpu__time_duration.sum usecond 59.30
  1060. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1061. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1062. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1063. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1064. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  1065. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1066. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1067. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1068. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1069. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1070. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1071. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1072. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1073. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1074. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1075. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1076. smsp__inst_executed.avg inst 12,308.23
  1077. smsp__inst_executed.max inst 12,720
  1078. smsp__inst_executed.min inst 11,718
  1079. smsp__inst_executed.sum inst 787,727
  1080. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1081. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1082. smsp__cycles_active.avg cycle 72,732.94
  1083. smsp__cycles_active.sum cycle 4,654,908
  1084. ---------------------------------------------------------------------- --------------- ------------------------------
  1085. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1086. Section: Command line profiler metrics
  1087. ---------------------------------------------------------------------- --------------- ------------------------------
  1088. gpu__time_duration.sum usecond 434.11
  1089. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1090. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1091. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1092. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1093. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1094. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1095. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1096. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1097. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1098. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1099. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,883.31
  1100. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,395
  1101. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,412
  1102. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,133
  1103. smsp__average_warp_latency_issue_stalled_barrier.pct % 354,700.81
  1104. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,547.01
  1105. smsp__inst_executed.avg inst 313,122.11
  1106. smsp__inst_executed.max inst 318,197
  1107. smsp__inst_executed.min inst 308,040
  1108. smsp__inst_executed.sum inst 20,039,815
  1109. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.35
  1110. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  1111. smsp__cycles_active.avg cycle 597,850.22
  1112. smsp__cycles_active.sum cycle 38,262,414
  1113. ---------------------------------------------------------------------- --------------- ------------------------------
  1114. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1115. Section: Command line profiler metrics
  1116. ---------------------------------------------------------------------- --------------- ------------------------------
  1117. gpu__time_duration.sum usecond 59.23
  1118. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1119. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1120. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1121. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1122. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1123. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1124. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1125. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1126. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1127. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1128. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1129. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1130. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1131. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1132. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1133. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1134. smsp__inst_executed.avg inst 12,287.16
  1135. smsp__inst_executed.max inst 12,688
  1136. smsp__inst_executed.min inst 11,888
  1137. smsp__inst_executed.sum inst 786,378
  1138. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1139. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1140. smsp__cycles_active.avg cycle 71,841.34
  1141. smsp__cycles_active.sum cycle 4,597,846
  1142. ---------------------------------------------------------------------- --------------- ------------------------------
  1143. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1144. Section: Command line profiler metrics
  1145. ---------------------------------------------------------------------- --------------- ------------------------------
  1146. gpu__time_duration.sum usecond 58.75
  1147. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1148. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1149. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1150. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1151. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1152. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1153. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1154. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1155. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1156. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1157. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1158. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1159. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1160. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1161. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1162. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1163. smsp__inst_executed.avg inst 12,288.83
  1164. smsp__inst_executed.max inst 12,660
  1165. smsp__inst_executed.min inst 11,928
  1166. smsp__inst_executed.sum inst 786,485
  1167. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1168. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1169. smsp__cycles_active.avg cycle 71,607.83
  1170. smsp__cycles_active.sum cycle 4,582,901
  1171. ---------------------------------------------------------------------- --------------- ------------------------------
  1172. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1173. Section: Command line profiler metrics
  1174. ---------------------------------------------------------------------- --------------- ------------------------------
  1175. gpu__time_duration.sum usecond 59.87
  1176. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1177. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1178. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1179. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1180. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1181. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1182. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1183. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1184. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1185. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1186. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1187. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1188. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1189. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1190. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1191. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1192. smsp__inst_executed.avg inst 12,288.42
  1193. smsp__inst_executed.max inst 12,700
  1194. smsp__inst_executed.min inst 11,680
  1195. smsp__inst_executed.sum inst 786,459
  1196. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1197. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1198. smsp__cycles_active.avg cycle 73,464.98
  1199. smsp__cycles_active.sum cycle 4,701,759
  1200. ---------------------------------------------------------------------- --------------- ------------------------------
  1201. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:36, Context 1, Stream 7
  1202. Section: Command line profiler metrics
  1203. ---------------------------------------------------------------------- --------------- ------------------------------
  1204. gpu__time_duration.sum usecond 58.21
  1205. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1206. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1207. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1208. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1209. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1210. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1211. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1212. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1213. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1214. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1215. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1216. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1217. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1218. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1219. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1220. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1221. smsp__inst_executed.avg inst 12,289.50
  1222. smsp__inst_executed.max inst 12,656
  1223. smsp__inst_executed.min inst 11,732
  1224. smsp__inst_executed.sum inst 786,528
  1225. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1226. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1227. smsp__cycles_active.avg cycle 70,751.97
  1228. smsp__cycles_active.sum cycle 4,528,126
  1229. ---------------------------------------------------------------------- --------------- ------------------------------
  1230. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1231. Section: Command line profiler metrics
  1232. ---------------------------------------------------------------------- --------------- ------------------------------
  1233. gpu__time_duration.sum usecond 58.05
  1234. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1235. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1236. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1237. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1238. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1239. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1240. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1241. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1242. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1243. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1244. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1245. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1246. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1247. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1248. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1249. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1250. smsp__inst_executed.avg inst 12,290.84
  1251. smsp__inst_executed.max inst 12,520
  1252. smsp__inst_executed.min inst 12,034
  1253. smsp__inst_executed.sum inst 786,614
  1254. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1255. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1256. smsp__cycles_active.avg cycle 71,176.25
  1257. smsp__cycles_active.sum cycle 4,555,280
  1258. ---------------------------------------------------------------------- --------------- ------------------------------
  1259. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1260. Section: Command line profiler metrics
  1261. ---------------------------------------------------------------------- --------------- ------------------------------
  1262. gpu__time_duration.sum usecond 58.56
  1263. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1264. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1265. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1266. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1267. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1268. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1269. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1270. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1271. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1272. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1273. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1274. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1275. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1276. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1277. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1278. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1279. smsp__inst_executed.avg inst 12,292.58
  1280. smsp__inst_executed.max inst 12,547
  1281. smsp__inst_executed.min inst 11,776
  1282. smsp__inst_executed.sum inst 786,725
  1283. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1284. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1285. smsp__cycles_active.avg cycle 70,971.50
  1286. smsp__cycles_active.sum cycle 4,542,176
  1287. ---------------------------------------------------------------------- --------------- ------------------------------
  1288. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1289. Section: Command line profiler metrics
  1290. ---------------------------------------------------------------------- --------------- ------------------------------
  1291. gpu__time_duration.sum usecond 58.14
  1292. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1293. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1294. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1295. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1296. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37
  1297. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  1298. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1299. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1300. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1301. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1302. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1303. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1304. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1305. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1306. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1307. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1308. smsp__inst_executed.avg inst 12,298.45
  1309. smsp__inst_executed.max inst 12,555
  1310. smsp__inst_executed.min inst 12,073
  1311. smsp__inst_executed.sum inst 787,101
  1312. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1313. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1314. smsp__cycles_active.avg cycle 70,221.45
  1315. smsp__cycles_active.sum cycle 4,494,173
  1316. ---------------------------------------------------------------------- --------------- ------------------------------
  1317. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1318. Section: Command line profiler metrics
  1319. ---------------------------------------------------------------------- --------------- ------------------------------
  1320. gpu__time_duration.sum usecond 59.36
  1321. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1322. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1323. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1324. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1325. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  1326. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1327. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1328. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1329. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1330. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1331. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1332. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1333. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1334. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1335. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1336. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1337. smsp__inst_executed.avg inst 12,308.89
  1338. smsp__inst_executed.max inst 12,584
  1339. smsp__inst_executed.min inst 11,684
  1340. smsp__inst_executed.sum inst 787,769
  1341. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1342. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1343. smsp__cycles_active.avg cycle 72,809.75
  1344. smsp__cycles_active.sum cycle 4,659,824
  1345. ---------------------------------------------------------------------- --------------- ------------------------------
  1346. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1347. Section: Command line profiler metrics
  1348. ---------------------------------------------------------------------- --------------- ------------------------------
  1349. gpu__time_duration.sum usecond 435.17
  1350. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1351. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1352. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1353. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1354. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1355. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1356. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1357. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1358. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1359. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1360. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,881.75
  1361. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,405
  1362. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,298
  1363. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,108
  1364. smsp__average_warp_latency_issue_stalled_barrier.pct % 351,267.26
  1365. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,512.67
  1366. smsp__inst_executed.avg inst 313,146.62
  1367. smsp__inst_executed.max inst 318,289
  1368. smsp__inst_executed.min inst 308,129
  1369. smsp__inst_executed.sum inst 20,041,384
  1370. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.17
  1371. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  1372. smsp__cycles_active.avg cycle 597,515.77
  1373. smsp__cycles_active.sum cycle 38,241,009
  1374. ---------------------------------------------------------------------- --------------- ------------------------------
  1375. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1376. Section: Command line profiler metrics
  1377. ---------------------------------------------------------------------- --------------- ------------------------------
  1378. gpu__time_duration.sum usecond 60.70
  1379. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1380. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1381. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1382. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1383. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1384. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1385. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1386. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1387. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1388. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1389. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1390. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1391. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1392. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1393. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1394. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1395. smsp__inst_executed.avg inst 12,288.25
  1396. smsp__inst_executed.max inst 12,664
  1397. smsp__inst_executed.min inst 11,904
  1398. smsp__inst_executed.sum inst 786,448
  1399. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1400. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1401. smsp__cycles_active.avg cycle 74,531.66
  1402. smsp__cycles_active.sum cycle 4,770,026
  1403. ---------------------------------------------------------------------- --------------- ------------------------------
  1404. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1405. Section: Command line profiler metrics
  1406. ---------------------------------------------------------------------- --------------- ------------------------------
  1407. gpu__time_duration.sum usecond 58.82
  1408. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1409. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1410. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1411. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1412. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1413. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1414. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1415. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1416. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1417. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1418. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1419. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1420. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1421. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1422. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1423. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1424. smsp__inst_executed.avg inst 12,288.38
  1425. smsp__inst_executed.max inst 12,490
  1426. smsp__inst_executed.min inst 12,092
  1427. smsp__inst_executed.sum inst 786,456
  1428. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1429. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1430. smsp__cycles_active.avg cycle 72,103.58
  1431. smsp__cycles_active.sum cycle 4,614,629
  1432. ---------------------------------------------------------------------- --------------- ------------------------------
  1433. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1434. Section: Command line profiler metrics
  1435. ---------------------------------------------------------------------- --------------- ------------------------------
  1436. gpu__time_duration.sum usecond 58.69
  1437. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1438. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1439. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1440. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1441. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1442. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1443. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1444. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1445. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1446. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1447. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1448. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1449. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1450. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1451. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1452. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1453. smsp__inst_executed.avg inst 12,288.58
  1454. smsp__inst_executed.max inst 12,486
  1455. smsp__inst_executed.min inst 11,896
  1456. smsp__inst_executed.sum inst 786,469
  1457. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1458. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1459. smsp__cycles_active.avg cycle 71,726.84
  1460. smsp__cycles_active.sum cycle 4,590,518
  1461. ---------------------------------------------------------------------- --------------- ------------------------------
  1462. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1463. Section: Command line profiler metrics
  1464. ---------------------------------------------------------------------- --------------- ------------------------------
  1465. gpu__time_duration.sum usecond 60
  1466. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1467. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1468. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1469. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1470. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1471. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1472. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1473. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1474. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1475. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1476. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1477. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1478. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1479. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1480. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1481. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1482. smsp__inst_executed.avg inst 12,288.86
  1483. smsp__inst_executed.max inst 12,664
  1484. smsp__inst_executed.min inst 11,716
  1485. smsp__inst_executed.sum inst 786,487
  1486. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1487. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1488. smsp__cycles_active.avg cycle 73,256.52
  1489. smsp__cycles_active.sum cycle 4,688,417
  1490. ---------------------------------------------------------------------- --------------- ------------------------------
  1491. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1492. Section: Command line profiler metrics
  1493. ---------------------------------------------------------------------- --------------- ------------------------------
  1494. gpu__time_duration.sum usecond 58.18
  1495. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1496. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1497. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1498. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1499. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1500. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1501. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1502. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1503. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1504. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1505. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1506. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1507. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1508. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1509. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1510. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1511. smsp__inst_executed.avg inst 12,289.66
  1512. smsp__inst_executed.max inst 12,852
  1513. smsp__inst_executed.min inst 11,702
  1514. smsp__inst_executed.sum inst 786,538
  1515. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1516. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1517. smsp__cycles_active.avg cycle 70,927.86
  1518. smsp__cycles_active.sum cycle 4,539,383
  1519. ---------------------------------------------------------------------- --------------- ------------------------------
  1520. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:37, Context 1, Stream 7
  1521. Section: Command line profiler metrics
  1522. ---------------------------------------------------------------------- --------------- ------------------------------
  1523. gpu__time_duration.sum usecond 58.37
  1524. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1525. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1526. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1527. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1528. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1529. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1530. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1531. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1532. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1533. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1534. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1535. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1536. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1537. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1538. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1539. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1540. smsp__inst_executed.avg inst 12,291.56
  1541. smsp__inst_executed.max inst 12,652
  1542. smsp__inst_executed.min inst 11,912
  1543. smsp__inst_executed.sum inst 786,660
  1544. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1545. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1546. smsp__cycles_active.avg cycle 70,620.88
  1547. smsp__cycles_active.sum cycle 4,519,736
  1548. ---------------------------------------------------------------------- --------------- ------------------------------
  1549. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1550. Section: Command line profiler metrics
  1551. ---------------------------------------------------------------------- --------------- ------------------------------
  1552. gpu__time_duration.sum usecond 59.01
  1553. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1554. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1555. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1556. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1557. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.44
  1558. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1559. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1560. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1561. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1562. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1563. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1564. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1565. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1566. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1567. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1568. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1569. smsp__inst_executed.avg inst 12,294.31
  1570. smsp__inst_executed.max inst 12,766
  1571. smsp__inst_executed.min inst 11,720
  1572. smsp__inst_executed.sum inst 786,836
  1573. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1574. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1575. smsp__cycles_active.avg cycle 71,559.72
  1576. smsp__cycles_active.sum cycle 4,579,822
  1577. ---------------------------------------------------------------------- --------------- ------------------------------
  1578. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1579. Section: Command line profiler metrics
  1580. ---------------------------------------------------------------------- --------------- ------------------------------
  1581. gpu__time_duration.sum usecond 58.18
  1582. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1583. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1584. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1585. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1586. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  1587. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1588. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1589. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1590. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1591. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1592. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1593. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1594. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1595. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1596. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1597. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1598. smsp__inst_executed.avg inst 12,301.61
  1599. smsp__inst_executed.max inst 12,512
  1600. smsp__inst_executed.min inst 12,076
  1601. smsp__inst_executed.sum inst 787,303
  1602. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1603. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1604. smsp__cycles_active.avg cycle 70,958.73
  1605. smsp__cycles_active.sum cycle 4,541,359
  1606. ---------------------------------------------------------------------- --------------- ------------------------------
  1607. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1608. Section: Command line profiler metrics
  1609. ---------------------------------------------------------------------- --------------- ------------------------------
  1610. gpu__time_duration.sum usecond 59.42
  1611. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1612. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1613. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1614. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1615. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  1616. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1617. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1618. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1619. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1620. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1621. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1622. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1623. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1624. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1625. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1626. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1627. smsp__inst_executed.avg inst 12,314.73
  1628. smsp__inst_executed.max inst 12,723
  1629. smsp__inst_executed.min inst 11,867
  1630. smsp__inst_executed.sum inst 788,143
  1631. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1632. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1633. smsp__cycles_active.avg cycle 72,355.83
  1634. smsp__cycles_active.sum cycle 4,630,773
  1635. ---------------------------------------------------------------------- --------------- ------------------------------
  1636. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1637. Section: Command line profiler metrics
  1638. ---------------------------------------------------------------------- --------------- ------------------------------
  1639. gpu__time_duration.sum usecond 433.79
  1640. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1641. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1642. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1643. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1644. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1645. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1646. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1647. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1648. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1649. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1650. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,889.94
  1651. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,543
  1652. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,317
  1653. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 510,239
  1654. smsp__average_warp_latency_issue_stalled_barrier.pct % 353,670.79
  1655. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,536.71
  1656. smsp__inst_executed.avg inst 313,134.42
  1657. smsp__inst_executed.max inst 318,339
  1658. smsp__inst_executed.min inst 308,068
  1659. smsp__inst_executed.sum inst 20,040,603
  1660. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.31
  1661. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.19
  1662. smsp__cycles_active.avg cycle 597,595.56
  1663. smsp__cycles_active.sum cycle 38,246,116
  1664. ---------------------------------------------------------------------- --------------- ------------------------------
  1665. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1666. Section: Command line profiler metrics
  1667. ---------------------------------------------------------------------- --------------- ------------------------------
  1668. gpu__time_duration.sum usecond 56.96
  1669. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1670. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1671. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1672. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1673. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1674. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1675. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1676. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1677. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1678. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1679. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1680. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1681. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1682. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1683. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1684. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1685. smsp__inst_executed.avg inst 12,287.81
  1686. smsp__inst_executed.max inst 12,680
  1687. smsp__inst_executed.min inst 11,900
  1688. smsp__inst_executed.sum inst 786,420
  1689. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1690. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1691. smsp__cycles_active.avg cycle 68,992.86
  1692. smsp__cycles_active.sum cycle 4,415,543
  1693. ---------------------------------------------------------------------- --------------- ------------------------------
  1694. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1695. Section: Command line profiler metrics
  1696. ---------------------------------------------------------------------- --------------- ------------------------------
  1697. gpu__time_duration.sum usecond 60.80
  1698. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1699. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1700. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1701. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1702. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1703. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1704. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1705. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1706. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1707. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1708. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1709. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1710. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1711. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1712. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1713. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1714. smsp__inst_executed.avg inst 12,288.44
  1715. smsp__inst_executed.max inst 12,672
  1716. smsp__inst_executed.min inst 11,908
  1717. smsp__inst_executed.sum inst 786,460
  1718. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1719. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1720. smsp__cycles_active.avg cycle 74,819.06
  1721. smsp__cycles_active.sum cycle 4,788,420
  1722. ---------------------------------------------------------------------- --------------- ------------------------------
  1723. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1724. Section: Command line profiler metrics
  1725. ---------------------------------------------------------------------- --------------- ------------------------------
  1726. gpu__time_duration.sum usecond 59.14
  1727. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1728. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1729. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1730. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1731. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1732. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1733. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1734. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1735. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1736. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1737. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1738. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1739. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1740. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1741. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1742. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1743. smsp__inst_executed.avg inst 12,287.36
  1744. smsp__inst_executed.max inst 12,484
  1745. smsp__inst_executed.min inst 12,088
  1746. smsp__inst_executed.sum inst 786,391
  1747. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1748. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1749. smsp__cycles_active.avg cycle 71,808.34
  1750. smsp__cycles_active.sum cycle 4,595,734
  1751. ---------------------------------------------------------------------- --------------- ------------------------------
  1752. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1753. Section: Command line profiler metrics
  1754. ---------------------------------------------------------------------- --------------- ------------------------------
  1755. gpu__time_duration.sum usecond 58.94
  1756. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1757. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1758. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1759. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1760. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1761. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1762. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1763. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1764. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1765. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1766. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1767. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1768. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1769. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1770. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1771. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1772. smsp__inst_executed.avg inst 12,288.64
  1773. smsp__inst_executed.max inst 12,656
  1774. smsp__inst_executed.min inst 12,092
  1775. smsp__inst_executed.sum inst 786,473
  1776. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1777. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1778. smsp__cycles_active.avg cycle 72,451.16
  1779. smsp__cycles_active.sum cycle 4,636,874
  1780. ---------------------------------------------------------------------- --------------- ------------------------------
  1781. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1782. Section: Command line profiler metrics
  1783. ---------------------------------------------------------------------- --------------- ------------------------------
  1784. gpu__time_duration.sum usecond 60.54
  1785. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1786. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1787. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1788. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1789. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1790. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1791. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1792. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1793. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1794. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1795. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1796. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1797. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1798. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1799. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1800. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1801. smsp__inst_executed.avg inst 12,288.56
  1802. smsp__inst_executed.max inst 12,677
  1803. smsp__inst_executed.min inst 11,720
  1804. smsp__inst_executed.sum inst 786,468
  1805. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1806. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1807. smsp__cycles_active.avg cycle 72,851.78
  1808. smsp__cycles_active.sum cycle 4,662,514
  1809. ---------------------------------------------------------------------- --------------- ------------------------------
  1810. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1811. Section: Command line profiler metrics
  1812. ---------------------------------------------------------------------- --------------- ------------------------------
  1813. gpu__time_duration.sum usecond 57.98
  1814. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1815. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1816. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1817. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1818. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1819. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1820. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1821. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1822. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1823. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1824. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1825. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1826. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1827. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1828. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1829. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1830. smsp__inst_executed.avg inst 12,288.55
  1831. smsp__inst_executed.max inst 12,492
  1832. smsp__inst_executed.min inst 11,924
  1833. smsp__inst_executed.sum inst 786,467
  1834. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1835. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1836. smsp__cycles_active.avg cycle 70,639.48
  1837. smsp__cycles_active.sum cycle 4,520,927
  1838. ---------------------------------------------------------------------- --------------- ------------------------------
  1839. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:38, Context 1, Stream 7
  1840. Section: Command line profiler metrics
  1841. ---------------------------------------------------------------------- --------------- ------------------------------
  1842. gpu__time_duration.sum usecond 58.11
  1843. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1844. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1845. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1846. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1847. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1848. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1849. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1850. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1851. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1852. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1853. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1854. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1855. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1856. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1857. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1858. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1859. smsp__inst_executed.avg inst 12,290.67
  1860. smsp__inst_executed.max inst 12,540
  1861. smsp__inst_executed.min inst 12,048
  1862. smsp__inst_executed.sum inst 786,603
  1863. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1864. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1865. smsp__cycles_active.avg cycle 71,134.56
  1866. smsp__cycles_active.sum cycle 4,552,612
  1867. ---------------------------------------------------------------------- --------------- ------------------------------
  1868. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7
  1869. Section: Command line profiler metrics
  1870. ---------------------------------------------------------------------- --------------- ------------------------------
  1871. gpu__time_duration.sum usecond 58.56
  1872. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1873. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1874. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1875. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1876. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1877. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1878. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1879. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1880. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1881. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1882. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1883. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1884. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1885. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1886. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1887. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1888. smsp__inst_executed.avg inst 12,292.61
  1889. smsp__inst_executed.max inst 12,727
  1890. smsp__inst_executed.min inst 11,881
  1891. smsp__inst_executed.sum inst 786,727
  1892. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1893. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1894. smsp__cycles_active.avg cycle 71,327.62
  1895. smsp__cycles_active.sum cycle 4,564,968
  1896. ---------------------------------------------------------------------- --------------- ------------------------------
  1897. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7
  1898. Section: Command line profiler metrics
  1899. ---------------------------------------------------------------------- --------------- ------------------------------
  1900. gpu__time_duration.sum usecond 58.30
  1901. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1902. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1903. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1904. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1905. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  1906. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1907. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1908. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1909. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1910. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1911. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1912. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1913. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1914. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1915. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1916. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1917. smsp__inst_executed.avg inst 12,298.61
  1918. smsp__inst_executed.max inst 12,810
  1919. smsp__inst_executed.min inst 11,926
  1920. smsp__inst_executed.sum inst 787,111
  1921. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1922. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1923. smsp__cycles_active.avg cycle 70,631.98
  1924. smsp__cycles_active.sum cycle 4,520,447
  1925. ---------------------------------------------------------------------- --------------- ------------------------------
  1926. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7
  1927. Section: Command line profiler metrics
  1928. ---------------------------------------------------------------------- --------------- ------------------------------
  1929. gpu__time_duration.sum usecond 59.52
  1930. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1931. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1932. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1933. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1934. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  1935. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1936. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1937. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1938. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1939. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1940. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1941. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1942. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1943. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1944. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1945. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1946. smsp__inst_executed.avg inst 12,309.06
  1947. smsp__inst_executed.max inst 12,561
  1948. smsp__inst_executed.min inst 12,043
  1949. smsp__inst_executed.sum inst 787,780
  1950. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1951. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1952. smsp__cycles_active.avg cycle 72,662.08
  1953. smsp__cycles_active.sum cycle 4,650,373
  1954. ---------------------------------------------------------------------- --------------- ------------------------------
  1955. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 21:59:39, Context 1, Stream 7
  1956. Section: Command line profiler metrics
  1957. ---------------------------------------------------------------------- --------------- ------------------------------
  1958. gpu__time_duration.sum usecond 436.35
  1959. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1960. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1961. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1962. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1963. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1964. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1965. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1966. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1967. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1968. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1969. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 31,827.38
  1970. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 32,385
  1971. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 31,390
  1972. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 509,238
  1973. smsp__average_warp_latency_issue_stalled_barrier.pct % 358,185.65
  1974. smsp__average_warp_latency_issue_stalled_barrier.ratio 3,581.86
  1975. smsp__inst_executed.avg inst 312,880.55
  1976. smsp__inst_executed.max inst 317,962
  1977. smsp__inst_executed.min inst 307,889
  1978. smsp__inst_executed.sum inst 20,024,355
  1979. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 19.51
  1980. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.20
  1981. smsp__cycles_active.avg cycle 598,707.28
  1982. smsp__cycles_active.sum cycle 38,317,266
  1983. ---------------------------------------------------------------------- --------------- ------------------------------