AUTH's THMMY "Parallel and distributed systems" course assignments.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2050 lines
228 KiB

  1. ==PROF== Connected to process 38875 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA)
  2. ==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes
  3. ==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes
  4. ==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes
  5. ==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes
  6. ==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes
  7. ==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes
  8. ==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes
  9. ==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes
  10. ==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes
  11. ==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes
  12. ==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes
  13. ==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes
  14. ==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes
  15. ==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes
  16. ==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes
  17. ==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes
  18. ==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes
  19. ==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes
  20. ==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes
  21. ==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes
  22. ==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes
  23. ==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes
  24. ==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes
  25. ==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes
  26. ==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes
  27. ==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes
  28. ==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes
  29. ==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes
  30. ==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes
  31. ==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes
  32. ==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes
  33. ==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes
  34. ==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes
  35. ==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes
  36. ==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes
  37. ==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes
  38. ==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes
  39. ==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes
  40. ==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes
  41. ==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes
  42. ==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes
  43. ==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes
  44. ==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes
  45. ==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes
  46. ==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes
  47. ==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes
  48. ==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes
  49. ==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes
  50. ==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes
  51. ==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes
  52. ==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes
  53. ==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes
  54. ==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes
  55. ==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes
  56. ==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes
  57. ==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes
  58. ==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes
  59. ==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes
  60. ==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes
  61. ==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes
  62. ==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes
  63. ==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes
  64. ==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes
  65. ==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes
  66. ==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes
  67. ==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes
  68. ==PROF== Disconnected from process 38875
  69. [38875] bitonicCUDA@127.0.0.1
  70. void prephase<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  71. Section: Command line profiler metrics
  72. ---------------------------------------------------------------------- --------------- ------------------------------
  73. gpu__time_duration.sum msecond 1.21
  74. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  75. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  76. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  77. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  78. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  79. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  80. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 186,368
  81. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 186,368
  82. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 186,368
  83. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 2,981,888
  84. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 111,970.88
  85. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 112,151
  86. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 111,739
  87. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 1,791,534
  88. smsp__average_warp_latency_issue_stalled_barrier.pct % 637,077.10
  89. smsp__average_warp_latency_issue_stalled_barrier.ratio 6,370.77
  90. smsp__inst_executed.avg inst 1,030,627.31
  91. smsp__inst_executed.max inst 1,030,849
  92. smsp__inst_executed.min inst 1,030,423
  93. smsp__inst_executed.sum inst 65,960,148
  94. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.38
  95. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  96. smsp__cycles_active.avg cycle 1,665,720.27
  97. smsp__cycles_active.sum cycle 106,606,097
  98. ---------------------------------------------------------------------- --------------- ------------------------------
  99. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  100. Section: Command line profiler metrics
  101. ---------------------------------------------------------------------- --------------- ------------------------------
  102. gpu__time_duration.sum usecond 59.87
  103. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  104. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  105. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  106. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  107. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  108. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  109. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  110. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  111. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  112. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  113. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  114. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  115. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  116. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  117. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  118. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  119. smsp__inst_executed.avg inst 12,309.31
  120. smsp__inst_executed.max inst 12,551
  121. smsp__inst_executed.min inst 12,072
  122. smsp__inst_executed.sum inst 787,796
  123. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  124. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  125. smsp__cycles_active.avg cycle 72,981.36
  126. smsp__cycles_active.sum cycle 4,670,807
  127. ---------------------------------------------------------------------- --------------- ------------------------------
  128. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  129. Section: Command line profiler metrics
  130. ---------------------------------------------------------------------- --------------- ------------------------------
  131. gpu__time_duration.sum usecond 230.75
  132. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  133. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  134. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  135. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  136. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  137. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  138. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  139. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  140. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  141. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  142. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,677.81
  143. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,960
  144. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,401
  145. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,845
  146. smsp__average_warp_latency_issue_stalled_barrier.pct % 116,386.30
  147. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,163.86
  148. smsp__inst_executed.avg inst 189,038.39
  149. smsp__inst_executed.max inst 192,131
  150. smsp__inst_executed.min inst 185,953
  151. smsp__inst_executed.sum inst 12,098,457
  152. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.15
  153. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  154. smsp__cycles_active.avg cycle 315,800.30
  155. smsp__cycles_active.sum cycle 20,211,219
  156. ---------------------------------------------------------------------- --------------- ------------------------------
  157. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  158. Section: Command line profiler metrics
  159. ---------------------------------------------------------------------- --------------- ------------------------------
  160. gpu__time_duration.sum usecond 58.05
  161. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  162. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  163. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  164. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  165. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  166. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  167. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  168. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  169. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  170. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  171. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  172. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  173. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  174. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  175. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  176. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  177. smsp__inst_executed.avg inst 12,299.28
  178. smsp__inst_executed.max inst 12,618
  179. smsp__inst_executed.min inst 11,935
  180. smsp__inst_executed.sum inst 787,154
  181. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  182. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  183. smsp__cycles_active.avg cycle 70,789.86
  184. smsp__cycles_active.sum cycle 4,530,551
  185. ---------------------------------------------------------------------- --------------- ------------------------------
  186. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  187. Section: Command line profiler metrics
  188. ---------------------------------------------------------------------- --------------- ------------------------------
  189. gpu__time_duration.sum usecond 59.81
  190. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  191. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  192. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  193. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  194. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.20
  195. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.90
  196. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  197. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  198. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  199. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  200. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  201. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  202. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  203. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  204. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  205. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  206. smsp__inst_executed.avg inst 12,309.86
  207. smsp__inst_executed.max inst 12,524
  208. smsp__inst_executed.min inst 11,866
  209. smsp__inst_executed.sum inst 787,831
  210. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  211. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  212. smsp__cycles_active.avg cycle 74,000.58
  213. smsp__cycles_active.sum cycle 4,736,037
  214. ---------------------------------------------------------------------- --------------- ------------------------------
  215. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  216. Section: Command line profiler metrics
  217. ---------------------------------------------------------------------- --------------- ------------------------------
  218. gpu__time_duration.sum usecond 227.78
  219. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  220. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  221. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  222. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  223. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  224. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  225. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  226. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  227. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  228. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  229. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,678.44
  230. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,985
  231. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,420
  232. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,855
  233. smsp__average_warp_latency_issue_stalled_barrier.pct % 116,807.09
  234. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,168.07
  235. smsp__inst_executed.avg inst 189,034.89
  236. smsp__inst_executed.max inst 191,946
  237. smsp__inst_executed.min inst 186,150
  238. smsp__inst_executed.sum inst 12,098,233
  239. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.15
  240. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  241. smsp__cycles_active.avg cycle 316,724.77
  242. smsp__cycles_active.sum cycle 20,270,385
  243. ---------------------------------------------------------------------- --------------- ------------------------------
  244. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:22, Context 1, Stream 7
  245. Section: Command line profiler metrics
  246. ---------------------------------------------------------------------- --------------- ------------------------------
  247. gpu__time_duration.sum usecond 58.43
  248. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  249. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  250. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  251. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  252. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  253. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  254. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  255. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  256. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  257. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  258. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  259. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  260. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  261. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  262. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  263. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  264. smsp__inst_executed.avg inst 12,292.47
  265. smsp__inst_executed.max inst 12,744
  266. smsp__inst_executed.min inst 12,048
  267. smsp__inst_executed.sum inst 786,718
  268. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  269. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  270. smsp__cycles_active.avg cycle 70,511.59
  271. smsp__cycles_active.sum cycle 4,512,742
  272. ---------------------------------------------------------------------- --------------- ------------------------------
  273. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  274. Section: Command line profiler metrics
  275. ---------------------------------------------------------------------- --------------- ------------------------------
  276. gpu__time_duration.sum usecond 58.14
  277. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  278. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  279. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  280. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  281. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  282. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  283. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  284. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  285. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  286. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  287. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  288. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  289. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  290. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  291. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  292. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  293. smsp__inst_executed.avg inst 12,298.75
  294. smsp__inst_executed.max inst 12,734
  295. smsp__inst_executed.min inst 11,912
  296. smsp__inst_executed.sum inst 787,120
  297. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  298. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  299. smsp__cycles_active.avg cycle 71,007.67
  300. smsp__cycles_active.sum cycle 4,544,491
  301. ---------------------------------------------------------------------- --------------- ------------------------------
  302. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  303. Section: Command line profiler metrics
  304. ---------------------------------------------------------------------- --------------- ------------------------------
  305. gpu__time_duration.sum usecond 59.55
  306. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  307. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  308. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  309. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  310. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  311. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  312. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  313. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  314. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  315. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  316. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  317. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  318. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  319. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  320. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  321. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  322. smsp__inst_executed.avg inst 12,309.34
  323. smsp__inst_executed.max inst 12,774
  324. smsp__inst_executed.min inst 11,741
  325. smsp__inst_executed.sum inst 787,798
  326. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  327. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  328. smsp__cycles_active.avg cycle 73,984.34
  329. smsp__cycles_active.sum cycle 4,734,998
  330. ---------------------------------------------------------------------- --------------- ------------------------------
  331. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  332. Section: Command line profiler metrics
  333. ---------------------------------------------------------------------- --------------- ------------------------------
  334. gpu__time_duration.sum usecond 228.67
  335. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  336. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  337. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  338. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  339. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  340. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  341. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  342. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  343. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  344. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  345. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.81
  346. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,018
  347. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,390
  348. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,989
  349. smsp__average_warp_latency_issue_stalled_barrier.pct % 117,091.55
  350. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,170.92
  351. smsp__inst_executed.avg inst 189,041.44
  352. smsp__inst_executed.max inst 191,914
  353. smsp__inst_executed.min inst 186,023
  354. smsp__inst_executed.sum inst 12,098,652
  355. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.19
  356. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  357. smsp__cycles_active.avg cycle 316,433.83
  358. smsp__cycles_active.sum cycle 20,251,765
  359. ---------------------------------------------------------------------- --------------- ------------------------------
  360. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  361. Section: Command line profiler metrics
  362. ---------------------------------------------------------------------- --------------- ------------------------------
  363. gpu__time_duration.sum usecond 58.27
  364. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  365. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  366. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  367. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  368. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  369. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  370. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  371. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  372. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  373. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  374. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  375. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  376. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  377. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  378. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  379. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  380. smsp__inst_executed.avg inst 12,290.77
  381. smsp__inst_executed.max inst 12,524
  382. smsp__inst_executed.min inst 12,028
  383. smsp__inst_executed.sum inst 786,609
  384. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  385. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  386. smsp__cycles_active.avg cycle 71,664.83
  387. smsp__cycles_active.sum cycle 4,586,549
  388. ---------------------------------------------------------------------- --------------- ------------------------------
  389. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  390. Section: Command line profiler metrics
  391. ---------------------------------------------------------------------- --------------- ------------------------------
  392. gpu__time_duration.sum usecond 58.53
  393. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  394. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  395. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  396. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  397. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  398. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  399. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  400. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  401. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  402. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  403. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  404. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  405. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  406. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  407. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  408. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  409. smsp__inst_executed.avg inst 12,292.66
  410. smsp__inst_executed.max inst 12,890
  411. smsp__inst_executed.min inst 11,524
  412. smsp__inst_executed.sum inst 786,730
  413. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  414. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  415. smsp__cycles_active.avg cycle 71,510.14
  416. smsp__cycles_active.sum cycle 4,576,649
  417. ---------------------------------------------------------------------- --------------- ------------------------------
  418. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  419. Section: Command line profiler metrics
  420. ---------------------------------------------------------------------- --------------- ------------------------------
  421. gpu__time_duration.sum usecond 58.14
  422. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  423. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  424. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  425. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  426. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37
  427. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  428. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  429. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  430. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  431. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  432. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  433. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  434. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  435. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  436. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  437. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  438. smsp__inst_executed.avg inst 12,298.09
  439. smsp__inst_executed.max inst 12,729
  440. smsp__inst_executed.min inst 11,878
  441. smsp__inst_executed.sum inst 787,078
  442. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  443. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  444. smsp__cycles_active.avg cycle 70,087.36
  445. smsp__cycles_active.sum cycle 4,485,591
  446. ---------------------------------------------------------------------- --------------- ------------------------------
  447. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  448. Section: Command line profiler metrics
  449. ---------------------------------------------------------------------- --------------- ------------------------------
  450. gpu__time_duration.sum usecond 59.65
  451. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  452. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  453. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  454. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  455. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  456. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  457. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  458. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  459. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  460. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  461. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  462. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  463. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  464. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  465. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  466. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  467. smsp__inst_executed.avg inst 12,308.77
  468. smsp__inst_executed.max inst 12,772
  469. smsp__inst_executed.min inst 11,917
  470. smsp__inst_executed.sum inst 787,761
  471. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  472. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  473. smsp__cycles_active.avg cycle 72,945.83
  474. smsp__cycles_active.sum cycle 4,668,533
  475. ---------------------------------------------------------------------- --------------- ------------------------------
  476. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  477. Section: Command line profiler metrics
  478. ---------------------------------------------------------------------- --------------- ------------------------------
  479. gpu__time_duration.sum usecond 231.20
  480. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  481. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  482. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  483. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  484. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  485. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  486. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  487. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  488. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  489. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  490. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,679.50
  491. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,958
  492. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,366
  493. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,872
  494. smsp__average_warp_latency_issue_stalled_barrier.pct % 117,303.63
  495. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,173.04
  496. smsp__inst_executed.avg inst 189,034.44
  497. smsp__inst_executed.max inst 192,047
  498. smsp__inst_executed.min inst 186,006
  499. smsp__inst_executed.sum inst 12,098,204
  500. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.29
  501. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  502. smsp__cycles_active.avg cycle 314,652.92
  503. smsp__cycles_active.sum cycle 20,137,787
  504. ---------------------------------------------------------------------- --------------- ------------------------------
  505. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  506. Section: Command line profiler metrics
  507. ---------------------------------------------------------------------- --------------- ------------------------------
  508. gpu__time_duration.sum usecond 58.24
  509. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  510. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  511. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  512. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  513. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  514. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  515. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  516. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  517. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  518. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  519. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  520. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  521. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  522. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  523. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  524. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  525. smsp__inst_executed.avg inst 12,289.17
  526. smsp__inst_executed.max inst 12,704
  527. smsp__inst_executed.min inst 11,772
  528. smsp__inst_executed.sum inst 786,507
  529. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  530. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  531. smsp__cycles_active.avg cycle 70,558.91
  532. smsp__cycles_active.sum cycle 4,515,770
  533. ---------------------------------------------------------------------- --------------- ------------------------------
  534. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  535. Section: Command line profiler metrics
  536. ---------------------------------------------------------------------- --------------- ------------------------------
  537. gpu__time_duration.sum usecond 58.18
  538. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  539. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  540. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  541. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  542. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47
  543. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  544. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  545. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  546. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  547. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  548. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  549. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  550. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  551. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  552. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  553. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  554. smsp__inst_executed.avg inst 12,290.81
  555. smsp__inst_executed.max inst 12,711
  556. smsp__inst_executed.min inst 11,886
  557. smsp__inst_executed.sum inst 786,612
  558. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  559. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  560. smsp__cycles_active.avg cycle 71,430.86
  561. smsp__cycles_active.sum cycle 4,571,575
  562. ---------------------------------------------------------------------- --------------- ------------------------------
  563. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:23, Context 1, Stream 7
  564. Section: Command line profiler metrics
  565. ---------------------------------------------------------------------- --------------- ------------------------------
  566. gpu__time_duration.sum usecond 58.46
  567. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  568. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  569. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  570. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  571. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  572. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  573. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  574. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  575. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  576. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  577. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  578. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  579. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  580. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  581. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  582. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  583. smsp__inst_executed.avg inst 12,293.34
  584. smsp__inst_executed.max inst 12,505
  585. smsp__inst_executed.min inst 11,800
  586. smsp__inst_executed.sum inst 786,774
  587. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  588. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  589. smsp__cycles_active.avg cycle 71,942.39
  590. smsp__cycles_active.sum cycle 4,604,313
  591. ---------------------------------------------------------------------- --------------- ------------------------------
  592. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  593. Section: Command line profiler metrics
  594. ---------------------------------------------------------------------- --------------- ------------------------------
  595. gpu__time_duration.sum usecond 58.18
  596. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  597. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  598. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  599. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  600. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  601. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  602. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  603. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  604. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  605. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  606. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  607. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  608. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  609. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  610. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  611. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  612. smsp__inst_executed.avg inst 12,298.81
  613. smsp__inst_executed.max inst 12,649
  614. smsp__inst_executed.min inst 12,114
  615. smsp__inst_executed.sum inst 787,124
  616. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  617. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  618. smsp__cycles_active.avg cycle 71,234.89
  619. smsp__cycles_active.sum cycle 4,559,033
  620. ---------------------------------------------------------------------- --------------- ------------------------------
  621. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  622. Section: Command line profiler metrics
  623. ---------------------------------------------------------------------- --------------- ------------------------------
  624. gpu__time_duration.sum usecond 59.39
  625. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  626. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  627. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  628. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  629. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  630. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  631. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  632. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  633. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  634. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  635. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  636. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  637. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  638. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  639. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  640. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  641. smsp__inst_executed.avg inst 12,308.92
  642. smsp__inst_executed.max inst 12,739
  643. smsp__inst_executed.min inst 11,660
  644. smsp__inst_executed.sum inst 787,771
  645. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  646. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  647. smsp__cycles_active.avg cycle 73,053.95
  648. smsp__cycles_active.sum cycle 4,675,453
  649. ---------------------------------------------------------------------- --------------- ------------------------------
  650. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  651. Section: Command line profiler metrics
  652. ---------------------------------------------------------------------- --------------- ------------------------------
  653. gpu__time_duration.sum usecond 228.06
  654. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  655. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  656. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  657. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  658. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  659. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  660. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  661. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  662. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  663. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  664. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,672.12
  665. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,021
  666. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,352
  667. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,754
  668. smsp__average_warp_latency_issue_stalled_barrier.pct % 116,983.48
  669. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,169.83
  670. smsp__inst_executed.avg inst 189,031.84
  671. smsp__inst_executed.max inst 192,116
  672. smsp__inst_executed.min inst 185,910
  673. smsp__inst_executed.sum inst 12,098,038
  674. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.18
  675. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  676. smsp__cycles_active.avg cycle 316,447.16
  677. smsp__cycles_active.sum cycle 20,252,618
  678. ---------------------------------------------------------------------- --------------- ------------------------------
  679. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  680. Section: Command line profiler metrics
  681. ---------------------------------------------------------------------- --------------- ------------------------------
  682. gpu__time_duration.sum usecond 60.10
  683. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  684. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  685. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  686. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  687. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  688. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  689. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  690. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  691. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  692. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  693. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  694. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  695. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  696. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  697. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  698. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  699. smsp__inst_executed.avg inst 12,288.73
  700. smsp__inst_executed.max inst 12,512
  701. smsp__inst_executed.min inst 12,088
  702. smsp__inst_executed.sum inst 786,479
  703. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  704. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  705. smsp__cycles_active.avg cycle 74,125.95
  706. smsp__cycles_active.sum cycle 4,744,061
  707. ---------------------------------------------------------------------- --------------- ------------------------------
  708. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  709. Section: Command line profiler metrics
  710. ---------------------------------------------------------------------- --------------- ------------------------------
  711. gpu__time_duration.sum usecond 58.21
  712. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  713. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  714. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  715. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  716. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  717. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  718. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  719. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  720. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  721. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  722. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  723. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  724. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  725. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  726. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  727. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  728. smsp__inst_executed.avg inst 12,289.52
  729. smsp__inst_executed.max inst 12,659
  730. smsp__inst_executed.min inst 12,064
  731. smsp__inst_executed.sum inst 786,529
  732. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  733. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  734. smsp__cycles_active.avg cycle 71,326.66
  735. smsp__cycles_active.sum cycle 4,564,906
  736. ---------------------------------------------------------------------- --------------- ------------------------------
  737. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  738. Section: Command line profiler metrics
  739. ---------------------------------------------------------------------- --------------- ------------------------------
  740. gpu__time_duration.sum usecond 58.27
  741. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  742. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  743. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  744. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  745. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47
  746. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  747. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  748. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  749. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  750. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  751. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  752. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  753. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  754. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  755. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  756. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  757. smsp__inst_executed.avg inst 12,290.06
  758. smsp__inst_executed.max inst 12,821
  759. smsp__inst_executed.min inst 11,676
  760. smsp__inst_executed.sum inst 786,564
  761. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  762. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  763. smsp__cycles_active.avg cycle 71,406.72
  764. smsp__cycles_active.sum cycle 4,570,030
  765. ---------------------------------------------------------------------- --------------- ------------------------------
  766. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  767. Section: Command line profiler metrics
  768. ---------------------------------------------------------------------- --------------- ------------------------------
  769. gpu__time_duration.sum usecond 58.50
  770. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  771. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  772. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  773. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  774. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  775. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.97
  776. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  777. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  778. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  779. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  780. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  781. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  782. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  783. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  784. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  785. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  786. smsp__inst_executed.avg inst 12,293.75
  787. smsp__inst_executed.max inst 12,525
  788. smsp__inst_executed.min inst 11,886
  789. smsp__inst_executed.sum inst 786,800
  790. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  791. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  792. smsp__cycles_active.avg cycle 72,203.08
  793. smsp__cycles_active.sum cycle 4,620,997
  794. ---------------------------------------------------------------------- --------------- ------------------------------
  795. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  796. Section: Command line profiler metrics
  797. ---------------------------------------------------------------------- --------------- ------------------------------
  798. gpu__time_duration.sum usecond 57.95
  799. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  800. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  801. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  802. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  803. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37
  804. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  805. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  806. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  807. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  808. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  809. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  810. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  811. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  812. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  813. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  814. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  815. smsp__inst_executed.avg inst 12,298.52
  816. smsp__inst_executed.max inst 12,534
  817. smsp__inst_executed.min inst 12,060
  818. smsp__inst_executed.sum inst 787,105
  819. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  820. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  821. smsp__cycles_active.avg cycle 70,465.11
  822. smsp__cycles_active.sum cycle 4,509,767
  823. ---------------------------------------------------------------------- --------------- ------------------------------
  824. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  825. Section: Command line profiler metrics
  826. ---------------------------------------------------------------------- --------------- ------------------------------
  827. gpu__time_duration.sum usecond 59.39
  828. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  829. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  830. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  831. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  832. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  833. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  834. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  835. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  836. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  837. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  838. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  839. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  840. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  841. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  842. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  843. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  844. smsp__inst_executed.avg inst 12,309.41
  845. smsp__inst_executed.max inst 12,967
  846. smsp__inst_executed.min inst 11,668
  847. smsp__inst_executed.sum inst 787,802
  848. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  849. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  850. smsp__cycles_active.avg cycle 73,798.33
  851. smsp__cycles_active.sum cycle 4,723,093
  852. ---------------------------------------------------------------------- --------------- ------------------------------
  853. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:24, Context 1, Stream 7
  854. Section: Command line profiler metrics
  855. ---------------------------------------------------------------------- --------------- ------------------------------
  856. gpu__time_duration.sum usecond 228.83
  857. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  858. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  859. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  860. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  861. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  862. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  863. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  864. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  865. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  866. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  867. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,669.75
  868. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,971
  869. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,352
  870. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,716
  871. smsp__average_warp_latency_issue_stalled_barrier.pct % 118,094.62
  872. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,180.95
  873. smsp__inst_executed.avg inst 189,037.22
  874. smsp__inst_executed.max inst 192,039
  875. smsp__inst_executed.min inst 186,030
  876. smsp__inst_executed.sum inst 12,098,382
  877. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.36
  878. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  879. smsp__cycles_active.avg cycle 315,017.08
  880. smsp__cycles_active.sum cycle 20,161,093
  881. ---------------------------------------------------------------------- --------------- ------------------------------
  882. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  883. Section: Command line profiler metrics
  884. ---------------------------------------------------------------------- --------------- ------------------------------
  885. gpu__time_duration.sum usecond 58.72
  886. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  887. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  888. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  889. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  890. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  891. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  892. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  893. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  894. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  895. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  896. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  897. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  898. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  899. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  900. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  901. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  902. smsp__inst_executed.avg inst 12,288.30
  903. smsp__inst_executed.max inst 12,664
  904. smsp__inst_executed.min inst 11,904
  905. smsp__inst_executed.sum inst 786,451
  906. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  907. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  908. smsp__cycles_active.avg cycle 72,385.16
  909. smsp__cycles_active.sum cycle 4,632,650
  910. ---------------------------------------------------------------------- --------------- ------------------------------
  911. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  912. Section: Command line profiler metrics
  913. ---------------------------------------------------------------------- --------------- ------------------------------
  914. gpu__time_duration.sum usecond 60.03
  915. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  916. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  917. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  918. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  919. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  920. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  921. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  922. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  923. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  924. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  925. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  926. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  927. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  928. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  929. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  930. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  931. smsp__inst_executed.avg inst 12,288.97
  932. smsp__inst_executed.max inst 12,828
  933. smsp__inst_executed.min inst 11,696
  934. smsp__inst_executed.sum inst 786,494
  935. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  936. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  937. smsp__cycles_active.avg cycle 74,209.61
  938. smsp__cycles_active.sum cycle 4,749,415
  939. ---------------------------------------------------------------------- --------------- ------------------------------
  940. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  941. Section: Command line profiler metrics
  942. ---------------------------------------------------------------------- --------------- ------------------------------
  943. gpu__time_duration.sum usecond 57.89
  944. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  945. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  946. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  947. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  948. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  949. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  950. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  951. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  952. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  953. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  954. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  955. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  956. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  957. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  958. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  959. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  960. smsp__inst_executed.avg inst 12,289.61
  961. smsp__inst_executed.max inst 12,704
  962. smsp__inst_executed.min inst 11,952
  963. smsp__inst_executed.sum inst 786,535
  964. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  965. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  966. smsp__cycles_active.avg cycle 71,674.52
  967. smsp__cycles_active.sum cycle 4,587,169
  968. ---------------------------------------------------------------------- --------------- ------------------------------
  969. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  970. Section: Command line profiler metrics
  971. ---------------------------------------------------------------------- --------------- ------------------------------
  972. gpu__time_duration.sum usecond 58.37
  973. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  974. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  975. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  976. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  977. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  978. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  979. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  980. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  981. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  982. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  983. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  984. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  985. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  986. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  987. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  988. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  989. smsp__inst_executed.avg inst 12,291.02
  990. smsp__inst_executed.max inst 12,520
  991. smsp__inst_executed.min inst 12,108
  992. smsp__inst_executed.sum inst 786,625
  993. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  994. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  995. smsp__cycles_active.avg cycle 71,742.84
  996. smsp__cycles_active.sum cycle 4,591,542
  997. ---------------------------------------------------------------------- --------------- ------------------------------
  998. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  999. Section: Command line profiler metrics
  1000. ---------------------------------------------------------------------- --------------- ------------------------------
  1001. gpu__time_duration.sum usecond 58.62
  1002. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1003. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1004. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1005. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1006. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  1007. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1008. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1009. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1010. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1011. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1012. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1013. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1014. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1015. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1016. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1017. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1018. smsp__inst_executed.avg inst 12,293.81
  1019. smsp__inst_executed.max inst 12,676
  1020. smsp__inst_executed.min inst 12,021
  1021. smsp__inst_executed.sum inst 786,804
  1022. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1023. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1024. smsp__cycles_active.avg cycle 71,224
  1025. smsp__cycles_active.sum cycle 4,558,336
  1026. ---------------------------------------------------------------------- --------------- ------------------------------
  1027. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  1028. Section: Command line profiler metrics
  1029. ---------------------------------------------------------------------- --------------- ------------------------------
  1030. gpu__time_duration.sum usecond 58.27
  1031. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1032. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1033. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1034. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1035. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1036. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1037. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1038. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1039. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1040. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1041. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1042. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1043. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1044. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1045. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1046. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1047. smsp__inst_executed.avg inst 12,298.45
  1048. smsp__inst_executed.max inst 12,492
  1049. smsp__inst_executed.min inst 11,896
  1050. smsp__inst_executed.sum inst 787,101
  1051. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1052. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1053. smsp__cycles_active.avg cycle 71,438.62
  1054. smsp__cycles_active.sum cycle 4,572,072
  1055. ---------------------------------------------------------------------- --------------- ------------------------------
  1056. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  1057. Section: Command line profiler metrics
  1058. ---------------------------------------------------------------------- --------------- ------------------------------
  1059. gpu__time_duration.sum usecond 59.46
  1060. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1061. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1062. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1063. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1064. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  1065. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1066. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1067. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1068. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1069. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1070. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1071. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1072. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1073. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1074. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1075. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1076. smsp__inst_executed.avg inst 12,309.52
  1077. smsp__inst_executed.max inst 12,762
  1078. smsp__inst_executed.min inst 11,951
  1079. smsp__inst_executed.sum inst 787,809
  1080. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1081. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1082. smsp__cycles_active.avg cycle 73,469.19
  1083. smsp__cycles_active.sum cycle 4,702,028
  1084. ---------------------------------------------------------------------- --------------- ------------------------------
  1085. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  1086. Section: Command line profiler metrics
  1087. ---------------------------------------------------------------------- --------------- ------------------------------
  1088. gpu__time_duration.sum usecond 228
  1089. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1090. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1091. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1092. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1093. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1094. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1095. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1096. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1097. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1098. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1099. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,690.44
  1100. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,977
  1101. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,373
  1102. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,047
  1103. smsp__average_warp_latency_issue_stalled_barrier.pct % 116,796.43
  1104. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,167.96
  1105. smsp__inst_executed.avg inst 189,032.39
  1106. smsp__inst_executed.max inst 192,016
  1107. smsp__inst_executed.min inst 186,010
  1108. smsp__inst_executed.sum inst 12,098,073
  1109. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.20
  1110. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  1111. smsp__cycles_active.avg cycle 315,448.41
  1112. smsp__cycles_active.sum cycle 20,188,698
  1113. ---------------------------------------------------------------------- --------------- ------------------------------
  1114. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  1115. Section: Command line profiler metrics
  1116. ---------------------------------------------------------------------- --------------- ------------------------------
  1117. gpu__time_duration.sum usecond 58.66
  1118. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1119. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1120. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1121. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1122. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1123. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1124. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1125. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1126. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1127. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1128. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1129. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1130. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1131. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1132. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1133. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1134. smsp__inst_executed.avg inst 12,288.34
  1135. smsp__inst_executed.max inst 12,672
  1136. smsp__inst_executed.min inst 12,084
  1137. smsp__inst_executed.sum inst 786,454
  1138. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1139. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1140. smsp__cycles_active.avg cycle 72,637.31
  1141. smsp__cycles_active.sum cycle 4,648,788
  1142. ---------------------------------------------------------------------- --------------- ------------------------------
  1143. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:25, Context 1, Stream 7
  1144. Section: Command line profiler metrics
  1145. ---------------------------------------------------------------------- --------------- ------------------------------
  1146. gpu__time_duration.sum usecond 59.23
  1147. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1148. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1149. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1150. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1151. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1152. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1153. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1154. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1155. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1156. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1157. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1158. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1159. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1160. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1161. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1162. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1163. smsp__inst_executed.avg inst 12,288.91
  1164. smsp__inst_executed.max inst 12,665
  1165. smsp__inst_executed.min inst 11,904
  1166. smsp__inst_executed.sum inst 786,490
  1167. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1168. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1169. smsp__cycles_active.avg cycle 72,738.33
  1170. smsp__cycles_active.sum cycle 4,655,253
  1171. ---------------------------------------------------------------------- --------------- ------------------------------
  1172. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1173. Section: Command line profiler metrics
  1174. ---------------------------------------------------------------------- --------------- ------------------------------
  1175. gpu__time_duration.sum usecond 60.22
  1176. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1177. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1178. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1179. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1180. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1181. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1182. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1183. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1184. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1185. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1186. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1187. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1188. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1189. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1190. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1191. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1192. smsp__inst_executed.avg inst 12,289.70
  1193. smsp__inst_executed.max inst 12,664
  1194. smsp__inst_executed.min inst 11,496
  1195. smsp__inst_executed.sum inst 786,541
  1196. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1197. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1198. smsp__cycles_active.avg cycle 74,202.39
  1199. smsp__cycles_active.sum cycle 4,748,953
  1200. ---------------------------------------------------------------------- --------------- ------------------------------
  1201. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1202. Section: Command line profiler metrics
  1203. ---------------------------------------------------------------------- --------------- ------------------------------
  1204. gpu__time_duration.sum usecond 58.30
  1205. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1206. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1207. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1208. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1209. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1210. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1211. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1212. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1213. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1214. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1215. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1216. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1217. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1218. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1219. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1220. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1221. smsp__inst_executed.avg inst 12,289.03
  1222. smsp__inst_executed.max inst 12,656
  1223. smsp__inst_executed.min inst 11,886
  1224. smsp__inst_executed.sum inst 786,498
  1225. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1226. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1227. smsp__cycles_active.avg cycle 71,574
  1228. smsp__cycles_active.sum cycle 4,580,736
  1229. ---------------------------------------------------------------------- --------------- ------------------------------
  1230. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1231. Section: Command line profiler metrics
  1232. ---------------------------------------------------------------------- --------------- ------------------------------
  1233. gpu__time_duration.sum usecond 58.30
  1234. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1235. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1236. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1237. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1238. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1239. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1240. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1241. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1242. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1243. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1244. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1245. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1246. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1247. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1248. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1249. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1250. smsp__inst_executed.avg inst 12,290.56
  1251. smsp__inst_executed.max inst 12,493
  1252. smsp__inst_executed.min inst 12,052
  1253. smsp__inst_executed.sum inst 786,596
  1254. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1255. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1256. smsp__cycles_active.avg cycle 71,559.62
  1257. smsp__cycles_active.sum cycle 4,579,816
  1258. ---------------------------------------------------------------------- --------------- ------------------------------
  1259. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1260. Section: Command line profiler metrics
  1261. ---------------------------------------------------------------------- --------------- ------------------------------
  1262. gpu__time_duration.sum usecond 58.53
  1263. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1264. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1265. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1266. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1267. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1268. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1269. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1270. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1271. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1272. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1273. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1274. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1275. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1276. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1277. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1278. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1279. smsp__inst_executed.avg inst 12,293.20
  1280. smsp__inst_executed.max inst 12,558
  1281. smsp__inst_executed.min inst 12,110
  1282. smsp__inst_executed.sum inst 786,765
  1283. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1284. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1285. smsp__cycles_active.avg cycle 71,728.27
  1286. smsp__cycles_active.sum cycle 4,590,609
  1287. ---------------------------------------------------------------------- --------------- ------------------------------
  1288. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1289. Section: Command line profiler metrics
  1290. ---------------------------------------------------------------------- --------------- ------------------------------
  1291. gpu__time_duration.sum usecond 58.08
  1292. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1293. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1294. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1295. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1296. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1297. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  1298. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1299. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1300. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1301. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1302. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1303. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1304. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1305. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1306. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1307. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1308. smsp__inst_executed.avg inst 12,297.78
  1309. smsp__inst_executed.max inst 12,685
  1310. smsp__inst_executed.min inst 12,030
  1311. smsp__inst_executed.sum inst 787,058
  1312. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1313. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1314. smsp__cycles_active.avg cycle 71,361.31
  1315. smsp__cycles_active.sum cycle 4,567,124
  1316. ---------------------------------------------------------------------- --------------- ------------------------------
  1317. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1318. Section: Command line profiler metrics
  1319. ---------------------------------------------------------------------- --------------- ------------------------------
  1320. gpu__time_duration.sum usecond 59.58
  1321. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1322. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1323. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1324. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1325. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  1326. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1327. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1328. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1329. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1330. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1331. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1332. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1333. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1334. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1335. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1336. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1337. smsp__inst_executed.avg inst 12,308.97
  1338. smsp__inst_executed.max inst 12,728
  1339. smsp__inst_executed.min inst 12,067
  1340. smsp__inst_executed.sum inst 787,774
  1341. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1342. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1343. smsp__cycles_active.avg cycle 73,261.38
  1344. smsp__cycles_active.sum cycle 4,688,728
  1345. ---------------------------------------------------------------------- --------------- ------------------------------
  1346. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1347. Section: Command line profiler metrics
  1348. ---------------------------------------------------------------------- --------------- ------------------------------
  1349. gpu__time_duration.sum usecond 228.22
  1350. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1351. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1352. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1353. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1354. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1355. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1356. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1357. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1358. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1359. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1360. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.12
  1361. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,951
  1362. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,409
  1363. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,978
  1364. smsp__average_warp_latency_issue_stalled_barrier.pct % 117,160.55
  1365. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,171.61
  1366. smsp__inst_executed.avg inst 189,022.41
  1367. smsp__inst_executed.max inst 192,049
  1368. smsp__inst_executed.min inst 186,033
  1369. smsp__inst_executed.sum inst 12,097,434
  1370. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.23
  1371. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  1372. smsp__cycles_active.avg cycle 315,736.44
  1373. smsp__cycles_active.sum cycle 20,207,132
  1374. ---------------------------------------------------------------------- --------------- ------------------------------
  1375. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1376. Section: Command line profiler metrics
  1377. ---------------------------------------------------------------------- --------------- ------------------------------
  1378. gpu__time_duration.sum usecond 60.86
  1379. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1380. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1381. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1382. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1383. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1384. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1385. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1386. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1387. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1388. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1389. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1390. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1391. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1392. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1393. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1394. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1395. smsp__inst_executed.avg inst 12,288.12
  1396. smsp__inst_executed.max inst 12,680
  1397. smsp__inst_executed.min inst 11,716
  1398. smsp__inst_executed.sum inst 786,440
  1399. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1400. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1401. smsp__cycles_active.avg cycle 74,287.59
  1402. smsp__cycles_active.sum cycle 4,754,406
  1403. ---------------------------------------------------------------------- --------------- ------------------------------
  1404. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1405. Section: Command line profiler metrics
  1406. ---------------------------------------------------------------------- --------------- ------------------------------
  1407. gpu__time_duration.sum usecond 58.78
  1408. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1409. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1410. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1411. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1412. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1413. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1414. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1415. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1416. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1417. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1418. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1419. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1420. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1421. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1422. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1423. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1424. smsp__inst_executed.avg inst 12,288.20
  1425. smsp__inst_executed.max inst 12,672
  1426. smsp__inst_executed.min inst 12,092
  1427. smsp__inst_executed.sum inst 786,445
  1428. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1429. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1430. smsp__cycles_active.avg cycle 72,712.78
  1431. smsp__cycles_active.sum cycle 4,653,618
  1432. ---------------------------------------------------------------------- --------------- ------------------------------
  1433. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:26, Context 1, Stream 7
  1434. Section: Command line profiler metrics
  1435. ---------------------------------------------------------------------- --------------- ------------------------------
  1436. gpu__time_duration.sum usecond 58.56
  1437. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1438. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1439. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1440. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1441. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1442. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1443. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1444. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1445. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1446. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1447. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1448. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1449. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1450. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1451. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1452. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1453. smsp__inst_executed.avg inst 12,287.97
  1454. smsp__inst_executed.max inst 12,860
  1455. smsp__inst_executed.min inst 12,094
  1456. smsp__inst_executed.sum inst 786,430
  1457. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1458. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1459. smsp__cycles_active.avg cycle 71,055.23
  1460. smsp__cycles_active.sum cycle 4,547,535
  1461. ---------------------------------------------------------------------- --------------- ------------------------------
  1462. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1463. Section: Command line profiler metrics
  1464. ---------------------------------------------------------------------- --------------- ------------------------------
  1465. gpu__time_duration.sum usecond 60.32
  1466. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1467. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1468. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1469. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1470. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1471. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1472. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1473. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1474. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1475. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1476. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1477. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1478. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1479. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1480. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1481. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1482. smsp__inst_executed.avg inst 12,288.66
  1483. smsp__inst_executed.max inst 12,509
  1484. smsp__inst_executed.min inst 11,904
  1485. smsp__inst_executed.sum inst 786,474
  1486. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1487. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1488. smsp__cycles_active.avg cycle 72,694.91
  1489. smsp__cycles_active.sum cycle 4,652,474
  1490. ---------------------------------------------------------------------- --------------- ------------------------------
  1491. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1492. Section: Command line profiler metrics
  1493. ---------------------------------------------------------------------- --------------- ------------------------------
  1494. gpu__time_duration.sum usecond 58.08
  1495. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1496. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1497. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1498. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1499. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1500. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1501. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1502. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1503. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1504. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1505. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1506. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1507. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1508. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1509. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1510. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1511. smsp__inst_executed.avg inst 12,289.78
  1512. smsp__inst_executed.max inst 12,493
  1513. smsp__inst_executed.min inst 12,050
  1514. smsp__inst_executed.sum inst 786,546
  1515. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1516. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1517. smsp__cycles_active.avg cycle 70,318.98
  1518. smsp__cycles_active.sum cycle 4,500,415
  1519. ---------------------------------------------------------------------- --------------- ------------------------------
  1520. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1521. Section: Command line profiler metrics
  1522. ---------------------------------------------------------------------- --------------- ------------------------------
  1523. gpu__time_duration.sum usecond 58.27
  1524. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1525. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1526. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1527. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1528. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1529. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1530. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1531. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1532. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1533. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1534. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1535. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1536. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1537. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1538. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1539. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1540. smsp__inst_executed.avg inst 12,290.84
  1541. smsp__inst_executed.max inst 12,691
  1542. smsp__inst_executed.min inst 12,005
  1543. smsp__inst_executed.sum inst 786,614
  1544. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1545. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1546. smsp__cycles_active.avg cycle 70,641.02
  1547. smsp__cycles_active.sum cycle 4,521,025
  1548. ---------------------------------------------------------------------- --------------- ------------------------------
  1549. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1550. Section: Command line profiler metrics
  1551. ---------------------------------------------------------------------- --------------- ------------------------------
  1552. gpu__time_duration.sum usecond 58.59
  1553. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1554. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1555. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1556. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1557. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1558. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1559. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1560. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1561. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1562. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1563. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1564. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1565. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1566. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1567. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1568. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1569. smsp__inst_executed.avg inst 12,294.64
  1570. smsp__inst_executed.max inst 12,527
  1571. smsp__inst_executed.min inst 12,057
  1572. smsp__inst_executed.sum inst 786,857
  1573. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1574. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1575. smsp__cycles_active.avg cycle 72,218
  1576. smsp__cycles_active.sum cycle 4,621,952
  1577. ---------------------------------------------------------------------- --------------- ------------------------------
  1578. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1579. Section: Command line profiler metrics
  1580. ---------------------------------------------------------------------- --------------- ------------------------------
  1581. gpu__time_duration.sum usecond 58.14
  1582. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1583. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1584. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1585. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1586. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37
  1587. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  1588. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1589. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1590. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1591. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1592. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1593. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1594. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1595. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1596. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1597. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1598. smsp__inst_executed.avg inst 12,300.36
  1599. smsp__inst_executed.max inst 12,681
  1600. smsp__inst_executed.min inst 11,918
  1601. smsp__inst_executed.sum inst 787,223
  1602. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1603. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1604. smsp__cycles_active.avg cycle 71,114.91
  1605. smsp__cycles_active.sum cycle 4,551,354
  1606. ---------------------------------------------------------------------- --------------- ------------------------------
  1607. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1608. Section: Command line profiler metrics
  1609. ---------------------------------------------------------------------- --------------- ------------------------------
  1610. gpu__time_duration.sum usecond 59.42
  1611. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1612. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1613. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1614. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1615. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  1616. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1617. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1618. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1619. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1620. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1621. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1622. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1623. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1624. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1625. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1626. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1627. smsp__inst_executed.avg inst 12,315.06
  1628. smsp__inst_executed.max inst 12,754
  1629. smsp__inst_executed.min inst 11,731
  1630. smsp__inst_executed.sum inst 788,164
  1631. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1632. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1633. smsp__cycles_active.avg cycle 73,447.34
  1634. smsp__cycles_active.sum cycle 4,700,630
  1635. ---------------------------------------------------------------------- --------------- ------------------------------
  1636. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1637. Section: Command line profiler metrics
  1638. ---------------------------------------------------------------------- --------------- ------------------------------
  1639. gpu__time_duration.sum usecond 231.01
  1640. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1641. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1642. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1643. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1644. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1645. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1646. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1647. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1648. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1649. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1650. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,667.44
  1651. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,990
  1652. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,305
  1653. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,679
  1654. smsp__average_warp_latency_issue_stalled_barrier.pct % 118,065.21
  1655. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,180.65
  1656. smsp__inst_executed.avg inst 189,024.22
  1657. smsp__inst_executed.max inst 192,046
  1658. smsp__inst_executed.min inst 186,039
  1659. smsp__inst_executed.sum inst 12,097,550
  1660. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.35
  1661. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  1662. smsp__cycles_active.avg cycle 315,105.62
  1663. smsp__cycles_active.sum cycle 20,166,760
  1664. ---------------------------------------------------------------------- --------------- ------------------------------
  1665. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1666. Section: Command line profiler metrics
  1667. ---------------------------------------------------------------------- --------------- ------------------------------
  1668. gpu__time_duration.sum usecond 56.86
  1669. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1670. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1671. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1672. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1673. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1674. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1675. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1676. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1677. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1678. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1679. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1680. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1681. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1682. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1683. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1684. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1685. smsp__inst_executed.avg inst 12,288.06
  1686. smsp__inst_executed.max inst 12,680
  1687. smsp__inst_executed.min inst 11,896
  1688. smsp__inst_executed.sum inst 786,436
  1689. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1690. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1691. smsp__cycles_active.avg cycle 69,539.08
  1692. smsp__cycles_active.sum cycle 4,450,501
  1693. ---------------------------------------------------------------------- --------------- ------------------------------
  1694. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1695. Section: Command line profiler metrics
  1696. ---------------------------------------------------------------------- --------------- ------------------------------
  1697. gpu__time_duration.sum usecond 60.74
  1698. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1699. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1700. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1701. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1702. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1703. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1704. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1705. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1706. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1707. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1708. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1709. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1710. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1711. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1712. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1713. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1714. smsp__inst_executed.avg inst 12,288.19
  1715. smsp__inst_executed.max inst 12,488
  1716. smsp__inst_executed.min inst 11,700
  1717. smsp__inst_executed.sum inst 786,444
  1718. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1719. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1720. smsp__cycles_active.avg cycle 75,362.86
  1721. smsp__cycles_active.sum cycle 4,823,223
  1722. ---------------------------------------------------------------------- --------------- ------------------------------
  1723. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1724. Section: Command line profiler metrics
  1725. ---------------------------------------------------------------------- --------------- ------------------------------
  1726. gpu__time_duration.sum usecond 58.75
  1727. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1728. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1729. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1730. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1731. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1732. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1733. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1734. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1735. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1736. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1737. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1738. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1739. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1740. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1741. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1742. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1743. smsp__inst_executed.avg inst 12,288.42
  1744. smsp__inst_executed.max inst 12,668
  1745. smsp__inst_executed.min inst 11,916
  1746. smsp__inst_executed.sum inst 786,459
  1747. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1748. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1749. smsp__cycles_active.avg cycle 72,588.78
  1750. smsp__cycles_active.sum cycle 4,645,682
  1751. ---------------------------------------------------------------------- --------------- ------------------------------
  1752. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:27, Context 1, Stream 7
  1753. Section: Command line profiler metrics
  1754. ---------------------------------------------------------------------- --------------- ------------------------------
  1755. gpu__time_duration.sum usecond 59.01
  1756. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1757. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1758. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1759. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1760. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1761. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1762. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1763. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1764. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1765. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1766. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1767. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1768. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1769. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1770. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1771. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1772. smsp__inst_executed.avg inst 12,287.92
  1773. smsp__inst_executed.max inst 12,489
  1774. smsp__inst_executed.min inst 11,942
  1775. smsp__inst_executed.sum inst 786,427
  1776. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1777. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1778. smsp__cycles_active.avg cycle 72,949.64
  1779. smsp__cycles_active.sum cycle 4,668,777
  1780. ---------------------------------------------------------------------- --------------- ------------------------------
  1781. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1782. Section: Command line profiler metrics
  1783. ---------------------------------------------------------------------- --------------- ------------------------------
  1784. gpu__time_duration.sum usecond 59.97
  1785. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1786. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1787. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1788. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1789. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1790. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1791. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1792. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1793. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1794. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1795. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1796. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1797. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1798. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1799. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1800. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1801. smsp__inst_executed.avg inst 12,288.47
  1802. smsp__inst_executed.max inst 12,707
  1803. smsp__inst_executed.min inst 11,904
  1804. smsp__inst_executed.sum inst 786,462
  1805. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1806. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1807. smsp__cycles_active.avg cycle 74,444.11
  1808. smsp__cycles_active.sum cycle 4,764,423
  1809. ---------------------------------------------------------------------- --------------- ------------------------------
  1810. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1811. Section: Command line profiler metrics
  1812. ---------------------------------------------------------------------- --------------- ------------------------------
  1813. gpu__time_duration.sum usecond 58.43
  1814. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1815. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1816. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1817. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1818. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1819. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1820. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1821. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1822. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1823. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1824. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1825. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1826. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1827. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1828. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1829. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1830. smsp__inst_executed.avg inst 12,289.03
  1831. smsp__inst_executed.max inst 12,510
  1832. smsp__inst_executed.min inst 11,724
  1833. smsp__inst_executed.sum inst 786,498
  1834. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1835. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1836. smsp__cycles_active.avg cycle 70,381.17
  1837. smsp__cycles_active.sum cycle 4,504,395
  1838. ---------------------------------------------------------------------- --------------- ------------------------------
  1839. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1840. Section: Command line profiler metrics
  1841. ---------------------------------------------------------------------- --------------- ------------------------------
  1842. gpu__time_duration.sum usecond 58.34
  1843. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1844. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1845. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1846. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1847. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1848. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1849. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1850. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1851. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1852. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1853. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1854. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1855. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1856. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1857. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1858. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1859. smsp__inst_executed.avg inst 12,290.17
  1860. smsp__inst_executed.max inst 12,557
  1861. smsp__inst_executed.min inst 12,092
  1862. smsp__inst_executed.sum inst 786,571
  1863. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1864. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1865. smsp__cycles_active.avg cycle 70,414.30
  1866. smsp__cycles_active.sum cycle 4,506,515
  1867. ---------------------------------------------------------------------- --------------- ------------------------------
  1868. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1869. Section: Command line profiler metrics
  1870. ---------------------------------------------------------------------- --------------- ------------------------------
  1871. gpu__time_duration.sum usecond 58.24
  1872. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1873. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1874. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1875. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1876. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1877. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1878. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1879. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1880. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1881. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1882. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1883. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1884. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1885. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1886. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1887. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1888. smsp__inst_executed.avg inst 12,293.06
  1889. smsp__inst_executed.max inst 12,927
  1890. smsp__inst_executed.min inst 11,694
  1891. smsp__inst_executed.sum inst 786,756
  1892. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1893. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1894. smsp__cycles_active.avg cycle 70,712.52
  1895. smsp__cycles_active.sum cycle 4,525,601
  1896. ---------------------------------------------------------------------- --------------- ------------------------------
  1897. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1898. Section: Command line profiler metrics
  1899. ---------------------------------------------------------------------- --------------- ------------------------------
  1900. gpu__time_duration.sum usecond 57.92
  1901. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1902. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1903. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1904. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1905. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1906. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1907. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1908. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1909. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1910. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1911. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1912. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1913. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1914. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1915. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1916. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1917. smsp__inst_executed.avg inst 12,298.02
  1918. smsp__inst_executed.max inst 12,742
  1919. smsp__inst_executed.min inst 11,876
  1920. smsp__inst_executed.sum inst 787,073
  1921. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1922. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1923. smsp__cycles_active.avg cycle 70,394.77
  1924. smsp__cycles_active.sum cycle 4,505,265
  1925. ---------------------------------------------------------------------- --------------- ------------------------------
  1926. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1927. Section: Command line profiler metrics
  1928. ---------------------------------------------------------------------- --------------- ------------------------------
  1929. gpu__time_duration.sum usecond 59.39
  1930. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1931. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1932. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1933. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1934. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23
  1935. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1936. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1937. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1938. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1939. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1940. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1941. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1942. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1943. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1944. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1945. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1946. smsp__inst_executed.avg inst 12,308.81
  1947. smsp__inst_executed.max inst 12,825
  1948. smsp__inst_executed.min inst 11,716
  1949. smsp__inst_executed.sum inst 787,764
  1950. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1951. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1952. smsp__cycles_active.avg cycle 72,582.98
  1953. smsp__cycles_active.sum cycle 4,645,311
  1954. ---------------------------------------------------------------------- --------------- ------------------------------
  1955. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 16:07:28, Context 1, Stream 7
  1956. Section: Command line profiler metrics
  1957. ---------------------------------------------------------------------- --------------- ------------------------------
  1958. gpu__time_duration.sum usecond 227.52
  1959. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1960. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1961. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1962. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1963. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1964. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1965. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1966. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1967. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1968. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1969. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,692.19
  1970. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,059
  1971. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,488
  1972. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,075
  1973. smsp__average_warp_latency_issue_stalled_barrier.pct % 118,035.67
  1974. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,180.36
  1975. smsp__inst_executed.avg inst 188,792.84
  1976. smsp__inst_executed.max inst 191,758
  1977. smsp__inst_executed.min inst 185,797
  1978. smsp__inst_executed.sum inst 12,082,742
  1979. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.34
  1980. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.12
  1981. smsp__cycles_active.avg cycle 315,152.44
  1982. smsp__cycles_active.sum cycle 20,169,756
  1983. ---------------------------------------------------------------------- --------------- ------------------------------