AUTH's THMMY "Parallel and distributed systems" course assignments.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2050 lines
228 KiB

  1. ==PROF== Connected to process 23012 (/home/hoo2/Work/AUTH/PDS/homework_3/out/v2/bitonicCUDA)
  2. ==PROF== Profiling "prephase" - 1: 0%....50%....100% - 6 passes
  3. ==PROF== Profiling "interBlockStep" - 2: 0%....50%....100% - 6 passes
  4. ==PROF== Profiling "inBlockStep" - 3: 0%....50%....100% - 6 passes
  5. ==PROF== Profiling "interBlockStep" - 4: 0%....50%....100% - 6 passes
  6. ==PROF== Profiling "interBlockStep" - 5: 0%....50%....100% - 6 passes
  7. ==PROF== Profiling "inBlockStep" - 6: 0%....50%....100% - 6 passes
  8. ==PROF== Profiling "interBlockStep" - 7: 0%....50%....100% - 6 passes
  9. ==PROF== Profiling "interBlockStep" - 8: 0%....50%....100% - 6 passes
  10. ==PROF== Profiling "interBlockStep" - 9: 0%....50%....100% - 6 passes
  11. ==PROF== Profiling "inBlockStep" - 10: 0%....50%....100% - 6 passes
  12. ==PROF== Profiling "interBlockStep" - 11: 0%....50%....100% - 6 passes
  13. ==PROF== Profiling "interBlockStep" - 12: 0%....50%....100% - 6 passes
  14. ==PROF== Profiling "interBlockStep" - 13: 0%....50%....100% - 6 passes
  15. ==PROF== Profiling "interBlockStep" - 14: 0%....50%....100% - 6 passes
  16. ==PROF== Profiling "inBlockStep" - 15: 0%....50%....100% - 6 passes
  17. ==PROF== Profiling "interBlockStep" - 16: 0%....50%....100% - 6 passes
  18. ==PROF== Profiling "interBlockStep" - 17: 0%....50%....100% - 6 passes
  19. ==PROF== Profiling "interBlockStep" - 18: 0%....50%....100% - 6 passes
  20. ==PROF== Profiling "interBlockStep" - 19: 0%....50%....100% - 6 passes
  21. ==PROF== Profiling "interBlockStep" - 20: 0%....50%....100% - 6 passes
  22. ==PROF== Profiling "inBlockStep" - 21: 0%....50%....100% - 6 passes
  23. ==PROF== Profiling "interBlockStep" - 22: 0%....50%....100% - 6 passes
  24. ==PROF== Profiling "interBlockStep" - 23: 0%....50%....100% - 6 passes
  25. ==PROF== Profiling "interBlockStep" - 24: 0%....50%....100% - 6 passes
  26. ==PROF== Profiling "interBlockStep" - 25: 0%....50%....100% - 6 passes
  27. ==PROF== Profiling "interBlockStep" - 26: 0%....50%....100% - 6 passes
  28. ==PROF== Profiling "interBlockStep" - 27: 0%....50%....100% - 6 passes
  29. ==PROF== Profiling "inBlockStep" - 28: 0%....50%....100% - 6 passes
  30. ==PROF== Profiling "interBlockStep" - 29: 0%....50%....100% - 6 passes
  31. ==PROF== Profiling "interBlockStep" - 30: 0%....50%....100% - 6 passes
  32. ==PROF== Profiling "interBlockStep" - 31: 0%....50%....100% - 6 passes
  33. ==PROF== Profiling "interBlockStep" - 32: 0%....50%....100% - 6 passes
  34. ==PROF== Profiling "interBlockStep" - 33: 0%....50%....100% - 6 passes
  35. ==PROF== Profiling "interBlockStep" - 34: 0%....50%....100% - 6 passes
  36. ==PROF== Profiling "interBlockStep" - 35: 0%....50%....100% - 6 passes
  37. ==PROF== Profiling "inBlockStep" - 36: 0%....50%....100% - 6 passes
  38. ==PROF== Profiling "interBlockStep" - 37: 0%....50%....100% - 6 passes
  39. ==PROF== Profiling "interBlockStep" - 38: 0%....50%....100% - 6 passes
  40. ==PROF== Profiling "interBlockStep" - 39: 0%....50%....100% - 6 passes
  41. ==PROF== Profiling "interBlockStep" - 40: 0%....50%....100% - 6 passes
  42. ==PROF== Profiling "interBlockStep" - 41: 0%....50%....100% - 6 passes
  43. ==PROF== Profiling "interBlockStep" - 42: 0%....50%....100% - 6 passes
  44. ==PROF== Profiling "interBlockStep" - 43: 0%....50%....100% - 6 passes
  45. ==PROF== Profiling "interBlockStep" - 44: 0%....50%....100% - 6 passes
  46. ==PROF== Profiling "inBlockStep" - 45: 0%....50%....100% - 6 passes
  47. ==PROF== Profiling "interBlockStep" - 46: 0%....50%....100% - 6 passes
  48. ==PROF== Profiling "interBlockStep" - 47: 0%....50%....100% - 6 passes
  49. ==PROF== Profiling "interBlockStep" - 48: 0%....50%....100% - 6 passes
  50. ==PROF== Profiling "interBlockStep" - 49: 0%....50%....100% - 6 passes
  51. ==PROF== Profiling "interBlockStep" - 50: 0%....50%....100% - 6 passes
  52. ==PROF== Profiling "interBlockStep" - 51: 0%....50%....100% - 6 passes
  53. ==PROF== Profiling "interBlockStep" - 52: 0%....50%....100% - 6 passes
  54. ==PROF== Profiling "interBlockStep" - 53: 0%....50%....100% - 6 passes
  55. ==PROF== Profiling "interBlockStep" - 54: 0%....50%....100% - 6 passes
  56. ==PROF== Profiling "inBlockStep" - 55: 0%....50%....100% - 6 passes
  57. ==PROF== Profiling "interBlockStep" - 56: 0%....50%....100% - 6 passes
  58. ==PROF== Profiling "interBlockStep" - 57: 0%....50%....100% - 6 passes
  59. ==PROF== Profiling "interBlockStep" - 58: 0%....50%....100% - 6 passes
  60. ==PROF== Profiling "interBlockStep" - 59: 0%....50%....100% - 6 passes
  61. ==PROF== Profiling "interBlockStep" - 60: 0%....50%....100% - 6 passes
  62. ==PROF== Profiling "interBlockStep" - 61: 0%....50%....100% - 6 passes
  63. ==PROF== Profiling "interBlockStep" - 62: 0%....50%....100% - 6 passes
  64. ==PROF== Profiling "interBlockStep" - 63: 0%....50%....100% - 6 passes
  65. ==PROF== Profiling "interBlockStep" - 64: 0%....50%....100% - 6 passes
  66. ==PROF== Profiling "interBlockStep" - 65: 0%....50%....100% - 6 passes
  67. ==PROF== Profiling "inBlockStep" - 66: 0%....50%....100% - 6 passes
  68. ==PROF== Disconnected from process 23012
  69. [23012] bitonicCUDA@127.0.0.1
  70. void prephase<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  71. Section: Command line profiler metrics
  72. ---------------------------------------------------------------------- --------------- ------------------------------
  73. gpu__time_duration.sum msecond 1.20
  74. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  75. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  76. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  77. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  78. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  79. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  80. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 186,368
  81. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 186,368
  82. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 186,368
  83. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 2,981,888
  84. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 111,954.62
  85. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 112,106
  86. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 111,827
  87. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 1,791,274
  88. smsp__average_warp_latency_issue_stalled_barrier.pct % 644,639.76
  89. smsp__average_warp_latency_issue_stalled_barrier.ratio 6,446.40
  90. smsp__inst_executed.avg inst 1,030,883.66
  91. smsp__inst_executed.max inst 1,031,104
  92. smsp__inst_executed.min inst 1,030,650
  93. smsp__inst_executed.sum inst 65,976,554
  94. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.50
  95. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  96. smsp__cycles_active.avg cycle 1,667,322.50
  97. smsp__cycles_active.sum cycle 106,708,640
  98. ---------------------------------------------------------------------- --------------- ------------------------------
  99. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  100. Section: Command line profiler metrics
  101. ---------------------------------------------------------------------- --------------- ------------------------------
  102. gpu__time_duration.sum usecond 59.94
  103. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  104. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  105. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  106. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  107. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  108. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  109. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  110. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  111. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  112. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  113. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  114. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  115. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  116. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  117. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  118. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  119. smsp__inst_executed.avg inst 12,308.62
  120. smsp__inst_executed.max inst 12,930
  121. smsp__inst_executed.min inst 12,094
  122. smsp__inst_executed.sum inst 787,752
  123. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  124. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  125. smsp__cycles_active.avg cycle 73,839.28
  126. smsp__cycles_active.sum cycle 4,725,714
  127. ---------------------------------------------------------------------- --------------- ------------------------------
  128. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  129. Section: Command line profiler metrics
  130. ---------------------------------------------------------------------- --------------- ------------------------------
  131. gpu__time_duration.sum usecond 231.58
  132. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  133. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  134. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  135. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  136. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  137. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  138. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  139. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  140. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  141. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  142. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,673.56
  143. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,005
  144. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,385
  145. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,777
  146. smsp__average_warp_latency_issue_stalled_barrier.pct % 123,076.57
  147. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,230.77
  148. smsp__inst_executed.avg inst 189,293.97
  149. smsp__inst_executed.max inst 192,369
  150. smsp__inst_executed.min inst 186,352
  151. smsp__inst_executed.sum inst 12,114,814
  152. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.78
  153. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  154. smsp__cycles_active.avg cycle 316,320.25
  155. smsp__cycles_active.sum cycle 20,244,496
  156. ---------------------------------------------------------------------- --------------- ------------------------------
  157. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  158. Section: Command line profiler metrics
  159. ---------------------------------------------------------------------- --------------- ------------------------------
  160. gpu__time_duration.sum usecond 58.08
  161. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  162. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  163. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  164. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  165. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  166. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  167. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  168. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  169. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  170. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  171. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  172. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  173. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  174. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  175. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  176. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  177. smsp__inst_executed.avg inst 12,298.77
  178. smsp__inst_executed.max inst 12,539
  179. smsp__inst_executed.min inst 12,060
  180. smsp__inst_executed.sum inst 787,121
  181. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  182. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  183. smsp__cycles_active.avg cycle 70,962.83
  184. smsp__cycles_active.sum cycle 4,541,621
  185. ---------------------------------------------------------------------- --------------- ------------------------------
  186. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  187. Section: Command line profiler metrics
  188. ---------------------------------------------------------------------- --------------- ------------------------------
  189. gpu__time_duration.sum usecond 59.71
  190. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  191. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  192. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  193. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  194. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  195. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  196. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  197. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  198. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  199. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  200. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  201. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  202. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  203. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  204. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  205. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  206. smsp__inst_executed.avg inst 12,309.31
  207. smsp__inst_executed.max inst 12,697
  208. smsp__inst_executed.min inst 11,822
  209. smsp__inst_executed.sum inst 787,796
  210. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  211. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  212. smsp__cycles_active.avg cycle 72,821.19
  213. smsp__cycles_active.sum cycle 4,660,556
  214. ---------------------------------------------------------------------- --------------- ------------------------------
  215. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  216. Section: Command line profiler metrics
  217. ---------------------------------------------------------------------- --------------- ------------------------------
  218. gpu__time_duration.sum usecond 232.45
  219. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  220. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  221. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  222. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  223. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  224. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  225. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  226. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  227. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  228. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  229. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,692.06
  230. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,017
  231. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,412
  232. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,073
  233. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,072.53
  234. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,240.73
  235. smsp__inst_executed.avg inst 189,295.75
  236. smsp__inst_executed.max inst 192,417
  237. smsp__inst_executed.min inst 186,276
  238. smsp__inst_executed.sum inst 12,114,928
  239. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.89
  240. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  241. smsp__cycles_active.avg cycle 316,238.97
  242. smsp__cycles_active.sum cycle 20,239,294
  243. ---------------------------------------------------------------------- --------------- ------------------------------
  244. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:31, Context 1, Stream 7
  245. Section: Command line profiler metrics
  246. ---------------------------------------------------------------------- --------------- ------------------------------
  247. gpu__time_duration.sum usecond 58.37
  248. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  249. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  250. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  251. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  252. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  253. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  254. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  255. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  256. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  257. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  258. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  259. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  260. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  261. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  262. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  263. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  264. smsp__inst_executed.avg inst 12,292.88
  265. smsp__inst_executed.max inst 12,554
  266. smsp__inst_executed.min inst 11,832
  267. smsp__inst_executed.sum inst 786,744
  268. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  269. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  270. smsp__cycles_active.avg cycle 71,835.05
  271. smsp__cycles_active.sum cycle 4,597,443
  272. ---------------------------------------------------------------------- --------------- ------------------------------
  273. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  274. Section: Command line profiler metrics
  275. ---------------------------------------------------------------------- --------------- ------------------------------
  276. gpu__time_duration.sum usecond 57.89
  277. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  278. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  279. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  280. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  281. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  282. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  283. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  284. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  285. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  286. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  287. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  288. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  289. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  290. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  291. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  292. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  293. smsp__inst_executed.avg inst 12,298.73
  294. smsp__inst_executed.max inst 12,788
  295. smsp__inst_executed.min inst 11,840
  296. smsp__inst_executed.sum inst 787,119
  297. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  298. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  299. smsp__cycles_active.avg cycle 71,345.14
  300. smsp__cycles_active.sum cycle 4,566,089
  301. ---------------------------------------------------------------------- --------------- ------------------------------
  302. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  303. Section: Command line profiler metrics
  304. ---------------------------------------------------------------------- --------------- ------------------------------
  305. gpu__time_duration.sum usecond 59.71
  306. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  307. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  308. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  309. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  310. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23
  311. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  312. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  313. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  314. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  315. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  316. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  317. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  318. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  319. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  320. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  321. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  322. smsp__inst_executed.avg inst 12,308.28
  323. smsp__inst_executed.max inst 12,596
  324. smsp__inst_executed.min inst 11,926
  325. smsp__inst_executed.sum inst 787,730
  326. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  327. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  328. smsp__cycles_active.avg cycle 73,894.73
  329. smsp__cycles_active.sum cycle 4,729,263
  330. ---------------------------------------------------------------------- --------------- ------------------------------
  331. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  332. Section: Command line profiler metrics
  333. ---------------------------------------------------------------------- --------------- ------------------------------
  334. gpu__time_duration.sum usecond 229.41
  335. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  336. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  337. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  338. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  339. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  340. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  341. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  342. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  343. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  344. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  345. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,675.88
  346. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,994
  347. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,371
  348. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,814
  349. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,117.44
  350. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,241.17
  351. smsp__inst_executed.avg inst 189,295.52
  352. smsp__inst_executed.max inst 192,256
  353. smsp__inst_executed.min inst 186,332
  354. smsp__inst_executed.sum inst 12,114,913
  355. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.85
  356. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  357. smsp__cycles_active.avg cycle 317,343.97
  358. smsp__cycles_active.sum cycle 20,310,014
  359. ---------------------------------------------------------------------- --------------- ------------------------------
  360. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  361. Section: Command line profiler metrics
  362. ---------------------------------------------------------------------- --------------- ------------------------------
  363. gpu__time_duration.sum usecond 58.11
  364. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  365. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  366. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  367. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  368. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  369. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  370. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  371. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  372. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  373. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  374. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  375. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  376. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  377. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  378. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  379. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  380. smsp__inst_executed.avg inst 12,290.78
  381. smsp__inst_executed.max inst 12,480
  382. smsp__inst_executed.min inst 11,928
  383. smsp__inst_executed.sum inst 786,610
  384. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  385. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  386. smsp__cycles_active.avg cycle 70,846.03
  387. smsp__cycles_active.sum cycle 4,534,146
  388. ---------------------------------------------------------------------- --------------- ------------------------------
  389. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  390. Section: Command line profiler metrics
  391. ---------------------------------------------------------------------- --------------- ------------------------------
  392. gpu__time_duration.sum usecond 58.59
  393. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  394. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  395. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  396. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  397. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  398. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  399. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  400. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  401. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  402. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  403. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  404. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  405. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  406. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  407. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  408. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  409. smsp__inst_executed.avg inst 12,293.23
  410. smsp__inst_executed.max inst 12,604
  411. smsp__inst_executed.min inst 11,836
  412. smsp__inst_executed.sum inst 786,767
  413. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  414. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  415. smsp__cycles_active.avg cycle 71,106.22
  416. smsp__cycles_active.sum cycle 4,550,798
  417. ---------------------------------------------------------------------- --------------- ------------------------------
  418. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  419. Section: Command line profiler metrics
  420. ---------------------------------------------------------------------- --------------- ------------------------------
  421. gpu__time_duration.sum usecond 58.11
  422. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  423. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  424. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  425. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  426. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.37
  427. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  428. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  429. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  430. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  431. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  432. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  433. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  434. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  435. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  436. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  437. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  438. smsp__inst_executed.avg inst 12,298.36
  439. smsp__inst_executed.max inst 12,513
  440. smsp__inst_executed.min inst 11,712
  441. smsp__inst_executed.sum inst 787,095
  442. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  443. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  444. smsp__cycles_active.avg cycle 70,202.59
  445. smsp__cycles_active.sum cycle 4,492,966
  446. ---------------------------------------------------------------------- --------------- ------------------------------
  447. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  448. Section: Command line profiler metrics
  449. ---------------------------------------------------------------------- --------------- ------------------------------
  450. gpu__time_duration.sum usecond 59.52
  451. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  452. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  453. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  454. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  455. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.22
  456. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  457. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  458. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  459. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  460. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  461. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  462. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  463. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  464. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  465. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  466. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  467. smsp__inst_executed.avg inst 12,308.52
  468. smsp__inst_executed.max inst 12,682
  469. smsp__inst_executed.min inst 11,859
  470. smsp__inst_executed.sum inst 787,745
  471. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  472. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  473. smsp__cycles_active.avg cycle 73,621.48
  474. smsp__cycles_active.sum cycle 4,711,775
  475. ---------------------------------------------------------------------- --------------- ------------------------------
  476. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  477. Section: Command line profiler metrics
  478. ---------------------------------------------------------------------- --------------- ------------------------------
  479. gpu__time_duration.sum usecond 229.09
  480. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  481. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  482. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  483. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  484. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  485. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  486. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  487. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  488. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  489. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  490. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.12
  491. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,974
  492. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,355
  493. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,978
  494. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,010.98
  495. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,240.11
  496. smsp__inst_executed.avg inst 189,293.06
  497. smsp__inst_executed.max inst 192,343
  498. smsp__inst_executed.min inst 186,209
  499. smsp__inst_executed.sum inst 12,114,756
  500. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.81
  501. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  502. smsp__cycles_active.avg cycle 317,996.44
  503. smsp__cycles_active.sum cycle 20,351,772
  504. ---------------------------------------------------------------------- --------------- ------------------------------
  505. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  506. Section: Command line profiler metrics
  507. ---------------------------------------------------------------------- --------------- ------------------------------
  508. gpu__time_duration.sum usecond 58.02
  509. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  510. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  511. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  512. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  513. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  514. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  515. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  516. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  517. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  518. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  519. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  520. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  521. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  522. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  523. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  524. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  525. smsp__inst_executed.avg inst 12,289.22
  526. smsp__inst_executed.max inst 12,699
  527. smsp__inst_executed.min inst 11,910
  528. smsp__inst_executed.sum inst 786,510
  529. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  530. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  531. smsp__cycles_active.avg cycle 71,268.28
  532. smsp__cycles_active.sum cycle 4,561,170
  533. ---------------------------------------------------------------------- --------------- ------------------------------
  534. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:32, Context 1, Stream 7
  535. Section: Command line profiler metrics
  536. ---------------------------------------------------------------------- --------------- ------------------------------
  537. gpu__time_duration.sum usecond 58.24
  538. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  539. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  540. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  541. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  542. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  543. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  544. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  545. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  546. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  547. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  548. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  549. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  550. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  551. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  552. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  553. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  554. smsp__inst_executed.avg inst 12,290.45
  555. smsp__inst_executed.max inst 12,669
  556. smsp__inst_executed.min inst 11,950
  557. smsp__inst_executed.sum inst 786,589
  558. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  559. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  560. smsp__cycles_active.avg cycle 70,528.27
  561. smsp__cycles_active.sum cycle 4,513,809
  562. ---------------------------------------------------------------------- --------------- ------------------------------
  563. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  564. Section: Command line profiler metrics
  565. ---------------------------------------------------------------------- --------------- ------------------------------
  566. gpu__time_duration.sum usecond 58.50
  567. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  568. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  569. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  570. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  571. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  572. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  573. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  574. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  575. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  576. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  577. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  578. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  579. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  580. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  581. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  582. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  583. smsp__inst_executed.avg inst 12,293.23
  584. smsp__inst_executed.max inst 12,648
  585. smsp__inst_executed.min inst 11,996
  586. smsp__inst_executed.sum inst 786,767
  587. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  588. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  589. smsp__cycles_active.avg cycle 70,717.06
  590. smsp__cycles_active.sum cycle 4,525,892
  591. ---------------------------------------------------------------------- --------------- ------------------------------
  592. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  593. Section: Command line profiler metrics
  594. ---------------------------------------------------------------------- --------------- ------------------------------
  595. gpu__time_duration.sum usecond 58.02
  596. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  597. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  598. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  599. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  600. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  601. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  602. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  603. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  604. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  605. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  606. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  607. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  608. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  609. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  610. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  611. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  612. smsp__inst_executed.avg inst 12,298.44
  613. smsp__inst_executed.max inst 12,776
  614. smsp__inst_executed.min inst 11,972
  615. smsp__inst_executed.sum inst 787,100
  616. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  617. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  618. smsp__cycles_active.avg cycle 70,171.77
  619. smsp__cycles_active.sum cycle 4,490,993
  620. ---------------------------------------------------------------------- --------------- ------------------------------
  621. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  622. Section: Command line profiler metrics
  623. ---------------------------------------------------------------------- --------------- ------------------------------
  624. gpu__time_duration.sum usecond 59.58
  625. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  626. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  627. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  628. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  629. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  630. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  631. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  632. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  633. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  634. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  635. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  636. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  637. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  638. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  639. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  640. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  641. smsp__inst_executed.avg inst 12,309.16
  642. smsp__inst_executed.max inst 12,776
  643. smsp__inst_executed.min inst 12,048
  644. smsp__inst_executed.sum inst 787,786
  645. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  646. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  647. smsp__cycles_active.avg cycle 73,678.91
  648. smsp__cycles_active.sum cycle 4,715,450
  649. ---------------------------------------------------------------------- --------------- ------------------------------
  650. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  651. Section: Command line profiler metrics
  652. ---------------------------------------------------------------------- --------------- ------------------------------
  653. gpu__time_duration.sum usecond 228.96
  654. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  655. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  656. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  657. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  658. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  659. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  660. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  661. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  662. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  663. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  664. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,689.06
  665. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,011
  666. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,382
  667. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,025
  668. smsp__average_warp_latency_issue_stalled_barrier.pct % 125,081.44
  669. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,250.81
  670. smsp__inst_executed.avg inst 189,292.86
  671. smsp__inst_executed.max inst 192,415
  672. smsp__inst_executed.min inst 186,212
  673. smsp__inst_executed.sum inst 12,114,743
  674. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.96
  675. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  676. smsp__cycles_active.avg cycle 316,855.81
  677. smsp__cycles_active.sum cycle 20,278,772
  678. ---------------------------------------------------------------------- --------------- ------------------------------
  679. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  680. Section: Command line profiler metrics
  681. ---------------------------------------------------------------------- --------------- ------------------------------
  682. gpu__time_duration.sum usecond 59.97
  683. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  684. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  685. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  686. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  687. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  688. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  689. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  690. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  691. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  692. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  693. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  694. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  695. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  696. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  697. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  698. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  699. smsp__inst_executed.avg inst 12,288.30
  700. smsp__inst_executed.max inst 12,684
  701. smsp__inst_executed.min inst 11,920
  702. smsp__inst_executed.sum inst 786,451
  703. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  704. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  705. smsp__cycles_active.avg cycle 73,684.17
  706. smsp__cycles_active.sum cycle 4,715,787
  707. ---------------------------------------------------------------------- --------------- ------------------------------
  708. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  709. Section: Command line profiler metrics
  710. ---------------------------------------------------------------------- --------------- ------------------------------
  711. gpu__time_duration.sum usecond 58.02
  712. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  713. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  714. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  715. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  716. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  717. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  718. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  719. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  720. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  721. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  722. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  723. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  724. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  725. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  726. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  727. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  728. smsp__inst_executed.avg inst 12,288.70
  729. smsp__inst_executed.max inst 12,656
  730. smsp__inst_executed.min inst 11,904
  731. smsp__inst_executed.sum inst 786,477
  732. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  733. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  734. smsp__cycles_active.avg cycle 71,366.16
  735. smsp__cycles_active.sum cycle 4,567,434
  736. ---------------------------------------------------------------------- --------------- ------------------------------
  737. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  738. Section: Command line profiler metrics
  739. ---------------------------------------------------------------------- --------------- ------------------------------
  740. gpu__time_duration.sum usecond 57.95
  741. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  742. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  743. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  744. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  745. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47
  746. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  747. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  748. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  749. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  750. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  751. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  752. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  753. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  754. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  755. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  756. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  757. smsp__inst_executed.avg inst 12,290.27
  758. smsp__inst_executed.max inst 12,693
  759. smsp__inst_executed.min inst 12,042
  760. smsp__inst_executed.sum inst 786,577
  761. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  762. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  763. smsp__cycles_active.avg cycle 70,688.91
  764. smsp__cycles_active.sum cycle 4,524,090
  765. ---------------------------------------------------------------------- --------------- ------------------------------
  766. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  767. Section: Command line profiler metrics
  768. ---------------------------------------------------------------------- --------------- ------------------------------
  769. gpu__time_duration.sum usecond 58.53
  770. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  771. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  772. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  773. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  774. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  775. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  776. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  777. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  778. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  779. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  780. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  781. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  782. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  783. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  784. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  785. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  786. smsp__inst_executed.avg inst 12,293.41
  787. smsp__inst_executed.max inst 12,585
  788. smsp__inst_executed.min inst 11,776
  789. smsp__inst_executed.sum inst 786,778
  790. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  791. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  792. smsp__cycles_active.avg cycle 71,063.67
  793. smsp__cycles_active.sum cycle 4,548,075
  794. ---------------------------------------------------------------------- --------------- ------------------------------
  795. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:33, Context 1, Stream 7
  796. Section: Command line profiler metrics
  797. ---------------------------------------------------------------------- --------------- ------------------------------
  798. gpu__time_duration.sum usecond 58.46
  799. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  800. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  801. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  802. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  803. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  804. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  805. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  806. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  807. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  808. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  809. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  810. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  811. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  812. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  813. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  814. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  815. smsp__inst_executed.avg inst 12,298.94
  816. smsp__inst_executed.max inst 12,684
  817. smsp__inst_executed.min inst 11,776
  818. smsp__inst_executed.sum inst 787,132
  819. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  820. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  821. smsp__cycles_active.avg cycle 71,037.02
  822. smsp__cycles_active.sum cycle 4,546,369
  823. ---------------------------------------------------------------------- --------------- ------------------------------
  824. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  825. Section: Command line profiler metrics
  826. ---------------------------------------------------------------------- --------------- ------------------------------
  827. gpu__time_duration.sum usecond 59.74
  828. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  829. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  830. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  831. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  832. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  833. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  834. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  835. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  836. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  837. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  838. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  839. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  840. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  841. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  842. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  843. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  844. smsp__inst_executed.avg inst 12,309.58
  845. smsp__inst_executed.max inst 12,726
  846. smsp__inst_executed.min inst 12,072
  847. smsp__inst_executed.sum inst 787,813
  848. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  849. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  850. smsp__cycles_active.avg cycle 73,656
  851. smsp__cycles_active.sum cycle 4,713,984
  852. ---------------------------------------------------------------------- --------------- ------------------------------
  853. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  854. Section: Command line profiler metrics
  855. ---------------------------------------------------------------------- --------------- ------------------------------
  856. gpu__time_duration.sum usecond 231.42
  857. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  858. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  859. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  860. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  861. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  862. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  863. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  864. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  865. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  866. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  867. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,682.75
  868. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,995
  869. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,336
  870. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,924
  871. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,533.56
  872. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,245.34
  873. smsp__inst_executed.avg inst 189,299.56
  874. smsp__inst_executed.max inst 192,317
  875. smsp__inst_executed.min inst 186,295
  876. smsp__inst_executed.sum inst 12,115,172
  877. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.91
  878. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  879. smsp__cycles_active.avg cycle 316,917.50
  880. smsp__cycles_active.sum cycle 20,282,720
  881. ---------------------------------------------------------------------- --------------- ------------------------------
  882. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  883. Section: Command line profiler metrics
  884. ---------------------------------------------------------------------- --------------- ------------------------------
  885. gpu__time_duration.sum usecond 58.56
  886. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  887. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  888. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  889. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  890. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  891. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  892. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  893. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  894. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  895. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  896. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  897. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  898. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  899. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  900. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  901. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  902. smsp__inst_executed.avg inst 12,287.72
  903. smsp__inst_executed.max inst 12,664
  904. smsp__inst_executed.min inst 11,916
  905. smsp__inst_executed.sum inst 786,414
  906. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  907. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  908. smsp__cycles_active.avg cycle 71,857.64
  909. smsp__cycles_active.sum cycle 4,598,889
  910. ---------------------------------------------------------------------- --------------- ------------------------------
  911. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  912. Section: Command line profiler metrics
  913. ---------------------------------------------------------------------- --------------- ------------------------------
  914. gpu__time_duration.sum usecond 59.97
  915. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  916. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  917. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  918. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  919. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  920. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  921. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  922. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  923. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  924. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  925. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  926. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  927. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  928. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  929. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  930. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  931. smsp__inst_executed.avg inst 12,288.48
  932. smsp__inst_executed.max inst 12,684
  933. smsp__inst_executed.min inst 11,892
  934. smsp__inst_executed.sum inst 786,463
  935. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  936. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  937. smsp__cycles_active.avg cycle 73,070.30
  938. smsp__cycles_active.sum cycle 4,676,499
  939. ---------------------------------------------------------------------- --------------- ------------------------------
  940. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  941. Section: Command line profiler metrics
  942. ---------------------------------------------------------------------- --------------- ------------------------------
  943. gpu__time_duration.sum usecond 58.24
  944. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  945. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  946. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  947. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  948. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  949. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  950. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  951. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  952. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  953. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  954. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  955. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  956. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  957. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  958. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  959. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  960. smsp__inst_executed.avg inst 12,289.53
  961. smsp__inst_executed.max inst 12,851
  962. smsp__inst_executed.min inst 11,908
  963. smsp__inst_executed.sum inst 786,530
  964. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  965. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  966. smsp__cycles_active.avg cycle 70,288.86
  967. smsp__cycles_active.sum cycle 4,498,487
  968. ---------------------------------------------------------------------- --------------- ------------------------------
  969. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  970. Section: Command line profiler metrics
  971. ---------------------------------------------------------------------- --------------- ------------------------------
  972. gpu__time_duration.sum usecond 57.92
  973. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  974. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  975. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  976. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  977. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  978. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  979. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  980. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  981. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  982. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  983. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  984. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  985. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  986. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  987. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  988. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  989. smsp__inst_executed.avg inst 12,290.62
  990. smsp__inst_executed.max inst 12,632
  991. smsp__inst_executed.min inst 12,036
  992. smsp__inst_executed.sum inst 786,600
  993. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  994. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  995. smsp__cycles_active.avg cycle 71,317.92
  996. smsp__cycles_active.sum cycle 4,564,347
  997. ---------------------------------------------------------------------- --------------- ------------------------------
  998. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  999. Section: Command line profiler metrics
  1000. ---------------------------------------------------------------------- --------------- ------------------------------
  1001. gpu__time_duration.sum usecond 58.53
  1002. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1003. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1004. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1005. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1006. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1007. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1008. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1009. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1010. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1011. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1012. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1013. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1014. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1015. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1016. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1017. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1018. smsp__inst_executed.avg inst 12,293.17
  1019. smsp__inst_executed.max inst 12,673
  1020. smsp__inst_executed.min inst 11,880
  1021. smsp__inst_executed.sum inst 786,763
  1022. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1023. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1024. smsp__cycles_active.avg cycle 71,760.25
  1025. smsp__cycles_active.sum cycle 4,592,656
  1026. ---------------------------------------------------------------------- --------------- ------------------------------
  1027. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  1028. Section: Command line profiler metrics
  1029. ---------------------------------------------------------------------- --------------- ------------------------------
  1030. gpu__time_duration.sum usecond 58.34
  1031. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1032. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1033. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1034. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1035. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1036. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  1037. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1038. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1039. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1040. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1041. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1042. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1043. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1044. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1045. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1046. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1047. smsp__inst_executed.avg inst 12,298.23
  1048. smsp__inst_executed.max inst 12,716
  1049. smsp__inst_executed.min inst 11,876
  1050. smsp__inst_executed.sum inst 787,087
  1051. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1052. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1053. smsp__cycles_active.avg cycle 71,216.77
  1054. smsp__cycles_active.sum cycle 4,557,873
  1055. ---------------------------------------------------------------------- --------------- ------------------------------
  1056. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  1057. Section: Command line profiler metrics
  1058. ---------------------------------------------------------------------- --------------- ------------------------------
  1059. gpu__time_duration.sum usecond 59.39
  1060. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1061. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1062. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1063. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1064. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23
  1065. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1066. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1067. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1068. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1069. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1070. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1071. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1072. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1073. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1074. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1075. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1076. smsp__inst_executed.avg inst 12,308.69
  1077. smsp__inst_executed.max inst 12,831
  1078. smsp__inst_executed.min inst 11,757
  1079. smsp__inst_executed.sum inst 787,756
  1080. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1081. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1082. smsp__cycles_active.avg cycle 73,196.75
  1083. smsp__cycles_active.sum cycle 4,684,592
  1084. ---------------------------------------------------------------------- --------------- ------------------------------
  1085. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  1086. Section: Command line profiler metrics
  1087. ---------------------------------------------------------------------- --------------- ------------------------------
  1088. gpu__time_duration.sum usecond 228.77
  1089. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1090. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1091. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1092. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1093. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1094. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1095. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1096. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1097. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1098. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1099. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,694.50
  1100. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,913
  1101. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,365
  1102. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,112
  1103. smsp__average_warp_latency_issue_stalled_barrier.pct % 123,085.89
  1104. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,230.86
  1105. smsp__inst_executed.avg inst 189,320.77
  1106. smsp__inst_executed.max inst 192,334
  1107. smsp__inst_executed.min inst 186,279
  1108. smsp__inst_executed.sum inst 12,116,529
  1109. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.78
  1110. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  1111. smsp__cycles_active.avg cycle 316,332
  1112. smsp__cycles_active.sum cycle 20,245,248
  1113. ---------------------------------------------------------------------- --------------- ------------------------------
  1114. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:34, Context 1, Stream 7
  1115. Section: Command line profiler metrics
  1116. ---------------------------------------------------------------------- --------------- ------------------------------
  1117. gpu__time_duration.sum usecond 58.72
  1118. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1119. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1120. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1121. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1122. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1123. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1124. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1125. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1126. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1127. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1128. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1129. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1130. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1131. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1132. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1133. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1134. smsp__inst_executed.avg inst 12,288.30
  1135. smsp__inst_executed.max inst 12,488
  1136. smsp__inst_executed.min inst 11,916
  1137. smsp__inst_executed.sum inst 786,451
  1138. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1139. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1140. smsp__cycles_active.avg cycle 71,574.31
  1141. smsp__cycles_active.sum cycle 4,580,756
  1142. ---------------------------------------------------------------------- --------------- ------------------------------
  1143. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1144. Section: Command line profiler metrics
  1145. ---------------------------------------------------------------------- --------------- ------------------------------
  1146. gpu__time_duration.sum usecond 59.20
  1147. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1148. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1149. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1150. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1151. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1152. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1153. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1154. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1155. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1156. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1157. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1158. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1159. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1160. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1161. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1162. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1163. smsp__inst_executed.avg inst 12,288.91
  1164. smsp__inst_executed.max inst 12,672
  1165. smsp__inst_executed.min inst 11,910
  1166. smsp__inst_executed.sum inst 786,490
  1167. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1168. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1169. smsp__cycles_active.avg cycle 71,645.88
  1170. smsp__cycles_active.sum cycle 4,585,336
  1171. ---------------------------------------------------------------------- --------------- ------------------------------
  1172. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1173. Section: Command line profiler metrics
  1174. ---------------------------------------------------------------------- --------------- ------------------------------
  1175. gpu__time_duration.sum usecond 60.29
  1176. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1177. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1178. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1179. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1180. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1181. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1182. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1183. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1184. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1185. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1186. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1187. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1188. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1189. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1190. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1191. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1192. smsp__inst_executed.avg inst 12,289.03
  1193. smsp__inst_executed.max inst 12,492
  1194. smsp__inst_executed.min inst 11,892
  1195. smsp__inst_executed.sum inst 786,498
  1196. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1197. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1198. smsp__cycles_active.avg cycle 73,041.34
  1199. smsp__cycles_active.sum cycle 4,674,646
  1200. ---------------------------------------------------------------------- --------------- ------------------------------
  1201. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1202. Section: Command line profiler metrics
  1203. ---------------------------------------------------------------------- --------------- ------------------------------
  1204. gpu__time_duration.sum usecond 58.27
  1205. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1206. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1207. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1208. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1209. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1210. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1211. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1212. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1213. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1214. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1215. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1216. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1217. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1218. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1219. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1220. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1221. smsp__inst_executed.avg inst 12,289.23
  1222. smsp__inst_executed.max inst 12,678
  1223. smsp__inst_executed.min inst 12,066
  1224. smsp__inst_executed.sum inst 786,511
  1225. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1226. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1227. smsp__cycles_active.avg cycle 70,712.69
  1228. smsp__cycles_active.sum cycle 4,525,612
  1229. ---------------------------------------------------------------------- --------------- ------------------------------
  1230. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1231. Section: Command line profiler metrics
  1232. ---------------------------------------------------------------------- --------------- ------------------------------
  1233. gpu__time_duration.sum usecond 58.21
  1234. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1235. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1236. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1237. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1238. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.47
  1239. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1240. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1241. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1242. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1243. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1244. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1245. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1246. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1247. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1248. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1249. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1250. smsp__inst_executed.avg inst 12,289.75
  1251. smsp__inst_executed.max inst 12,526
  1252. smsp__inst_executed.min inst 12,050
  1253. smsp__inst_executed.sum inst 786,544
  1254. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1255. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1256. smsp__cycles_active.avg cycle 71,762.81
  1257. smsp__cycles_active.sum cycle 4,592,820
  1258. ---------------------------------------------------------------------- --------------- ------------------------------
  1259. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1260. Section: Command line profiler metrics
  1261. ---------------------------------------------------------------------- --------------- ------------------------------
  1262. gpu__time_duration.sum usecond 58.59
  1263. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1264. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1265. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1266. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1267. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  1268. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1269. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1270. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1271. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1272. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1273. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1274. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1275. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1276. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1277. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1278. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1279. smsp__inst_executed.avg inst 12,293.14
  1280. smsp__inst_executed.max inst 12,740
  1281. smsp__inst_executed.min inst 11,704
  1282. smsp__inst_executed.sum inst 786,761
  1283. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1284. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1285. smsp__cycles_active.avg cycle 72,097.56
  1286. smsp__cycles_active.sum cycle 4,614,244
  1287. ---------------------------------------------------------------------- --------------- ------------------------------
  1288. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1289. Section: Command line profiler metrics
  1290. ---------------------------------------------------------------------- --------------- ------------------------------
  1291. gpu__time_duration.sum usecond 58.05
  1292. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1293. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1294. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1295. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1296. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1297. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1298. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1299. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1300. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1301. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1302. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1303. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1304. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1305. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1306. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1307. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1308. smsp__inst_executed.avg inst 12,298.39
  1309. smsp__inst_executed.max inst 12,695
  1310. smsp__inst_executed.min inst 11,854
  1311. smsp__inst_executed.sum inst 787,097
  1312. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1313. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1314. smsp__cycles_active.avg cycle 70,856.70
  1315. smsp__cycles_active.sum cycle 4,534,829
  1316. ---------------------------------------------------------------------- --------------- ------------------------------
  1317. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1318. Section: Command line profiler metrics
  1319. ---------------------------------------------------------------------- --------------- ------------------------------
  1320. gpu__time_duration.sum usecond 59.30
  1321. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1322. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1323. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1324. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1325. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.20
  1326. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1327. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1328. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1329. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1330. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1331. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1332. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1333. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1334. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1335. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1336. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1337. smsp__inst_executed.avg inst 12,309.80
  1338. smsp__inst_executed.max inst 12,711
  1339. smsp__inst_executed.min inst 11,932
  1340. smsp__inst_executed.sum inst 787,827
  1341. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1342. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1343. smsp__cycles_active.avg cycle 72,244.53
  1344. smsp__cycles_active.sum cycle 4,623,650
  1345. ---------------------------------------------------------------------- --------------- ------------------------------
  1346. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1347. Section: Command line profiler metrics
  1348. ---------------------------------------------------------------------- --------------- ------------------------------
  1349. gpu__time_duration.sum usecond 228.90
  1350. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1351. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1352. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1353. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1354. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1355. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1356. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1357. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1358. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1359. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1360. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,686.06
  1361. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,967
  1362. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,333
  1363. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,977
  1364. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,987.84
  1365. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,249.88
  1366. smsp__inst_executed.avg inst 189,274.94
  1367. smsp__inst_executed.max inst 192,335
  1368. smsp__inst_executed.min inst 186,200
  1369. smsp__inst_executed.sum inst 12,113,596
  1370. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.97
  1371. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  1372. smsp__cycles_active.avg cycle 316,573.25
  1373. smsp__cycles_active.sum cycle 20,260,688
  1374. ---------------------------------------------------------------------- --------------- ------------------------------
  1375. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1376. Section: Command line profiler metrics
  1377. ---------------------------------------------------------------------- --------------- ------------------------------
  1378. gpu__time_duration.sum usecond 60.42
  1379. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1380. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1381. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1382. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1383. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1384. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1385. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1386. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1387. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1388. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1389. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1390. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1391. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1392. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1393. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1394. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1395. smsp__inst_executed.avg inst 12,287.78
  1396. smsp__inst_executed.max inst 12,852
  1397. smsp__inst_executed.min inst 11,520
  1398. smsp__inst_executed.sum inst 786,418
  1399. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1400. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1401. smsp__cycles_active.avg cycle 75,112.75
  1402. smsp__cycles_active.sum cycle 4,807,216
  1403. ---------------------------------------------------------------------- --------------- ------------------------------
  1404. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1405. Section: Command line profiler metrics
  1406. ---------------------------------------------------------------------- --------------- ------------------------------
  1407. gpu__time_duration.sum usecond 58.91
  1408. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1409. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1410. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1411. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1412. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1413. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1414. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1415. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1416. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1417. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1418. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1419. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1420. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1421. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1422. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1423. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1424. smsp__inst_executed.avg inst 12,288.53
  1425. smsp__inst_executed.max inst 12,679
  1426. smsp__inst_executed.min inst 11,900
  1427. smsp__inst_executed.sum inst 786,466
  1428. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1429. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1430. smsp__cycles_active.avg cycle 72,599.36
  1431. smsp__cycles_active.sum cycle 4,646,359
  1432. ---------------------------------------------------------------------- --------------- ------------------------------
  1433. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:35, Context 1, Stream 7
  1434. Section: Command line profiler metrics
  1435. ---------------------------------------------------------------------- --------------- ------------------------------
  1436. gpu__time_duration.sum usecond 58.59
  1437. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1438. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1439. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1440. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1441. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1442. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1443. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1444. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1445. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1446. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1447. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1448. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1449. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1450. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1451. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1452. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1453. smsp__inst_executed.avg inst 12,288.06
  1454. smsp__inst_executed.max inst 12,478
  1455. smsp__inst_executed.min inst 12,100
  1456. smsp__inst_executed.sum inst 786,436
  1457. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1458. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1459. smsp__cycles_active.avg cycle 71,386.41
  1460. smsp__cycles_active.sum cycle 4,568,730
  1461. ---------------------------------------------------------------------- --------------- ------------------------------
  1462. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1463. Section: Command line profiler metrics
  1464. ---------------------------------------------------------------------- --------------- ------------------------------
  1465. gpu__time_duration.sum usecond 60.22
  1466. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1467. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1468. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1469. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1470. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1471. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1472. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1473. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1474. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1475. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1476. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1477. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1478. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1479. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1480. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1481. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1482. smsp__inst_executed.avg inst 12,289.20
  1483. smsp__inst_executed.max inst 12,496
  1484. smsp__inst_executed.min inst 11,920
  1485. smsp__inst_executed.sum inst 786,509
  1486. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1487. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1488. smsp__cycles_active.avg cycle 73,866.02
  1489. smsp__cycles_active.sum cycle 4,727,425
  1490. ---------------------------------------------------------------------- --------------- ------------------------------
  1491. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1492. Section: Command line profiler metrics
  1493. ---------------------------------------------------------------------- --------------- ------------------------------
  1494. gpu__time_duration.sum usecond 58.59
  1495. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1496. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1497. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1498. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1499. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1500. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1501. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1502. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1503. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1504. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1505. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1506. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1507. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1508. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1509. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1510. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1511. smsp__inst_executed.avg inst 12,289.89
  1512. smsp__inst_executed.max inst 12,675
  1513. smsp__inst_executed.min inst 11,914
  1514. smsp__inst_executed.sum inst 786,553
  1515. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1516. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1517. smsp__cycles_active.avg cycle 71,482.77
  1518. smsp__cycles_active.sum cycle 4,574,897
  1519. ---------------------------------------------------------------------- --------------- ------------------------------
  1520. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1521. Section: Command line profiler metrics
  1522. ---------------------------------------------------------------------- --------------- ------------------------------
  1523. gpu__time_duration.sum usecond 58.69
  1524. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1525. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1526. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1527. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1528. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1529. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1530. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1531. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1532. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1533. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1534. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1535. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1536. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1537. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1538. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1539. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1540. smsp__inst_executed.avg inst 12,291.39
  1541. smsp__inst_executed.max inst 12,501
  1542. smsp__inst_executed.min inst 12,038
  1543. smsp__inst_executed.sum inst 786,649
  1544. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1545. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1546. smsp__cycles_active.avg cycle 71,637.31
  1547. smsp__cycles_active.sum cycle 4,584,788
  1548. ---------------------------------------------------------------------- --------------- ------------------------------
  1549. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1550. Section: Command line profiler metrics
  1551. ---------------------------------------------------------------------- --------------- ------------------------------
  1552. gpu__time_duration.sum usecond 58.56
  1553. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1554. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1555. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1556. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1557. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.43
  1558. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.98
  1559. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1560. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1561. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1562. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1563. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1564. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1565. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1566. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1567. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1568. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1569. smsp__inst_executed.avg inst 12,294.56
  1570. smsp__inst_executed.max inst 12,543
  1571. smsp__inst_executed.min inst 11,712
  1572. smsp__inst_executed.sum inst 786,852
  1573. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1574. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1575. smsp__cycles_active.avg cycle 71,770.67
  1576. smsp__cycles_active.sum cycle 4,593,323
  1577. ---------------------------------------------------------------------- --------------- ------------------------------
  1578. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1579. Section: Command line profiler metrics
  1580. ---------------------------------------------------------------------- --------------- ------------------------------
  1581. gpu__time_duration.sum usecond 58.05
  1582. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1583. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1584. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1585. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1586. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.35
  1587. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.95
  1588. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1589. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1590. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1591. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1592. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1593. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1594. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1595. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1596. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1597. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1598. smsp__inst_executed.avg inst 12,301.70
  1599. smsp__inst_executed.max inst 12,527
  1600. smsp__inst_executed.min inst 12,071
  1601. smsp__inst_executed.sum inst 787,309
  1602. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1603. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1604. smsp__cycles_active.avg cycle 71,540.92
  1605. smsp__cycles_active.sum cycle 4,578,619
  1606. ---------------------------------------------------------------------- --------------- ------------------------------
  1607. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1608. Section: Command line profiler metrics
  1609. ---------------------------------------------------------------------- --------------- ------------------------------
  1610. gpu__time_duration.sum usecond 59.23
  1611. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1612. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1613. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1614. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1615. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.23
  1616. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1617. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1618. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1619. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1620. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1621. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1622. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1623. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1624. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1625. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1626. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1627. smsp__inst_executed.avg inst 12,314.02
  1628. smsp__inst_executed.max inst 12,699
  1629. smsp__inst_executed.min inst 11,912
  1630. smsp__inst_executed.sum inst 788,097
  1631. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1632. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1633. smsp__cycles_active.avg cycle 72,509.19
  1634. smsp__cycles_active.sum cycle 4,640,588
  1635. ---------------------------------------------------------------------- --------------- ------------------------------
  1636. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1637. Section: Command line profiler metrics
  1638. ---------------------------------------------------------------------- --------------- ------------------------------
  1639. gpu__time_duration.sum usecond 232.10
  1640. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1641. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1642. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1643. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1644. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1645. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1646. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1647. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1648. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1649. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1650. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,689.25
  1651. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 20,069
  1652. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,389
  1653. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 315,028
  1654. smsp__average_warp_latency_issue_stalled_barrier.pct % 124,193.57
  1655. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,241.94
  1656. smsp__inst_executed.avg inst 189,278.17
  1657. smsp__inst_executed.max inst 192,324
  1658. smsp__inst_executed.min inst 186,272
  1659. smsp__inst_executed.sum inst 12,113,803
  1660. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 12.87
  1661. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  1662. smsp__cycles_active.avg cycle 316,996.25
  1663. smsp__cycles_active.sum cycle 20,287,760
  1664. ---------------------------------------------------------------------- --------------- ------------------------------
  1665. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1666. Section: Command line profiler metrics
  1667. ---------------------------------------------------------------------- --------------- ------------------------------
  1668. gpu__time_duration.sum usecond 56.90
  1669. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1670. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1671. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1672. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1673. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1674. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1675. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1676. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1677. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1678. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1679. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1680. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1681. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1682. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1683. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1684. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1685. smsp__inst_executed.avg inst 12,287.95
  1686. smsp__inst_executed.max inst 12,680
  1687. smsp__inst_executed.min inst 11,896
  1688. smsp__inst_executed.sum inst 786,429
  1689. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1690. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1691. smsp__cycles_active.avg cycle 68,417.62
  1692. smsp__cycles_active.sum cycle 4,378,728
  1693. ---------------------------------------------------------------------- --------------- ------------------------------
  1694. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1695. Section: Command line profiler metrics
  1696. ---------------------------------------------------------------------- --------------- ------------------------------
  1697. gpu__time_duration.sum usecond 60.45
  1698. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1699. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1700. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1701. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1702. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1703. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1704. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1705. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1706. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1707. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1708. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1709. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1710. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1711. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1712. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1713. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1714. smsp__inst_executed.avg inst 12,287.66
  1715. smsp__inst_executed.max inst 12,672
  1716. smsp__inst_executed.min inst 11,908
  1717. smsp__inst_executed.sum inst 786,410
  1718. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1719. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1720. smsp__cycles_active.avg cycle 74,576.05
  1721. smsp__cycles_active.sum cycle 4,772,867
  1722. ---------------------------------------------------------------------- --------------- ------------------------------
  1723. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1724. Section: Command line profiler metrics
  1725. ---------------------------------------------------------------------- --------------- ------------------------------
  1726. gpu__time_duration.sum usecond 58.85
  1727. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1728. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1729. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1730. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1731. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1732. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1733. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1734. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1735. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1736. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1737. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1738. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1739. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1740. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1741. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1742. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1743. smsp__inst_executed.avg inst 12,287.80
  1744. smsp__inst_executed.max inst 12,492
  1745. smsp__inst_executed.min inst 12,088
  1746. smsp__inst_executed.sum inst 786,419
  1747. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1748. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1749. smsp__cycles_active.avg cycle 72,754.67
  1750. smsp__cycles_active.sum cycle 4,656,299
  1751. ---------------------------------------------------------------------- --------------- ------------------------------
  1752. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:36, Context 1, Stream 7
  1753. Section: Command line profiler metrics
  1754. ---------------------------------------------------------------------- --------------- ------------------------------
  1755. gpu__time_duration.sum usecond 58.85
  1756. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1757. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1758. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1759. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1760. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1761. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1762. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1763. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1764. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1765. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1766. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1767. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1768. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1769. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1770. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1771. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1772. smsp__inst_executed.avg inst 12,287.94
  1773. smsp__inst_executed.max inst 12,660
  1774. smsp__inst_executed.min inst 12,090
  1775. smsp__inst_executed.sum inst 786,428
  1776. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1777. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1778. smsp__cycles_active.avg cycle 72,405.38
  1779. smsp__cycles_active.sum cycle 4,633,944
  1780. ---------------------------------------------------------------------- --------------- ------------------------------
  1781. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1782. Section: Command line profiler metrics
  1783. ---------------------------------------------------------------------- --------------- ------------------------------
  1784. gpu__time_duration.sum usecond 59.90
  1785. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1786. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1787. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1788. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1789. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.49
  1790. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4.00
  1791. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1792. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1793. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1794. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1795. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1796. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1797. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1798. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1799. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1800. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1801. smsp__inst_executed.avg inst 12,289.11
  1802. smsp__inst_executed.max inst 12,516
  1803. smsp__inst_executed.min inst 11,872
  1804. smsp__inst_executed.sum inst 786,503
  1805. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1806. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1807. smsp__cycles_active.avg cycle 74,029.31
  1808. smsp__cycles_active.sum cycle 4,737,876
  1809. ---------------------------------------------------------------------- --------------- ------------------------------
  1810. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1811. Section: Command line profiler metrics
  1812. ---------------------------------------------------------------------- --------------- ------------------------------
  1813. gpu__time_duration.sum usecond 58.08
  1814. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1815. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1816. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1817. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1818. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.48
  1819. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1820. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1821. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1822. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1823. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1824. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1825. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1826. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1827. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1828. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1829. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1830. smsp__inst_executed.avg inst 12,288.77
  1831. smsp__inst_executed.max inst 12,648
  1832. smsp__inst_executed.min inst 11,890
  1833. smsp__inst_executed.sum inst 786,481
  1834. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1835. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1836. smsp__cycles_active.avg cycle 70,229.30
  1837. smsp__cycles_active.sum cycle 4,494,675
  1838. ---------------------------------------------------------------------- --------------- ------------------------------
  1839. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1840. Section: Command line profiler metrics
  1841. ---------------------------------------------------------------------- --------------- ------------------------------
  1842. gpu__time_duration.sum usecond 58.27
  1843. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1844. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1845. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1846. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1847. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.46
  1848. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.99
  1849. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1850. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1851. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1852. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1853. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1854. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1855. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1856. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1857. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1858. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1859. smsp__inst_executed.avg inst 12,290.17
  1860. smsp__inst_executed.max inst 12,491
  1861. smsp__inst_executed.min inst 12,062
  1862. smsp__inst_executed.sum inst 786,571
  1863. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1864. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1865. smsp__cycles_active.avg cycle 71,537.89
  1866. smsp__cycles_active.sum cycle 4,578,425
  1867. ---------------------------------------------------------------------- --------------- ------------------------------
  1868. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1869. Section: Command line profiler metrics
  1870. ---------------------------------------------------------------------- --------------- ------------------------------
  1871. gpu__time_duration.sum usecond 58.46
  1872. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1873. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1874. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1875. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1876. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.42
  1877. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.97
  1878. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1879. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1880. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1881. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1882. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1883. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1884. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1885. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1886. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1887. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1888. smsp__inst_executed.avg inst 12,294.28
  1889. smsp__inst_executed.max inst 12,521
  1890. smsp__inst_executed.min inst 12,037
  1891. smsp__inst_executed.sum inst 786,834
  1892. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1893. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1894. smsp__cycles_active.avg cycle 71,773.67
  1895. smsp__cycles_active.sum cycle 4,593,515
  1896. ---------------------------------------------------------------------- --------------- ------------------------------
  1897. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1898. Section: Command line profiler metrics
  1899. ---------------------------------------------------------------------- --------------- ------------------------------
  1900. gpu__time_duration.sum usecond 57.95
  1901. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1902. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1903. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1904. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1905. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.36
  1906. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.96
  1907. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1908. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1909. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1910. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1911. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1912. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1913. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1914. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1915. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1916. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1917. smsp__inst_executed.avg inst 12,298.47
  1918. smsp__inst_executed.max inst 12,737
  1919. smsp__inst_executed.min inst 11,886
  1920. smsp__inst_executed.sum inst 787,102
  1921. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1922. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1923. smsp__cycles_active.avg cycle 71,021.17
  1924. smsp__cycles_active.sum cycle 4,545,355
  1925. ---------------------------------------------------------------------- --------------- ------------------------------
  1926. void interBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1927. Section: Command line profiler metrics
  1928. ---------------------------------------------------------------------- --------------- ------------------------------
  1929. gpu__time_duration.sum usecond 59.26
  1930. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1931. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1932. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1933. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1934. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.21
  1935. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 3.91
  1936. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 0
  1937. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 0
  1938. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 0
  1939. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 0
  1940. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 0
  1941. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 0
  1942. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 0
  1943. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 0
  1944. smsp__average_warp_latency_issue_stalled_barrier.pct % 0
  1945. smsp__average_warp_latency_issue_stalled_barrier.ratio 0
  1946. smsp__inst_executed.avg inst 12,309.88
  1947. smsp__inst_executed.max inst 12,759
  1948. smsp__inst_executed.min inst 12,026
  1949. smsp__inst_executed.sum inst 787,832
  1950. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 0
  1951. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0
  1952. smsp__cycles_active.avg cycle 72,721.27
  1953. smsp__cycles_active.sum cycle 4,654,161
  1954. ---------------------------------------------------------------------- --------------- ------------------------------
  1955. void inBlockStep<unsigned int>(T1 *, unsigned long, unsigned long, unsigned long), 2025-Feb-16 14:04:37, Context 1, Stream 7
  1956. Section: Command line profiler metrics
  1957. ---------------------------------------------------------------------- --------------- ------------------------------
  1958. gpu__time_duration.sum usecond 231.97
  1959. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
  1960. l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum (!) n/a
  1961. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
  1962. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request 4
  1963. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.pct % 12.50
  1964. l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request 4
  1965. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.avg 32,768
  1966. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.max 33,280
  1967. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.min 32,256
  1968. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_read.sum 524,288
  1969. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.avg 19,679.75
  1970. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.max 19,990
  1971. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.min 19,275
  1972. l1tex__data_pipe_lsu_wavefronts_mem_shared_cmd_write.sum 314,876
  1973. smsp__average_warp_latency_issue_stalled_barrier.pct % 125,400.79
  1974. smsp__average_warp_latency_issue_stalled_barrier.ratio 1,254.01
  1975. smsp__inst_executed.avg inst 189,032.33
  1976. smsp__inst_executed.max inst 192,028
  1977. smsp__inst_executed.min inst 186,044
  1978. smsp__inst_executed.sum inst 12,098,069
  1979. smsp__warp_issue_stalled_barrier_per_warp_active.pct % 13.03
  1980. smsp__warp_issue_stalled_barrier_per_warp_active.ratio 0.13
  1981. smsp__cycles_active.avg cycle 316,085.12
  1982. smsp__cycles_active.sum cycle 20,229,448
  1983. ---------------------------------------------------------------------- --------------- ------------------------------