A triangle counting assignment for A.U.TH Parallel and distributed systems class.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 
 

344 Zeilen
10 KiB

  1. /*!
  2. * \file v4.cpp
  3. * \brief vv3 part of the exercise.
  4. *
  5. * \author
  6. * Christos Choutouridis AEM:8997
  7. * <cchoutou@ece.auth.gr>
  8. */
  9. #include <v4.h>
  10. namespace v4 {
  11. #if defined CILK
  12. /*!
  13. * Utility function to get/set the number of threads.
  14. *
  15. * The number of threads are controlled via environment variable \c CILK_NWORKERS
  16. *
  17. * \return The number of threads used.
  18. * \note
  19. * The user can reduce the number with the command option \c --max_threads.
  20. * If so the requested number will be used even if the environment has more threads available.
  21. */
  22. int nworkers() {
  23. if (session.max_threads)
  24. return (session.max_threads < __cilkrts_get_nworkers()) ?
  25. session.max_threads : __cilkrts_get_nworkers();
  26. else
  27. return __cilkrts_get_nworkers();
  28. }
  29. /*!
  30. * Calculate and return a vertex-wise count vector.
  31. *
  32. * 1
  33. * vector = --- * (A.* (A*B))*ones_N
  34. * 2
  35. * We squeezed all that to one function for performance. The row*column multiplication
  36. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  37. * with a time complexity of \$ O(nnz1 + nnz2) \$
  38. *
  39. * \param A The first matrix to use.
  40. * \param B The second matrix to use (they can be the same).
  41. * \return The count vector. RVO is used here.
  42. * \note
  43. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  44. * - A full matrix calculation
  45. * - A lower triangular matrix
  46. * \warning
  47. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  48. */
  49. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  50. std::vector<value_t> c(A.size());
  51. cilk_for (int i=0 ; i<A.size() ; ++i) {
  52. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  53. c[i] += A.getRow(i)*B.getCol(j.index());
  54. }
  55. if (session.makeSymmetric) c[i] /= 2;
  56. }
  57. return c;
  58. }
  59. /*!
  60. * A sum utility to use as spawn function for parallelized sum.
  61. * \return The sum of \c v from \c begin to \c end.
  62. */
  63. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  64. for (auto i =begin ; i != end ; ++i)
  65. out_sum += v[i];
  66. }
  67. /*!
  68. * A parallelized version of sum. Just because ;)
  69. * \return The total sum of vector \c v
  70. */
  71. value_t sum (std::vector<value_t>& v) {
  72. int n = nworkers();
  73. std::vector<value_t> sum_v(n, 0); // result of each do_sum invocation.
  74. // We spawn workers in a more statically way.
  75. for (index_t i =0 ; i < n ; ++i) {
  76. cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
  77. }
  78. cilk_sync;
  79. // sum the sums (a sum to rule them all)
  80. value_t s =0; for (auto& it : sum_v) s += it;
  81. return s;
  82. }
  83. #elif defined OMP
  84. /*!
  85. * Utility function to get/set the number of threads.
  86. *
  87. * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
  88. *
  89. * \return The number of threads used.
  90. * \note
  91. * The user can reduce the number with the command option \c --max_threads.
  92. * If so the requested number will be used even if the environment has more threads available.
  93. */
  94. int nworkers() {
  95. if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
  96. omp_set_dynamic(0);
  97. omp_set_num_threads(session.max_threads);
  98. return session.max_threads;
  99. }
  100. else {
  101. omp_set_dynamic(1);
  102. return omp_get_max_threads();
  103. }
  104. }
  105. /*!
  106. * Calculate and return a vertex-wise count vector.
  107. *
  108. * 1
  109. * vector = --- * (A.* (A*B))*ones_N
  110. * 2
  111. * We squeezed all that to one function for performance. The row*column multiplication
  112. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  113. * with a time complexity of \$ O(nnz1 + nnz2) \$
  114. *
  115. * \param A The first matrix to use.
  116. * \param B The second matrix to use (they can be the same).
  117. * \return The count vector. RVO is used here.
  118. * \note
  119. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  120. * - A full matrix calculation
  121. * - A lower triangular matrix
  122. * \warning
  123. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  124. */
  125. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  126. std::vector<value_t> c(A.size());
  127. // OMP schedule selection
  128. if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0);
  129. else omp_set_schedule (omp_sched_static, 0);
  130. #pragma omp parallel for shared(c) schedule(runtime)
  131. for (int i=0 ; i<A.size() ; ++i) {
  132. for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
  133. c[i] += A.getRow(i)*B.getCol(j.index());
  134. }
  135. if (session.makeSymmetric) c[i] /= 2;
  136. }
  137. return c;
  138. }
  139. /*!
  140. * A parallelized version of sum. Just because ;)
  141. * \return The total sum of vector \c v
  142. */
  143. value_t sum (std::vector<value_t>& v) {
  144. value_t s =0;
  145. #pragma omp parallel for reduction(+:s)
  146. for (auto i =0u ; i<v.size() ; ++i)
  147. s += v[i];
  148. return s;
  149. }
  150. #elif defined THREADS
  151. /*!
  152. * Utility function to get/set the number of threads.
  153. *
  154. * The number of threads are inherited by the environment via std::thread::hardware_concurrency()
  155. *
  156. * \return The number of threads used.
  157. * \note
  158. * The user can reduce the number with the command option \c --max_threads.
  159. * If so the requested number will be used even if the environment has more threads available.
  160. */
  161. int nworkers() {
  162. if (session.max_threads)
  163. return (session.max_threads < std::thread::hardware_concurrency()) ?
  164. session.max_threads : std::thread::hardware_concurrency();
  165. else
  166. return std::thread::hardware_concurrency();
  167. }
  168. /*!
  169. * A spawn function to calculate and return a vertex-wise count vector.
  170. *
  171. * 1
  172. * vector(begin..end) = --- * (A.* (A*B))*ones_N
  173. * 2
  174. *
  175. * We squeezed all that to one function for performance. The row*column multiplication
  176. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  177. * with a time complexity of \$ O(nnz1 + nnz2) \$
  178. *
  179. * \param out Reference to output vector
  180. * \param A The first matrix to use.
  181. * \param B The second matrix to use (they can be the same).
  182. * \param iton vector containing the range with the columns to use (it can be shuffled).
  183. * \return The count vector. RVO is used here.
  184. * \note
  185. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  186. * - A full matrix calculation
  187. * - A lower triangular matrix
  188. * \warning
  189. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  190. */
  191. std::vector<value_t> mmacc_v_rng(
  192. std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) {
  193. for (index_t i=begin ; i<end ; ++i) {
  194. index_t ii = iton[i];
  195. for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){
  196. out[ii] += A.getRow(ii)*B.getCol(j.index());
  197. }
  198. if (session.makeSymmetric) out[ii] /= 2;
  199. }
  200. return out;
  201. }
  202. /*!
  203. * Calculate and return a vertex-wise count vector.
  204. *
  205. * \param A The first matrix to use.
  206. * \param B The second matrix to use (they can be the same).
  207. * \return The count vector. RVO is used here.
  208. */
  209. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  210. std::vector<std::thread> workers;
  211. std::vector<value_t> c(A.size());
  212. int n = nworkers();
  213. std::vector<index_t> iton(A.size()); // Create a 0 .. N range for outer loop
  214. std::iota(iton.begin(), iton.end(), 0);
  215. if (session.dynamic) // in case of dynamic scheduling, shuffle the range
  216. std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()});
  217. for (index_t i=0 ; i<n ; ++i) // dispatch the workers and hold them in a vector
  218. workers.push_back (
  219. std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n)
  220. );
  221. // a for to join them all...
  222. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  223. t.join();
  224. });
  225. return c;
  226. }
  227. /*!
  228. * A sum utility to use as spawn function for parallelized sum.
  229. * \return The sum of \c v from \c begin to \c end.
  230. */
  231. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  232. for (auto i =begin ; i != end ; ++i)
  233. out_sum += v[i];
  234. }
  235. /*!
  236. * A parallelized version of sum. Just because ;)
  237. * \return The total sum of vector \c v
  238. */
  239. value_t sum (std::vector<value_t>& v) {
  240. int n = nworkers();
  241. std::vector<value_t> sum_v(n, 0); // result of each do_sum invocation.
  242. std::vector<std::thread> workers;
  243. // We spawn workers in a more statically way.
  244. for (index_t i =0 ; i < n ; ++i)
  245. workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));
  246. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  247. t.join();
  248. });
  249. // sum the sums (a sum to rule them all)
  250. value_t s =0; for (auto& it : sum_v) s += it;
  251. return s;
  252. }
  253. #else
  254. //! Return the number of workers.
  255. //! \note This function is just for completion
  256. int nworkers() { return 1; }
  257. /*!
  258. * Calculate and return a vertex-wise count vector.
  259. *
  260. * 1
  261. * vector = --- * (A.* (A*B))*ones_N
  262. * 2
  263. *
  264. * We squeezed all that to one function for performance. The row*column multiplication
  265. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  266. * with a time complexity of \$ O(nnz1 + nnz2) \$
  267. *
  268. * \param A The first matrix to use.
  269. * \param B The second matrix to use (they can be the same).
  270. * \return The count vector. RVO is used here.
  271. * \note
  272. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  273. * - A full matrix calculation
  274. * - A lower triangular matrix
  275. * \warning
  276. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  277. */
  278. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  279. std::vector<value_t> c(A.size());
  280. for (int i=0 ; i<A.size() ; ++i) {
  281. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  282. c[i] += A.getRow(i)*B.getCol(j.index());
  283. }
  284. if (session.makeSymmetric) c[i] /= 2;
  285. }
  286. return c;
  287. }
  288. /*!
  289. * Summation functionality.
  290. * \return The total sum of vector \c v
  291. */
  292. value_t sum (std::vector<value_t>& v) {
  293. value_t s =0;
  294. for (auto& it : v)
  295. s += it;
  296. return s;
  297. }
  298. #endif
  299. //! Polymorphic interface function for count vector
  300. std::vector<value_t> triang_v(matrix& A) {
  301. return mmacc_v(A, A);
  302. }
  303. //! Polymorphic interface function for sum results
  304. value_t triang_count (std::vector<value_t>& c) {
  305. return (session.makeSymmetric) ? sum(c)/3 : sum(c);
  306. }
  307. } // namespace v4