A triangle counting assignment for A.U.TH Parallel and distributed systems class.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

339 lines
11 KiB

  1. /*!
  2. * \file v4.cpp
  3. * \brief vv3 part of the exercise.
  4. *
  5. * \author
  6. * Christos Choutouridis AEM:8997
  7. * <cchoutou@ece.auth.gr>
  8. */
  9. #include <v4.h>
  10. namespace v4 {
  11. #if defined CILK
  12. /*!
  13. * Utility function to get/set the number of threads.
  14. *
  15. * The number of threads are controlled via environment variable \c CILK_NWORKERS
  16. *
  17. * \return The number of threads used.
  18. * \note
  19. * The user can reduce the number with the command option \c --max_threads.
  20. * If so the requested number will be used even if the environment has more threads available.
  21. */
  22. int nworkers() {
  23. if (session.max_threads)
  24. return (session.max_threads < __cilkrts_get_nworkers()) ?
  25. session.max_threads : __cilkrts_get_nworkers();
  26. else
  27. return __cilkrts_get_nworkers();
  28. }
  29. /*!
  30. * Calculate and return a vertex-wise count vector.
  31. *
  32. * 1
  33. * vector = --- * (A.* (A*B))*ones_N
  34. * 2
  35. * We squeezed all that to one function for performance. The row*column multiplication
  36. * uses the inner CSC structure of sparse matrix and follows only non-zero members.
  37. *
  38. * \param A The first matrix to use.
  39. * \param B The second matrix to use (they can be the same).
  40. * \return The count vector. RVO is used here.
  41. * \note
  42. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  43. * - A full matrix calculation which update only c[i]
  44. * - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
  45. * \warning
  46. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  47. */
  48. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  49. std::vector<value_t> c(A.size());
  50. cilk_for (int i=0 ; i<A.size() ; ++i) {
  51. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  52. c[i] += A.getRow(i)*B.getCol(j.index());
  53. }
  54. if (session.makeSymmetric) c[i] /= 2;
  55. }
  56. return c;
  57. }
  58. /*!
  59. * A sum utility to use as spawn function for parallelized sum.
  60. * \return The sum of \c v from \c begin to \c end.
  61. */
  62. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  63. for (auto i =begin ; i != end ; ++i)
  64. out_sum += v[i];
  65. }
  66. /*!
  67. * A parallelized version of sum. Just because ;)
  68. * \return The total sum of vector \c v
  69. */
  70. value_t sum (std::vector<value_t>& v) {
  71. int n = nworkers();
  72. std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation.
  73. // We spawn workers in a more statically way.
  74. for (index_t i =0 ; i < n ; ++i) {
  75. cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
  76. }
  77. cilk_sync;
  78. // sum the sums (a sum to rule them all)
  79. value_t s =0; for (auto& it : sum_v) s += it;
  80. return s;
  81. }
  82. #elif defined OMP
  83. /*!
  84. * Utility function to get/set the number of threads.
  85. *
  86. * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
  87. *
  88. * \return The number of threads used.
  89. * \note
  90. * The user can reduce the number with the command option \c --max_threads.
  91. * If so the requested number will be used even if the environment has more threads available.
  92. */
  93. int nworkers() {
  94. if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
  95. omp_set_dynamic(0);
  96. omp_set_num_threads(session.max_threads);
  97. return session.max_threads;
  98. }
  99. else {
  100. omp_set_dynamic(1);
  101. return omp_get_max_threads();
  102. }
  103. }
  104. /*!
  105. * Calculate and return a vertex-wise count vector.
  106. *
  107. * 1
  108. * vector = --- * (A.* (A*B))*ones_N
  109. * 2
  110. * We squeezed all that to one function for performance. The row*column multiplication
  111. * uses the inner CSC structure of sparse matrix and follows only non-zero members.
  112. *
  113. * \param A The first matrix to use.
  114. * \param B The second matrix to use (they can be the same).
  115. * \return The count vector. RVO is used here.
  116. * \note
  117. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  118. * - A full matrix calculation which update only c[i]
  119. * - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
  120. * \warning
  121. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  122. */
  123. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  124. std::vector<value_t> c(A.size());
  125. // OMP schedule selection
  126. if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0);
  127. else omp_set_schedule (omp_sched_static, 0);
  128. #pragma omp parallel for shared(c) schedule(runtime)
  129. for (int i=0 ; i<A.size() ; ++i) {
  130. for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
  131. c[i] += A.getRow(i)*B.getCol(j.index());
  132. }
  133. if (session.makeSymmetric) c[i] /= 2;
  134. }
  135. return c;
  136. }
  137. /*!
  138. * A parallelized version of sum. Just because ;)
  139. * \return The total sum of vector \c v
  140. */
  141. value_t sum (std::vector<value_t>& v) {
  142. value_t s =0;
  143. #pragma omp parallel for reduction(+:s)
  144. for (auto i =0u ; i<v.size() ; ++i)
  145. s += v[i];
  146. return s;
  147. }
  148. #elif defined THREADS
  149. /*!
  150. * Utility function to get/set the number of threads.
  151. *
  152. * The number of threads are inherited by the environment via std::thread::hardware_concurrency()
  153. *
  154. * \return The number of threads used.
  155. * \note
  156. * The user can reduce the number with the command option \c --max_threads.
  157. * If so the requested number will be used even if the environment has more threads available.
  158. */
  159. int nworkers() {
  160. if (session.max_threads)
  161. return (session.max_threads < std::thread::hardware_concurrency()) ?
  162. session.max_threads : std::thread::hardware_concurrency();
  163. else
  164. return std::thread::hardware_concurrency();
  165. }
  166. /*!
  167. * A spawn function to calculate and return a vertex-wise count vector.
  168. *
  169. * 1
  170. * vector(begin..end) = --- * (A.* (A*B))*ones_N
  171. * 2
  172. *
  173. * We squeezed all that to one function for performance. The row*column multiplication
  174. * uses the inner CSC structure of sparse matrix and follows only non-zero members.
  175. *
  176. * \param out Reference to output vector
  177. * \param A The first matrix to use.
  178. * \param B The second matrix to use (they can be the same).
  179. * \param iton vector containing the range with the columns to use (it can be shuffled).
  180. * \return The count vector. RVO is used here.
  181. * \note
  182. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  183. * - A full matrix calculation which update only c[i]
  184. * - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
  185. * \warning
  186. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  187. */
  188. std::vector<value_t> mmacc_v_rng(
  189. std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) {
  190. for (index_t i=begin ; i<end ; ++i) {
  191. index_t ii = iton[i];
  192. for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){
  193. out[ii] += A.getRow(ii)*B.getCol(j.index());
  194. }
  195. if (session.makeSymmetric) out[ii] /= 2;
  196. }
  197. return out;
  198. }
  199. /*!
  200. * Calculate and return a vertex-wise count vector.
  201. *
  202. * \param A The first matrix to use.
  203. * \param B The second matrix to use (they can be the same).
  204. * \return The count vector. RVO is used here.
  205. */
  206. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  207. std::vector<std::thread> workers;
  208. std::vector<value_t> c(A.size());
  209. int n = nworkers();
  210. std::vector<index_t> iton(A.size()); // Create a 0 .. N range for outer loop
  211. std::iota(iton.begin(), iton.end(), 0);
  212. if (session.dynamic) // in case of dynamic scheduling, shuffle the range
  213. std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()});
  214. for (index_t i=0 ; i<n ; ++i) // dispatch the workers and hold them in a vector
  215. workers.push_back (
  216. std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n)
  217. );
  218. // a for to join them all...
  219. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  220. t.join();
  221. });
  222. return c;
  223. }
  224. /*!
  225. * A sum utility to use as spawn function for parallelized sum.
  226. * \return The sum of \c v from \c begin to \c end.
  227. */
  228. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  229. for (auto i =begin ; i != end ; ++i)
  230. out_sum += v[i];
  231. }
  232. /*!
  233. * A parallelized version of sum. Just because ;)
  234. * \return The total sum of vector \c v
  235. */
  236. value_t sum (std::vector<value_t>& v) {
  237. int n = nworkers();
  238. std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation.
  239. std::vector<std::thread> workers;
  240. // We spawn workers in a more statically way.
  241. for (index_t i =0 ; i < n ; ++i)
  242. workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));
  243. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  244. t.join();
  245. });
  246. // sum the sums (a sum to rule them all)
  247. value_t s =0; for (auto& it : sum_v) s += it;
  248. return s;
  249. }
  250. #else
  251. //! Return the number of workers.
  252. //! \note This function is just for completion
  253. int nworkers() { return 1; }
  254. /*!
  255. * Calculate and return a vertex-wise count vector.
  256. *
  257. * 1
  258. * vector = --- * (A.* (A*B))*ones_N
  259. * 2
  260. * We squeezed all that to one function for performance. The row*column multiplication
  261. * uses the inner CSC structure of sparse matrix and follows only non-zero members.
  262. *
  263. * \param A The first matrix to use.
  264. * \param B The second matrix to use (they can be the same).
  265. * \return The count vector. RVO is used here.
  266. * \note
  267. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  268. * - A full matrix calculation which update only c[i]
  269. * - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
  270. * \warning
  271. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  272. */
  273. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  274. std::vector<value_t> c(A.size());
  275. for (int i=0 ; i<A.size() ; ++i) {
  276. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  277. c[i] += A.getRow(i)*B.getCol(j.index());
  278. }
  279. if (session.makeSymmetric) c[i] /= 2;
  280. }
  281. return c;
  282. }
  283. /*!
  284. * Summation functionality.
  285. * \return The total sum of vector \c v
  286. */
  287. value_t sum (std::vector<value_t>& v) {
  288. value_t s =0;
  289. for (auto& it : v)
  290. s += it;
  291. return s;
  292. }
  293. #endif
  294. //! Polymorphic interface function for count vector
  295. std::vector<value_t> triang_v(matrix& A) {
  296. return mmacc_v(A, A);
  297. }
  298. //! Polymorphic interface function for sum results
  299. value_t triang_count (std::vector<value_t>& c) {
  300. return (session.makeSymmetric) ? sum(c)/3 : sum(c);
  301. }
  302. } // namespace v4