A triangle counting assignment for A.U.TH Parallel and distributed systems class.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

344 lines
10 KiB

  1. /*!
  2. * \file v4.cpp
  3. * \brief vv3 part of the exercise.
  4. *
  5. * \author
  6. * Christos Choutouridis AEM:8997
  7. * <cchoutou@ece.auth.gr>
  8. */
  9. #include <v4.h>
  10. namespace v4 {
  11. #if defined CILK
  12. /*!
  13. * Utility function to get/set the number of threads.
  14. *
  15. * The number of threads are controlled via environment variable \c CILK_NWORKERS
  16. *
  17. * \return The number of threads used.
  18. * \note
  19. * The user can reduce the number with the command option \c --max_threads.
  20. * If so the requested number will be used even if the environment has more threads available.
  21. */
  22. int nworkers() {
  23. if (session.max_threads)
  24. return (session.max_threads < __cilkrts_get_nworkers()) ?
  25. session.max_threads : __cilkrts_get_nworkers();
  26. else
  27. return __cilkrts_get_nworkers();
  28. }
  29. /*!
  30. * Calculate and return a vertex-wise count vector.
  31. *
  32. * 1
  33. * vector = --- * (A.* (A*B))*ones_N
  34. * 2
  35. * We squeezed all that to one function for performance. The row*column multiplication
  36. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  37. * with a time complexity of \$ O(nnz1 + nnz2) \$
  38. *
  39. * \param A The first matrix to use.
  40. * \param B The second matrix to use (they can be the same).
  41. * \return The count vector. RVO is used here.
  42. * \note
  43. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  44. * - A full matrix calculation
  45. * - A lower triangular matrix
  46. * \warning
  47. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  48. */
  49. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  50. std::vector<value_t> c(A.size());
  51. cilk_for (int i=0 ; i<A.size() ; ++i) {
  52. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  53. c[i] += A.getRow(i)*B.getCol(j.index());
  54. }
  55. if (session.makeSymmetric) c[i] /= 2;
  56. }
  57. return c;
  58. }
  59. /*!
  60. * A sum utility to use as spawn function for parallelized sum.
  61. * \return The sum of \c v from \c begin to \c end.
  62. */
  63. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  64. for (auto i =begin ; i != end ; ++i)
  65. out_sum += v[i];
  66. }
  67. /*!
  68. * A parallelized version of sum. Just because ;)
  69. * \return The total sum of vector \c v
  70. */
  71. value_t sum (std::vector<value_t>& v) {
  72. int n = nworkers();
  73. std::vector<value_t> sum_v(n, 0); // result of each do_sum invocation.
  74. // We spawn workers in a more statically way.
  75. for (index_t i =0 ; i < n ; ++i) {
  76. cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
  77. }
  78. cilk_sync;
  79. // sum the sums (a sum to rule them all)
  80. value_t s =0; for (auto& it : sum_v) s += it;
  81. return s;
  82. }
  83. #elif defined OMP
  84. /*!
  85. * Utility function to get/set the number of threads.
  86. *
  87. * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
  88. *
  89. * \return The number of threads used.
  90. * \note
  91. * The user can reduce the number with the command option \c --max_threads.
  92. * If so the requested number will be used even if the environment has more threads available.
  93. */
  94. int nworkers() {
  95. if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
  96. omp_set_dynamic(0);
  97. omp_set_num_threads(session.max_threads);
  98. return session.max_threads;
  99. }
  100. else {
  101. omp_set_dynamic(1);
  102. return omp_get_max_threads();
  103. }
  104. }
  105. /*!
  106. * Calculate and return a vertex-wise count vector.
  107. *
  108. * 1
  109. * vector = --- * (A.* (A*B))*ones_N
  110. * 2
  111. * We squeezed all that to one function for performance. The row*column multiplication
  112. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  113. * with a time complexity of \$ O(nnz1 + nnz2) \$
  114. *
  115. * \param A The first matrix to use.
  116. * \param B The second matrix to use (they can be the same).
  117. * \return The count vector. RVO is used here.
  118. * \note
  119. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  120. * - A full matrix calculation
  121. * - A lower triangular matrix
  122. * \warning
  123. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  124. */
  125. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  126. std::vector<value_t> c(A.size());
  127. // OMP schedule selection
  128. if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0);
  129. else omp_set_schedule (omp_sched_static, 0);
  130. #pragma omp parallel for shared(c) schedule(runtime)
  131. for (int i=0 ; i<A.size() ; ++i) {
  132. for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
  133. c[i] += A.getRow(i)*B.getCol(j.index());
  134. }
  135. if (session.makeSymmetric) c[i] /= 2;
  136. }
  137. return c;
  138. }
  139. /*!
  140. * A parallelized version of sum. Just because ;)
  141. * \return The total sum of vector \c v
  142. */
  143. value_t sum (std::vector<value_t>& v) {
  144. value_t s =0;
  145. #pragma omp parallel for reduction(+:s)
  146. for (auto i =0u ; i<v.size() ; ++i)
  147. s += v[i];
  148. return s;
  149. }
  150. #elif defined THREADS
  151. /*!
  152. * Utility function to get/set the number of threads.
  153. *
  154. * The number of threads are inherited by the environment via std::thread::hardware_concurrency()
  155. *
  156. * \return The number of threads used.
  157. * \note
  158. * The user can reduce the number with the command option \c --max_threads.
  159. * If so the requested number will be used even if the environment has more threads available.
  160. */
  161. int nworkers() {
  162. if (session.max_threads)
  163. return (session.max_threads < std::thread::hardware_concurrency()) ?
  164. session.max_threads : std::thread::hardware_concurrency();
  165. else
  166. return std::thread::hardware_concurrency();
  167. }
  168. /*!
  169. * A spawn function to calculate and return a vertex-wise count vector.
  170. *
  171. * 1
  172. * vector(begin..end) = --- * (A.* (A*B))*ones_N
  173. * 2
  174. *
  175. * We squeezed all that to one function for performance. The row*column multiplication
  176. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  177. * with a time complexity of \$ O(nnz1 + nnz2) \$
  178. *
  179. * \param out Reference to output vector
  180. * \param A The first matrix to use.
  181. * \param B The second matrix to use (they can be the same).
  182. * \param iton vector containing the range with the columns to use (it can be shuffled).
  183. * \return The count vector. RVO is used here.
  184. * \note
  185. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  186. * - A full matrix calculation
  187. * - A lower triangular matrix
  188. * \warning
  189. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  190. */
  191. std::vector<value_t> mmacc_v_rng(
  192. std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) {
  193. for (index_t i=begin ; i<end ; ++i) {
  194. index_t ii = iton[i];
  195. for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){
  196. out[ii] += A.getRow(ii)*B.getCol(j.index());
  197. }
  198. if (session.makeSymmetric) out[ii] /= 2;
  199. }
  200. return out;
  201. }
  202. /*!
  203. * Calculate and return a vertex-wise count vector.
  204. *
  205. * \param A The first matrix to use.
  206. * \param B The second matrix to use (they can be the same).
  207. * \return The count vector. RVO is used here.
  208. */
  209. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  210. std::vector<std::thread> workers;
  211. std::vector<value_t> c(A.size());
  212. int n = nworkers();
  213. std::vector<index_t> iton(A.size()); // Create a 0 .. N range for outer loop
  214. std::iota(iton.begin(), iton.end(), 0);
  215. if (session.dynamic) // in case of dynamic scheduling, shuffle the range
  216. std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()});
  217. for (index_t i=0 ; i<n ; ++i) // dispatch the workers and hold them in a vector
  218. workers.push_back (
  219. std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n)
  220. );
  221. // a for to join them all...
  222. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  223. t.join();
  224. });
  225. return c;
  226. }
  227. /*!
  228. * A sum utility to use as spawn function for parallelized sum.
  229. * \return The sum of \c v from \c begin to \c end.
  230. */
  231. void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
  232. for (auto i =begin ; i != end ; ++i)
  233. out_sum += v[i];
  234. }
  235. /*!
  236. * A parallelized version of sum. Just because ;)
  237. * \return The total sum of vector \c v
  238. */
  239. value_t sum (std::vector<value_t>& v) {
  240. int n = nworkers();
  241. std::vector<value_t> sum_v(n, 0); // result of each do_sum invocation.
  242. std::vector<std::thread> workers;
  243. // We spawn workers in a more statically way.
  244. for (index_t i =0 ; i < n ; ++i)
  245. workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));
  246. std::for_each(workers.begin(), workers.end(), [](std::thread& t){
  247. t.join();
  248. });
  249. // sum the sums (a sum to rule them all)
  250. value_t s =0; for (auto& it : sum_v) s += it;
  251. return s;
  252. }
  253. #else
  254. //! Return the number of workers.
  255. //! \note This function is just for completion
  256. int nworkers() { return 1; }
  257. /*!
  258. * Calculate and return a vertex-wise count vector.
  259. *
  260. * 1
  261. * vector = --- * (A.* (A*B))*ones_N
  262. * 2
  263. *
  264. * We squeezed all that to one function for performance. The row*column multiplication
  265. * uses the inner CSC structure of sparse matrix and follows only non-zero members
  266. * with a time complexity of \$ O(nnz1 + nnz2) \$
  267. *
  268. * \param A The first matrix to use.
  269. * \param B The second matrix to use (they can be the same).
  270. * \return The count vector. RVO is used here.
  271. * \note
  272. * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
  273. * - A full matrix calculation
  274. * - A lower triangular matrix
  275. * \warning
  276. * The later(--triangular_only) produce correct results ONLY if we are after the total count.
  277. */
  278. std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
  279. std::vector<value_t> c(A.size());
  280. for (int i=0 ; i<A.size() ; ++i) {
  281. for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
  282. c[i] += A.getRow(i)*B.getCol(j.index());
  283. }
  284. if (session.makeSymmetric) c[i] /= 2;
  285. }
  286. return c;
  287. }
  288. /*!
  289. * Summation functionality.
  290. * \return The total sum of vector \c v
  291. */
  292. value_t sum (std::vector<value_t>& v) {
  293. value_t s =0;
  294. for (auto& it : v)
  295. s += it;
  296. return s;
  297. }
  298. #endif
  299. //! Polymorphic interface function for count vector
  300. std::vector<value_t> triang_v(matrix& A) {
  301. return mmacc_v(A, A);
  302. }
  303. //! Polymorphic interface function for sum results
  304. value_t triang_count (std::vector<value_t>& c) {
  305. return (session.makeSymmetric) ? sum(c)/3 : sum(c);
  306. }
  307. } // namespace v4