/*! * \file v4.cpp * \brief vv3 part of the exercise. * * \author * Christos Choutouridis AEM:8997 * */ #include namespace v4 { #if defined CILK /*! * Utility function to get/set the number of threads. * * The number of threads are controlled via environment variable \c CILK_NWORKERS * * \return The number of threads used. * \note * The user can reduce the number with the command option \c --max_threads. * If so the requested number will be used even if the environment has more threads available. */ int nworkers() { if (session.max_threads) return (session.max_threads < __cilkrts_get_nworkers()) ? session.max_threads : __cilkrts_get_nworkers(); else return __cilkrts_get_nworkers(); } /*! * Calculate and return a vertex-wise count vector. * * 1 * vector = --- * (A.* (A*B))*ones_N * 2 * We squeezed all that to one function for performance. The row*column multiplication * uses the inner CSC structure of sparse matrix and follows only non-zero members * with a time complexity of \$ O(nnz1 + nnz2) \$ * * \param A The first matrix to use. * \param B The second matrix to use (they can be the same). * \return The count vector. RVO is used here. * \note * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only * - A full matrix calculation * - A lower triangular matrix * \warning * The later(--triangular_only) produce correct results ONLY if we are after the total count. */ std::vector mmacc_v(matrix& A, matrix& B) { std::vector c(A.size()); cilk_for (int i=0 ; i& v, index_t begin, index_t end) { for (auto i =begin ; i != end ; ++i) out_sum += v[i]; } /*! * A parallelized version of sum. Just because ;) * \return The total sum of vector \c v */ value_t sum (std::vector& v) { int n = nworkers(); std::vector sum_v(n, 0); // result of each do_sum invocation. // We spawn workers in a more statically way. for (index_t i =0 ; i < n ; ++i) { cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); } cilk_sync; // sum the sums (a sum to rule them all) value_t s =0; for (auto& it : sum_v) s += it; return s; } #elif defined OMP /*! * Utility function to get/set the number of threads. * * The number of threads are controlled via environment variable \c OMP_NUM_THREADS * * \return The number of threads used. * \note * The user can reduce the number with the command option \c --max_threads. * If so the requested number will be used even if the environment has more threads available. */ int nworkers() { if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { omp_set_dynamic(0); omp_set_num_threads(session.max_threads); return session.max_threads; } else { omp_set_dynamic(1); return omp_get_max_threads(); } } /*! * Calculate and return a vertex-wise count vector. * * 1 * vector = --- * (A.* (A*B))*ones_N * 2 * We squeezed all that to one function for performance. The row*column multiplication * uses the inner CSC structure of sparse matrix and follows only non-zero members * with a time complexity of \$ O(nnz1 + nnz2) \$ * * \param A The first matrix to use. * \param B The second matrix to use (they can be the same). * \return The count vector. RVO is used here. * \note * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only * - A full matrix calculation * - A lower triangular matrix * \warning * The later(--triangular_only) produce correct results ONLY if we are after the total count. */ std::vector mmacc_v(matrix& A, matrix& B) { std::vector c(A.size()); // OMP schedule selection if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0); else omp_set_schedule (omp_sched_static, 0); #pragma omp parallel for shared(c) schedule(runtime) for (int i=0 ; i& v) { value_t s =0; #pragma omp parallel for reduction(+:s) for (auto i =0u ; i mmacc_v_rng( std::vector& out, matrix& A, matrix& B, std::vector& iton, index_t begin, index_t end) { for (index_t i=begin ; i mmacc_v(matrix& A, matrix& B) { std::vector workers; std::vector c(A.size()); int n = nworkers(); std::vector iton(A.size()); // Create a 0 .. N range for outer loop std::iota(iton.begin(), iton.end(), 0); if (session.dynamic) // in case of dynamic scheduling, shuffle the range std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()}); for (index_t i=0 ; i& v, index_t begin, index_t end) { for (auto i =begin ; i != end ; ++i) out_sum += v[i]; } /*! * A parallelized version of sum. Just because ;) * \return The total sum of vector \c v */ value_t sum (std::vector& v) { int n = nworkers(); std::vector sum_v(n, 0); // result of each do_sum invocation. std::vector workers; // We spawn workers in a more statically way. for (index_t i =0 ; i < n ; ++i) workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n)); std::for_each(workers.begin(), workers.end(), [](std::thread& t){ t.join(); }); // sum the sums (a sum to rule them all) value_t s =0; for (auto& it : sum_v) s += it; return s; } #else //! Return the number of workers. //! \note This function is just for completion int nworkers() { return 1; } /*! * Calculate and return a vertex-wise count vector. * * 1 * vector = --- * (A.* (A*B))*ones_N * 2 * * We squeezed all that to one function for performance. The row*column multiplication * uses the inner CSC structure of sparse matrix and follows only non-zero members * with a time complexity of \$ O(nnz1 + nnz2) \$ * * \param A The first matrix to use. * \param B The second matrix to use (they can be the same). * \return The count vector. RVO is used here. * \note * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only * - A full matrix calculation * - A lower triangular matrix * \warning * The later(--triangular_only) produce correct results ONLY if we are after the total count. */ std::vector mmacc_v(matrix& A, matrix& B) { std::vector c(A.size()); for (int i=0 ; i& v) { value_t s =0; for (auto& it : v) s += it; return s; } #endif //! Polymorphic interface function for count vector std::vector triang_v(matrix& A) { return mmacc_v(A, A); } //! Polymorphic interface function for sum results value_t triang_count (std::vector& c) { return (session.makeSymmetric) ? sum(c)/3 : sum(c); } } // namespace v4