/*! * \file v3.cpp * \brief vv3 part of the exercise. * * \author * Christos Choutouridis AEM:8997 * */ #include namespace v3 { #if defined CILK /*! * Utility function to get/set the number of threads. * * The number of threads are controlled via environment variable \c CILK_NWORKERS * * \return The number of threads used. * \note * The user can reduce the number with the command option \c --max_threads. * If so the requested number will be used even if the environment has more threads available. */ int nworkers() { if (session.max_threads) return (session.max_threads < __cilkrts_get_nworkers()) ? session.max_threads : __cilkrts_get_nworkers(); else return __cilkrts_get_nworkers(); } /*! * Calculate and return a vertex-wise count vector. * * \param A The matrix to use. * \return The count vector. RVO is used here. * \note * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only * - A full matrix calculation which update only c[i] * - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. */ std::vector triang_v(matrix& A) { std::vector> c(A.size()); // atomic for c[j], c[k] only std::vector ret(A.size()); // unrestricted c[i] access cilk_for (int i=0 ; i& v, index_t begin, index_t end) { for (auto i =begin ; i != end ; ++i) out_sum += v[i]; } /*! * A parallelized version of sum. Just because ;) * \return The total sum of vector \c v */ value_t sum (std::vector& v) { int n = nworkers(); std::vector sum_v(n, 0); // result of each do_sum invocation. // We spawn workers in a more statically way. for (index_t i =0 ; i < n ; ++i) { cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); } cilk_sync; // sum the sums (a sum to rule them all) value_t s =0; for (auto& it : sum_v) s += it; return s; } #elif defined OMP /*! * A "simple" user defined OpenMP reduction for vector * \note * Not used. Reason: The atomic version of the code performs better. */ #pragma omp declare reduction(vec_value_plus : std::vector : \ std::transform( \ omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus() \ ) \ ) \ initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) /*! * Utility function to get/set the number of threads. * * The number of threads are controlled via environment variable \c OMP_NUM_THREADS * * \return The number of threads used. * \note * The user can reduce the number with the command option \c --max_threads. * If so the requested number will be used even if the environment has more threads available. */ int nworkers() { if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { omp_set_dynamic(0); omp_set_num_threads(session.max_threads); return session.max_threads; } else { omp_set_dynamic(1); return omp_get_max_threads(); } } /*! * Calculate and return a vertex-wise count vector. * * \param A The matrix to use. * \return The count vector. RVO is used here. * \note * We use two methods of calculation based on \c --make_symmetric or \c --triangular_only * - A full matrix calculation which update only c[i] * - A lower triangular matrix which update c[i], c[j], c[k]. This is waaayyy faster. */ std::vector triang_v(matrix& A) { std::vector> c(A.size()); // atomic for c[j], c[k] only std::vector ret(A.size()); // unrestricted c[i] access // OMP schedule selection if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0); else omp_set_schedule (omp_sched_static, 0); #pragma omp parallel for schedule(runtime) //reduction(vec_value_plus : c) for (int i=0 ; i& v) { value_t s =0; #pragma omp parallel for reduction(+:s) for (auto i =0u ; i triang_v(matrix& A) { std::vector c(A.size()); for (int i=0 ; i& v) { value_t s =0; for (auto& it : v) s += it; return s; } #endif //! Polymorphic interface function for sum results value_t triang_count (std::vector& c) { return sum(c)/3; } }