hoo2
/
PDS_TriangleCount


			
							/*!
 * \file    v3.cpp
 * \brief   vv3 part of the exercise.
 *
 * \author
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */
#include <v3.h>

namespace v3 {

#if defined CILK

/*!
 * Utility function to get/set the number of threads.
 *
 * The number of threads are controlled via environment variable \c CILK_NWORKERS
 *
 * \return  The number of threads used.
 * \note
 *    The user can reduce the number with the command option \c --max_threads.
 *    If so the requested number will be used even if the environment has more threads available.
 */
int nworkers() {
   if (session.max_threads)
      return (session.max_threads < __cilkrts_get_nworkers()) ?
            session.max_threads : __cilkrts_get_nworkers();
   else
      return __cilkrts_get_nworkers();
}

/*!
 * Calculate and return a vertex-wise count vector.
 *
 * \param   A  The matrix to use.
 * \return  The count vector. RVO is used here.
 * \note
 *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
 *    - A full matrix calculation which update only c[i]
 *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
 */
std::vector<value_t> triang_v(matrix& A) {
   std::vector<std::atomic<value_t>> c(A.size());  // atomic for c[j], c[k] only
   std::vector<value_t> ret(A.size());             // unrestricted c[i] access

   cilk_for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
         // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
            // k list all the edges with j
            if (A.get(k.index(), i)) {
               ++ret[i];
               c[j.index()] += (!session.makeSymmetric)? 1:0;
               c[k.index()] += (!session.makeSymmetric)? 1:0;
            }
         }
      }
      if (session.makeSymmetric) {
         ret[i] = ret[i]/2;
         c[i] = c[i]/2;
      }
   }
   // merge c to ret and return it
   for (index_t i =0 ; i<A.size() ; ++i)   ret[i] += c[i];
   return ret;
}

/*!
 * A sum utility to use as spawn function for parallelized sum.
 * \return  The sum of \c v from \c begin to \c end.
 */
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
}

/*!
 * A parallelized version of sum. Just because ;)
 * \return     The total sum of vector \c v
 */
value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
   std::vector<value_t> sum_v(n, 0);   // result of each do_sum invocation.

   // We spawn workers in a more statically way.
   for (index_t i =0 ; i < n ; ++i) {
      cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
   }
   cilk_sync;

   // sum the sums (a sum to rule them all)
   value_t s =0; for (auto& it : sum_v) s += it;
   return s;
}

#elif defined OMP

/*!
 * A "simple" user defined OpenMP reduction for vector<value_t>
 * \note
 *    Not used. Reason: The atomic version of the code performs better.
 */
#pragma omp declare reduction(vec_value_plus : std::vector<value_t> :                                    \
         std::transform(                                                                                 \
               omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus<value_t>()     \
         )                                                                                               \
      )                                                                                                  \
      initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))


/*!
 * Utility function to get/set the number of threads.
 *
 * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
 *
 * \return  The number of threads used.
 * \note
 *    The user can reduce the number with the command option \c --max_threads.
 *    If so the requested number will be used even if the environment has more threads available.
 */
int nworkers() {
   if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
      omp_set_dynamic(0);
      omp_set_num_threads(session.max_threads);
      return session.max_threads;
   }
   else {
      omp_set_dynamic(1);
      return omp_get_max_threads();
   }
}

/*!
 * Calculate and return a vertex-wise count vector.
 *
 * \param   A  The matrix to use.
 * \return  The count vector. RVO is used here.
 * \note
 *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
 *    - A full matrix calculation which update only c[i]
 *    - A lower triangular matrix which update c[i], c[j], c[k]. This is waaayyy faster.
 */
std::vector<value_t> triang_v(matrix& A) {
   std::vector<std::atomic<value_t>> c(A.size());  // atomic for c[j], c[k] only
   std::vector<value_t> ret(A.size());             // unrestricted c[i] access

   // OMP schedule selection
   if (session.dynamic)    omp_set_schedule (omp_sched_dynamic, 0);
   else                    omp_set_schedule (omp_sched_static, 0);
   #pragma omp parallel for schedule(runtime) //reduction(vec_value_plus : c)
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
         // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
            // k list all the edges with j
            if (A.get(k.index(), i)) {
               ++ret[i];
               c[j.index()] += (!session.makeSymmetric)? 1:0;
               c[k.index()] += (!session.makeSymmetric)? 1:0;
            }
         }
      }
      if (session.makeSymmetric) {
         ret[i] = ret[i]/2;
         c[i] = c[i]/2;
      }
   }
   // merge c to ret and return it
   for (index_t i =0 ; i<A.size() ; ++i)   ret[i] += c[i];
   return ret;
}

/*!
 * A parallelized version of sum. Just because ;)
 * \return     The total sum of vector \c v
 */
value_t sum (std::vector<value_t>& v) {
   value_t s =0;

   #pragma omp parallel for reduction(+:s)
   for (auto i =0u ; i<v.size() ; ++i)
      s += v[i];
   return s;
}

#else

//! Return the number of workers.
//! \note   This function is just for completion
int nworkers() { return 1; }

/*!
 * Calculate and return a vertex-wise count vector.
 *
 * \param   A  The matrix to use.
 * \return  The count vector. RVO is used here.
 * \note
 *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
 *    - A full matrix calculation which update only c[i]
 *    - A lower triangular matrix which update c[i], c[j], c[k]. This is waaayyy faster.
 */
std::vector<value_t> triang_v(matrix& A) {
   std::vector<value_t> c(A.size());

   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
         // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
            // k list all the edges with j
            if (A.get(k.index(), i)) {
               ++c[i];
               c[j.index()] += (!session.makeSymmetric)? 1:0;
               c[k.index()] += (!session.makeSymmetric)? 1:0;
               //^ We set other nodes in case of lower triangular
            }
         }
      }
      if (session.makeSymmetric) c[i] /= 2;
      //^ We don't have to divide by 2 in case of lower triangular
   }
   return c;
}

/*!
 * Summation functionality.
 * \return     The total sum of vector \c v
 */
value_t sum (std::vector<value_t>& v) {
   value_t s =0;
   for (auto& it : v)
      s += it;
   return s;
}

#endif

//! Polymorphic interface function for sum results
value_t triang_count (std::vector<value_t>& c) {
   return sum(c)/3;
}

}