A 2nd implementation conforming to vector creation [need optimization]

4年前 · e3b96138c8
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 # project
 resources/
 bin/
 out/
 mat/
 mtx/

 # eclipse
 .project
--- a/+ 61
+++ b/+ 61
@@ -38,9 +38,10 @@ DEP_DIR         := $(BUILD_DIR)/.dep
 # ========== Compiler settings ==========
 # Compiler flags for debug and release
 DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c++14
 REL_CFLAGS      := -Wall -Wextra -O2 -std=c++14
 REL_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -O2 -std=c++14
 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
 PRE_DEFS		:=

 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
@@ -105,7 +106,7 @@ DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
 # Invoke cpp to create makefile rules with dependencies for each source file
 $(DEP_DIR)/%.d: %.cpp
 	@mkdir -p $(@D)
 	@$(DOCKER) $(CXX) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
 	$(DOCKER) $(CXX) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<

 # objects depent on .cpp AND dependency files, which have an empty recipe
 $(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
@@ -137,29 +138,82 @@ clean:
 	@rm -rf $(DEP_DIR)
 	@rm -rf $(BUILD_DIR)


 #
 # ================ Local build rules =================
 # examples:
 # make debug

 .PHONY: debug
 debug: CFLAGS := $(DEB_CFLAGS)
 debug: $(BUILD_DIR)/$(TARGET)

 .PHONY: release
 release: CFLAGS := $(REL_CFLAGS)
 release: $(BUILD_DIR)/$(TARGET)

 .PHONY: all
 all: release

 local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V3
 local_v3: TARGET := local_v3
 local_v3: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V4
 local_v4: TARGET := local_v4
 local_v4: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v3: DOCKER := $(DOCKER_CMD)
 v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V3
 v3: TARGET := tcount_v3
 v3: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v3_cilk: DOCKER := $(DOCKER_CMD)
 v3_cilk: CXX    := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
 v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=V3 -DCILK
 v3_cilk: LDFLAGS += -fcilkplus
 v3_cilk: TARGET := tcount_cilkv3
 v3_cilk: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v3_omp: DOCKER := $(DOCKER_CMD)
 v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V3 -DOMP
 v3_omp: LDFLAGS += -fopenmp
 v3_omp: TARGET := tcount_ompv3
 v3_omp: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4: DOCKER := $(DOCKER_CMD)
 v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4
 v4: TARGET := tcount_v4
 v4: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4_cilk: DOCKER := $(DOCKER_CMD)
 v4_cilk: CXX    := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
 v4_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=V4 -DCILK
 v4_cilk: LDFLAGS += -fcilkplus
 v4_cilk: TARGET := tcount_cilkv4
 v4_cilk: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4_omp: DOCKER := $(DOCKER_CMD)
 v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V4 -DOMP
 v4_omp: LDFLAGS += -fopenmp
 v4_omp: TARGET := tcount_ompv4
 v4_omp: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4_pthreads: DOCKER := $(DOCKER_CMD)
 v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4 -DTHREADS
 v4_pthreads: TARGET := tcount_pthv4
 v4_pthreads: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 #
 # ================ Docker based rules ================
 # examples:
 # make IMAGE="gcc:8.3" dock
 #
 .PHONY: dock
 dock: DOCKER := $(DOCKER_CMD)
 dock: CFLAGS := $(REL_CFLAGS)
 dock: $(BUILD_DIR)/$(TARGET)
--- a/inc/config.h
+++ b/inc/config.h
@@ -14,14 +14,19 @@
 #include <v3.h>
 #include <v4.h>

 /*
 * Defines for different version of the exercise
 */
 #define  V12   2
 #define  V3    3
 #define  V4    4

 // Fail-safe verision selection
 #if !defined CODE_VERSION
 #define CODE_VERSION   V4
 #endif

 // matrix alias template dispatcher based on pre-define flag from compiler (see Makefile)
 #if   CODE_VERSION == V12
   using namespace v12;
   using matrix = v12::matrix;
--- a/inc/impl.hpp
+++ b/inc/impl.hpp
@@ -168,25 +168,42 @@ Matrix<DataType, IndexType, Type> make_Matrix(IndexType s) {
   return Matrix<DataType, IndexType, Type>(new DataType[Matrix<DataType, IndexType, Type>::capacity(s)], s);
 }

 template<typename DataType, typename IndexType>
 using CooVal = std::tuple<DataType, IndexType, IndexType>;

 /**
 * A simple sparse matrix implementation.
 *
 * We use CSC format and provide get/set functionalities for each (i,j) item
 * on the matrix. We also provide a () overload using a proxy SpMatVal object.
 * This way the user can:
 * \code
 *    auto v = A(3,4);
 *    A(3, 4) = 7;
 * \endcode
 *
 * We also provide getCol() and getRow() functions witch return a viwer/iterator to rows and
 * columns of the matrix. In the case of a symmetric matrix instead of a row we return the
 * equivalent column. This way we gain speed due to CSC format nature.
 *
 * @tparam DataType  The type for values
 * @tparam IndexType The type for indexes
 * @tparam Type      The Matrix type (FULL or SYMMETRIC)
 */
 template<typename DataType, typename IndexType, MatrixType Type>
 struct SpMat {
   using dataType    = DataType;                   //!< meta:export of underling data type
   using indexType   = IndexType;                  //!< meta:export of underling index type
   static constexpr MatrixType matrixType = Type;  //!< export of array type

   friend class SpMatCol<DataType, IndexType>;
   friend class SpMatRow<DataType, IndexType>;
   friend class SpMatVal<DataType, IndexType>;
   friend struct SpMatCol<DataType, IndexType>;
   friend struct SpMatRow<DataType, IndexType>;
   friend struct SpMatVal<DataType, IndexType>;

   /*!
    * \name Obj lifetime
    */
   //! @{

   //! allocation with init value ctor
   //! Default ctor with optional memory allocations
   SpMat(IndexType n=IndexType{}, IndexType nnz=IndexType{}) :
      values(nnz, DataType{}),
      rows(nnz, IndexType{}),
@@ -194,6 +211,7 @@ struct SpMat {
      N(n),
      NNZ(nnz) { }

   //! A ctor using csc array data
   SpMat(IndexType n, IndexType nnz, const IndexType* row, const IndexType* col) :
      values(nnz, 1),
      rows(row, row+nnz),
@@ -201,6 +219,7 @@ struct SpMat {
      N(n),
      NNZ(nnz) { }

   //! ctor using csc array data with value array
   SpMat(IndexType n, IndexType nnz, const DataType* v, const IndexType* row, const IndexType* col) :
      values(v, v+nnz),
      rows(row, row+nnz),
@@ -208,6 +227,7 @@ struct SpMat {
      N(n),
      NNZ(nnz) { }

   //! ctor vectors of row/col and default value for values array
   SpMat(IndexType n, IndexType nnz, const DataType v, const std::vector<IndexType>& row, const std::vector<IndexType>& col) :
      values(nnz, v),
      rows (row),
@@ -219,38 +239,72 @@ struct SpMat {
   SpMat(SpMat&& a) noexcept { moves(std::move(a)); }
   //! move
   SpMat& operator=(SpMat&& a) noexcept { moves(std::move(a)); return *this; }
   SpMat(const SpMat& a)             = delete;  //!< No copy ctor
   SpMat& operator=(const SpMat& a)  = delete;  //!< No copy assignment
   SpMat(const SpMat& a)             = delete;  //!< make sure there are no copies
   SpMat& operator=(const SpMat& a)  = delete;  //!< make sure there are no copies
   //! @}

   //! \name Data exposure
   //! @{

   //! \return the dimension of the matrix
   IndexType size()     noexcept { return N; }
   //! After construction size configuration tool
   IndexType size(IndexType n) {
      col_ptr.resize(n+1);
      return N = n;
   }
   //! \return the NNZ of the matrix
   IndexType capacity() noexcept { return NNZ; }
   //! After construction NNZ size configuration tool
   IndexType capacity(IndexType nnz) {
      values.reserve(nnz);
      rows.reserve(nnz);
      return NNZ;
   }
   // getters for row arrays of the struct (unused)
   std::vector<DataType>&  getValues() noexcept { return values; }
   std::vector<IndexType>& getRows() noexcept { return rows; }
   std::vector<IndexType>& getCols() noexcept { return col_ptr; }

   /*!
    * Return a proxy SpMatVal object with read and write capabilities.
    * @param i    The row number
    * @param j    The column number
    * @return     tHE SpMatVal object
    */
   SpMatVal<DataType, IndexType> operator()(IndexType i, IndexType j) {
      return SpMatVal<DataType, IndexType>(this, get(i, j), i, j);
   }

   /*!
    * A read item functionality using binary search to find the correct row
    *
    * @param i    The row number
    * @param j    The column number
    * @return     The value of the item or DataType{} if is not present.
    */
   DataType get(IndexType i, IndexType j) {
      IndexType idx;
      bool found;
      std::tie(idx, found) =find_idx(rows, col_ptr[j], col_ptr[j+1], i);
      return (found) ? values[idx] : 0;
      IndexType end, idx =find_idx(rows, col_ptr[j], end=col_ptr[j+1], i);
      return (idx != end) ? values[idx] : 0;
   }

   /*!
    * A write item functionality.
    *
    * First we search if the matrix has already a value in (i, j) position.
    * If so we just change it to a new value. If not we add the item on the matrix.
    *
    * @note
    *    We don't increase the NNZ value of the struct. We expect the user has already
    *    change the NNZ value to the right one using @see capacity() function.
    *
    * @param i    The row number
    * @param j    The column number
    * @return     The new value of the item .
    */
   DataType set(DataType v, IndexType i, IndexType j) {
      IndexType idx; bool found;
      std::tie(idx, found) = find_idx(rows, col_ptr[j], col_ptr[j+1], i);
      std::tie(idx, found) = find2_idx(rows, col_ptr[j], col_ptr[j+1], i);
      if (found)
         return values[idx] = v;    // we don't change NNZ even if we write "0"
      else {
@@ -264,34 +318,47 @@ struct SpMat {
      }
   }


   /*!
    * Get a view of a CSC column
    * @param j    The column to get
    * @return     The SpMatCol object @see SpMatCol
    */
   SpMatCol<DataType, IndexType> getCol(IndexType j) {
      return SpMatCol<DataType, IndexType>(this, col_ptr[j], col_ptr[j+1]);
   }

   /*!
    * Get a view of a CSC row
    *
    * In case of a SYMMETRIC matrix we can return a column instead.
    *
    * @param j    The row to get
    * @return     The SpMatCol object @see SpMatCol
    */
   template<MatrixType AT= Type>
   std::enable_if_t<AT==MatrixType::SYMMETRIC, SpMatCol<DataType, IndexType>>
   getRow(IndexType i) {
      return getCol(i);
   }

   /*!
    * Get a view of a CSC row
    *
    * @param j    The row to get
    * @return     The SpMatRow object @see SpMatRow
    */
   template<MatrixType AT= Type>
   std::enable_if_t<AT==MatrixType::FULL, SpMatCol<DataType, IndexType>>
   getRow(IndexType i) {
      return SpMatRow<DataType, IndexType>(this, i);
   }

   // iterator support
   // values only iterator support
   DataType* begin() noexcept { return values.begin(); }
   DataType* end()   noexcept { return values.end(); }
   //! @}

   /*!
    * \name Safe iteration API
    *
    * This api automates the iteration over the array based on
    * MatrixType
    */
   //! @{

   //! A small iteration helper
   template<typename F, typename... Args>
   void for_each_in (IndexType begin, IndexType end, F&& lambda, Args&&... args) {
      for (IndexType it=begin ; it<end ; ++it) {
@@ -299,15 +366,45 @@ struct SpMat {
      }
   }

   //! @}

   // operations
   // friend operations for printing
   template<typename D, typename I, MatrixType T> friend void print(SpMat<D, I, T>& mat);
   template<typename D, typename I, MatrixType T> friend void print_dense(SpMat<D, I, T>& mat);

 private:
   // index-find helper
   std::pair<IndexType, bool> find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
   /*!
    * A small binary search implementation using index for begin-end instead of iterators.
    *
    * \param   v     Reference to vector to search
    * \param   begin The vector's index to begin
    * \param   end   The vector's index to end
    * \param   match What to search
    * @return        The index of the item or end on failure.
    */
   IndexType find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
      IndexType b = begin, e = end-1;
      while (true) {
         IndexType m = (b+e)/2;
         if       (v[m] == match)   return  m;
         else if  (b >= e)          return  end;
         else {
            if    (v[m] <  match)   b = m +1;
            else                    e   = m -1;
         }
      }
      return end;
   }
   /*!
    * find helper for set using index for begin-end instead of iterators.
    *
    * We search for the item or a place to add the item.
    * So we return the index if we find it. Otherwise we return the place to add it.
    *
    * \param   v     Reference to vector to search
    * \param   begin The vector's index to begin
    * \param   end   The vector's index to end
    * \param   match What to search
    */
   std::pair<IndexType, bool> find2_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
      for ( ; begin < end ; ++begin) {
         if       (match == v[begin])   return std::make_pair(begin, true);
         else if  (match <  v[begin])   return std::make_pair(begin, false);
@@ -324,38 +421,58 @@ private:
   }
   //! \name Data
   //! @{
   std::vector<DataType>   values {};
   std::vector<IndexType>  rows{};
   std::vector<IndexType>  col_ptr{1,0};
   IndexType   N{0};
   IndexType   NNZ{0};
   std::vector<DataType>   values {};     //!< vector to store the values of the matrix
   std::vector<IndexType>  rows{};        //!< vector to store the row information
   std::vector<IndexType>  col_ptr{1,0};  //!< vector to stor the column pointers
   IndexType   N{0};                      //!< The dimension of the matrix (square)
   IndexType   NNZ{0};                    //!< The NNZ (capacity of the matrix)
   //! @}
 };

 /*!
 * A view/iterator hybrid object for SpMat columns.
 *
 * This object provides access to a column of a SpMat. The public functionalities
 * allow data access using indexes instead of iterators. We prefer indexes over iterators
 * because we can apply the same index to different inner vector of SpMat without conversion.
 *
 * @tparam DataType
 * @tparam IndexType
 */
 template<typename DataType, typename IndexType>
 struct SpMatCol {
   using owner_t = SpMat<DataType, IndexType>;

   /*!
    * ctor using column pointers for begin-end. own is pointer to SpMat.
    */
   SpMatCol(owner_t* own, const IndexType begin, const IndexType end) noexcept :
      owner_(own), index_(begin), begin_(begin), end_(end) {
      vindex_ = vIndexCalc(index_);
   }
   SpMatCol()                          = default;
   SpMatCol(const SpMatCol&)           = delete;
   SpMatCol& operator=(const SpMatCol&)= delete;
   SpMatCol(const SpMatCol&)           = delete;   //!< make sure there are no copies
   SpMatCol& operator=(const SpMatCol&)= delete;   //!< make sure there are no copies
   SpMatCol(SpMatCol&&)                = default;
   SpMatCol& operator=(SpMatCol&&)     = default;

   //! a simple dereference operator, like an iterator
   DataType operator* () {
      return get();
   }
   //! Increment operator acts on index(), like an iterator
   SpMatCol& operator++ ()    { advance(); return *this; }
   SpMatCol& operator++ (int) { SpMatCol& p = *this; advance(); return p; }

   //! () operator acts as member access (like a view)
   DataType operator()(IndexType x) {
      return (x == index())? get() : DataType{};
   }
   //! = operator acts as member assignment (like a view)
   DataType operator= (DataType v) { return owner_->values[index_] = v; }
   // iterator like handlers
   // these return a virtual index value based on the items position on the full matrix
   // but the move of the index is just a ++ away.
   IndexType       index()       noexcept { return vindex_; }
   const IndexType index() const noexcept { return vindex_; }
   IndexType       begin()       noexcept { return vIndexCalc(begin_); }
@@ -363,6 +480,13 @@ struct SpMatCol {
   IndexType       end()         noexcept { return owner_->N; }
   const IndexType end()   const noexcept { return owner_->N; }

   /*!
    * Multiplication operator
    * @tparam C   Universal reference for the type right half site column
    *
    * @param c    The right hand site matrix
    * @return     The value of the inner product of two vectors
    */
   template <typename C>
   DataType operator* (C&& c) {
      static_assert(std::is_same<remove_cvref_t<C>, SpMatCol<DataType, IndexType>>(), "");
@@ -380,25 +504,42 @@ struct SpMatCol {
   }

 private:
   //! small tool to increase the index pointers to SpMat matrix
   void advance() noexcept {
      ++index_;
      vindex_ = vIndexCalc(index_);
   }
   //! tool to translate between col_ptr indexes and SpMat "virtual" full matrix indexes
   IndexType vIndexCalc(IndexType idx) {
      return (idx < end_) ? owner_->rows[idx] : end();
   }
   //! small get tool
   DataType get() { return owner_->values[index_]; }
   owner_t*    owner_{nullptr};
   IndexType   vindex_  {IndexType{}};
   IndexType   index_{IndexType{}};
   IndexType   begin_{IndexType{}};
   IndexType   end_{IndexType{}};

   owner_t*    owner_   {nullptr};     //!< Pointer to owner SpMat. SpMatCol is just a view
   IndexType   vindex_  {IndexType{}}; //!< Virtual index of full matrix
   IndexType   index_   {IndexType{}}; //!< index to SpMat::rows
   IndexType   begin_   {IndexType{}}; //!< beginning index of the column in SpMat::rows
   IndexType   end_     {IndexType{}}; //!< ending index of the column in SpMat::rows
 };

 /*!
 * A view/iterator hybrid object for SpMat rows.
 *
 * This object provides access to a column of a SpMat. The public functionalities
 * allow data access using indexes instead of iterators. We prefer indexes over iterators
 * because we can apply the same index to different inner vector of SpMat without conversion.
 *
 * @tparam DataType
 * @tparam IndexType
 */
 template<typename DataType, typename IndexType>
 struct SpMatRow {
   using owner_t = SpMat<DataType, IndexType>;

   /*!
    * ctor using virtual full matrix row index. own is pointer to SpMat.
    */
   SpMatRow(owner_t* own, const IndexType row) noexcept :
      owner_(own), vindex_(IndexType{}), row_(row), index_(IndexType{}),
      begin_(IndexType{}), end_(owner_->NNZ) {
@@ -410,20 +551,29 @@ struct SpMatRow {
         advance();
   }
   SpMatRow()                          = default;
   SpMatRow(const SpMatRow&)           = delete;
   SpMatRow& operator=(const SpMatRow&)= delete;
   SpMatRow(const SpMatRow&)           = delete;   //!< make sure there are no copies
   SpMatRow& operator=(const SpMatRow&)= delete;   //!< make sure there are no copies
   SpMatRow(SpMatRow&&)                = default;
   SpMatRow& operator=(SpMatRow&&)     = default;

   //! a simple dereference operator, like an iterator
   DataType operator* () {
      return get();
   }
   //! Increment operator acts on index(), like an iterator
   //! here the increment is a O(N) process.
   SpMatRow& operator++ ()    { advance(); return *this; }
   SpMatRow& operator++ (int) { SpMatRow& p = *this; advance(); return p; }

   //! () operator acts as member access (like a view)
   DataType operator()(IndexType x) {
      return (x == index())? get() : DataType{};
   }
   //! = operator acts as member assignment (like a view)
   DataType operator= (DataType v) { return owner_->values[index_] = v; }
   // iterator like handlers
   // these return a virtual index value based on the items position on the full matrix
   // but the move of the index is just a ++ away.
   IndexType       index()       noexcept { return vindex_; }
   const IndexType index() const noexcept { return vindex_; }
   IndexType       begin()       noexcept { return vIndexCalc(begin_); }
@@ -431,6 +581,13 @@ struct SpMatRow {
   IndexType       end()         noexcept { return owner_->N; }
   const IndexType end()   const noexcept { return owner_->N; }

   /*!
    * Multiplication operator
    * @tparam C   Universal reference for the type right half site column
    *
    * @param c    The right hand site matrix
    * @return     The value of the inner product of two vectors
    */
   template <typename C>
   DataType operator* (C&& c) {
      static_assert(std::is_same<remove_cvref_t<C>, SpMatCol<DataType, IndexType>>(), "");
@@ -447,70 +604,96 @@ struct SpMatRow {
      return v;
   }
 private:
   //! small tool to increase the index pointers to SpMat matrix
   //! We have to search the entire rows vector in SpMat to find the next
   //! virtual row position.
   //! time complexity O(N)
   void advance() noexcept {
      do
         ++index_;
      while(index_ != end_ && owner_->rows[index_] != row_);
      vindex_ = vIndexCalc(index_);
   }
   //! tool to translate between col_ptr indexes and SpMat "virtual" full matrix indexes
   IndexType vIndexCalc(IndexType idx) {
      for(IndexType i =0 ; i<(owner_->N+1) ; ++i)
         if (idx < owner_->col_ptr[i])
            return i-1;
      return end();
   }
   //! small get tool
   DataType get() { return owner_->values[index_]; }
   owner_t*    owner_   {nullptr};
   IndexType   vindex_  {IndexType{}};
   IndexType   row_     {IndexType{}};
   IndexType   index_   {IndexType{}};
   IndexType   begin_   {IndexType{}};
   IndexType   end_     {IndexType{}};

   owner_t*    owner_   {nullptr};     //!< Pointer to owner SpMat. SpMatCol is just a view
   IndexType   vindex_  {IndexType{}}; //!< Virtual index of full matrix
   IndexType   row_     {IndexType{}}; //!< The virtual full matrix row of the object
   IndexType   index_   {IndexType{}}; //!< index to SpMat::rows
   IndexType   begin_   {IndexType{}}; //!< beginning index of the column in SpMat::rows
   IndexType   end_     {IndexType{}}; //!< ending index of the column in SpMat::rows
 };

 /*!
 * A proxy SpMat value object/view.
 *
 * This object acts as proxy to provide read/write access to an SpMat item.
 *
 * @tparam DataType     The type of the values of the SpMat matrix
 * @tparam IndexType    The type of the indexes of the SpMat matrix
 */
 template<typename DataType, typename IndexType>
 struct SpMatVal {
   using owner_t = SpMat<DataType, IndexType>;

   //!< ctor using all value-row-column data, plus a pointer to owner SpMat object
   SpMatVal(owner_t* own, DataType v, IndexType i, IndexType j) :
      owner_(own), v_(v), i_(i), j_(j) { }
   SpMatVal()                           = default;
   SpMatVal(const SpMatVal&)            = delete;
   SpMatVal& operator=(const SpMatVal&) = delete;
   SpMatVal(const SpMatVal&)            = delete;  //!< make sure there are no copies
   SpMatVal& operator=(const SpMatVal&) = delete;  //!< make sure there are no copies
   SpMatVal(SpMatVal&&)                 = default;
   SpMatVal& operator=(SpMatVal&&)      = default;

   //! Operator to return the DataType value implicitly
   operator DataType() { return v_; }
   //! Operator to write back to owner the assigned value
   //! for ex: A(2,3) = 5;
   SpMatVal& operator=(DataType v) {
      v_ = v;
      owner_->set(v_, i_, j_);
      return *this;
   }
 private:
   owner_t*    owner_{nullptr};;
   DataType    v_{DataType{}};
   IndexType   i_{IndexType{}};
   IndexType   j_{IndexType{}};
   owner_t*    owner_{nullptr};  //!< Pointer to owner SpMat. SpMatVal is just a view.
   DataType    v_{DataType{}};   //!< The value of the row-column pair (for speed)
   IndexType   i_{IndexType{}};  //!< The row
   IndexType   j_{IndexType{}};  //!< the column
 };


 enum class InputMatrix{
   GENERATE,
   MTX
 };
 //! enumerator for input matrix type.
 enum class InputMatrix{ UNSPECIFIED, GENERATE, MTX };
 //! enumerator for output handling
 enum class OutputMode{ STD, FILE };

 /*!
 * Session option for each invocation of the executable
 */
 struct session_t {
   std::size_t    size           {0};
   double         probability    {0};
   InputMatrix    inputMatrix    {InputMatrix::GENERATE};
   std::ifstream  mtxFile        {};
   std::size_t    print_size     {80};
   bool           timing         {false};
   bool           print          {false};
   bool           makeSymmetric  {false};
   InputMatrix    inputMatrix    {InputMatrix::UNSPECIFIED};   //!< Source of the matrix
   std::ifstream  mtxFile        {};                           //!< matrix file in MatrixMarket format
   std::size_t    gen_size       {};                           //!< size of the matrix if we select to generate a random one
   double         gen_prob       {};                           //!< probability of the binomial distribution for the matrix
                                                               //!< if we generate one
   OutputMode     outputMode     {OutputMode::STD};            //!< Type of the output file
   std::ofstream  outFile        {};                           //!< File to use for output
   std::size_t    max_threads    {};                           //!< Maximum threads to use
   bool           timing         {false};                      //!< Enable timing prints of the program
   bool           verbose        {false};                      //!< Flag to enable verbose output to stdout
   bool           makeSymmetric  {true};                       //!< symmetric matrix creation flag (true by default)
   bool           validate_mtx   {false};                      //!< Flag to request mtx input data triangular validation.
   bool           print_count    {false};                      //!< Flag to request total count printing
   bool           mtx_print      {false};                      //!< matrix print flag
   std::size_t    mtx_print_size {};                           //!< matrix print size
 };

 extern session_t session;
--- a/inc/utils.h
+++ b/inc/utils.h
@@ -17,6 +17,10 @@
 #include <impl.hpp>
 #include <config.h>

 /*!
 * A small RAII utility to memory allocation arrays.
 * @tparam T   The type of pointer for the memory
 */
 template <typename T>
 struct buffer_t {
   buffer_t(size_t s) { p = new T[s]; }
@@ -34,8 +38,14 @@ private:
   T* p{nullptr};
 };

 /*!
 * A toolbox for MatrixMarket format handling
 */
 struct Mtx {

   /*!
    * A template version of the coo2csc function provided by PDS lab stuff.
    */
   template<typename I>
   static void coo2csc(I *row, I *col, I const* row_coo, I const* col_coo, I nnz, I n, I isOneBased) {
      // ----- cannot assume that input is already 0!
@@ -70,6 +80,12 @@ struct Mtx {
      }
   }

   /*!
    * Utility to check if a matrix input is strictly triangular or not.
    * @tparam I   Index type
    * @param file Reference to input file stream
    * @return     The status of the operation
    */
   template<typename I>
   static bool is_triangular (std::ifstream& file) {
      std::string line, token;
@@ -105,6 +121,14 @@ struct Mtx {
      return true;
   }

   /*!
    * A utility to load an MatrixMarket file to memory
    * @tparam DataT     The data type
    * @tparam IndexT    The indexes type
    * @param M          Reference to matrix for output
    * @param file       Reference to input file stream to read from
    * @return           The status of the operation
    */
   template<typename DataT, typename IndexT, MatrixType MatrixT>
   static bool load (SpMat<DataT, IndexT, MatrixT>& M, std::ifstream& file) {
      std::string line, token;
@@ -139,60 +163,119 @@ struct Mtx {
            if (line[0] == '%')  continue;
            IndexT i, j;
            ss >> i >> j;
            if (session.makeSymmetric) {
               if (LU == Z) {
                  LU = (i<j) ? UPPER: LOWER;
               }
               if ((LU==LOWER && j<i) || (LU==UPPER && i<j)) {
                  coo_row[cnt]   = i;
                  coo_col[cnt++] = j;
                  coo_row[cnt]   = j;
                  coo_col[cnt++] = i;
               }
            if (LU == Z) {
               LU = (i<j) ? UPPER: LOWER;
            }
            else {
            // ignore all values outside the triangle area
            if ((LU==LOWER && j<i) || (LU==UPPER && i<j)) {
               coo_row[cnt]   = i;
               coo_col[cnt++] = j;
               if (session.makeSymmetric) {
                  coo_row[cnt]   = j;
                  coo_col[cnt++] = i;
               }
            }
            break;
         }
      }
      coo2csc(&row[0], &col[0], &coo_row[0], &coo_col[0], cnt, n1, 1);
      M = SpMat<DataT, IndexT, MatrixT>(n1, cnt, &row[0], &col[0]);
      return true;
      if (cnt) {
         // convert and construct
         coo2csc(&row[0], &col[0], &coo_row[0], &coo_col[0], cnt, n1, 1);
         M = SpMat<DataT, IndexT, MatrixT>(n1, cnt, &row[0], &col[0]);
         return true;
      }
      return false;
   }
 };

 /*!
 * A small timing utility based on chrono.
 */
 struct Timing{
   using Tpoint = std::chrono::steady_clock::time_point;
   using microseconds = std::chrono::microseconds;
   using milliseconds = std::chrono::milliseconds;
   using seconds = std::chrono::seconds;


   //! tool to mark the starting point
   Tpoint start () noexcept { return start_ = std::chrono::steady_clock::now(); }
   //! tool to mark the ending point
   Tpoint stop ()  noexcept { return stop_ = std::chrono::steady_clock::now(); }

   auto dt () noexcept {
      return std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_).count();
   }
   void print_dt () noexcept {
      auto t = stop_ - start_;
      if       (std::chrono::duration_cast<microseconds>(t).count() < 10000)
         std::cout << "time: " << std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n";
      else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000)
         std::cout << "time: " << std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n";
      else
         std::cout << "time: " << std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n";

   //! tool to print the time interval
   void print_dt (const char* what) noexcept {
      if (session.timing) {
         auto t = stop_ - start_;
         if       (std::chrono::duration_cast<microseconds>(t).count() < 10000)
            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n";
         else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000)
            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n";
         else
            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n";
      }
   }
 private:
   Tpoint start_;
   Tpoint stop_;
 };

 /*!
 * A Logger for entire programm.
 */
 struct Log {
   struct Endl {} endl;    //!< a tag objec to to use it as a new line request.

   //! We provide logging via << operator
   template<typename T>
   Log& operator<< (T&& t) {
      if (session.verbose) {
         if (line_) {
            std::cout << "[Log]: " << t;
            line_ = false;
         }
         else
            std::cout << t;
      }
      return *this;
   }
   // overload for special end line handling
   Log& operator<< (Endl e) { (void)e;
      if (session.verbose) {
         std::cout << '\n';
         line_ = true;
      }
      return *this;
   }
 private:
   bool line_ {true};
 };

 extern Log logger;

 //! Total count result printing function
 template<typename F>
 void triangle_out (value_t s, F&& f) {
   f << "Total triangles: " << s << '\n';
 }

 //! vector out result printing function
 template<typename F>
 void vector_out (std::vector<value_t>& v, F&& f) {
   size_t idx{};
   f << "id,c3\n";
   for (auto& it : v) f << idx++ <<',' << it << '\n';
   f << '\n';
 }

 /*
 * Public non-template api.
 * We use matrix alias template. So it has to be defined somewhere
 */
 void init_ER_graph (matrix& A, double p);
 void print_ER_graph (matrix& A);
 void print_graph (matrix& A);
 void threads_info ();

 #endif /* UTILS_H_ */
--- a/inc/v3.h
+++ b/inc/v3.h
@@ -9,13 +9,39 @@
 #ifndef V3_H_
 #define V3_H_

 #include <iostream>
 #include <mutex>
 #include <impl.hpp>

 #if defined CILK
 #include <cilk/cilk.h>
 #include <cilk/cilk_api.h>
 #include <cilk/reducer_opadd.h>

 #elif defined OMP
 #include <omp.h>

 #elif defined THREADS
 #include <thread>

 #else
 #endif

 namespace v3 {

 //! Select a data representation suited for V3.
 using matrix = SpMat<int, int>;

 int triang_count (matrix& A);

 using index_t  = typename matrix::indexType; //!< syntactic sugar alias for index type
 using value_t  = typename matrix::dataType;  //!< syntactic sugar alias for value type

 /*
 *  Common api for all the versions
 */
 int nworkers();
 std::vector<value_t> triang_v(matrix& A);
 value_t triang_count (std::vector<value_t>& c);

 };
 #endif /* V3_H_ */
--- a/inc/v4.h
+++ b/inc/v4.h
@@ -9,13 +9,40 @@
 #ifndef V4_H_
 #define V4_H_

 #include <iostream>
 #include <string>
 #include <mutex>
 #include <impl.hpp>

 #if defined CILK
 #include <cilk/cilk.h>
 #include <cilk/cilk_api.h>
 #include <cilk/reducer_opadd.h>

 #elif defined OMP
 #include <omp.h>

 #elif defined THREADS
 #include <thread>

 #else
 #endif

 namespace v4 {

 //! Select a data representation suited for V3.
 using matrix = SpMat<int, int>;

 int triang_count (matrix& A);

 using index_t  = typename matrix::indexType; //!< syntactic sugar alias for index type
 using value_t  = typename matrix::dataType;  //!< syntactic sugar alias for value type

 /*
 *  Common api for all the versions
 */
 int nworkers();
 std::vector<value_t> triang_v(matrix& A);
 value_t triang_count (std::vector<value_t>& c);

 };
 #endif /* V4_H_ */
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,6 +16,7 @@

 // Global session data
 session_t session;
 Log       logger;

 /*!
 * A small command line argument parser
@@ -35,74 +36,123 @@ bool get_options(int argc, char* argv[]){
         else
            status = false;
      }
      else if (arg == "-g" || arg == "--generate")
      else if (arg == "-o" || arg == "--output") {
         session.outputMode = OutputMode::FILE;
         if (i+1 < argc)
            session.outFile = std::ofstream(argv[++i]);
         else
            status = false;
      }
      else if (arg == "-g" || arg == "--generate") {
         session.inputMatrix = InputMatrix::GENERATE;
      else if (arg == "-s" || arg == "--size")
         session.size = (i+1 < argc) ? std::atoi(argv[++i]) : session.size;
      else if (arg == "-p" || arg == "--probability")
         session.probability = (i+1 < argc) ? std::atof(argv[++i]) : session.probability;
      else if (arg == "--print") {
         session.print = true;
         session.print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.print_size;
         if (i+2 < argc) {
            session.gen_size = std::atoi(argv[++i]);
            session.gen_prob = std::atof(argv[++i]);
         }
         else
            status = false;
      }
      else if (arg == "-n" || arg == "--max_trheads") {
         session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads;
      }
      else if (arg == "--make_symmetric")
         session.makeSymmetric = true;
      else if (arg == "-t" || arg == "--timing")
         session.timing = true;
      else if (arg == "-v" || arg == "--verbose")
         session.verbose = true;
      else if (arg == "--triangular_only")
         session.makeSymmetric = false;
      else if (arg == "--validate_mtx")
         session.validate_mtx = true;
      else if (arg == "--print_count")
         session.print_count = true;
      else if (arg == "--print_graph") {
         session.mtx_print = true;
         session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size;
      }
      else if (arg == "-h" || arg == "--help") {
         std::cout << "Help message\n";
         exit(0);
      }
      else {
      else {   // parse error
         std::cout << "Error message\n";
         status = false;
      }
   }

   // Input checkers
   if (session.inputMatrix == InputMatrix::UNSPECIFIED) {
      std::cout << "Error message\n";
      status = false;
   }
   return status;
 }


 int main(int argc, char* argv[]) try {
   Timing timer;
   matrix A;

   // try to read command line
   if (!get_options(argc, argv))
      exit(1);

   // get or generate matrix
 /*!
 * get or generate matrix
 * \param   A     Reference to matrix for output (move using RVO)
 * \param   timer Reference to timer utility to access time printing functionality
 */
 void prepare_matrix (matrix& A, Timing& timer) {
   if (session.inputMatrix == InputMatrix::GENERATE) {
      std::cout << "Initialize matrix with size: " << session.size << " and probability: " << session.probability << '\n';
      logger << "Initialize matrix with size: " << session.gen_size << " and probability: " << session.gen_prob << logger.endl;
      timer.start();
      A.size(session.size);
      init_ER_graph(A, session.probability);
      A.size(session.gen_size);
      init_ER_graph(A, session.gen_prob);
      timer.stop();
      if (session.timing)  timer.print_dt();
      timer.print_dt("generate matrix");
   }
   else {
      std::cout << "Read matrix from file\n";
      logger << "Read matrix from file" << logger.endl;
      timer.start();
      if (session.makeSymmetric && !Mtx::is_triangular<matrix::indexType> (session.mtxFile))
      if (session.validate_mtx && !Mtx::is_triangular<matrix::indexType> (session.mtxFile))
         throw std::runtime_error("Error: Matrix is not strictly upper or lower");
      if (!Mtx::load (A, session.mtxFile)) {
         throw std::runtime_error("Error: fail to load matrix");
      }
      timer.stop();
      std::cout << "Matrix size: " << A.size() << " and capacity: " << A.capacity() <<'\n';
      if (session.timing)  timer.print_dt();
      logger << "Matrix size: " << A.size() << " and capacity: " << A.capacity() << logger.endl;
      timer.print_dt("load matrix");
   }

   if (session.print) {
      std::cout << "Array A:\n";
      print_ER_graph (A);
   if (session.verbose && session.mtx_print) {
      logger << "\nMatrix:" << logger.endl;
      print_graph (A);
   }
 }

 /*
 * main program
 */
 int main(int argc, char* argv[]) try {
   Timing timer;
   matrix A;
   std::vector<value_t> c;
   index_t s;

   std::cout << "count triangles\n";
   // try to read command line
   if (!get_options(argc, argv))
      exit(1);

   prepare_matrix(A, timer);
   threads_info();
   logger << "Create count vector" << logger.endl;
   timer.start();
   std::cout << "There are " << triang_count(A) << " triangles\n";
   c = triang_v (A);
   timer.stop();
   if (session.timing)  timer.print_dt();

   timer.print_dt("create count vector");
   if (session.print_count) {
      logger << "Calculate total triangles" << logger.endl;
      timer.start();
      s = triang_count(c);
      timer.stop();
      logger << "There are " << s << " triangles" << logger.endl;
      timer.print_dt("calculate sum");
   }
   // output results
   if (session.print_count)
      triangle_out (s, (session.outputMode == OutputMode::FILE) ? session.outFile : std::cout);
   else
      vector_out (c, (session.outputMode == OutputMode::FILE) ? session.outFile : std::cout);
 	return 0;
 }
 catch (std::exception& e) {
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -40,8 +40,8 @@ void init_ER_graph (matrix& A, double p) {
 /*!
 * Utility to print the graph to sdtout
 */
 void print_ER_graph (matrix& A) {
   matrix::indexType N = (A.size() < (matrix::indexType)session.print_size) ? A.size() : session.print_size;
 void print_graph (matrix& A) {
   matrix::indexType N = (A.size() < (matrix::indexType)session.mtx_print_size) ? A.size() : session.mtx_print_size;

   A.for_each_in(0, N, [&](auto i){
      A.for_each_in(0, N, [&](auto j) {
@@ -51,3 +51,20 @@ void print_ER_graph (matrix& A) {
   });
 }

 /*!
 * Utility to print to logger thread information
 */
 void threads_info () {
 #if defined CILK
   logger << "Running with max threads: " << __cilkrts_get_nworkers() << logger.endl;
   logger << "Utilizing " << __cilkrts_get_nworkers() << " threads for calculating vector." << logger.endl;
   logger << "Utilizing " << nworkers() << " threads for calculating sum." << logger.endl;
 #elif defined OMP
   logger << "Running with max threads: " << nworkers() << logger.endl;
 #elif defined THREADS
   logger << "Running with max threads: " << nworkers() << logger.endl;
 #else
   logger << "Running the serial version of the algorithm." << logger.endl;
 #endif
 }

--- a/src/v3.cpp
+++ b/src/v3.cpp
@@ -6,30 +6,128 @@
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #include <iostream>
 #include <random>
 #include <v3.h>

 //   for (int i=0 ; i<A.size() ; ++i) {
 //      for (int j = A.col_ptr[i]; j<A.col_ptr[i+1] ; ++j) {
 //         int j_idx = A.rows[j];
 //         for (int k = A.col_ptr[j_idx] ; k<A.col_ptr[j_idx+1] ; ++k) {
 //            int k_idx = A.rows[k];
 //            if (A.get(k_idx, i)) {
 //               ++c[i];
 //            }
 //         }
 //      }
 //   }

 namespace v3 {

 using index_t  = typename matrix::indexType;
 using value_t  = typename matrix::dataType;
 #if defined CILK

 /*!
 * A naive triangle counting algorithm
 * \param A    The adjacency matrix
 * \return     The number of triangles
 // export CILK_NWORKERS=<num>
 int nworkers() {
   if (session.max_threads)
      return (session.max_threads < __cilkrts_get_nworkers()) ?
            session.max_threads : __cilkrts_get_nworkers();
   else
      return __cilkrts_get_nworkers();
 }

 std::vector<value_t> triang_v(matrix& A) {
   std::vector<value_t> c(A.size());

   cilk_for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
            if (A.get(k.index(), i))                                       // search for i-k edge
               ++c[i];
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
   std::vector<value_t> sum_v(n, 0);

   for (index_t i =0 ; i < n ; ++i) {
      cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
   }
   cilk_sync;

   value_t s =0;
   for (auto& it : sum_v) s += it;
   return s;
 }

 #elif defined OMP

 /*
 // export OMP_NUM_THREADS=<num>
 */
 int nworkers() {
   if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
      omp_set_dynamic(0);
      omp_set_num_threads(session.max_threads);
      return session.max_threads;
   }
   else {
      omp_set_dynamic(1);
      return omp_get_max_threads();
   }
 }

 std::vector<value_t> triang_v(matrix& A) {
   std::vector<value_t> c(A.size());

   #pragma omp parallel for shared(c)
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
            if (A.get(k.index(), i))                                       // search for i-k edge
               ++c[i];
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 value_t sum (std::vector<value_t>& v) {
   value_t s =0;

   #pragma omp parallel for reduction(+:s)
   for (auto i =0u ; i<v.size() ; ++i)
      s += v[i];
   return s;
 }

 #else

 int nworkers() { return 1; }

 std::vector<value_t> triang_v(matrix& A) {
   std::vector<value_t> c(A.size());

   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
            for (auto ii = A.getCol(i) ; ii.index() <= k.index() ; ++ii)   // search for i-k edge (this could be binary search)
               if (ii.index() == k.index())  ++c[i];
            if (A.get(k.index(), i))                                       // search for i-k edge
               ++c[i];
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

@@ -40,9 +138,11 @@ value_t sum (std::vector<value_t>& v) {
   return s;
 }

 value_t triang_count (matrix& A) {
   auto v = triang_v(A);
   return sum(v);
 #endif


 value_t triang_count (std::vector<value_t>& c) {
   return (session.makeSymmetric) ? sum(c)/3 : sum(c);
 }

 }
--- a/src/v4.cpp
+++ b/src/v4.cpp
@@ -6,27 +6,176 @@
 *    Christos Choutouridis AEM:8997
 *    <cchoutou@ece.auth.gr>
 */

 #include <iostream>
 #include <random>
 #include <v4.h>

 namespace v4 {

 using index_t  = typename matrix::indexType;
 using value_t  = typename matrix::dataType;
 #if defined CILK

 // export CILK_NWORKERS=<num>
 int nworkers() {
   if (session.max_threads)
      return (session.max_threads < __cilkrts_get_nworkers()) ?
            session.max_threads : __cilkrts_get_nworkers();
   else
      return __cilkrts_get_nworkers();
 }

 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());
   for (int i=0 ; i<A.size() ; ++i) {

   cilk_for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
   std::vector<value_t> sum_v(n, 0);

   for (index_t i =0 ; i < n ; ++i) {
      cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
   }
   cilk_sync;

   value_t s =0;
   for (auto& it : sum_v) s += it;
   return s;
 }

 #elif defined OMP

 /*
 // export OMP_NUM_THREADS=<num>
 */
 int nworkers() {
   if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
      omp_set_dynamic(0);
      omp_set_num_threads(session.max_threads);
      return session.max_threads;
   }
   else {
      omp_set_dynamic(1);
      return omp_get_max_threads();
   }
 }

 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());

   #pragma omp parallel for shared(c)
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 value_t sum (std::vector<value_t>& v) {
   value_t s =0;

   #pragma omp parallel for reduction(+:s)
   for (auto i =0u ; i<v.size() ; ++i)
      s += v[i];
   return s;
 }

 #elif defined THREADS

 /*
 * std::thread::hardware_concurrency()
 */
 int nworkers() {
   if (session.max_threads)
      return (session.max_threads < std::thread::hardware_concurrency()) ?
            session.max_threads : std::thread::hardware_concurrency();
   else
      return std::thread::hardware_concurrency();
 }

 std::vector<value_t> mmacc_v_rng(std::vector<value_t>& out, matrix& A, matrix& B, index_t begin, index_t end) {
   for (index_t i=begin ; i<end ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
         out[i] += A.getRow(i)*B.getCol(j.index());
      }
   }
   return out;
 }

 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<std::thread> workers;
   std::vector<value_t> c(A.size());
   int n = nworkers();

   for (index_t i=0 ; i<n ; ++i)
      workers.push_back (std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), i*c.size()/n, (i+1)*c.size()/n));

   std::for_each(workers.begin(), workers.end(), [](std::thread& t){
      t.join();
   });
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
   std::vector<value_t> sum_v(n, 0);
   std::vector<std::thread> workers;

   for (index_t i =0 ; i < n ; ++i)
      workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));

   std::for_each(workers.begin(), workers.end(), [](std::thread& t){
      t.join();
   });

   value_t s =0;
   for (auto& it : sum_v) s += it;
   return s;
 }

 #else

 int nworkers() { return 1; }

 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
   }
   if (session.makeSymmetric)
      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
         return x/2;
      });
   return c;
 }

 value_t sum (std::vector<value_t>& v) {
   value_t s =0;
@@ -35,9 +184,14 @@ value_t sum (std::vector<value_t>& v) {
   return s;
 }

 value_t triang_count (matrix& A) {
   auto v = mmacc_v(A, A);
   return (session.makeSymmetric) ? sum(v)/6 : sum(v);
 #endif

 std::vector<value_t> triang_v(matrix& A) {
   return mmacc_v(A, A);
 }

 value_t triang_count (std::vector<value_t>& c) {
   return (session.makeSymmetric) ? sum(c)/3 : sum(c);
 }

 }  // namespace v4