A measurement ready version

2020-12-04 19:12:13 +02:00 · 2020-12-04 19:12:13 +02:00 · 8b5112dc2a
commit 8b5112dc2a
parent fd8a3a2bc3
28 changed files with 10166 additions and 112 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,10 @@ out/
 mat/
 mtx/

+# hpc related
+exclude
+hpc_auth_sync.sh
+
 # eclipse
 .project
 .cproject
--- a/77
+++ b/77
@ -38,7 +38,7 @@ DEP_DIR         := $(BUILD_DIR)/.dep
 # ========== Compiler settings ==========
 # Compiler flags for debug and release
 DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c++14
-REL_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -O2 -std=c++14
+REL_CFLAGS      := -Wall -Wextra -O3 -std=c++14
 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
 PRE_DEFS		:=
@ -151,39 +151,50 @@ release: $(BUILD_DIR)/$(TARGET)

 all: release

-local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V3
+local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=3
 local_v3: TARGET := local_v3
 local_v3: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

-local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V4
+local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=4
 local_v4: TARGET := local_v4
 local_v4: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

+elearn: CFLAGS := $(DEB_CFLAGS) -DELEARNING
+elearn: TARGET := elearn
+elearn: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+	
+local_v4_opt: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -pg
+local_v4_opt: LDFLAGS += -pg
+local_v4_opt: TARGET := local_v4_opt
+local_v4_opt: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
 v3: DOCKER := $(DOCKER_CMD)
-v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V3
+v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3
 v3: TARGET := tcount_v3
 v3: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v3_cilk: DOCKER := $(DOCKER_CMD)
 v3_cilk: CXX    := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
-v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=V3 -DCILK
+v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK
 v3_cilk: LDFLAGS += -fcilkplus
 v3_cilk: TARGET := tcount_cilkv3
 v3_cilk: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v3_omp: DOCKER := $(DOCKER_CMD)
-v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V3 -DOMP
+v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP
 v3_omp: LDFLAGS += -fopenmp
 v3_omp: TARGET := tcount_ompv3
 v3_omp: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4: DOCKER := $(DOCKER_CMD)
-v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4
+v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4
 v4: TARGET := tcount_v4
 v4: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
@ -197,25 +208,61 @@ v4_cilk: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4_omp: DOCKER := $(DOCKER_CMD)
-v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V4 -DOMP
+v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP
 v4_omp: LDFLAGS += -fopenmp
 v4_omp: TARGET := tcount_ompv4
 v4_omp: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 v4_pthreads: DOCKER := $(DOCKER_CMD)
-v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4 -DTHREADS
+v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS
 v4_pthreads: TARGET := tcount_pthv4
 v4_pthreads: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

+
 #
-# ================ Docker based rules ================
-# examples:
-# make IMAGE="gcc:8.3" dock
+# ================ hpc build rules =================
 #
-dock: DOCKER := $(DOCKER_CMD)
-dock: CFLAGS := $(REL_CFLAGS)
-dock: $(BUILD_DIR)/$(TARGET)
+hpc_v3_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3
+hpc_v3_ser: TARGET := hpc_v3
+hpc_v3_ser: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP
+hpc_v3_omp: LDFLAGS += -fopenmp
+hpc_v3_omp: TARGET := hpc_ompv3
+hpc_v3_omp: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v3_cilk: CXX    := clang++
+hpc_v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK
+hpc_v3_cilk: LDFLAGS += -fcilkplus
+hpc_v3_cilk: TARGET := hpc_cilkv3
+hpc_v3_cilk: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v4_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4
+hpc_v4_ser: TARGET := hpc_v4
+hpc_v4_ser: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP
+hpc_v4_omp: LDFLAGS += -fopenmp
+hpc_v4_omp: TARGET := hpc_ompv4
+hpc_v4_omp: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v4_cilk: CXX    := clang++
+hpc_v4_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=4 -DCILK
+hpc_v4_cilk: LDFLAGS += -fcilkplus
+hpc_v4_cilk: TARGET := hpc_cilkv4
+hpc_v4_cilk: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+hpc_v4_pth: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS
+hpc_v4_pth: TARGET := hpc_pthv4
+hpc_v4_pth: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)


--- a/hpc-results/ntasks1.out
+++ b/hpc-results/ntasks1.out
--- a/hpc-results/ntasks1.sh
+++ b/hpc-results/ntasks1.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=ntasks1.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks10.out
+++ b/hpc-results/ntasks10.out
--- a/hpc-results/ntasks10.sh
+++ b/hpc-results/ntasks10.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=10
+#SBATCH --output=ntasks10.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks15.out
+++ b/hpc-results/ntasks15.out
--- a/hpc-results/ntasks15.sh
+++ b/hpc-results/ntasks15.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=15
+#SBATCH --output=ntasks15.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks2.out
+++ b/hpc-results/ntasks2.out
--- a/hpc-results/ntasks2.sh
+++ b/hpc-results/ntasks2.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=ntasks2.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks20.out
+++ b/hpc-results/ntasks20.out
--- a/hpc-results/ntasks20.sh
+++ b/hpc-results/ntasks20.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=20
+#SBATCH --output=ntasks20.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks4.out
+++ b/hpc-results/ntasks4.out
--- a/hpc-results/ntasks4.sh
+++ b/hpc-results/ntasks4.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=ntasks4.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks5.out
+++ b/hpc-results/ntasks5.out
--- a/hpc-results/ntasks5.sh
+++ b/hpc-results/ntasks5.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=5
+#SBATCH --output=ntasks5.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/hpc-results/ntasks8.out
+++ b/hpc-results/ntasks8.out
--- a/hpc-results/ntasks8.sh
+++ b/hpc-results/ntasks8.sh
@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+#SBATCH --time=20:00
+#SBATCH --partition=batch
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --output=ntasks8.out
+
+module load gcc/9.2.0 openmpi/3.1.6
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
+
+export OMP_NUM_THREADS=$SLURM_NTASKS
+export CILK_NWORKERS=$SLURM_NTASKS
+
+./runall.sh mtx/belgium_osm.mtx 8
+./runall.sh mtx/com-Youtube.mtx 8
+./runall.sh mtx/dblp-2010.mtx 8
+./runall.sh mtx/mycielskian13.mtx 8
+./runall.sh mtx/NACA0015.mtx 8
--- a/inc/config.h
+++ b/inc/config.h
@ -13,6 +13,7 @@
 #include <v12.h>
 #include <v3.h>
 #include <v4.h>
+#include <elearn.h>

 /*
 * Defines for different version of the exercise
--- a/inc/elearn.h
+++ b/inc/elearn.h
@ -0,0 +1,17 @@
+/*!
+ * \file    elearn.h
+ * \brief   e-learning version of the exercise.
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#ifndef ELEARN_H_
+#define ELEARN_H_
+
+#include <impl.hpp>
+
+uint32_t elearn_test (void) ;
+
+
+#endif /* ELEARN_H_ */
--- a/inc/impl.hpp
+++ b/inc/impl.hpp
@ -284,8 +284,22 @@ struct SpMat {
    * @return     The value of the item or DataType{} if is not present.
    */
   DataType get(IndexType i, IndexType j) {
-      IndexType end, idx =find_idx(rows, col_ptr[j], end=col_ptr[j+1], i);
-      return (idx != end) ? values[idx] : 0;
+      IndexType idx; bool found;
+      std::tie(idx, found) =find_idx(rows, col_ptr[j], col_ptr[j+1], i);
+      return (found) ? values[idx] : 0;
+   }
+
+   /*!
+    * A read item functionality using binary search to find the correct row
+    *
+    * @param i    The row number
+    * @param j    The column number
+    * @return     The value of the item or DataType{} if is not present.
+    */
+   DataType get2(IndexType i, IndexType j) {
+      IndexType idx; bool found;
+      std::tie(idx, found) =find2_idx(rows, col_ptr[j], col_ptr[j+1], i);
+      return (found) ? values[idx] : 0;
   }

   /*!
@ -380,18 +394,18 @@ private:
    * \param   match What to search
    * @return        The index of the item or end on failure.
    */
-   IndexType find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
+   std::pair<IndexType, bool> find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
      IndexType b = begin, e = end-1;
      while (true) {
         IndexType m = (b+e)/2;
-         if       (v[m] == match)   return  m;
-         else if  (b >= e)          return  end;
+         if       (v[m] == match)   return  std::make_pair(m, true);
+         else if  (b >= e)          return  std::make_pair(end, false);
         else {
            if    (v[m] <  match)   b = m +1;
            else                    e   = m -1;
         }
      }
-      return end;
+      return std::make_pair(end, false);;
   }
   /*!
    * find helper for set using index for begin-end instead of iterators.
@ -687,13 +701,19 @@ struct session_t {
   OutputMode     outputMode     {OutputMode::STD};            //!< Type of the output file
   std::ofstream  outFile        {};                           //!< File to use for output
   std::size_t    max_threads    {};                           //!< Maximum threads to use
+   std::size_t    repeat         {1};                          //!< How many times we execute the calculations part
   bool           timing         {false};                      //!< Enable timing prints of the program
   bool           verbose        {false};                      //!< Flag to enable verbose output to stdout
+#if CODE_VERSION == 3
+   bool           makeSymmetric  {false};                      //!< symmetric matrix creation flag (true by default)
+#else
   bool           makeSymmetric  {true};                       //!< symmetric matrix creation flag (true by default)
+#endif
   bool           validate_mtx   {false};                      //!< Flag to request mtx input data triangular validation.
   bool           print_count    {false};                      //!< Flag to request total count printing
   bool           mtx_print      {false};                      //!< matrix print flag
   std::size_t    mtx_print_size {};                           //!< matrix print size
+   bool           dynamic        {false};                      //!< Selects dynamic scheduling for OpenMP and pthreads.
 };

 extern session_t session;
--- a/inc/v3.h
+++ b/inc/v3.h
@ -11,6 +11,7 @@

 #include <iostream>
 #include <mutex>
+#include <atomic>
 #include <impl.hpp>

 #if defined CILK
--- a/inc/v4.h
+++ b/inc/v4.h
@ -24,6 +24,8 @@

 #elif defined THREADS
 #include <thread>
+#include <numeric>
+#include <random>

 #else
 #endif
--- a/runall.sh
+++ b/runall.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+if [[ $# -lt 2 ]]; then
+   echo "Error: You must pass the matrix files and the number of iterations"
+	echo "example $ runnall.sh mtx/s12.mtx 5"
+   exit 1;
+fi
+
+dynamics=("out/hpc_ompv3" "out/hpc_ompv4" "out/hpc_pthv4")
+
+for ex in out/*; do
+   echo "-------------------------------------------"
+   echo "executable: $ex"
+	for file in "$@"; do
+		if [[ $file == ${@: -1} ]];then
+			continue
+		fi
+      echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null"
+		eval $ex -i $file -r ${@: -1} --timing -o /dev/null
+      echo "running $ex -i $file -r ${@: -1} --timing --print_count"
+		eval $ex -i $file -r ${@: -1} --timing --print_count
+		
+		if [[ $ex == ${dynamics[0]} || $ex == ${dynamics[1]} || $ex == ${dynamics[2]} ]]; then
+ 			echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic"
+			eval $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic
+      	echo "running $ex -i $file -r ${@: -1} --timing --print_count --dynamic"
+			eval $ex -i $file -r ${@: -1} --timing --print_count --dynamic
+		fi     
+		done
+done
+
--- a/src/elearn.cpp
+++ b/src/elearn.cpp
@ -0,0 +1,126 @@
+/*!
+ * \file    elearn.cpp
+ * \brief   e-learning version of the exercise.
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#include <elearn.h>
+
+//------- e-learning code start ---------
+
+//! Credits to PDS team
+static void coo2csc_e(
+      uint32_t *row, uint32_t *col, uint32_t const* row_coo, uint32_t const* col_coo, uint32_t nnz, uint32_t n, uint32_t isOneBased
+      ) {
+   // ----- cannot assume that input is already 0!
+   for (uint32_t l = 0; l < n+1; l++) col[l] = 0;
+
+   // ----- find the correct column sizes
+   for (uint32_t l = 0; l < nnz; l++)
+      col[col_coo[l] - isOneBased]++;
+
+   // ----- cumulative sum
+   for (uint32_t i = 0, cumsum = 0; i < n; i++) {
+      uint32_t temp = col[i];
+      col[i] = cumsum;
+      cumsum += temp;
+   }
+   col[n] = nnz;
+   // ----- copy the row indices to the correct place
+   for (uint32_t l = 0; l < nnz; l++) {
+      uint32_t col_l;
+      col_l = col_coo[l] - isOneBased;
+
+      uint32_t dst = col[col_l];
+      row[dst] = row_coo[l] - isOneBased;
+
+      col[col_l]++;
+   }
+   // ----- revert the column pointers
+   for (uint32_t i = 0, last = 0; i < n; i++) {
+      uint32_t temp = col[i];
+      col[i] = last;
+      last = temp;
+   }
+}
+
+/*!
+ * A small binary search utility
+ */
+uint32_t find_idx(const uint32_t* v, uint32_t begin, uint32_t end, uint32_t match) {
+   uint32_t b = begin, e = end-1;
+   while (1) {
+      uint32_t m = (b+e)/2;
+      if       (v[m] == match)   return  m;
+      else if  (b >= e)          return  end;
+      else {
+         if    (v[m] <  match)   b = m +1;
+         else                    e   = m -1;
+      }
+   }
+   return end;
+}
+
+/*!
+ * Sparse matrix item accessor
+ */
+uint32_t get(uint32_t* R, uint32_t* C, uint32_t i, uint32_t j) {
+   uint32_t e = C[j+1];
+   return (find_idx(R, C[j], e, i) != e) ? 1 : 0;
+}
+
+/*!
+ * \param coo_row    pointer to coo row data
+ * \param coo_col    pointer to coo_column data
+ * \param n          the size of matrix
+ * \param nz         the number of non-zero items
+ * \return           The vertex-wise count vector
+ */
+uint32_t* vertexWiseTriangleCounts (uint32_t *coo_row, uint32_t *coo_col, uint32_t n, uint32_t nz) {
+   uint32_t* v = (uint32_t*)malloc(sizeof(uint32_t)*n);
+   uint32_t* R = (uint32_t*)malloc(sizeof(uint32_t)*nz);
+   uint32_t* C = (uint32_t*)malloc(sizeof(uint32_t)*n+1);
+
+   // convert input
+   coo2csc_e (R, C, coo_row, coo_col, nz, n, 1);
+
+   for (uint32_t i=0 ; i<n ; ++i) {
+      for (uint32_t j = C[i]; j<C[i+1] ; ++j) {
+         uint32_t j_idx = R[j];
+         for (uint32_t k = C[j_idx] ; k<C[j_idx+1] ; ++k) {
+            uint32_t k_idx = R[k];
+            if (get(R, C, k_idx, i)) {
+               ++v[i];
+               ++v[j_idx];
+               ++v[k_idx];
+            }
+         }
+      }
+   }
+   return v;
+}
+
+//------- e-learning code end ---------
+
+/*!
+ * A unit-test like functionality to check our implementation.
+ * \return
+ */
+uint32_t elearn_test (void) {
+   uint32_t CooR[] = { 2, 4, 6, 7, 3, 5, 6, 8, 11, 12, 4, 11, 12, 7, 6, 7, 9, 10, 12};
+   uint32_t CooC[] = { 1, 1, 1, 1, 2, 2, 2, 2,  2,  2, 3,  3,  3, 4, 5, 6, 8,  8, 11};
+   uint32_t    N   = 12;
+   uint32_t   NZ   = 19;
+   uint32_t   c3[] = { 3, 5, 3, 1, 1, 3, 2, 0, 0, 0, 3, 3 };
+
+   uint32_t* tc3 = vertexWiseTriangleCounts(CooR, CooC, N, NZ);   // call
+
+   for (uint32_t i=0 ; i<N ; ++i)                                 // validate
+      if (tc3[i] != c3[i])
+         return 0;   // fail
+   return 1;         // pass
+}
+
+
--- a/src/main.cpp
+++ b/src/main.cpp
@ -55,26 +55,72 @@ bool get_options(int argc, char* argv[]){
      else if (arg == "-n" || arg == "--max_trheads") {
         session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads;
      }
+      else if (arg == "-r" || arg == "--repeat") {
+         session.repeat = (i+1 < argc) ? std::atoi(argv[++i]) : session.repeat;
+      }
      else if (arg == "-t" || arg == "--timing")
         session.timing = true;
      else if (arg == "-v" || arg == "--verbose")
         session.verbose = true;
+      else if (arg == "--make_symmetric")
+         session.makeSymmetric = true;
      else if (arg == "--triangular_only")
         session.makeSymmetric = false;
      else if (arg == "--validate_mtx")
         session.validate_mtx = true;
-      else if (arg == "--print_count")
+      else if (arg == "--dynamic")
+         session.dynamic = true;
+      else if (arg == "--print_count") {
         session.print_count = true;
+         session.makeSymmetric = false;
+      }
      else if (arg == "--print_graph") {
         session.mtx_print = true;
         session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size;
      }
      else if (arg == "-h" || arg == "--help") {
-         std::cout << "Help message\n";
+         std::cout << "vertex-wise triangular count utility.\n\n";
+         std::cout << "tcount -i <file> | -g <size> <probability> [-o <file>] [-n <threads>] [--dynamic] [-r <times>] [-t] [-v]\n";
+         std::cout << "        [--make_symmetric] [--triangular_only] [--print_count] [--validate_mtx] [--print_graph <size>]\n";
+         std::cout << '\n';
+         std::cout << "Options:\n\n";
+         std::cout << "   -i | --input <file>\n";
+         std::cout << "      Path to mtx file to load.\n\n";
+         std::cout << "   -g | --generate <size> <probability>\n";
+         std::cout << "      Request a random generated graph with size <size> and probability <probability>.\n";
+         std::cout << "      This is very slow, use it with care.\n\n";
+         std::cout << "   -o | --output <file>\n";
+         std::cout << "      Select <file> as output file. Default is stdout.\n\n";
+         std::cout << "   -n | --max_trheads <threads>\n";
+         std::cout << "      Reduce the thread number for the execution to <threads>. <threads> must be less or equal to available CPUs.\n\n";
+         std::cout << "   --dynamic\n";
+         std::cout << "      Request of dynamic scheduling for OpenMP and pthreads. Does not affect cilk versions.\n\n";
+         std::cout << "   -r | --repeat <times>\n";
+         std::cout << "      Repeat the vector calculation <times> times.\n\n";
+         std::cout << "   -t | --timing\n";
+         std::cout << "      Request timing measurements output to stdout.\n\n";
+         std::cout << "   -v | --verbose\n";
+         std::cout << "      Request a more verbose output to stdout.\n\n";
+         std::cout << "   --make_symmetric\n";
+         std::cout << "      Explicitly request a symmetric graph generation. This affects only V3 versions where by default a lower\n";
+         std::cout << "      triangular matrix is used.\n\n";
+         std::cout << "   --triangular_only\n";
+         std::cout << "      NOTE: Requires also \"--print_count\".\n";
+         std::cout << "      Explicitly request to use a lower triangular matrix. This affects only V4 versions where a symmetric\n";
+         std::cout << "      matrix is used by default and produce correct answer ONLY for total triangle counting (--print_count).\n\n";
+         std::cout << "   --print_count\n";
+         std::cout << "      NOTE: When used, also implies \"---triangular_only\" for all versions.\n";
+         std::cout << "      Request a total triangle counting output.\n\n";
+         std::cout << "   --validate_mtx\n";
+         std::cout << "      Request an input matrix validation before execution.\n\n";
+         std::cout << "   --print_graph <size>\n";
+         std::cout << "      Prints the first <size> x <size> part of the matrix to stdout.\n\n";
+         std::cout << "   -h | --help <size>\n";
+         std::cout << "      Prints this and exit.\n";
         exit(0);
      }
      else {   // parse error
-         std::cout << "Error message\n";
+         std::cout << "Invokation error. Try -h for details.\n";
         status = false;
      }
   }
@ -84,6 +130,12 @@ bool get_options(int argc, char* argv[]){
      std::cout << "Error message\n";
      status = false;
   }
+#if CODE_VERSION == V4
+   else if (!session.makeSymmetric && !session.print_count) {
+      std::cout << "\"--triangular_only\" requires \"--print_count\"\n";
+      status = false;
+   }
+#endif
   return status;
 }

@ -129,17 +181,26 @@ int main(int argc, char* argv[]) try {
   std::vector<value_t> c;
   index_t s;

+   #if defined ELEARNING
+   if (!elearn_test())  std::cout << "E-learning test: FAIL\n";
+   else                 std::cout << "E-learning test: PASS\n";
+   exit(0);
+   #endif
+
   // try to read command line
   if (!get_options(argc, argv))
      exit(1);

   prepare_matrix(A, timer);
   threads_info();
-   logger << "Create count vector" << logger.endl;
+   for (size_t i =0 ; i<session.repeat ; ++i) {
+      // repeat calculations as requested by user
+      logger << "Create vector" << logger.endl;
      timer.start();
      c = triang_v (A);
      timer.stop();
-   timer.print_dt("create count vector");
+      timer.print_dt("create vector");
+   }
   if (session.print_count) {
      logger << "Calculate total triangles" << logger.endl;
      timer.start();
--- a/src/v3.cpp
+++ b/src/v3.cpp
@ -8,23 +8,20 @@
 */
 #include <v3.h>

-//   for (int i=0 ; i<A.size() ; ++i) {
-//      for (int j = A.col_ptr[i]; j<A.col_ptr[i+1] ; ++j) {
-//         int j_idx = A.rows[j];
-//         for (int k = A.col_ptr[j_idx] ; k<A.col_ptr[j_idx+1] ; ++k) {
-//            int k_idx = A.rows[k];
-//            if (A.get(k_idx, i)) {
-//               ++c[i];
-//            }
-//         }
-//      }
-//   }
-
 namespace v3 {

 #if defined CILK

-// export CILK_NWORKERS=<num>
+/*!
+ * Utility function to get/set the number of threads.
+ *
+ * The number of threads are controlled via environment variable \c CILK_NWORKERS
+ *
+ * \return  The number of threads used.
+ * \note
+ *    The user can reduce the number with the command option \c --max_threads.
+ *    If so the requested number will be used even if the environment has more threads available.
+ */
 int nworkers() {
   if (session.max_threads)
      return (session.max_threads < __cilkrts_get_nworkers()) ?
@ -33,45 +30,93 @@ int nworkers() {
      return __cilkrts_get_nworkers();
 }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ * \param   A  The matrix to use.
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ */
 std::vector<value_t> triang_v(matrix& A) {
-   std::vector<value_t> c(A.size());
+   std::vector<std::atomic<value_t>> c(A.size());
+   std::vector<value_t> ret(A.size());

   cilk_for (int i=0 ; i<A.size() ; ++i) {
-      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
-         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
-            if (A.get(k.index(), i))                                       // search for i-k edge
-               ++c[i];
+      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
+         // j list all the edges with i
+         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
+            // k list all the edges with j
+            if (A.get(k.index(), i)) {
+               ++ret[i];
+               c[j.index()] += (!session.makeSymmetric)? 1:0;
+               c[k.index()] += (!session.makeSymmetric)? 1:0;
            }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
-   return c;
+         }
+      }
+      if (session.makeSymmetric) {
+         ret[i] = ret[i]/2;
+         c[i] = c[i]/2;
+      }
+   }
+   for (index_t i =0 ; i<A.size() ; ++i)   ret[i] += c[i];
+   return ret;
 }

+/*!
+ * A sum utility to use as spawn function for parallelized sum.
+ * \return  The sum of \c v from \c begin to \c end.
+ */
 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

+/*!
+ * A parallelized version of sum. Just because ;)
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
-   std::vector<value_t> sum_v(n, 0);
+   std::vector<value_t> sum_v(n, 0);   // result of each do_sum invokation.

+   // We spawn workers in a more statically way.
   for (index_t i =0 ; i < n ; ++i) {
      cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
   }
   cilk_sync;

-   value_t s =0;
-   for (auto& it : sum_v) s += it;
+   // sum the sums (a sum to rule them all)
+   value_t s =0; for (auto& it : sum_v) s += it;
   return s;
 }

 #elif defined OMP

-/*
-// export OMP_NUM_THREADS=<num>
+/*!
+ * A "simple" user defined OpenMP reduction for vector<value_t>
+ * \note
+ *    Not used. Reason: The atomic version of the code performs better.
+ */
+#pragma omp declare reduction(vec_value_plus : std::vector<value_t> :                                    \
+         std::transform(                                                                                 \
+               omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus<value_t>()     \
+         )                                                                                               \
+      )                                                                                                  \
+      initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
+
+
+/*!
+ * Utility function to get/set the number of threads.
+ *
+ * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
+ *
+ * \return  The number of threads used.
+ * \note
+ *    The user can reduce the number with the command option \c --max_threads.
+ *    If so the requested number will be used even if the environment has more threads available.
 */
 int nworkers() {
   if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
@ -85,23 +130,49 @@ int nworkers() {
   }
 }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ * \param   A  The matrix to use.
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ */
 std::vector<value_t> triang_v(matrix& A) {
-   std::vector<value_t> c(A.size());
+   std::vector<std::atomic<value_t>> c(A.size());
+   std::vector<value_t> ret(A.size());

-   #pragma omp parallel for shared(c)
+   // OMP schedule selection
+   if (session.dynamic)    omp_set_schedule (omp_sched_dynamic, 0);
+   else                    omp_set_schedule (omp_sched_static, 0);
+   #pragma omp parallel for schedule(runtime) //reduction(vec_value_plus : c)
   for (int i=0 ; i<A.size() ; ++i) {
-      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
-         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
-            if (A.get(k.index(), i))                                       // search for i-k edge
-               ++c[i];
+      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
+         // j list all the edges with i
+         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
+            // k list all the edges with j
+            if (A.get(k.index(), i)) {
+               ++ret[i];
+               c[j.index()] += (!session.makeSymmetric)? 1:0;
+               c[k.index()] += (!session.makeSymmetric)? 1:0;
            }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
-   return c;
+         }
+      }
+      if (session.makeSymmetric) {
+         ret[i] = ret[i]/2;
+         c[i] = c[i]/2;
+      }
+   }
+   for (index_t i =0 ; i<A.size() ; ++i)   ret[i] += c[i];
+   return ret;
 }

+/*!
+ * A parallelized version of sum. Just because ;)
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   value_t s =0;

@ -113,24 +184,44 @@ value_t sum (std::vector<value_t>& v) {

 #else

+//! Return the number of workers.
+//! \note   This function is just for completion
 int nworkers() { return 1; }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ * \param   A  The matrix to use.
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ */
 std::vector<value_t> triang_v(matrix& A) {
   std::vector<value_t> c(A.size());

   for (int i=0 ; i<A.size() ; ++i) {
-      for (auto j = A.getCol(i); j.index() != j.end() ; ++j)               // j list all the edges with i
-         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k)    // k list all the edges with j
-            if (A.get(k.index(), i))                                       // search for i-k edge
+      for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
+         // j list all the edges with i
+         for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
+            // k list all the edges with j
+            if (A.get(k.index(), i)) {
               ++c[i];
+               c[j.index()] += (!session.makeSymmetric)? 1:0;
+               c[k.index()] += (!session.makeSymmetric)? 1:0;
+            }
+         }
+      }
+      if (session.makeSymmetric) c[i] /= 2;
   }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
   return c;
 }

+/*!
+ * Summation functionality.
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   value_t s =0;
   for (auto& it : v)
@ -140,9 +231,9 @@ value_t sum (std::vector<value_t>& v) {

 #endif

-
+//! Polymorphic interface function for sum results
 value_t triang_count (std::vector<value_t>& c) {
-   return (session.makeSymmetric) ? sum(c)/3 : sum(c);
+   return sum(c)/3;
 }

 }
--- a/src/v4.cpp
+++ b/src/v4.cpp
@ -12,7 +12,16 @@ namespace v4 {

 #if defined CILK

-// export CILK_NWORKERS=<num>
+/*!
+ * Utility function to get/set the number of threads.
+ *
+ * The number of threads are controlled via environment variable \c CILK_NWORKERS
+ *
+ * \return  The number of threads used.
+ * \note
+ *    The user can reduce the number with the command option \c --max_threads.
+ *    If so the requested number will be used even if the environment has more threads available.
+ */
 int nworkers() {
   if (session.max_threads)
      return (session.max_threads < __cilkrts_get_nworkers()) ?
@ -21,6 +30,25 @@ int nworkers() {
      return __cilkrts_get_nworkers();
 }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ *           1
+ * vector = --- * (A.* (A*B))*ones_N
+ *           2
+ * We squeezed all that to one function for performance. The row*column multiplication
+ * uses the inner CSC structure of sparse matrix and follows only non-zero members.
+ *
+ * \param   A  The first matrix to use.
+ * \param   B  The second matrix to use (they can be the same).
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ * \warning
+ *    The later(--triangular_only) produce correct results ONLY if we are after the total count.
+ */
 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());

@ -28,37 +56,50 @@ std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
+      if (session.makeSymmetric) c[i] /= 2;
   }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
   return c;
 }

+/*!
+ * A sum utility to use as spawn function for parallelized sum.
+ * \return  The sum of \c v from \c begin to \c end.
+ */
 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

+/*!
+ * A parallelized version of sum. Just because ;)
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
-   std::vector<value_t> sum_v(n, 0);
+   std::vector<value_t> sum_v(n, 0);   // result of each do_sum invokation.

+   // We spawn workers in a more statically way.
   for (index_t i =0 ; i < n ; ++i) {
      cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
   }
   cilk_sync;

-   value_t s =0;
-   for (auto& it : sum_v) s += it;
+   // sum the sums (a sum to rule them all)
+   value_t s =0; for (auto& it : sum_v) s += it;
   return s;
 }

 #elif defined OMP

-/*
-// export OMP_NUM_THREADS=<num>
+/*!
+ * Utility function to get/set the number of threads.
+ *
+ * The number of threads are controlled via environment variable \c OMP_NUM_THREADS
+ *
+ * \return  The number of threads used.
+ * \note
+ *    The user can reduce the number with the command option \c --max_threads.
+ *    If so the requested number will be used even if the environment has more threads available.
 */
 int nworkers() {
   if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
@ -72,22 +113,45 @@ int nworkers() {
   }
 }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ *           1
+ * vector = --- * (A.* (A*B))*ones_N
+ *           2
+ * We squeezed all that to one function for performance. The row*column multiplication
+ * uses the inner CSC structure of sparse matrix and follows only non-zero members.
+ *
+ * \param   A  The first matrix to use.
+ * \param   B  The second matrix to use (they can be the same).
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ * \warning
+ *    The later(--triangular_only) produce correct results ONLY if we are after the total count.
+ */
 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());

-   #pragma omp parallel for shared(c)
+   // OMP schedule selection
+   if (session.dynamic)    omp_set_schedule (omp_sched_dynamic, 0);
+   else                    omp_set_schedule (omp_sched_static, 0);
+   #pragma omp parallel for shared(c) schedule(runtime)
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
+      if (session.makeSymmetric) c[i] /= 2;
   }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
   return c;
 }

+/*!
+ * A parallelized version of sum. Just because ;)
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   value_t s =0;

@ -99,8 +163,15 @@ value_t sum (std::vector<value_t>& v) {

 #elif defined THREADS

-/*
- * std::thread::hardware_concurrency()
+/*!
+ * Utility function to get/set the number of threads.
+ *
+ * The number of threads are inherited by the environment via std::thread::hardware_concurrency()
+ *
+ * \return  The number of threads used.
+ * \note
+ *    The user can reduce the number with the command option \c --max_threads.
+ *    If so the requested number will be used even if the environment has more threads available.
 */
 int nworkers() {
   if (session.max_threads)
@ -110,43 +181,89 @@ int nworkers() {
      return std::thread::hardware_concurrency();
 }

-std::vector<value_t> mmacc_v_rng(std::vector<value_t>& out, matrix& A, matrix& B, index_t begin, index_t end) {
+/*!
+ * A spawn function to calculate and return a vertex-wise count vector.
+ *
+ *                       1
+ * vector(begin..end) = --- * (A.* (A*B))*ones_N
+ *                       2
+ *
+ * We squeezed all that to one function for performance. The row*column multiplication
+ * uses the inner CSC structure of sparse matrix and follows only non-zero members.
+ *
+ * \param   out   Reference to output vector
+ * \param   A     The first matrix to use.
+ * \param   B     The second matrix to use (they can be the same).
+ * \param   iton  vector containing the range with the columns to use (it can be shuffled).
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ * \warning
+ *    The later(--triangular_only) produce correct results ONLY if we are after the total count.
+ */
+std::vector<value_t> mmacc_v_rng(
+      std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) {
   for (index_t i=begin ; i<end ; ++i) {
-      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
-         out[i] += A.getRow(i)*B.getCol(j.index());
+      index_t ii = iton[i];
+      for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){
+         out[ii] += A.getRow(ii)*B.getCol(j.index());
      }
+      if (session.makeSymmetric) out[ii] /= 2;
   }
   return out;
 }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ * \param   A  The first matrix to use.
+ * \param   B  The second matrix to use (they can be the same).
+ * \return  The count vector. RVO is used here.
+ */
 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<std::thread> workers;
   std::vector<value_t> c(A.size());
   int n = nworkers();

-   for (index_t i=0 ; i<n ; ++i)
-      workers.push_back (std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), i*c.size()/n, (i+1)*c.size()/n));
+   std::vector<index_t> iton(A.size());      // Create a 0 .. N range for outer loop
+   std::iota(iton.begin(), iton.end(), 0);
+   if (session.dynamic)                      // in case of dynamic scheduling, shuffle the range
+      std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()});

+   for (index_t i=0 ; i<n ; ++i)             // dispatch the workers and hold them in a vector
+      workers.push_back (
+         std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n)
+      );
+
+   // a for to join them all...
   std::for_each(workers.begin(), workers.end(), [](std::thread& t){
      t.join();
   });
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
+
   return c;
 }

+/*!
+ * A sum utility to use as spawn function for parallelized sum.
+ * \return  The sum of \c v from \c begin to \c end.
+ */
 void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
   for (auto i =begin ; i != end ; ++i)
      out_sum += v[i];
 }

+/*!
+ * A parallelized version of sum. Just because ;)
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   int n = nworkers();
-   std::vector<value_t> sum_v(n, 0);
+   std::vector<value_t> sum_v(n, 0);   // result of each do_sum invokation.
   std::vector<std::thread> workers;

+   // We spawn workers in a more statically way.
   for (index_t i =0 ; i < n ; ++i)
      workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));

@ -154,29 +271,51 @@ value_t sum (std::vector<value_t>& v) {
      t.join();
   });

-   value_t s =0;
-   for (auto& it : sum_v) s += it;
+   // sum the sums (a sum to rule them all)
+   value_t s =0; for (auto& it : sum_v) s += it;
   return s;
 }

 #else

+//! Return the number of workers.
+//! \note   This function is just for completion
 int nworkers() { return 1; }

+/*!
+ * Calculate and return a vertex-wise count vector.
+ *
+ *           1
+ * vector = --- * (A.* (A*B))*ones_N
+ *           2
+ * We squeezed all that to one function for performance. The row*column multiplication
+ * uses the inner CSC structure of sparse matrix and follows only non-zero members.
+ *
+ * \param   A  The first matrix to use.
+ * \param   B  The second matrix to use (they can be the same).
+ * \return  The count vector. RVO is used here.
+ * \note
+ *    We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
+ *    - A full matrix calculation which update only c[i]
+ *    - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
+ * \warning
+ *    The later(--triangular_only) produce correct results ONLY if we are after the total count.
+ */
 std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
   std::vector<value_t> c(A.size());
   for (int i=0 ; i<A.size() ; ++i) {
      for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
         c[i] += A.getRow(i)*B.getCol(j.index());
      }
+      if (session.makeSymmetric) c[i] /= 2;
   }
-   if (session.makeSymmetric)
-      std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
-         return x/2;
-      });
   return c;
 }

+/*!
+ * Summation functionality.
+ * \return     The total sum of vector \c v
+ */
 value_t sum (std::vector<value_t>& v) {
   value_t s =0;
   for (auto& it : v)
@ -186,10 +325,12 @@ value_t sum (std::vector<value_t>& v) {

 #endif

+//! Polymorphic interface function for count vector
 std::vector<value_t> triang_v(matrix& A) {
   return mmacc_v(A, A);
 }

+//! Polymorphic interface function for sum results
 value_t triang_count (std::vector<value_t>& c) {
   return (session.makeSymmetric) ? sum(c)/3 : sum(c);
 }