HW2: Setup build env and basic code setup

HW2: Init C++ project
HW2: Add julia implementations for v0.5 and v1
2024-12-26 12:04:24 +02:00 · 2024-12-25 10:02:48 +02:00 · 2024-12-24 17:58:10 +02:00 · 2024-11-21 22:56:37 +02:00 · 2024-11-21 15:18:39 +02:00 · 2024-11-20 21:56:04 +02:00
35 changed files with 2765 additions and 201 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+# Assessments
+assessments/
--- a/homework_1/.gitignore
+++ b/homework_1/.gitignore
@ -9,8 +9,16 @@ mtx/
 exclude
 hpc_auth_sync.sh

+# IDEs
+.idea/
+.clangd
+
 # eclipse
 .project
 .cproject
 .settings/

+.vs/
+.vscode/
+
+
--- a/homework_1/Makefile
+++ b/homework_1/Makefile
@ -27,8 +27,8 @@ SRC_DIR_LIST    := src gtest
 # Include directories list(space seperated). Makefile-relative path.
 INC_DIR_LIST    := inc \
                   src \
+                  /usr/include/hdf5/serial/ \
                   gtest \
-                   /usr/include/hdf5/serial/
 #                   Libs/MATLAB/R2019b/include/ \

 # Exclude files list(space seperated). Filenames only.
@ -179,7 +179,7 @@ hpc-clean:
 	rm hpc-results/post

 #
-# ================ Local via docker build rules =================
+# ================ Local (and/or) via docker build rules =================
 #
 # examples:
 # make IMAGE=hpcimage v0
@ -191,18 +191,27 @@ local_v0: TARGET := local_v0
 local_v0: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
 	
-local_v0_opt: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=0
-local_v0_opt: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=0
-local_v0_opt: TARGET := local_v0_opt
-local_v0_opt: $(BUILD_DIR)/$(TARGET)
-	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
-	
 local_v1: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=1
 local_v1: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=1
 local_v1: TARGET := local_v1
 local_v1: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

+local_v1_omp: CFLAGS := $(DEB_CFLAGS) -fopenmp -DCODE_VERSION=1 -DOMP
+local_v1_omp: CXXFLAGS := $(DEB_CXXFLAGS) -fopenmp -DCODE_VERSION=1 -DOMP
+local_v1_omp: LDFLAGS += -fopenmp
+local_v1_omp: TARGET := local_v1_omp
+local_v1_omp: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+
+
+local_v1_pth: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=1 -DPTHREADS
+local_v1_pth: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=1 -DPTHREADS
+local_v1_pth: TARGET := local_v1_pth
+local_v1_pth: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+	
+
 v0: DOCKER := $(DOCKER_CMD)
 v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=0
 v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=0
@ -215,7 +224,7 @@ v1_cilk: CXX    := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
 v1_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=1 -DCILK
 v1_cilk: CXXFLAGS := $(REL_CXXFLAGS) -fcilkplus -DCODE_VERSION=1 -DCILK
 v1_cilk: LDFLAGS += -fcilkplus
-v1_cilk: TARGET := knnsearch_cilkv1
+v1_cilk: TARGET := knnsearch_v1_cilk
 v1_cilk: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

@ -223,10 +232,17 @@ v1_omp: DOCKER := $(DOCKER_CMD)
 v1_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=1 -DOMP
 v1_omp: CXXFLAGS := $(REL_CXXFLAGS) -fopenmp -DCODE_VERSION=1 -DOMP
 v1_omp: LDFLAGS += -fopenmp
-v1_omp: TARGET := knnsearch_ompv1
+v1_omp: TARGET := knnsearch_v1_omp
 v1_omp: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

+v1_pth: DOCKER := $(DOCKER_CMD)
+v1_pth: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=1 -DPTHREADS
+v1_pth: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=1 -DPTHREADS
+v1_pth: TARGET := knnsearch_v1_pth
+v1_pth: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
+	
 v1: DOCKER := $(DOCKER_CMD)
 v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=1
 v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=1
@ -240,6 +256,11 @@ tests: TARGET := tests
 tests: $(BUILD_DIR)/$(TARGET)
 	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
 	
+tests_rel: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=0 -DTESTING
+tests_rel: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=0 -DTESTING
+tests_rel: TARGET := tests
+tests_rel: $(BUILD_DIR)/$(TARGET)
+	cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)

 #
 # ========= Inside CSAL Image build rules ===========
--- a/homework_1/gtest/tests.cpp
+++ b/homework_1/gtest/tests.cpp
@ -19,6 +19,8 @@

 using matrix_t = mtx::Matrix<int>;

+extern void loadMtx(MatrixDst& Corpus, MatrixDst& Query);
+extern void storeMtx(MatrixIdx& Idx, MatrixDst& Dst);

 // =====================================
 // C1, Q1
@ -140,11 +142,44 @@ TEST(Tv0_UT, pdist2_test2) {
 }


+TEST(Tv0_UT, pdist2_test3) {
+
+   mtx::Matrix<double> D2_exp(16, 16, {
+            0,  0.7433,  0.6868,  0.8846,  0.6342,  0.4561,  0.5118,  0.6341,  0.5461,  0.7322,  0.6974,  0.4330,  0.7028,  0.6303,  0.6826,  0.4179,
+       0.7433,       0,  0.3400,  0.4555,  0.4207,  0.9736,  0.9690,  0.7386,  1.1055,  0.5462,  0.5345,  0.6576,  0.8677,  1.0291,  0.5393,  0.8106,
+       0.6868,  0.3400,       0,  0.5380,  0.6268,  0.9512,  1.0234,  0.8403,  0.9843,  0.8187,  0.3091,  0.7829,  0.5759,  0.9411,  0.7239,  0.9186,
+       0.8846,  0.4555,  0.5380,       0,  0.6796,  1.1672,  1.0460,  1.1016,  1.1139,  0.7542,  0.6480,  0.9304,  1.0568,  1.3482,  0.8316,  0.9750,
+       0.6342,  0.4207,  0.6268,  0.6796,       0,  0.9267,  0.8772,  0.4847,  0.9317,  0.4093,  0.8351,  0.4215,  0.9736,  0.9007,  0.5999,  0.5291,
+       0.4561,  0.9736,  0.9512,  1.1672,  0.9267,       0,  0.3903,  0.7795,  0.9308,  0.8429,  0.8436,  0.5672,  0.9284,  0.7064,  0.6435,  0.5975,
+       0.5118,  0.9690,  1.0234,  1.0460,  0.8772,  0.3903,       0,  0.8920,  0.9253,  0.7060,  0.9427,  0.5728,  1.1515,  0.9907,  0.6471,  0.4811,
+       0.6341,  0.7386,  0.8403,  1.1016,  0.4847,  0.7795,  0.8920,       0,  0.9824,  0.6416,  0.9844,  0.3398,  0.9355,  0.5428,  0.6536,  0.5309,
+       0.5461,  1.1055,  0.9843,  1.1139,  0.9317,  0.9308,  0.9253,  0.9824,       0,  1.1517,  1.0541,  0.8746,  0.8506,  0.8777,  1.2036,  0.7607,
+       0.7322,  0.5462,  0.8187,  0.7542,  0.4093,  0.8429,  0.7060,  0.6416,  1.1517,       0,  0.9106,  0.4245,  1.2071,  1.0738,  0.3745,  0.5170,
+       0.6974,  0.5345,  0.3091,  0.6480,  0.8351,  0.8436,  0.9427,  0.9844,  1.0541,  0.9106,       0,  0.8647,  0.5941,  0.9954,  0.7148,  0.9876,
+       0.4330,  0.6576,  0.7829,  0.9304,  0.4215,  0.5672,  0.5728,  0.3398,  0.8746,  0.4245,  0.8647,       0,  0.9590,  0.6782,  0.4586,  0.2525,
+       0.7028,  0.8677,  0.5759,  1.0568,  0.9736,  0.9284,  1.1515,  0.9355,  0.8506,  1.2071,  0.5941,  0.9590,       0,  0.6838,  1.0517,  1.0675,
+       0.6303,  1.0291,  0.9411,  1.3482,  0.9007,  0.7064,  0.9907,  0.5428,  0.8777,  1.0738,  0.9954,  0.6782,  0.6838,       0,  0.9482,  0.7937,
+       0.6826,  0.5393,  0.7239,  0.8316,  0.5999,  0.6435,  0.6471,  0.6536,  1.2036,  0.3745,  0.7148,  0.4586,  1.0517,  0.9482,       0,  0.6345,
+       0.4179,  0.8106,  0.9186,  0.9750,  0.5291,  0.5975,  0.4811,  0.5309,  0.7607,  0.5170,  0.9876,  0.2525,  1.0675,  0.7937,  0.6345,       0
+   });
+
+   mtx::Matrix<double> D (16,16);
+
+   v0::pdist2(C2, C2, D);
+
+   for (size_t i = 0 ; i< D.rows() ; ++i)
+      for (size_t j = 0 ; j<D.columns() ; ++j) {
+         EXPECT_EQ (D2_exp.get(i ,j) + 0.01 > D(i, j), true);
+         EXPECT_EQ (D2_exp.get(i ,j) - 0.01 < D(i, j), true);
+      }
+}
+
+
 /*
 * ==========================================
 * v0::knn
 */
-TEST(Tv0_UT, knn_test1) {
+TEST(Tv0_UT, knn_v0_test1) {
   size_t k = 3;
   mtx::Matrix<uint32_t> Idx_exp(5, k, {
      5,     8,     9,
@ -177,7 +212,7 @@ TEST(Tv0_UT, knn_test1) {

 }

-TEST(Tv0_UT, knn_test2) {
+TEST(Tv0_UT, knn_v0_test2) {
   size_t k = 3;
   mtx::Matrix<uint32_t> Idx_exp(8, k, {
      14,    13,     1,
@ -220,7 +255,7 @@ TEST(Tv0_UT, knn_test2) {
 * ==========================================
 * v1::knn
 */
-TEST(Tv1_UT, knn_test1) {
+TEST(Tv1_UT, knn_v1_1slice) {
   size_t k = 3;
   mtx::Matrix<uint32_t> Idx_exp(8, k, {
      14,    13,     1,
@ -247,7 +282,46 @@ TEST(Tv1_UT, knn_test1) {
   mtx::Matrix<uint32_t> Idx(8, k);
   mtx::Matrix<double>   Dst(8, k);

-   v1::knnsearch(C2, Q2, 0, k, k, Idx, Dst);
+   v1::knnsearch(C2, Q2, 1, k, k, Idx, Dst);
+
+
+   for (size_t i = 0 ; i< Idx.rows() ; ++i)
+      for (size_t j = 0 ; j<Idx.columns() ; ++j) {
+         EXPECT_EQ (Idx_exp(i ,j) == Idx(i, j) + 1, true);  // matlab starts from 1
+         EXPECT_EQ (Dst_exp.get(i ,j) + 0.01 > Dst(i, j), true);
+         EXPECT_EQ (Dst_exp.get(i ,j) - 0.01 < Dst(i, j), true);
+      }
+
+}
+
+TEST(Tv1_UT, knn_v1_2slice) {
+   size_t k = 3;
+   mtx::Matrix<uint32_t> Idx_exp(8, k, {
+      14,    13,     1,
+      15,    10,    12,
+      14,     8,    12,
+       4,     1,     3,
+       8,    12,     5,
+       4,     3,     2,
+      10,     2,     4,
+       1,    11,     9
+   });
+
+   mtx::Matrix<double> Dst_exp(8, k, {
+      0.1939,    0.5768,    0.6020,
+      0.2708,    0.3686,    0.3740,
+      0.2684,    0.3103,    0.5277,
+      0.4709,    0.6050,    0.6492,
+      0.4397,    0.6842,    0.7074,
+      0.3402,    0.4360,    0.4475,
+      0.3708,    0.4037,    0.4417,
+      0.6352,    0.6653,    0.6758
+   });
+
+   mtx::Matrix<uint32_t> Idx(8, k);
+   mtx::Matrix<double>   Dst(8, k);
+
+   v1::knnsearch(C2, Q2, 2, k, k, Idx, Dst);


   for (size_t i = 0 ; i< Idx.rows() ; ++i)
@ -260,7 +334,7 @@ TEST(Tv1_UT, knn_test1) {
 }

 // all-to-all
-TEST(Tv1_UT, knn_test2) {
+TEST(Tv1_UT, knn_v1_4slice) {
   size_t k = 3;
   mtx::Matrix<uint32_t> Idx_exp(16, k, {
      1,    16,    12,
@ -303,7 +377,7 @@ TEST(Tv1_UT, knn_test2) {
   mtx::Matrix<uint32_t> Idx(16, k);
   mtx::Matrix<double>   Dst(16, k);

-   v1::knnsearch(C2, C2, 0, k, k, Idx, Dst);
+   v1::knnsearch(C2, C2, 4, k, k, Idx, Dst);


   for (size_t i = 0 ; i< Idx.rows() ; ++i)
@ -315,3 +389,130 @@ TEST(Tv1_UT, knn_test2) {

 }

+
+
+/*
+ * ============== Live hdf5 tests ===============
+ *
+ * In order to run these test we need the followin hdf5 files in ./mtx directory:
+ *
+ * - fasion-mnist-784-euclidean.hdf5
+ * - mnist-784-euclidean.hdf5
+ * - sift-128-euclidean.hdf5
+ * - gist-960-euclidean.hdf5
+ *
+ */
+
+TEST(Tlive_UT, knn_v0_sift_test) {
+   // Instantiate matrixes
+   MatrixDst Corpus;
+   MatrixDst Query;
+   MatrixIdx Idx;
+   MatrixDst Dst;
+
+   // setup environment
+   session.corpusMtxFile = "mtx/sift-128-euclidean.hdf5";
+   session.corpusDataSet = "/test";
+   session.queryMtx = false;
+   session.k = 100;
+   size_t m = session.k;
+   session.timing = true;
+   session.outMtxFile = "test/knn_v0.hdf5";
+
+
+   loadMtx(Corpus, Query);
+
+   // Prepare output memory (There is no Query, so from Corpus
+   Idx.resize(Corpus.rows(), session.k);
+   Dst.resize(Corpus.rows(), session.k);
+
+   v0::knnsearch(Corpus, Corpus, 0, session.k, m, Idx, Dst);
+   storeMtx(Idx, Dst);
+   EXPECT_EQ(true, true);
+}
+
+
+TEST(Tlive_UT, knn_v1_sift_test_1slice) {
+   // Instantiate matrixes
+   MatrixDst Corpus;
+   MatrixDst Query;
+   MatrixIdx Idx;
+   MatrixDst Dst;
+
+   // setup environment
+   session.corpusMtxFile = "mtx/sift-128-euclidean.hdf5";
+   session.corpusDataSet = "/test";
+   session.queryMtx = false;
+   session.k = 100;
+   size_t m = session.k;
+   session.timing = true;
+   session.outMtxFile = "test/knn_v1ser.hdf5";
+
+
+   loadMtx(Corpus, Query);
+
+   // Prepare output memory (There is no Query, so from Corpus
+   Idx.resize(Corpus.rows(), session.k);
+   Dst.resize(Corpus.rows(), session.k);
+
+   v1::knnsearch(Corpus, Corpus, 0, session.k, m, Idx, Dst);
+   storeMtx(Idx, Dst);
+   EXPECT_EQ(true, true);
+}
+
+TEST(Tlive_UT, knn_v1_sift_test_2slice) {
+   // Instantiate matrixes
+   MatrixDst Corpus;
+   MatrixDst Query;
+   MatrixIdx Idx;
+   MatrixDst Dst;
+
+   // setup environment
+   session.corpusMtxFile = "mtx/sift-128-euclidean.hdf5";
+   session.corpusDataSet = "/test";
+   session.queryMtx = false;
+   session.k = 100;
+   size_t m = session.k;
+   session.timing = true;
+   session.outMtxFile = "test/knn_v1ser.hdf5";
+
+
+   loadMtx(Corpus, Query);
+
+   // Prepare output memory (There is no Query, so from Corpus
+   Idx.resize(Corpus.rows(), session.k);
+   Dst.resize(Corpus.rows(), session.k);
+
+   v1::knnsearch(Corpus, Corpus, 2, session.k, m, Idx, Dst);
+   storeMtx(Idx, Dst);
+   EXPECT_EQ(true, true);
+}
+
+TEST(Tlive_UT, knn_v1_sift_test_4slice) {
+   // Instantiate matrixes
+   MatrixDst Corpus;
+   MatrixDst Query;
+   MatrixIdx Idx;
+   MatrixDst Dst;
+
+   // setup environment
+   session.corpusMtxFile = "mtx/sift-128-euclidean.hdf5";
+   session.corpusDataSet = "/test";
+   session.queryMtx = false;
+   session.k = 100;
+   size_t m = session.k;
+   session.timing = true;
+   session.outMtxFile = "test/knn_v1ser.hdf5";
+
+
+   loadMtx(Corpus, Query);
+
+   // Prepare output memory (There is no Query, so from Corpus
+   Idx.resize(Corpus.rows(), session.k);
+   Dst.resize(Corpus.rows(), session.k);
+
+   v1::knnsearch(Corpus, Corpus, 4, session.k, m, Idx, Dst);
+   storeMtx(Idx, Dst);
+   EXPECT_EQ(true, true);
+}
+
--- a/homework_1/inc/config.h
+++ b/homework_1/inc/config.h
@ -55,7 +55,9 @@ struct session_t {
   std::string   outMtxFile     {"out.hdf5"};              //!< output matrix file name in HDF5 format
   std::string   outMtxIdxDataSet {"/Idx"};                //!< Index output dataset name in HDF5 matrix file
   std::string   outMtxDstDataSet {"/Dst"};                //!< Distance output dataset name in HDF5 matrix file
-   std::size_t   max_threads    {};                        //!< Maximum threads to use
+   std::size_t   max_threads    {0};                       //!< Maximum threads to use
+   std::size_t   slices         {0};                       //!< Slices/threads to use
+   std::size_t   accuracy       {100};                     //!< The neighbor finding accuracy
   bool          timing         {false};                   //!< Enable timing prints of the program
   bool          verbose        {false};                   //!< Flag to enable verbose output to stdout
 };
--- a/homework_1/inc/matrix.hpp
+++ b/homework_1/inc/matrix.hpp
@ -134,6 +134,9 @@ struct Matrix {
   Matrix& operator=(Matrix&& m) noexcept { moves(std::move(m)); return *this; }
   Matrix(const Matrix& m)             = delete;  //!< No copy ctor
   Matrix& operator=(const Matrix& m)  = delete;  //!< No copy
+   //Matrix(const Matrix& m);
+   //Matrix& operator=(const Matrix& m) { copy(m); }
+
   //! @}

   //! \name Data exposure
@ -233,6 +236,11 @@ struct Matrix {

   // a basic serial iterator support
   DataType* data() noexcept { return data_; }
+   DataType* begin() noexcept { return data_; }
+   const DataType* begin() const noexcept { return data_; }
+   DataType* end() noexcept { return data_ + capacity(rows_, cols_); }
+   const DataType* end() const noexcept { return data_ + capacity(rows_, cols_); }
+
 //   IndexType begin_idx() noexcept { return 0; }
 //   IndexType end_idx()   noexcept { return capacity(rows_, cols_); }

@ -265,17 +273,19 @@ struct Matrix {
      std::swap(rows_, src.rows_);
      std::swap(cols_, src.cols_);
   }
+
 private:
   //! move helper
   void moves(Matrix&& src) noexcept {
-      data_ = std::move(src.vector_storage_);
-      data_ = std::move(src.raw_storage_);
+      vector_storage_ = std::move(src.vector_storage_);
+      raw_storage_    = std::move(src.raw_storage_);
      data_           = std::move(src.data_);
-      data_ = std::move(src.use_vector_);
+      use_vector_     = std::move(src.use_vector_);
      rows_           = std::move(src.rows_);
      cols_           = std::move(src.cols_);
   }

+   // Storage
   std::vector<DataType>
               vector_storage_;  //!< Internal storage (if used).
   DataType*   raw_storage_;     //!< External storage (if used).
@ -528,125 +538,6 @@ private:
 };


-template<typename ...> struct Matrix_view { };
-
-/*!
- * @struct Matrix_view
- * @tparam MatrixType
- */
-template<template <typename, typename, MatrixType, MatrixOrder, bool> class Matrix,
-         typename DataType,
-         typename IndexType,
-         MatrixType Type,
-         MatrixOrder Order>
-struct Matrix_view<Matrix<DataType, IndexType, Type, Order, false>> {
-   using owner_t = Matrix<DataType, IndexType, Type, Order, false>;
-
-   using dataType    = DataType;                      //!< meta:export of underling data type
-   using indexType   = IndexType;                     //!< meta:export of underling index type
-   static constexpr MatrixOrder matrixOrder = Order;  //!< meta:export of array order
-   static constexpr MatrixType matrixType = Type;     //!< meta:export of array type
-
-   /*!
-    * \name Obj lifetime
-    */
-   //! @{
-
-   //! Construct a matrix view to entire matrix
-   Matrix_view(const owner_t* owner) noexcept :
-      owner_(owner), m_(owner->data()), rows_(owner->rows()), cols_(owner->columns()) { }
-
-   Matrix_view(const owner_t* owner, IndexType begin, IndexType end) noexcept :
-      owner_(owner) {
-         if constexpr (Order == MatrixOrder::ROWMAJOR) {
-            m_ = owner->data() + begin * owner->columns();
-            rows_ = end - begin;
-            cols_ = owner->columns();
-         } else if (Order == MatrixOrder::COLMAJOR) {
-            m_ = owner->data() + begin * owner->rows();
-            rows_ = owner->rows();
-            cols_ = end - begin;
-         }
-   }
-
-   Matrix_view(Matrix_view&& m)                 = delete; //! No move
-   Matrix_view& operator=(Matrix_view&& m)      = delete;
-   Matrix_view(const Matrix_view& m)            = delete; //!< No copy
-   Matrix_view& operator=(const Matrix_view& m) = delete;
-   //! @}
-
-   //! Get/Set the size of each dimension
-   const IndexType rows() const noexcept { return rows_; }
-   const IndexType columns() const noexcept { return cols_; }
-
-   //! Get the interface size of the Matrix (what appears to be the size)
-   IndexType size() const {
-      return rows_ * cols_;
-   }
-
-   //! Actual memory capacity of the symmetric matrix
-   static constexpr IndexType capacity(IndexType M, IndexType N) {
-      return M*N;
-   }
-   /*
-    * virtual 2D accessors
-    */
-   const DataType get (IndexType i, IndexType j) const {
-      if constexpr (Order == MatrixOrder::COLMAJOR)
-         return m_[i + j*rows_];
-      else
-         return m_[i*cols_ + j];
-   }
-
-   DataType set (DataType v, IndexType i, IndexType j) {
-      if constexpr (Order == MatrixOrder::COLMAJOR)
-         return m_[i + j*rows_] = v;
-      else
-         return m_[i*cols_ + j] = v;
-   }
-//   DataType operator()(IndexType i, IndexType j) { return get(i, j); }
-   /*!
-    * Return a proxy MatVal object with read and write capabilities.
-    * @param i    The row number
-    * @param j    The column number
-    * @return     tHE MatVal object
-    */
-   MatVal<Matrix_view> operator()(IndexType i, IndexType j) noexcept {
-      return MatVal<Matrix_view>(this, get(i, j), i, j);
-   }
-
-   // a basic serial iterator support
-   DataType* data() noexcept { return m_.data(); }
-//   IndexType begin_idx() noexcept { return 0; }
-//   IndexType end_idx()   noexcept { return capacity(rows_, cols_); }
-
-   const DataType* data() const noexcept { return m_; }
-   const IndexType begin_idx() const noexcept { return 0; }
-   const IndexType end_idx()   const noexcept { return capacity(rows_, cols_); }
-   //! @}
-
-   /*!
-    * \name Safe iteration API
-    *
-    * This api automates the iteration over the array based on
-    * MatrixType
-    */
-   //! @{
-   template<typename F, typename... Args>
-   void for_each_in (IndexType begin, IndexType end, F&& lambda, Args&&... args) {
-      for (IndexType it=begin ; it<end ; ++it) {
-         std::forward<F>(lambda)(std::forward<Args>(args)..., it);
-      }
-   }
-   //! @}
-   //!
-private:
-   const owner_t* owner_ {nullptr};     //!< Pointer to Matrix
-   DataType*   m_       {nullptr};     //!< Starting address of the slice/view
-   IndexType   rows_{};                //!< the virtual size of rows.
-   IndexType   cols_{};                //!< the virtual size of columns.
-};
-
 /*!
 * A view/iterator hybrid object for Matrix columns.
 *
--- a/homework_1/inc/v0.hpp
+++ b/homework_1/inc/v0.hpp
@ -54,12 +54,21 @@ void pdist2(const Matrix& X, const Matrix& Y, Matrix& D2) {
   for (int i = 0; i < M ; ++i) {
      for (int j = 0; j < N; ++j) {
         D2.set(D2.get(i, j) + X_norms[i] + Y_norms[j], i, j);
-         //D2.set(std::max(D2.get(i, j), 0.0),            i, j); // Ensure non-negative
+         D2.set(std::max(D2.get(i, j), 0.0),            i, j); // Ensure non-negative
         D2.set(std::sqrt(D2.get(i, j)),                i, j); // Take the square root of each
      }
   }
+   M++;
 }

+/*!
+ * Quick select implementation
+ * \fn void quickselect(std::vector<std::pair<DataType,IndexType>>&, int)
+ * \tparam DataType
+ * \tparam IndexType
+ * \param vec  Vector of paire(distance, index) to partially sort over distance
+ * \param k    The number of elements to sort-select
+ */
 template<typename DataType, typename IndexType>
 void quickselect(std::vector<std::pair<DataType, IndexType>>& vec, int k) {
   std::nth_element(
@ -75,6 +84,7 @@ void quickselect(std::vector<std::pair<DataType, IndexType>>& vec, int k) {
 /*!
 * \param C    Is a MxD matrix (Corpus)
 * \param Q    Is a NxD matrix (Query)
+ * \param idx_offset The offset of the indexes for output (to match with the actual Corpus indexes)
 * \param k    The number of nearest neighbors needed
 * \param idx  Is the Nxk matrix with the k indexes of the C points, that are
 *             neighbors of the nth point of Q
@ -82,7 +92,7 @@ void quickselect(std::vector<std::pair<DataType, IndexType>>& vec, int k) {
 *             point of Q
 */
 template<typename MatrixD, typename MatrixI>
-void knnsearch(const MatrixD& C, const MatrixD& Q, size_t idx_offset, size_t k, size_t m, MatrixI& idx, MatrixD& dst) {
+void knnsearch(MatrixD& C, MatrixD& Q, size_t idx_offset, size_t k, size_t m, MatrixI& idx, MatrixD& dst) {

   using DstType = typename MatrixD::dataType;
   using IdxType = typename MatrixI::dataType;
--- a/homework_1/inc/v1.hpp
+++ b/homework_1/inc/v1.hpp
@ -1,5 +1,5 @@
 /**
- * \file    v0.hpp
+ * \file    v1.hpp
 * \brief
 *
 * \author
@ -16,8 +16,44 @@
 #include "v0.hpp"
 #include "config.h"

+#if defined CILK
+#include <cilk/cilk.h>
+#include <cilk/cilk_api.h>
+//#include <cilk/reducer_opadd.h>
+
+#elif defined OMP
+#include <omp.h>
+
+#elif defined PTHREADS
+#include <thread>
+#include <numeric>
+#include <functional>
+//#include <random>
+
+#else
+#endif
+
+
+void init_workers();
+
 namespace v1 {

+
+/*!
+ *
+ * Merge knnsearch results and select the closest neighbors
+ *
+ * \tparam DataType
+ * \tparam IndexType
+ * \param N1   Neighbors results from one knnsearch
+ * \param D1   Distances results from one knnsearcs
+ * \param N2   Neighbors results from second knnsearch
+ * \param D2   Distances results from second knnsearch
+ * \param k    How many
+ * \param m    How accurate
+ * \param N    Output for Neighbors
+ * \param D    Output for Distances
+ */
 template <typename DataType, typename IndexType>
 void mergeResultsWithM(mtx::Matrix<IndexType>& N1, mtx::Matrix<DataType>& D1,
                       mtx::Matrix<IndexType>& N2, mtx::Matrix<DataType>& D2,
@ -57,49 +93,118 @@ void mergeResultsWithM(mtx::Matrix<IndexType>& N1, mtx::Matrix<DataType>& D1,
    }
 }

-
-
+/*!
+ * The main parallelizable body
+ */
 template<typename MatrixD, typename MatrixI>
-void knnsearch(const MatrixD& C, const MatrixD& Q, size_t idx_offset, size_t k, size_t m, MatrixI& idx, MatrixD& dst) {
-
+void worker_body (std::vector<MatrixD>& corpus_slices,
+                  std::vector<MatrixD>& query_slices,
+                  MatrixI& idx,
+                  MatrixD& dst,
+                  size_t slice,
+                  size_t num_slices, size_t corpus_slice_size, size_t query_slice_size,
+                  size_t k,
+                  size_t m) {
+   // "load" types
   using DstType = typename MatrixD::dataType;
   using IdxType = typename MatrixI::dataType;

-    if (C.rows() <= 8 || Q.rows() <= 4) {
-        // Base case: Call knnsearch directly
-        v0::knnsearch(C, Q, idx_offset, k, m, idx, dst);
-        return;
+
+   for (size_t ci = 0; ci < num_slices; ++ci) {
+      size_t idx_offset = ci * corpus_slice_size;
+
+      // Intermediate matrixes for intermediate results
+      MatrixI temp_idx(query_slices[slice].rows(), k);
+      MatrixD temp_dst(query_slices[slice].rows(), k);
+
+      // kNN for each combination
+      v0::knnsearch(corpus_slices[ci], query_slices[slice], idx_offset, k, m, temp_idx, temp_dst);
+
+      // Merge temporary results to final results
+      MatrixI idx_slice((IdxType*)idx.data(), slice * query_slice_size, query_slices[slice].rows(), k);
+      MatrixD dst_slice((DstType*)dst.data(), slice * query_slice_size, query_slices[slice].rows(), k);
+
+      mergeResultsWithM(idx_slice, dst_slice, temp_idx, temp_dst, k, m, idx_slice, dst_slice);
+   }
+}
+
+/*!
+ * \param C    Is a MxD matrix (Corpus)
+ * \param Q    Is a NxD matrix (Query)
+ * \param num_slices How many slices to Corpus-Query
+ * \param k    The number of nearest neighbors needed
+ * \param m    accuracy
+ * \param idx  Is the Nxk matrix with the k indexes of the C points, that are
+ *             neighbors of the nth point of Q
+ * \param dst  Is the Nxk matrix with the k distances to the C points of the nth
+ *             point of Q
+ */
+template<typename MatrixD, typename MatrixI>
+void knnsearch(MatrixD& C, MatrixD& Q, size_t num_slices, size_t k, size_t m, MatrixI& idx, MatrixD& dst) {
+    using DstType = typename MatrixD::dataType;
+    using IdxType = typename MatrixI::dataType;
+
+    //Slice calculations
+    size_t corpus_slice_size = C.rows() / ((num_slices == 0)? 1:num_slices);
+    size_t query_slice_size = Q.rows() / ((num_slices == 0)? 1:num_slices);
+
+    // Make slices
+    std::vector<MatrixD> corpus_slices;
+    std::vector<MatrixD> query_slices;
+
+   for (size_t i = 0; i < num_slices; ++i) {
+      corpus_slices.emplace_back(
+         (DstType*)C.data(),
+         i * corpus_slice_size,
+         (i == num_slices - 1 ? C.rows() - i * corpus_slice_size : corpus_slice_size),
+         C.columns());
+      query_slices.emplace_back(
+         (DstType*)Q.data(),
+         i * query_slice_size,
+         (i == num_slices - 1 ? Q.rows() - i * query_slice_size : query_slice_size),
+         Q.columns());
   }

-    // Divide Corpus and Query into subsets
-    IdxType midC = C.rows() / 2;
-    IdxType midQ = Q.rows() / 2;
+   // Initialize results
+   for (size_t i = 0; i < dst.rows(); ++i) {
+      for (size_t j = 0; j < dst.columns(); ++j) {
+         dst.set(std::numeric_limits<DstType>::infinity(), i, j);
+         idx.set(static_cast<IdxType>(-1), i, j);
+      }
+   }

-    // Slice corpus and query matrixes
-    MatrixD C1((DstType*)C.data(), 0,    midC, C.columns());
-    MatrixD C2((DstType*)C.data(), midC, midC, C.columns());
-    MatrixD Q1((DstType*)Q.data(), 0,    midQ, Q.columns());
-    MatrixD Q2((DstType*)Q.data(), midQ, midQ, Q.columns());
+    // Main loop
+#if defined OMP
+   #pragma omp parallel for
+   for (size_t qi = 0; qi < num_slices; ++qi) {
+         worker_body (corpus_slices, query_slices, idx, dst, qi, num_slices, corpus_slice_size, query_slice_size, k, m);
+   }
+#elif defined CILK
+   cilk_for (size_t qi = 0; qi < num_slices; ++qi) {
+         worker_body (corpus_slices, query_slices, idx, dst, qi, num_slices, corpus_slice_size, query_slice_size, k, m);
+   }
+#elif defined PTHREADS
+   std::vector<std::thread> workers;
+   for (size_t qi = 0; qi < num_slices; ++qi) {
+      workers.push_back(
+         std::thread (worker_body<MatrixD, MatrixI>,
+                      std::ref(corpus_slices), std::ref(query_slices),
+                      std::ref(idx), std::ref(dst),
+                      qi,
+                      num_slices, corpus_slice_size, query_slice_size,
+                      k, m)
+      );
+   }
+   // Join threads
+   std::for_each(workers.begin(), workers.end(), [](std::thread& t){
+      t.join();
+   });

-    // Allocate temporary matrixes for all permutations
-    MatrixI N1_1(midQ, k), N1_2(midQ, k), N2_1(midQ, k), N2_2(midQ, k);
-    MatrixD D1_1(midQ, k), D1_2(midQ, k), D2_1(midQ, k), D2_2(midQ, k);
-
-    // Recursive calls
-    knnsearch(C1, Q1, idx_offset,        k, m, N1_1, D1_1);
-    knnsearch(C2, Q1, idx_offset + midC, k, m, N1_2, D1_2);
-    knnsearch(C1, Q2, idx_offset,        k, m, N2_1, D2_1);
-    knnsearch(C2, Q2, idx_offset + midC, k, m, N2_2, D2_2);
-
-    // slice output matrixes
-    MatrixI N1((IdxType*)idx.data(), 0,    midQ, k);
-    MatrixI N2((IdxType*)idx.data(), midQ, midQ, k);
-    MatrixD D1((DstType*)dst.data(), 0,    midQ, k);
-    MatrixD D2((DstType*)dst.data(), midQ, midQ, k);
-
-    // Merge results in place
-    mergeResultsWithM(N1_1, D1_1, N1_2, D1_2, k, m, N1, D1);
-    mergeResultsWithM(N2_1, D2_1, N2_2, D2_2, k, m, N2, D2);
+#else
+   for (size_t qi = 0; qi < num_slices; ++qi) {
+      worker_body (corpus_slices, query_slices, idx, dst, qi, num_slices, corpus_slice_size, query_slice_size, k, m);
+   }
+#endif

 }

--- a/homework_1/matlab/dist2.m
+++ b/homework_1/matlab/dist2.m
@ -12,7 +12,10 @@ function [D2] = dist2(X, Y)
    if d1 ~= d2
        error('X,Y column dimensions must match');
    end
-    %D2 = sqrt((X.^2)*ones(d,m) -2*X*Y' + ones(n,d)*(Y.^2)');
+    % debug
+    %X_norm = sum(X.^2, 2);
+    %Y_norm = sum(Y.^2, 2)';
+    %XY = 2 * X*Y';
    D2 = max(sum(X.^2, 2) - 2 * X*Y' + sum(Y.^2, 2)', 0);
    D2 = sqrt(D2);
    
--- a/homework_1/matlab/measurements_over_acc.m
+++ b/homework_1/matlab/measurements_over_acc.m
@ -0,0 +1,78 @@
+% Plot measurements
+
+accuracy = [100 80 60 40 20 10];
+ser_sift_acc = [ 4395 4365 4384 4315 4295 4246 ];
+ser_mnist_ser_acc = [ 7936 7924 7886 7903 7844 7801 ];
+
+omp_sift_acc = [
+    1162 1157 1148 1138 1134 1107
+];
+omp_mnist_acc = [
+    2044 2036 2028 2007 1993 1973
+];
+    
+cilk_sift_acc = [
+    1148 1133 1127 1096 1079 1057
+];
+cilk_mnist_acc = [
+    2008 2024 1974 1965 1959 1947
+];
+
+pth_sift_acc = [
+    1157 1159 1121 1100 1084 1075
+];
+pth_mnist_acc = [
+    2050 2086 2040 2020 2004 1979
+];
+
+% 1ο Διάγραμμα: OMP
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(accuracy, ser_sift_acc, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(accuracy, ser_mnist_ser_acc, '-s', 'DisplayName', 'Serial MNIST');
+plot(accuracy, omp_sift_acc, '-^', 'DisplayName', 'OMP SIFT');
+plot(accuracy, omp_mnist_acc, '-d', 'DisplayName', 'OMP MNIST');
+hold off;
+title('OMP');
+xlabel('Accuracy (%)');
+ylabel('Execution Time [msec]');
+set(gca, 'XDir', 'reverse'); % reverse x
+legend('Location', 'northwest');
+grid on;
+print(gcf, 'OMP_over_accuracy.png', '-dpng', '-r300');
+
+% 2ο Διάγραμμα: CILK
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(accuracy, ser_sift_acc, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(accuracy, ser_mnist_ser_acc, '-s', 'DisplayName', 'Serial MNIST');
+plot(accuracy, cilk_sift_acc, '-^', 'DisplayName', 'CILK SIFT');
+plot(accuracy, cilk_mnist_acc, '-d', 'DisplayName', 'CILK MNIST');
+hold off;
+title('CILK');
+xlabel('Accuracy (%)');
+ylabel('Execution Time [msec]');
+set(gca, 'XDir', 'reverse'); % reverse x
+legend('Location', 'northwest');
+grid on;
+print(gcf, 'CILK_over_accuracy.png', '-dpng', '-r300');
+
+% 3ο Διάγραμμα: Pthreads
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(accuracy, ser_sift_acc, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(accuracy, ser_mnist_ser_acc, '-s', 'DisplayName', 'Serial MNIST');
+plot(accuracy, pth_sift_acc, '-^', 'DisplayName', 'Pthreads SIFT');
+plot(accuracy, pth_mnist_acc, '-d', 'DisplayName', 'Pthreads MNIST');
+hold off;
+title('Pthreads');
+xlabel('Accuracy (%)');
+ylabel('Execution Time [msec]');
+set(gca, 'XDir', 'reverse'); % reverse x
+legend('Location', 'northwest');
+grid on;
+print(gcf, 'Pthreads_over_accuracy.png', '-dpng', '-r300');
+
--- a/homework_1/matlab/measurements_over_th.m
+++ b/homework_1/matlab/measurements_over_th.m
@ -0,0 +1,75 @@
+% Plot measurements
+
+threads = [1 2 4 6 8 10 12];
+ser_sift_threads = [ 4418 4418 4418 4418 4418 4418 4418 ];
+ser_mnist_ser_threads = [ 7924 7924 7924 7924 7924 7924 7924 ];
+
+omp_sift_th = [
+    4374 2194 1148 751 724 637 596
+];
+omp_mnist_th = [
+    7918 3941 2031 1395 1415 1330 1292
+];
+    
+cilk_sift_th = [
+    4183 2076 1114 886 706 742 636
+];
+cilk_mnist_th = [
+    7745 3883 2020 1454 1436 1342 1292
+];
+
+pth_sift_th = [
+    4254 2155 1133 877 724 640 682
+];
+pth_mnist_th = [
+    7889 3963 2058 1445 1496 1379 1352
+];
+
+% 1ο Διάγραμμα: OMP
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(threads, ser_sift_threads, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(threads, ser_mnist_ser_threads, '-s', 'DisplayName', 'Serial MNIST');
+plot(threads, omp_sift_th, '-^', 'DisplayName', 'OMP SIFT');
+plot(threads, omp_mnist_th, '-d', 'DisplayName', 'OMP MNIST');
+hold off;
+title('OMP');
+xlabel('Threads');
+ylabel('Execution Time [msec]');
+legend('Location', 'northeast');
+grid on;
+print(gcf, 'OMP_over_threads.png', '-dpng', '-r300');
+
+% 2ο Διάγραμμα: CILK
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(threads, ser_sift_threads, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(threads, ser_mnist_ser_threads, '-s', 'DisplayName', 'Serial MNIST');
+plot(threads, cilk_sift_th, '-^', 'DisplayName', 'CILK SIFT');
+plot(threads, cilk_mnist_th, '-d', 'DisplayName', 'CILK MNIST');
+hold off;
+title('CILK');
+xlabel('Threads');
+ylabel('Execution Time [msec]');
+legend('Location', 'northeast');
+grid on;
+print(gcf, 'CILK_over_threads.png', '-dpng', '-r300');
+
+% 3ο Διάγραμμα: Pthreads
+figure;
+set(gcf, 'Position', [100, 100, 1280, 720]); % Set the figure size to HD
+plot(threads, ser_sift_threads, '-o', 'DisplayName', 'Serial SIFT');
+hold on;
+plot(threads, ser_mnist_ser_threads, '-s', 'DisplayName', 'Serial MNIST');
+plot(threads, pth_sift_th, '-^', 'DisplayName', 'Pthreads SIFT');
+plot(threads, pth_mnist_th, '-d', 'DisplayName', 'Pthreads MNIST');
+hold off;
+title('Pthreads');
+xlabel('Threads');
+ylabel('Execution Time [msec]');
+legend('Location', 'northeast');
+grid on;
+print(gcf, 'Pthreads_over_threads.png', '-dpng', '-r300');
+
--- a/homework_1/matlab/run_test.m
+++ b/homework_1/matlab/run_test.m
@ -2,6 +2,106 @@
 %
 %

+
+%
+%
+%
+C1 = [ 
+    0.8147    0.1576;
+    0.9058    0.9706;
+    0.1270    0.9572;
+    0.9134    0.4854;
+    0.6324    0.8003;
+    0.0975    0.1419;
+    0.2785    0.4218;
+    0.5469    0.9157;
+    0.9575    0.7922;
+    0.9649    0.9595 ];
+
+Q1 = [
+    0.6557    0.7577;
+    0.0357    0.7431;
+    0.8491    0.3922;
+    0.9340    0.6555;
+    0.6787    0.1712 ];
+
+C2 = [
+    0.7060    0.4456    0.5060    0.6160;
+    0.0318    0.6463    0.6991    0.4733;
+    0.2769    0.7094    0.8909    0.3517;
+    0.0462    0.7547    0.9593    0.8308;
+    0.0971    0.2760    0.5472    0.5853;
+    0.8235    0.6797    0.1386    0.5497;
+    0.6948    0.6551    0.1493    0.9172;
+    0.3171    0.1626    0.2575    0.2858;
+    0.9502    0.1190    0.8407    0.7572;
+    0.0344    0.4984    0.2543    0.7537;
+    0.4387    0.9597    0.8143    0.3804;
+    0.3816    0.3404    0.2435    0.5678;
+    0.7655    0.5853    0.9293    0.0759;
+    0.7952    0.2238    0.3500    0.0540;
+    0.1869    0.7513    0.1966    0.5308;
+    0.4898    0.2551    0.2511    0.7792 ];
+
+Q2 = [
+    0.9340    0.3112    0.4505    0.0782;
+    0.1299    0.5285    0.0838    0.4427;
+    0.5688    0.1656    0.2290    0.1067;
+    0.4694    0.6020    0.9133    0.9619;
+    0.0119    0.2630    0.1524    0.0046;
+    0.3371    0.6541    0.8258    0.7749;
+    0.1622    0.6892    0.5383    0.8173;
+    0.7943    0.7482    0.9961    0.8687 ];
+
+D1_exp = [
+    0.6208    0.9745    0.2371    0.5120    0.1367;
+    0.3284    0.8993    0.5811    0.3164    0.8310;
+    0.5651    0.2327    0.9169    0.8616    0.9603;
+    0.3749    0.9147    0.1132    0.1713    0.3921;
+    0.0485    0.5994    0.4621    0.3346    0.6308;
+    0.8312    0.6044    0.7922    0.9815    0.5819;
+    0.5052    0.4028    0.5714    0.6959    0.4722;
+    0.1919    0.5395    0.6045    0.4665    0.7561;
+    0.3037    0.9231    0.4144    0.1387    0.6807;
+    0.3692    0.9540    0.5790    0.3056    0.8386 ];
+  
+  
+  D2_exp = [
+    0.6020    0.7396    0.6583    0.6050    1.0070    0.5542    0.6298    0.6352;
+    1.0696    0.6348    0.9353    0.6914    0.8160    0.4475    0.4037    0.9145;
+    0.9268    0.8450    0.9376    0.6492    0.9671    0.4360    0.5956    0.7400;
+    1.3455    0.9876    1.2953    0.4709    1.2557    0.3402    0.4417    0.7500;
+    0.9839    0.5476    0.7517    0.7216    0.7074    0.5605    0.4784    0.9954;
+    0.6839    0.7200    0.7305    0.9495    1.0628    0.8718    0.8178    0.9179;
+    0.9850    0.7514    0.9585    0.7996    1.2054    0.7784    0.6680    0.8591;
+    0.6950    0.4730    0.3103    1.0504    0.4397    0.8967    0.8140    1.2066;
+    0.8065    1.2298    0.9722    0.7153    1.3933    0.8141    1.0204    0.6758;
+    1.1572    0.3686    0.9031    0.8232    0.7921    0.6656    0.3708    1.0970;
+    0.9432    0.9049    1.0320    0.6905    1.1167    0.5094    0.6455    0.6653;
+    0.7672    0.3740    0.5277    0.8247    0.6842    0.6945    0.5648    0.9968;
+    0.5768    1.1210    0.8403    0.9345    1.1316    0.8292    1.0380    0.8127;
+    0.1939    0.8703    0.2684    1.1794    0.8103    1.0683    1.1115    1.1646;
+    1.0106    0.2708    0.8184    0.8954    0.7402    0.6982    0.4509    1.0594;
+    0.8554    0.5878    0.6834    0.7699    0.9155    0.7161    0.6162    0.9481 ];
+
+% tests
+D1 = dist2(C1, Q1);
+if norm (D1-pdist2(C1, Q1), 'fro') > 0.01
+    disp('Error in dist2(C1, Q1)');
+end
+
+D2 = dist2(C2, Q2);
+if norm (D2-pdist2(C2, Q2), 'fro') > 0.01
+    disp('Error in dist2(C2, Q2)');
+end
+
+D2 = dist2(C2, C2);
+if norm (D2-pdist2(C2, C2), 'fro') > 0.01
+    disp('Error in dist2(C2, C2)');
+end
+    
+
+
 %C = rand(10000, 2);  % Corpus
 %Q = rand(10000, 2);   % Queries
 C = rand(20000, 2);  % Δύο clusters
--- a/homework_1/measurements/CILK_over_accuracy.png
+++ b/homework_1/measurements/CILK_over_accuracy.png
--- a/homework_1/measurements/CILK_over_threads.png
+++ b/homework_1/measurements/CILK_over_threads.png
--- a/homework_1/measurements/OMP_over_accuracy.png
+++ b/homework_1/measurements/OMP_over_accuracy.png
--- a/homework_1/measurements/OMP_over_threads.png
+++ b/homework_1/measurements/OMP_over_threads.png
--- a/homework_1/measurements/Pthreads_over_accuracy.png
+++ b/homework_1/measurements/Pthreads_over_accuracy.png
--- a/homework_1/measurements/Pthreads_over_threads.png
+++ b/homework_1/measurements/Pthreads_over_threads.png
--- a/homework_1/measurements/over_acc.txt
+++ b/homework_1/measurements/over_acc.txt
@ -0,0 +1,122 @@
+# Serial-sift over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 100 -k 100 -t
+[Timing]: knnsearch: 4395 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 80 -k 100 -t
+[Timing]: knnsearch: 4365 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 60 -k 100 -t
+[Timing]: knnsearch: 4384 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 40 -k 100 -t
+[Timing]: knnsearch: 4315 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 20 -k 100 -t
+[Timing]: knnsearch: 4295 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -a 10 -k 100 -t
+[Timing]: knnsearch: 4246 [msec]
+
+
+# Serial-mnist over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 100 -k 100 -t
+[Timing]: knnsearch: 7936 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 80 -k 100 -t
+[Timing]: knnsearch: 7924 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 60 -k 100 -t
+[Timing]: knnsearch: 7886 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 40 -k 100 -t
+[Timing]: knnsearch: 7903 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 20 -k 100 -t
+[Timing]: knnsearch: 7844 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -a 10 -k 100 -t
+[Timing]: knnsearch: 7801 [msec]
+
+
+# OMP-sift over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 1162 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 1157 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 1148 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 1138 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 1134 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1107 [msec]
+
+# OMP-mnist over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 2044 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 2036 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 2028 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 2007 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 1993 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1973 [msec]
+
+
+
+# CILK-sift over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 1148 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 1133 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 1127 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 1096 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 1079 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1057 [msec]
+
+
+# CILK-mnist over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 2008 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 2024 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 1974 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 1965 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 1959 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1947 [msec]
+
+
+# Pthreads-sift over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 1157 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 1159 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 1121 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 1100 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 1084 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1075 [msec]
+
+
+# Pthreads-mnist over acc
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 100 -k 100 -t
+[Timing]: knnsearch: 2050 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 80 -k 100 -t
+[Timing]: knnsearch: 2086 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 60 -k 100 -t
+[Timing]: knnsearch: 2040 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 40 -k 100 -t
+[Timing]: knnsearch: 2020 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 20 -k 100 -t
+[Timing]: knnsearch: 2004 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -a 10 -k 100 -t
+[Timing]: knnsearch: 1979 [msec]
+
+
+
+
--- a/homework_1/measurements/over_threads.txt
+++ b/homework_1/measurements/over_threads.txt
@ -0,0 +1,103 @@
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/sift-128-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 4418 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1 -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 7924 [msec]
+
+# OMP-sift over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 4374 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 2194 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 1148 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 751 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 724 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 637 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/sift-128-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 596 [msec]
+
+# OMP-mnist over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 7918 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 3941 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 2031 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 1395 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 1415 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 1330 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_omp -c mtx/mnist-784-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 1292 [msec]
+
+
+# CILK-sift over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 4183 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 2076 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 1114 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 886 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 706 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 742 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/sift-128-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 636 [msec]
+
+# CILK-mnist over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 7745 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 3883 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 2020 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 1454 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 1436 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 1342 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_cilk -c mtx/mnist-784-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 1292 [msec]
+
+
+# Pthreads-sift over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 4254 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 2155 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 1133 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 877 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 724 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 640 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/sift-128-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 682 [msec]
+
+# Pthreads-mnist over threads
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 1 -k 100 -t
+[Timing]: knnsearch: 7889 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 2 -k 100 -t
+[Timing]: knnsearch: 3963 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 4 -k 100 -t
+[Timing]: knnsearch: 2058 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 6 -k 100 -t
+[Timing]: knnsearch: 1445 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 8 -k 100 -t
+[Timing]: knnsearch: 1496 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 10 -k 100 -t
+[Timing]: knnsearch: 1379 [msec]
+hoo2@shirka:~/Work/AUTH/PDS/homework_1$ eval $DOCK ./out/knnsearch_v1_pth -c mtx/mnist-784-euclidean.hdf5 /test -s 12 -k 100 -t
+[Timing]: knnsearch: 1352 [msec]
+
--- a/homework_1/report/homework_1_report.pdf
+++ b/homework_1/report/homework_1_report.pdf
--- a/homework_1/report/homework_1_report.tex
+++ b/homework_1/report/homework_1_report.tex
@ -46,5 +46,197 @@

 \section{Εισαγωγή}

+Η παρούσα εργασία αφορά τον παραλληλισμό συστημάτων με κοινή μνήμη.
+Το αντικείμενό είναι ο παραλληλισμός τους αλγορίθμου εύρεσης κοντινότερων γειτόνων \textbf{knnseach}.
+Στην εργασία αυτή υλοποιούμε τόσο μια σειριακή προσέγγιση όσο και μια που παραλληλοποιείται με \textbf{pthreads}, με \textbf{open-cilk} και με \textbf{open-mp}.
+Στην παραλληλοποιήσιμη έκδοση δεν μας ενδιαφέρει η ακρίβεια του αλγορίθμου, κάτι που μας δίνει την δυνατότητα να βελτιώσουμε τον χρόνο εκτέλεσης.
+
+
+\section{Παραδοτέα}
+Τα παραδοτέα της εργασίας αποτελούνται από:
+\begin{itemize}
+    \item Την παρούσα αναφορά.
+    \item Το \href{https://git.hoo2.net/hoo2/PDS/src/branch/master/homework_1}{σύνδεσμο με το αποθετήριο} που περιέχει τον κώδικα για την παραγωγή των εκτελέσιμων, της αναφοράς και τις μετρήσεις.
+    \item Τον \href{https://hub.docker.com/r/hoo2/hpcimage}{σύνδεσμο} με το image με το οποίο μεταγλωττίσαμε και εκτελέσαμε τις μετρήσεις.
+\end{itemize}
+
+\section {Υλοποίηση}
+Πριν ξεκινήσουμε να αναλύουμε τον παραλληλισμό και τις μετρήσεις καλό θα ήταν να αναφερθούμε στην υλοποίηση.
+Για την παρούσα εργασία χρησιμοποιήσαμε τη γλώσσα \textbf{C++} και πέρα από τις βιβλιοθήκες που ζητούνται από την εκφώνηση (cilk/omp) έγινε και χρήση της \textbf{openblas}.
+Βοηθητικά έγινε επίσης χρήση του \textbf{bash} και της \textbf{matlab} κυρίως για την λήψη και οπτικοποίηση των αποτελεσμάτων.
+Στην παρούσα εργασία γίνεται ευρεία χρήση πινάκων τόσο για εισαγωγή και εξαγωγή των δεδομένων, όσο και για τους υπολογισμούς.
+Για το λόγο αυτό θεωρήσαμε ότι έπρεπε πρώτα να υλοποιήσουμε κάποιες αφαιρέσεις που θα μας επέτρεπαν να χειριστούμε ευκολότερα το πρόβλημα
+
+\subsection{Ο τύπος Matrix<>}
+Η πιο βασική αφαίρεση που υλοποιήσαμε είναι ο τύπος \textbf{Matrix<>}.
+Πρόκειται για μια αναπαράσταση πινάκων, είτε μονοδιάστατων είτε δισδιάστατων.
+Η υλοποίησή του έγινε με χρήση templates και έτσι μπορούμε να κατασκευάζουμε πίνακες οποιουδήποτε τύπου.
+Η αφαίρεση εσωτερικά κάνει χρίση μονοδιάστατης αποθήκευσης των δεδομένων με αποτέλεσμα να έχουμε την δυνατότητα να διαλέγουμε εμείς το ordering του πίνακα.
+Αν είναι δηλαδή column-major ή row-major.
+Στην αφαίρεση δημιουργούμε πίνακες στους οποίους τον έλεγχο της μνήμης τον έχουμε εσωτερικά στο αντικείμενο.
+Για την αποθήκευση χρησιμοποιούμε \textbf{std::vector} κάτι που μας δίνει ευελιξία στο μέγεθος αλλά και στη χρήση.
+\par
+Η κύρια όμως λειτουργικότητα που καθιστά την αφαίρεση βολική για την συγκεκριμένη εργασία είναι η δυνατότητα να κατασκευάσουμε αντικείμενα των οποίων η μνήμη είναι και εξωτερικά του αντικειμένου.
+Να λειτουργήσει δηλαδή ο ίδιος ο τύπος ώς viewer, όπως για παράδειγμα το \href{https://en.cppreference.com/w/cpp/header/string_view}{string view}, χωρίς όμως να χρειάζεται να έχουμε διαφορετικό τύπο για αυτή τη δουλειά.
+Σε αυτή την περίπτωση στο εσωτερικό storage δεν αποθηκεύουμε παρά μόνο μια διεύθυνση στα δεδομένα του άλλου πίνακα.
+Μπορούμε όμως να χρησιμοποιήσουμε όλη την λειτουργικότητα του πίνακα.
+Αυτό μας δίνει τη δυνατότητα να δημιουργούμε αντικείμενα που “βλέπουν” σε ολόκληρους ή και σε \textbf{slices }από πίνακες.
+
+Για παράδειγμα μπορούμε να γράψουμε:
+\begin{verbatim}
+    Matrix<double> A(4,5);          // Πίνακας 4x5
+    Matrix<double> vA(A, 0, 2, 5);  // View στις 2 πρώτες γραμμές του Α
+    for (int i=0 ; i<vA.rows() ; ++i)
+        for (int j=0 ; j<vA.columns() ; ++j)
+            std::cout << A(i, j);
+\end{verbatim}
+
+Στον παραπάνω παράδειγμα ο πίνακας με τα δεδομένα είναι ο \textbf{Α}.
+o \textbf{vA}, είναι ένα \textit{slice} του A, που δείχνει μόνο στις 2 πρώτες γραμμές του Α.
+
+\subsection{knnsearch version 0}
+Για το πρώτο σκέλος της άσκησης είχαμε να παρουσιάσουμε την knnsearch με σειριακό τρόπο.
+Για το σκοπό αυτό υλοποιήσαμε την \textbf{v0::knnsearch}.
+Η συνάρτηση αυτή κάνει χρήση του παραπάνω τύπου (Matrix) και επομένως είναι και ή ίδια template.
+Η λειτουργία της χωρίζεται σε 3 βασικά μέρη:
+\begin{itemize}
+    \item Τον υπολογισμό του \textbf{πίνακα αποστάσεων}.
+    Εδώ βρίσκεται ο κύριος υπολογιστικός όγκος της συνάρτησης, λόγω του πολλαπλασιασμού πινάκων που απαιτείται.
+    Για το λόγο αυτό κάνουμε χρήση της openblas.
+    \item Την \textbf{αντιστοίχηση των αποστάσεων} των σημείων \textbf{με τους δείκτες (indexes)} αυτών των αποστάσεων στον αρχικό πίνακα Corpus.
+    Εδώ ουσιαστικά \textit{“ζευγαρώνουμε”} τον κάθε δείκτη με τη απόσταση που του αναλογεί σε μια δομή \textbf{std::pair<>}.
+    Αυτό το κάνουμε ώστε ταξινομώντας ένα διάνυσμα με τέτοια ζευγάρια ώς προς τη μικρότερη απόσταση, να ταξινομούνται αυτόματα και οι δείκτες.
+    Γιαυτό το σκοπό κατασκευάζουμε για κάθε σημείο του Query, ένα διάνυσμα τέτοιων ζευγαριών.
+    \item Την \textbf{μερική ταξινόμηση} του παραπάνω διανύσματος.
+    Εδώ κάνουμε χρήση του αλγορίθμου quick-select.
+    Ουσιαστικά ταξινομούμε μόνο ένα κομμάτι του διανύσματος όσα τα στοιχεία που θέλουμε (τον αριθμό των γειτόνων που ψάχνουμε).
+    Ο αλγόριθμος αυτός είναι γραμμικός, περιορίζοντας έτσι λιγάκι τη “χασούρα”.
+    Για το σκοπό αυτό χρησιμοποιήσαμε την std::nth\_element().
+\end{itemize}
+Στην υλοποίησή μας τα Corpus-Query είναι διαφορετικά, κάτι που δεν μας περιορίζει να λύσουμε το πρόβλημα όπου το Corpus == Query φυσικά.
+
+\subsection{knnsearch version 1}
+Για το δεύτερο σκέλος της εργασίας είχαμε να παρουσιάσουμε την knnsearch την οποία θα παραλληλοποιήσουμε.
+Για το σκοπό αυτό υλοποιήσαμε την \textbf{v1::knnsearch}.
+Η αρχική μας προσέγγιση ήταν \textbf{αναδρομική}.
+Ουσιαστικά χωρίζαμε τα Corpus-Queries σε slices και για καθένα από αυτά καλούσαμε ξανά τον εαυτό μας.
+Η αναδρομή τερματιζόταν όταν τα slices ήταν αρκετά μικρά ώστε να τρέξει ο σειριακός αλγόριθμος.
+Για να μην χάνονται όμως γείτονες η knnsearch έπρεπε να τρέχει για όλους τους συνδυασμούς.
+Το κάθε Corpus με τα δύο διαφορετικά Queries και το κάθε Query αντίστοιχα.
+Έπειτα συνενώναμε τα αποτελέσματα.
+Αν και αυτή η προσέγγιση ήταν η ακριβείς μας έδινε όμως τη δυνατότητα να “φτηνύνουμε” τον τρόπο με τον οποίο ενώναμε τα αποτελέσματα.
+Επίσης για την περίπτωση που το Query ήταν ίδιο με το Corpus, μας προσφέρεται η δυνατότητα να αποφύγουμε έναν από τους 4 συνδυασμούς.
+\par
+Η προσέγγιση αυτή, αν και απλή, δεν βολεύει για παραλληλοποίηση, καθώς τα νήματα που πρέπει να δημιουργηθούν βρίσκονται μέσα στην αναδρομή, καθιστώντας δύσχρηστη(όχι αδύνατη) τη χρήση των εργαλείων της cilk και της openMP.
+Ακόμα και το “μέρος” στο οποίο θα έμπαινε η loop-α με τις join για την περίπτωση των pthreads ήταν περίπλοκο.
+Για το σκοπό αυτό \textbf{“ανοίξαμε” την αναδρομή} σε βρόχο επανάληψης.
+Σε αυτή την υλοποίηση τα Corpus-Queries, χωρίζονται σε ένα αριθμό από slices που μας δίνεται από την γραμμή εντολών.
+Για όλα αυτά σε ένα διπλό βρόχο καλούμε την σειριακή knnsearch.
+Ο διπλός βρόχος ουσιαστικά δημιουργεί όλους τους συνδυασμούς των slice από το Corpus με τα slices από το Query.
+Έτσι πάλι δεν χάνουμε γείτονες.
+Τα αποτελέσματα συνενώνονται όπου και κρατούνται οι κοντινότεροι γείτονες.
+Η διαδικασία επιλογής είναι ίδια με τη σειριακή.
+Γίνεται δηλαδή πάλι χρήση της quick-select.
+Για να μειώσουμε την ακρίβεια και εδώ μπορούμε να “φτωχύνουμε” το merge.
+Για το σκοπό αυτό έχουμε ένα όρισμα στη συνάρτηση το οποίο μας κόβει τον αριθμό των κοντινότερων γειτόνων.
+Σε αυτή την προσέγγιση υπάρχει ακόμα ένα ουσιαστικότερο όφελος.
+Ο χωρισμός σε slices μας δίνει την δυνατότητα να δημιουργήσουμε threads που δεν θα χρειαστεί ποτέ να χρειαστούν κομμάτια μνήμης από άλλα threads.
+Αυτό ο διαχωρισμός γίνεται στον έξω βρόχο επανάληψης της knnsearch
+\begin{verbatim}
+   #pragma omp parallel for
+    for (size_t qi = 0; qi < num_slices; ++qi)
+        worker_body (corpus_slices, query_slices, idx, dst, qi,
+                     num_slices, corpus_slice_size, query_slice_size, k, m);
+\end{verbatim}
+Για τα το παραπάνω snippet (που είναι και “καρδιά” του αλγόριθμου), ο κάθε worker δουλεύει σε δικό του corpus και query slice και αποθηκεύει τα δεδομένα σε δικό του slice πίσω στους idx και dst.
+Με αυτό τον τρόπο δεν χρειαζόμαστε κανέναν συγχρονισμό για τα threads.
+
+\subsection{Μετρήσεις}
+
+Για την παραγωγή των εκτελέσιμων χρησιμοποιήσαμε το προσωπικό μας(μου) laptop.
+Στον κατάλογο με τον πηγαίο κώδικα υπάρχει ότι είναι απαραίτητο για την μεταγλώττιση και εκτέλεση του κώδικα.
+Για τη μεταγλώττιση κάναμε χρήση του image hpcimage που βρίσκεται \href{https://hub.docker.com/r/hoo2/hpcimage}{εδώ}.
+Για την μεταγλώττιση και εκτέλεση μπορεί κάποιος να δώσει τις παρακάτω εντολές στο root κατάλογο “homework\_1”.
+\begin{verbatim}
+    DOCK="docker run --rm -v <PATH_to_homework_1>:/usr/src/PDS_homework_1 \
+         -w /usr/src/PDS_homework_1/ hoo2/hpcimage"
+    make -j IMAGE=hoo2/hpcimage v0
+    make -j IMAGE=hoo2/hpcimage v1
+    make -j IMAGE=hoo2/hpcimage v1_cilk
+    make -j IMAGE=hoo2/hpcimage v1_omp
+    make -j IMAGE=hoo2/hpcimage v1_pth
+    # run pthreads version with 4 slices
+    eval $DOCK ./out/knnsearch_v1_pth -c <path to hdf5> /test -k 100 -s 4 -t
+\end{verbatim}
+
+\section{Αποτελέσματα}
+
+Παρακάτω παραθέτουμε τα αποτελέσματα των μετρήσεων.
+Για αυτές:
+\begin{itemize}
+    \item Εκτελέσαμε την κάθε έκδοση του προγράμματος της v1 για\textbf{ διαφορετικό αριθμό από threads} για δύο διαφορετικούς πίνακες(all-to-all) ώστε να δούμε την συμπεριφορά.
+    Τα threads μπορούν να περαστούν από τη γραμμή εντολών.
+    \item Εκτελέσαμε την κάθε έκδοση του προγράμματος για δύο διαφορετικούς πίνακες(all-to-all) για \textbf{διαφορετική επιθυμητή ακρίβεια}.
+\end{itemize}
+
+Για αντιπαραβολή παραθέτουμε ότι ότι στο ίδιο μηχάνημα το οποίο πήραμε τις μετρήσεις η knnsearch έκδοση του \textbf{matlab} κάνει:
+\begin{itemize}
+    \item sift: \textbf{3.223628 [sec]}
+    \item mnist: \textbf{20.634987 [sec]}
+\end{itemize}
+
+\subsection{Επίδραση των threads - OpenMP}
+
+Για την έκδοση με την openMP, χρησιμοποιώντας διαφορετικό αριθμό threads έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/OMP_over_threads.png}
+\captionof{figure}{Βελτιστοποίηση από την παραλληλοποίηση [openMP].}
+\label{fig:openMP_th}
+
+\subsection{Επίδραση των threads - openCilk}
+
+Για την έκδοση με την Cilk έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/CILK_over_threads.png}
+\captionof{figure}{Βελτιστοποίηση από την παραλληλοποίηση  [open-cilk].}
+\label{fig:cilk_th}
+
+\subsection{Επίδραση των threads - pthreads}
+
+Για την έκδοση με τα pthreads έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/Pthreads_over_threads.png}
+\captionof{figure}{Βελτιστοποίηση από την παραλληλοποίηση [pthreads].}
+\label{fig:pthreads_th}
+
+\subsection{Επίδραση της ακρίβειας - OpenMP}
+
+Για την έκδοση με την openMP όσο αλλάζουμε την ακρίβεια έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/OMP_over_accuracy.png}
+\captionof{figure}{Βελτιστοποίηση λόγο μειωμένης ακρίβειας [openMP].}
+\label{fig:openMP_acc}
+
+\subsection{Επίδραση της ακρίβειας - openCilk}
+
+Για την έκδοση με την Cilk όσο αλλάζουμε την ακρίβεια έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/CILK_over_accuracy.png}
+\captionof{figure}{Βελτιστοποίηση λόγο μειωμένης ακρίβειας [open-cilk].}
+\label{fig:cilk_acc}
+
+\subsection{Επίδραση της ακρίβειας - pthreads}
+
+Για την έκδοση με τα pthreads όσο αλλάζουμε την ακρίβεια έχουμε: \\
+\includegraphics[width=\textwidth]{../measurements/Pthreads_over_accuracy.png}
+\captionof{figure}{Βελτιστοποίηση λόγο μειωμένης ακρίβειας [pthreads].}
+\label{fig:pthreads_acc}
+
+
+\section{Συμπεράσματα}
+
+%\justifying
+
+Απ' ότι βλέπουμε παραπάνω, ενώ ο χρόνος βελτιώνεται καθώς χρησιμοποιούμε παραπάνω threads, αυτό δεν συμβαίνει όταν μειώνουμε την ακρίβεια.
+Τουλάχιστον όχι όσο θα θέλαμε.
+Φυσικά αυτό έχει να κάνει με την υλοποίηση μας, καθώς δεν επιλέξαμε καλή τεχνική συνένωσης.
+Στην υλοποίησή μας δεν έγινε πολύ επιθετική στρατηγική μείωσης της ακρίβειας των αποτελεσμάτων.
+Να πούμε επίσης ότι ο συγκεκριμένος αλγόριθμος, λόγο της ουσιαστικά “μη” επικοινωνίας που χρειάζονται τα threads, θα μπορούσε να χρησιμοποιηθεί και σε απομακρυσμένα συστήματα κάνοντας χρήση του MPI.
+

 \end{document}
--- a/homework_1/src/main.cpp
+++ b/homework_1/src/main.cpp
@ -65,9 +65,15 @@ bool get_options(int argc, char* argv[]){
      else if (arg == "-k") {
         session.k = (i+1 < argc) ? std::atoi(argv[++i]) : session.k;
      }
-      else if (arg == "-n" || arg == "--max_trheads") {
+      else if (arg == "-n" || arg == "--max_threads") {
         session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads;
      }
+      else if (arg == "-s" || arg == "--slices") {
+         session.slices = (i+1 < argc) ? std::atoi(argv[++i]) : session.slices;
+      }
+      else if (arg == "-a" || arg == "--accuracy") {
+         session.accuracy = (i+1 < argc) ? std::atoi(argv[++i]) : session.accuracy;
+      }
      else if (arg == "-t" || arg == "--timing")
         session.timing = true;
      else if (arg == "-v" || arg == "--verbose")
@ -87,7 +93,12 @@ bool get_options(int argc, char* argv[]){
         std::cout << "   -k <number>\n";
         std::cout << "      Set the number of closest neighbors to find. \n\n";
         std::cout << "   -n | --max_trheads <threads>\n";
-         std::cout << "      Reduce the thread number for the execution to <threads>. <threads> must be less or equal to available CPUs.\n\n";
+         std::cout << "      Reduce the thread number for the execution to <threads>. <threads> should be less or equal to available CPUs.\n\n";
+         std::cout << "   -s | --slices <slices/threads>\n";
+         std::cout << "      The number of slices to the Corpus matrix. In the parallel version this setting affects the number of threads\n";
+         std::cout << "      <threads> should be less or equal to available CPUs\n\n";
+         std::cout << "   -a | --accuracy <accuracy>\n";
+         std::cout << "      Reduce the accuracy of neighbor finding. The accuracy should be between 1-100 \n\n";
         std::cout << "   -t | --timing\n";
         std::cout << "      Request timing measurements output to stdout.\n\n";
         std::cout << "   -v | --verbose\n";
@ -109,6 +120,42 @@ bool get_options(int argc, char* argv[]){
   return status;
 }

+/*!
+ * Matrix load
+ *
+ * \fn void loadMtx(MatrixDst&, MatrixDst&)
+ * \param Corpus  matrix to load to
+ * \param Query   matrix to load to
+ */
+void loadMtx(MatrixDst& Corpus, MatrixDst& Query) {
+   if (access(session.outMtxFile.c_str(), F_OK) == 0)
+      std::remove(session.outMtxFile.c_str());
+
+//   timer.start();
+   Mtx::load<MatrixDst, DstHDF5Type>(session.corpusMtxFile, session.corpusDataSet, Corpus);
+   if (session.queryMtx)
+      Mtx::load<MatrixDst, DstHDF5Type>(session.corpusMtxFile, session.corpusDataSet, Query);
+//   timer.stop();
+//   timer.print_dt("Load hdf5 files");
+}
+
+
+/*!
+ * Matrix store
+ *
+ * \fn void storeMtx(MatrixIdx&, MatrixDst&)
+ * \param Idx  Index part(neighbors) of the matrix to store
+ * \param Dst  Distances part of the matrix to store
+ */
+void storeMtx(MatrixIdx& Idx, MatrixDst& Dst) {
+//   timer.start();
+   Mtx::store<MatrixIdx, IdxHDF5Type>(session.outMtxFile, session.outMtxIdxDataSet, Idx);
+   Mtx::store<MatrixDst, DstHDF5Type>(session.outMtxFile, session.outMtxDstDataSet, Dst);
+//   timer.stop();
+//   timer.print_dt("Store hdf5 files");
+}
+
+
 #ifndef TESTING
 int main(int argc, char* argv[]) try {
   // Instantiate matrixes
@ -127,38 +174,26 @@ int main(int argc, char* argv[]) try {
   if (!get_options(argc, argv))
      exit(1);

-   if (access(session.outMtxFile.c_str(), F_OK) == 0)
-      std::remove(session.outMtxFile.c_str());
+   init_workers();

   // Load data
-   timer.start();
-   Mtx::load<MatrixDst, DstHDF5Type>(session.corpusMtxFile, session.corpusDataSet, Corpus);
-   if (session.queryMtx)
-      Mtx::load<MatrixDst, DstHDF5Type>(session.corpusMtxFile, session.corpusDataSet, Query);
-   timer.stop();
-   timer.print_dt("Load hdf5 files");
+   loadMtx(Corpus, Query);

   // Prepare output memory
-   Idx.resize(Query.rows(), session.k);
-   Dst.resize(Query.rows(), session.k);
+   Idx.resize((session.queryMtx) ? Query.rows() : Corpus.rows(), session.k);
+   Dst.resize((session.queryMtx) ? Query.rows() : Corpus.rows(), session.k);

   // Do the search
   logger << "Start knnsearch ...";
   timer.start();
-   if (session.queryMtx)
-      knnsearch(Corpus, Query, 0, session.k, session.k, Idx, Dst);
-   else
-      knnsearch(Corpus, Corpus, 0, session.k, session.k, Idx, Dst);
+   size_t selected_neighbors = (size_t)(session.k*(session.accuracy/100.0));
+   knnsearch(Corpus, (session.queryMtx) ? Query : Corpus, session.slices, session.k, selected_neighbors, Idx, Dst);
   timer.stop();
   logger << " Done" << logger.endl;
   timer.print_dt("knnsearch");

   // Store data
-   timer.start();
-   Mtx::store<MatrixIdx, IdxHDF5Type>(session.outMtxFile, session.outMtxIdxDataSet, Idx);
-   Mtx::store<MatrixDst, DstHDF5Type>(session.outMtxFile, session.outMtxDstDataSet, Dst);
-   timer.stop();
-   timer.print_dt("Store hdf5 files");
+   storeMtx(Idx, Dst);

   return 0;
 }
--- a/homework_1/src/v1.cpp
+++ b/homework_1/src/v1.cpp
@ -0,0 +1,54 @@
+/**
+ * \file    v1.hpp
+ * \brief
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#include "v1.hpp"
+
+/*!
+ * \fn void init_workers()
+ *
+ * Initialize worker settings
+ */
+void init_workers() {
+#if defined CILK
+   size_t cilk_w = __cilkrts_get_nworkers();
+
+   if (!session.max_threads)
+      session.max_threads = (session.slices) ? (session.slices) : cilk_w;
+//   else if (session.max_threads < cilk_w)
+//      __cilkrts_set_param("nworkers", "4");
+   // else ignored by cilk
+
+#elif defined OMP
+//   omp_set_dynamic(1);
+   size_t omp_w = (size_t)omp_get_max_threads();
+
+   if (!session.max_threads) {
+      session.max_threads = (session.slices) ? (session.slices) : omp_w;
+//      omp_set_dynamic(1);
+   }
+   else if (session.max_threads < omp_w) {
+//      omp_set_dynamic(0);
+      omp_set_num_threads(session.max_threads);
+   }
+   // else ignored by omp
+
+#elif defined PTHREADS
+   size_t pth_w = std::thread::hardware_concurrency();
+
+   if (!session.max_threads)
+      session.max_threads = (session.slices) ? (session.slices) : pth_w;
+#else
+
+#endif
+   if (!session.slices)
+      session.slices = session.max_threads;
+
+   openblas_set_num_threads(1); // Limit OpenBLAS to 1 thread
+}
+
--- a/homework_2/.gitignore
+++ b/homework_2/.gitignore
@ -0,0 +1,23 @@
+# project
+bin/
+out/
+mat/
+mtx/
+.unused/
+various/
+
+# hpc
+
+# IDEs
+.idea/
+.clangd
+
+# eclipse
+.project
+.cproject
+.settings/
+
+.vs/
+.vscode/
+
+
--- a/homework_2/Makefile
+++ b/homework_2/Makefile
@ -0,0 +1,221 @@
+#
+# PDS homework_1 Makefile
+#
+# Copyright (C) 2024 Christos Choutouridis <christos@choutouridis.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============== Project settings ==============
+# Project's name
+PROJECT         := PDS_homework_2
+# Excecutable's name
+TARGET          := distbitonic
+# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
+SRC_DIR_LIST    := src
+# Include directories list(space seperated). Makefile-relative path.
+INC_DIR_LIST    := include \
+				   /usr/lib/x86_64-linux-gnu/openmpi/include/ \
+                   src
+
+# Exclude files list(space seperated). Filenames only.
+# EXC_FILE_LIST := bad.cpp old.cpp
+
+# Build directories
+BUILD_DIR       := bin
+OBJ_DIR         := $(BUILD_DIR)/obj
+DEP_DIR         := $(BUILD_DIR)/.dep
+
+
+# ========== Compiler settings ==========
+# Compiler flags for debug and release
+DEB_CFLAGS      := -DDEBUG -g3 -Wall -Wextra -std=c11
+REL_CFLAGS      := -Wall -Wextra -O3 -std=c11
+DEB_CXXFLAGS    := -DDEBUG -g3 -Wall -Wextra -std=c++17
+REL_CXXFLAGS    := -Wall -Wextra -O3 -std=c++17
+
+# Pre-defines
+# PRE_DEFS := MYCAB=1729 SUPER_MODE
+PRE_DEFS		:=
+
+# ============== Linker settings ==============
+# Linker flags (example: -pthread -lm)
+LDFLAGS         := -pthread
+
+# Map output file
+MAP_FILE        := output.map
+MAP_FLAG        := -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
+
+# ============== Docker settings ==============
+# We need:
+#  - Bind the entire project directory(the dir that icludes all the code) as volume.
+#  - In docker instance, change to working directory(where the makefile is).
+DOCKER_VOL_DIR  := $(shell pwd)
+DOCKER_WRK_DIR  :=
+DOCKER_RUN      := docker run --rm
+DOCKER_FLAGS    := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)
+
+# docker invoke mechanism (edit with care)
+#   note:
+#   Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
+#   functionality is needed.
+DOCKER_CMD      = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
+DOCKER          :=
+
+# ============== Tool selection ==============
+# compiler and compiler flags.
+CSIZE           := size
+CFLAGS          := $(DEB_CFLAGS)
+CXXFLAGS		:= $(DEB_CXXFLAGS)
+CXX             := g++ #mpic++
+CC              := gcc #mpicc
+
+#
+# =========== Main body and Patterns ===========
+#
+
+#ifeq ($(OS), Windows_NT)
+#	TARGET := $(TARGET).exe
+#endif
+INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
+DEF     := $(foreach def,$(PRE_DEFS),-D$(def))
+EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
+               $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
+           )
+# source files. object and dependencies list
+# recursive search into current and source directories
+SRC     := $(wildcard *.cpp)
+SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
+SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
+SRC     := $(filter-out $(EXC),$(SRC))
+#SRC     := $(abspath $(SRC))
+
+OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
+DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
+
+
+# Make Dependencies pattern.
+# This little trick enables recompilation only when dependencies change
+# and it does so for changes both in source AND header files ;)
+# 
+# It is based on Tom Tromey's method.
+# 
+# Invoke cpp to create makefile rules with dependencies for each source file
+$(DEP_DIR)/%.d: %.c
+	@mkdir -p $(@D)
+	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
+
+# c file objects depent on .c AND dependency files, which have an empty recipe 
+$(OBJ_DIR)/%.o: %.c $(DEP_DIR)/%.d
+	@mkdir -p $(@D)
+	@$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
+
+$(DEP_DIR)/%.d: %.cpp
+	@mkdir -p $(@D)
+	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
+
+# cpp file objects depent on .cpp AND dependency files, which have an empty recipe 
+$(OBJ_DIR)/%.o: %.cpp $(DEP_DIR)/%.d
+	@mkdir -p $(@D)
+	@$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
+
+# empty recipe for dependency files. This prevents make errors
+$(DEP):
+
+# now include all dependencies
+# After all they are makefile dependency rules ;)
+include $(wildcard $(DEP))
+
+# main target rule
+$(BUILD_DIR)/$(TARGET): $(OBJ)
+	@mkdir -p $(@D)
+	@echo Linking to target: $(TARGET)
+	@echo $(DOCKER) $(CXX) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@$(DOCKER) $(CXX) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
+	@echo
+	@echo Print size information
+	@$(CSIZE) $(@D)/$(TARGET)
+	@echo Done
+
+.PHONY: clean
+clean:
+	@echo Cleaning build directories
+	@rm -rf $(OBJ_DIR)
+	@rm -rf $(DEP_DIR)
+	@rm -rf $(BUILD_DIR)
+
+#
+# ================ Local build rules =================
+# example:
+# make debug
+
+debug: CFLAGS := $(DEB_CFLAGS)
+debug: $(BUILD_DIR)/$(TARGET)
+
+release: CFLAGS := $(REL_CFLAGS)
+release: $(BUILD_DIR)/$(TARGET)
+
+all: release
+
+hpc-results/post:
+	$(CXX) $(CFLAGS) -o $@ hpc-results/main.cpp 
+
+hpc-clean:
+	rm hpc-results/post
+
+#
+# ================ Local (and/or) via docker build rules =================
+#
+# examples:
+# make IMAGE=hpcimage v0
+# make IMAGE=hpcimage v1_cilk
+#
+dist_v05: CC := mpicc
+dist_v05: CXX := mpic++
+dist_v05: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=50
+dist_v05: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=50
+dist_v05: TARGET := dist_v05
+dist_v05: $(BUILD_DIR)/$(TARGET)
+
+dist_v1: CC := mpicc
+dist_v1: CXX := mpic++
+dist_v1: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=100
+dist_v1: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=100
+dist_v1: TARGET := dist_v1
+dist_v1: $(BUILD_DIR)/$(TARGET)
+
+#
+# ========= Inside CSAL Image build rules ===========
+#
+# 1) first jump into image (make sure you are in the directory where Makefile is):
+#    > docker run -it -v ${PWD}:/usr/src/exercise_1 -w /usr/src/exercise_1/ hpcimage
+# 2) Clean binaries first **important**
+#    > make clean
+# 3) for v4 cilk for example:
+#    > make csal_v4_cilk
+# 4) run executables from `bin/`. Examples:
+#    > ./bin/tcount_ompv3 -i mtx/NACA0015.mtx --timing -r 3 -o /dev/null
+#    > ./bin/tcount_pthv4 -i mtx/com_Youtube.mtx --timing --dynamic --print_count
+
+
+
+#
+# ======== Run from container =========
+#
+# examples:
+#
+# make IMAGE=hpcimage EXEC=knnsearch_v1 run
+# make IMAGE=hpcimage EXEC=knnsearch_v1 run
+#
+
--- a/homework_2/exersize.md
+++ b/homework_2/exersize.md
@ -0,0 +1,33 @@
+Parallel & Distributed Computer Systems HW2
+
+December 6, 2024
+
+Write a distributed program that sorts $N$ integers in ascending order, using MPI. The inter-process communications should be defined by the Bitonic sort algorithm as presented in our class notes.
+
+The program must perform the following tasks:
+
+- The user specifies two positive integers $q$ and $p$.
+
+- Start $2^p$ processes with an array of $2^q$ random integers is each processes.
+
+- Sort all $N = \left ( 2^{(q + p)} \right)$ elements int ascending order.
+
+- Check the correctness of the final result.
+
+Your implementation should be based on the following steps:
+- Start p processes, each process gets n/p data and sorts (ascending or descending depending on the process id) them using sort from any library. Use two buffers, to sort from one to the other, to send from one and receive in the other.
+- Repeat $O((log_2(p))^2)$:
+    * Exchange data with the corresponding partner and keep the min or max elements, depending on the process id and phase of the computation
+    * Sort locally the bitonic sequence using a modification of merge sort, starting from the "elbow" of the bitonic.
+
+You may use the C standard library function stdlib qsort() to check the correctness of your results and as the initial local sorting routine for each process.
+
+You must deliver:
+- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
+
+- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $p = [1:7]$ and $q = [20:27]$.
+
+- The source code of your program uploaded online.
+
+Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
+Deadline: 7 January, $2025$.
--- a/homework_2/include/config.h
+++ b/homework_2/include/config.h
@ -0,0 +1,39 @@
+/*!
+ * \file    config,h
+ * \brief   Build configuration file.
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+#include <iostream>
+#include <string>
+
+/*
+ * Defines for different version of the exercise
+ */
+#define  V50     (50)
+#define  V100    (100)
+
+
+// Fail-safe version selection
+#if !defined CODE_VERSION
+#define CODE_VERSION   V1
+#endif
+
+
+/*!
+ * Session option for each invocation of the executable
+ */
+struct session_t {
+    bool        timing          {false};
+    bool        verbose         {false};    //!< Flag to enable verbose output to stdout
+};
+
+extern session_t session;
+
+#endif /* CONFIG_H_ */
--- a/homework_2/include/matrix.hpp
+++ b/homework_2/include/matrix.hpp
@ -0,0 +1,804 @@
+/**
+ * \file    matrix.hpp
+ * \brief   A matrix abstraction implementation
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#ifndef MATRIX_HPP_
+#define MATRIX_HPP_
+
+#include <type_traits>
+#include <utility>
+#include <algorithm>
+#include <vector>
+#include <tuple>
+
+namespace mtx {
+
+using std::size_t;
+
+/*
+ * Small helper to strip types
+ */
+template<typename T>
+struct remove_cvref {
+    typedef std::remove_cv_t<std::remove_reference_t<T>> type;
+};
+template<typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+/*!
+ * Enumerator to denote the storage type of the array to use.
+ */
+enum class MatrixType {
+   DENSE,      /*!< Matrix is dense */
+   SPARSE,     /*!< Matrix is sparse */
+};
+
+/*!
+ * Enumerator to denote the storage type of the array to use.
+ */
+enum class MatrixOrder {
+   COLMAJOR,     /*!< Matrix is column major */
+   ROWMAJOR,     /*!< Matrix is row major */
+};
+
+/*
+ * Forward type declarations
+ */
+
+template<typename MatrixType> struct MatCol;
+template<typename MatrixType> struct MatRow;
+template<typename MatrixType> struct MatVal;
+
+ /*!
+ * A 2-D matrix functionality over a 1-D array
+ *
+ * This is a very thin abstraction layer over a native array.
+ * This is tested using compiler explorer and our template produce
+ * almost identical assembly.
+ *
+ * The penalty hit we have is due to the fact that we use a one dimension array
+ * and we have to calculate the actual position from an (i,j) pair.
+ * The use of 1D array was our intention from the beginning, so the penalty
+ * was pretty much unavoidable.
+ *
+ * \tparam DataType  The underling data type of the array
+ * \tparam IndexType The underling type for the index variables and sizes
+ * \tparam Type      The storage type of the array
+ *    \arg  FULL              For full matrix
+ *    \arg  SYMMETRIC         For symmetric matrix (we use only the lower part)
+ */
+template<typename DataType,
+         typename IndexType = size_t,
+         MatrixType Type    = MatrixType::DENSE,
+         MatrixOrder Order  = MatrixOrder::ROWMAJOR,
+         bool Symmetric     = false>
+struct Matrix {
+
+   using dataType    = DataType;                   //!< meta:export of underling data type
+   using indexType   = IndexType;                  //!< meta:export of underling index type
+   static constexpr MatrixOrder matrixOrder = Order; //!< meta:export of array order
+   static constexpr MatrixType matrixType = Type;  //!< meta:export of array type
+   static constexpr bool symmetric = Symmetric;    //!< meta:export symmetric flag
+
+   /*!
+    * \name Obj lifetime
+    */
+   //! @{
+
+   //! Construct an empty matrix with dimensions rows x columns
+   Matrix(IndexType rows = IndexType{}, IndexType columns = IndexType{}) noexcept
+       : vector_storage_(capacity(rows, columns)),
+         raw_storage_(nullptr),
+         use_vector_(true),
+         rows_(rows),
+         cols_(columns) {
+       data_ = vector_storage_.data();
+    }
+
+   //! Construct a matrix by copying existing data with dimensions rows x columns
+   Matrix(DataType* data, IndexType major_start, IndexType major_length, IndexType minor_length) noexcept
+      :  vector_storage_(),
+         raw_storage_ (data + major_start * minor_length),
+         use_vector_ (false) {
+      if constexpr (Order == MatrixOrder::ROWMAJOR) {
+         rows_ = major_length;
+         cols_ = minor_length;
+      }
+      else {
+         rows_ = minor_length;
+         cols_ = major_length;
+      }
+      data_ = raw_storage_;
+   }
+
+   //! Construct a matrix using an initializer list
+   Matrix(IndexType rows, IndexType columns, std::initializer_list<DataType> list)
+      : vector_storage_(list),
+        raw_storage_(nullptr),
+        use_vector_(true),
+        rows_(rows),
+        cols_(columns) {
+      if (list.size() != capacity(rows, columns)) {
+          throw std::invalid_argument("Matrix initializer list size does not match matrix dimensions.");
+      }
+      data_ = vector_storage_.data();
+   }
+
+   //! move ctor
+   Matrix(Matrix&& m) noexcept { moves(std::move(m)); }
+   //! move
+   Matrix& operator=(Matrix&& m) noexcept { moves(std::move(m)); return *this; }
+   Matrix(const Matrix& m)             = delete;  //!< No copy ctor
+   Matrix& operator=(const Matrix& m)  = delete;  //!< No copy
+   //Matrix(const Matrix& m);
+   //Matrix& operator=(const Matrix& m) { copy(m); }
+
+   //! @}
+
+   //! \name Data exposure
+   //! @{
+
+
+   //! Get/Set the size of each dimension
+   IndexType rows() const noexcept { return rows_; }
+   IndexType columns() const noexcept { return cols_; }
+
+   //! Get the interface size of the Matrix (what appears to be the size)
+   IndexType size() const {
+      return rows_ * cols_;
+   }
+   //! Set the interface size of the Matrix (what appears to be the size)
+   IndexType resize(IndexType rows, IndexType columns) {
+      if (use_vector_) {
+         rows_ = rows;
+         cols_ = columns;
+         vector_storage_.resize(capacity(rows_, cols_));
+         data_ = vector_storage_.data();
+      }
+      return capacity(rows_, cols_);
+   }
+
+   //! Actual memory capacity of the symmetric matrix
+   static constexpr IndexType capacity(IndexType M, IndexType N) {
+      if constexpr (Symmetric)
+         return (M+1)*N/2;
+      else
+         return M*N;
+   }
+
+   /*
+    * virtual 2D accessors
+    */
+   DataType get (IndexType i, IndexType j) {
+      if constexpr (Symmetric) {
+         auto T = [](size_t i)->size_t { return i*(i+1)/2; };          // Triangular number of i
+         if constexpr (Order == MatrixOrder::COLMAJOR) {
+            // In column major we use the lower triangle of the matrix
+            if (i>=j)   return data_[j*rows_ - T(j) + i];  // Lower, use our notation
+            else        return data_[i*rows_ - T(i) + j];  // Upper, use opposite index
+         }
+         else {
+            // In row major we use the upper triangle of the matrix
+            if (i<=j)   return data_[i*cols_ - T(i) + j];  // Upper, use our notation
+            else        return data_[j*cols_ - T(j) + i];  // Lower, use opposite index
+         }
+      }
+      else {
+         if constexpr (Order == MatrixOrder::COLMAJOR)
+            return data_[i + j*rows_];
+         else
+            return data_[i*cols_ + j];
+      }
+   }
+
+   /*!
+    * \fn DataType set(DataType, IndexType, IndexType)
+    * \param v
+    * \param i
+    * \param j
+    * \return
+    */
+   DataType set (DataType v, IndexType i, IndexType j) {
+      if constexpr (Symmetric) {
+         auto T = [](size_t i)->size_t { return i*(i+1)/2; };          // Triangular number of i
+         if constexpr (Order == MatrixOrder::COLMAJOR) {
+            // In column major we use the lower triangle of the matrix
+            if (i>=j)   return data_[j*rows_ - T(j) + i] = v;  // Lower, use our notation
+            else        return data_[i*rows_ - T(i) + j] = v;  // Upper, use opposite index
+         }
+         else {
+            // In row major we use the upper triangle of the matrix
+            if (i<=j)   return data_[i*cols_ - T(i) + j] = v;  // Upper, use our notation
+            else        return data_[j*cols_ - T(j) + i] = v;  // Lower, use opposite index
+         }
+      }
+      else {
+         if constexpr (Order == MatrixOrder::COLMAJOR)
+            return data_[i + j*rows_] = v;
+         else
+            return data_[i*cols_ + j] = v;
+      }
+   }
+//   DataType operator()(IndexType i, IndexType j) { return get(i, j); }
+   /*!
+    * Return a proxy MatVal object with read and write capabilities.
+    * @param i    The row number
+    * @param j    The column number
+    * @return     tHE MatVal object
+    */
+   MatVal<Matrix> operator()(IndexType i, IndexType j) noexcept {
+      return MatVal<Matrix>(this, get(i, j), i, j);
+   }
+
+   // a basic serial iterator support
+   DataType* data() noexcept { return data_; }
+   DataType* begin() noexcept { return data_; }
+   const DataType* begin() const noexcept { return data_; }
+   DataType* end() noexcept { return data_ + capacity(rows_, cols_); }
+   const DataType* end() const noexcept { return data_ + capacity(rows_, cols_); }
+
+//   IndexType begin_idx() noexcept { return 0; }
+//   IndexType end_idx()   noexcept { return capacity(rows_, cols_); }
+
+   const DataType* data() const noexcept { return data_; }
+   const IndexType begin_idx() const noexcept { return 0; }
+   const IndexType end_idx()   const noexcept { return capacity(rows_, cols_); }
+   //! @}
+
+   /*!
+    * \name Safe iteration API
+    *
+    * This api automates the iteration over the array based on
+    * MatrixType
+    */
+   //! @{
+   template<typename F, typename... Args>
+   void for_each_in (IndexType begin, IndexType end, F&& lambda, Args&&... args) {
+      for (IndexType it=begin ; it<end ; ++it) {
+         std::forward<F>(lambda)(std::forward<Args>(args)..., it);
+      }
+   }
+   //! @}
+
+   //
+   void swap(Matrix& src) noexcept {
+      std::swap(vector_storage_, src.vector_storage_);
+      std::swap(raw_storage_, src.raw_storage_);
+      std::swap(data_, src.data_);
+      std::swap(use_vector_, src.use_vector_);
+      std::swap(rows_, src.rows_);
+      std::swap(cols_, src.cols_);
+   }
+
+private:
+   //! move helper
+   void moves(Matrix&& src) noexcept {
+      vector_storage_ = std::move(src.vector_storage_);
+      raw_storage_    = std::move(src.raw_storage_);
+      data_           = std::move(src.data_);
+      use_vector_     = std::move(src.use_vector_);
+      rows_           = std::move(src.rows_);
+      cols_           = std::move(src.cols_);
+   }
+
+   // Storage
+   std::vector<DataType>
+               vector_storage_;  //!< Internal storage (if used).
+   DataType*   raw_storage_;     //!< External storage (if used).
+   DataType*   data_;            //!< Pointer to active storage.
+   bool        use_vector_;      //!< True if using vector storage, false for raw pointer.
+   IndexType   rows_{};          //!< the virtual size of rows.
+   IndexType   cols_{};          //!< the virtual size of columns.
+};
+
+
+/**
+ * A simple sparse matrix specialization.
+ *
+ * We use CSC format and provide get/set functionalities for each (i,j) item
+ * on the matrix. We also provide a () overload using a proxy MatVal object.
+ * This way the user can:
+ * \code
+ *    auto v = A(3,4);
+ *    A(3, 4) = 7;
+ * \endcode
+ *
+ * We also provide getCol() and getRow() functions witch return a viewer/iterator to rows and
+ * columns of the matrix. In the case of a symmetric matrix instead of a row we return the
+ * equivalent column. This way we gain speed due to CSC format nature.
+ *
+ * @tparam DataType  The type for values
+ * @tparam IndexType The type for indexes
+ * @tparam Type      The Matrix type (FULL or SYMMETRIC)
+ */
+template<typename DataType, typename IndexType,
+         MatrixOrder Order,
+         bool Symmetric>
+struct Matrix<DataType, IndexType, MatrixType::SPARSE, Order, Symmetric> {
+
+   using dataType    = DataType;                   //!< meta:export of underling data type
+   using indexType   = IndexType;                  //!< meta:export of underling index type
+   static constexpr MatrixOrder matrixOrder = Order; //!< meta:export of array order
+   static constexpr MatrixType matrixType = MatrixType::SPARSE;  //!< meta:export of array type
+   static constexpr bool symmetric = Symmetric;    //!< meta:export symmetric flag
+
+   friend struct MatCol<Matrix>;
+   friend struct MatRow<Matrix>;
+   friend struct MatVal<Matrix>;
+
+   /*!
+    * \name Obj lifetime
+    */
+   //! @{
+
+   //! Default ctor with optional memory allocations
+   Matrix(IndexType n=IndexType{}) noexcept:
+      values{},
+      rows{},
+      col_ptr((n)? n+1:2, IndexType{}),
+      N(n),
+      NNZ(0) { }
+
+   //! A ctor using csc array data
+   Matrix(IndexType n, IndexType nnz, const IndexType* row, const IndexType* col) noexcept:
+      values(nnz, 1),
+      rows(row, row+nnz),
+      col_ptr(col, col+n+1),
+      N(n),
+      NNZ(nnz) { }
+
+   //! ctor using csc array data with value array
+   Matrix(IndexType n, IndexType nnz, const DataType* v, const IndexType* row, const IndexType* col) noexcept:
+      values(v, v+nnz),
+      rows(row, row+nnz),
+      col_ptr(col, col+n+1),
+      N(n),
+      NNZ(nnz) { }
+
+   //! ctor vectors of row/col and default value for values array
+   Matrix(IndexType n, IndexType nnz, const DataType v,
+          const std::vector<IndexType>& row, const std::vector<IndexType>& col) noexcept:
+      values(nnz, v),
+      rows (row),
+      col_ptr(col),
+      N(n),
+      NNZ(nnz) { }
+
+   //! move ctor
+   Matrix(Matrix&& m) noexcept { moves(std::move(m)); }
+   //! move
+   Matrix& operator=(Matrix&& m) noexcept { moves(std::move(m)); return *this; }
+   Matrix(const Matrix& m)             = delete;  //!< make sure there are no copies
+   Matrix& operator=(const Matrix& m)  = delete;  //!< make sure there are no copies
+   //! @}
+
+   //! \name Data exposure
+   //! @{
+
+   //! \return the dimension of the matrix
+   IndexType size()     noexcept { return N; }
+   //! After construction size configuration tool
+   IndexType resize(IndexType n) {
+      col_ptr.resize(n+1);
+      return N = n;
+   }
+   //! \return the NNZ of the matrix
+   IndexType capacity() noexcept { return NNZ; }
+   //! After construction NNZ size configuration tool
+   IndexType capacity(IndexType nnz) noexcept {
+      values.reserve(nnz);
+      rows.reserve(nnz);
+      return NNZ;
+   }
+   // getters for row arrays of the struct (unused)
+   std::vector<DataType>&  getValues() noexcept { return values; }
+   std::vector<IndexType>& getRows() noexcept { return rows; }
+   std::vector<IndexType>& getCols() noexcept { return col_ptr; }
+
+   /*!
+    * Return a proxy MatVal object with read and write capabilities.
+    * @param i    The row number
+    * @param j    The column number
+    * @return     tHE MatVal object
+    */
+   MatVal<Matrix> operator()(IndexType i, IndexType j) noexcept {
+      return MatVal<Matrix>(this, get(i, j), i, j);
+   }
+
+   /*!
+    * A read item functionality using binary search to find the correct row
+    *
+    * @param i    The row number
+    * @param j    The column number
+    * @return     The value of the item or DataType{} if is not present.
+    */
+   DataType get(IndexType i, IndexType j) noexcept {
+      IndexType idx; bool found;
+      std::tie(idx, found) =find_idx(rows, col_ptr[j], col_ptr[j+1], i);
+      return (found) ? values[idx] : 0;
+   }
+
+   /*!
+    * A write item functionality.
+    *
+    * First we search if the matrix has already a value in (i, j) position.
+    * If so we just change it to a new value. If not we add the item on the matrix.
+    *
+    * @note
+    *    When change a value, we don't increase the NNZ value of the struct. We expect the user has already
+    *    change the NNZ value to the right one using @see capacity() function. When adding a value we
+    *    increase the NNZ.
+    *
+    * @param i    The row number
+    * @param j    The column number
+    * @return     The new value of the item .
+    */
+   DataType set(DataType v, IndexType i, IndexType j) {
+      IndexType idx; bool found;
+      std::tie(idx, found) = find_idx(rows, col_ptr[j], col_ptr[j+1], i);
+      if (found)
+         return values[idx] = v;    // we don't change NNZ even if we write "0"
+      else {
+         values.insert(values.begin()+idx, v);
+         rows.insert(rows.begin()+idx, i);
+         std::transform(col_ptr.begin()+j+1, col_ptr.end(), col_ptr.begin()+j+1, [](IndexType it) {
+            return ++it;
+         });
+         ++NNZ;                     // we increase the NNZ even if we write "0"
+         return v;
+      }
+   }
+
+   /*!
+    * Get a view of a CSC column
+    * @param j    The column to get
+    * @return     The MatCol object @see MatCol
+    */
+   MatCol<Matrix> getCol(IndexType j) noexcept {
+      return MatCol<Matrix>(this, col_ptr[j], col_ptr[j+1]);
+   }
+
+   /*!
+    * Get a view of a CSC row
+    *
+    * In case of a SYMMETRIC matrix we can return a column instead.
+    *
+    * @param j    The row to get
+    * @return     On symmetric matrix MatCol otherwise a MatRow
+    */
+
+   MatCol<Matrix> getRow(IndexType i) noexcept {
+      if constexpr (Symmetric)
+         return getCol(i);
+      else
+         return MatRow<Matrix>(this, i);
+   }
+
+   // values only iterator support
+   DataType* begin() noexcept { return values.begin(); }
+   DataType* end()   noexcept { return values.end(); }
+   //! @}
+
+   //! A small iteration helper
+   template<typename F, typename... Args>
+   void for_each_in (IndexType begin, IndexType end, F&& lambda, Args&&... args) {
+      for (IndexType it=begin ; it<end ; ++it) {
+         std::forward<F>(lambda)(std::forward<Args>(args)..., it);
+      }
+   }
+
+private:
+   /*!
+    * A small binary search implementation using index for begin-end instead of iterators.
+    *
+    * \param   v     Reference to vector to search
+    * \param   begin The vector's index to begin
+    * \param   end   The vector's index to end
+    * \param   match What to search
+    * \return  An <index, status> pair.
+    *                index    is the index of the item or end if not found
+    *                status   is true if found, false otherwise
+    */
+   std::pair<IndexType, bool> find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
+      if (v.capacity() != 0 && begin < end) {
+         IndexType b = begin, e = end-1;
+         while (b <= e) {
+            IndexType m = (b+e)/2;
+            if       (v[m] == match)   return  std::make_pair(m, true);
+            else if  (b >= e)          return  std::make_pair(end, false);
+            else {
+               if    (v[m] <  match)   b = m +1;
+               else                    e = m -1;
+            }
+         }
+      }
+      return std::make_pair(end, false);
+   }
+
+   // move helper
+   void moves(Matrix&& src) noexcept {
+      values  = std::move(src.values);
+      rows    = std::move(src.rows);
+      col_ptr = std::move(src.col_ptr);
+      N       = std::move(src.N);   // redundant for primitives
+      NNZ     = std::move(src.NNZ); //
+   }
+   //! \name Data
+   //! @{
+   std::vector<DataType>   values {};     //!< vector to store the values of the matrix
+   std::vector<IndexType>  rows{};        //!< vector to store the row information
+   std::vector<IndexType>  col_ptr{1,0};  //!< vector to store the column pointers
+   IndexType   N{0};                      //!< The dimension of the matrix (square)
+   IndexType   NNZ{0};                    //!< The NNZ (capacity of the matrix)
+   //! @}
+};
+
+
+/*!
+ * A view/iterator hybrid object for Matrix columns.
+ *
+ * This object provides access to a column of a Matrix. The public functionalities
+ * allow data access using indexes instead of iterators. We prefer indexes over iterators
+ * because we can apply the same index to different inner vector of Matrix without conversion.
+ *
+ * @tparam DataType
+ * @tparam IndexType
+ */
+template<typename MatrixType>
+struct MatCol {
+   using owner_t = MatrixType;
+
+   using DataType  = typename MatrixType::dataType;
+   using IndexType = typename MatrixType::indexType;
+
+   /*!
+    * ctor using column pointers for begin-end. own is pointer to Matrix.
+    */
+   MatCol(owner_t* own, const IndexType begin, const IndexType end) noexcept :
+      owner_(own), index_(begin), begin_(begin), end_(end) {
+      vindex_ = vIndexCalc(index_);
+   }
+   MatCol()                        = default;
+   MatCol(const MatCol&)           = delete;   //!< make sure there are no copies
+   MatCol& operator=(const MatCol&)= delete;   //!< make sure there are no copies
+   MatCol(MatCol&&)                = default;
+   MatCol& operator=(MatCol&&)     = default;
+
+   //! a simple dereference operator, like an iterator
+   DataType operator* () {
+      return get();
+   }
+   //! Increment operator acts on index(), like an iterator
+   MatCol& operator++ ()    { advance(); return *this; }
+   MatCol& operator++ (int) { MatCol& p = *this; advance(); return p; }
+
+   //! () operator acts as member access (like a view)
+   DataType operator()(IndexType x) {
+      return (x == index())? get() : DataType{};
+   }
+   //! = operator acts as member assignment (like a view)
+   DataType operator= (DataType v) { return owner_->values[index_] = v; }
+   // iterator like handlers
+   // these return a virtual index value based on the items position on the full matrix
+   // but the move of the index is just a ++ away.
+   IndexType       index()       noexcept { return vindex_; }
+   const IndexType index() const noexcept { return vindex_; }
+   IndexType       begin()       noexcept { return vIndexCalc(begin_); }
+   const IndexType begin() const noexcept { return vIndexCalc(begin_); }
+   IndexType       end()         noexcept { return owner_->N; }
+   const IndexType end()   const noexcept { return owner_->N; }
+
+   /*!
+    * Multiplication operator
+    *
+    * We follow only the non-zero values and multiply only the common indexes.
+    *
+    * @tparam C   Universal reference for the type right half site column
+    *
+    * @param c    The right hand site matrix
+    * @return     The value of the inner product of two vectors
+    * @note       The time complexity is \$ O(nnz1+nnz2) \$.
+    *             Where the nnz is the max NNZ elements of the column of the matrix
+    */
+   template <typename C>
+   DataType operator* (C&& c) {
+      static_assert(std::is_same<remove_cvref_t<C>, MatCol<MatrixType>>(), "");
+      DataType v{};
+      while (index() != end() && c.index() != c.end()) {
+         if      (index() < c.index())  advance(); // advance me
+         else if (index() > c.index())  ++c;       // advance other
+         else { //index() == c.index()
+            v += get() * *c;                       // multiply and advance both
+            ++c;
+            advance();
+         }
+      }
+      return v;
+   }
+
+private:
+   //! small tool to increase the index pointers to Matrix
+   void advance() noexcept {
+      ++index_;
+      vindex_ = vIndexCalc(index_);
+   }
+   //! tool to translate between col_ptr indexes and Matrix "virtual" full matrix indexes
+   IndexType vIndexCalc(IndexType idx) {
+      return (idx < end_) ? owner_->rows[idx] : end();
+   }
+   //! small get tool
+   DataType get() { return owner_->values[index_]; }
+
+   owner_t*    owner_   {nullptr};     //!< Pointer to owner Matrix. MatCol is just a view
+   IndexType   vindex_  {IndexType{}}; //!< Virtual index of full matrix
+   IndexType   index_   {IndexType{}}; //!< index to Matrix::rows
+   IndexType   begin_   {IndexType{}}; //!< beginning index of the column in Matrix::rows
+   IndexType   end_     {IndexType{}}; //!< ending index of the column in Matrix::rows
+};
+
+/*!
+ * A view/iterator hybrid object for Matrix rows.
+ *
+ * This object provides access to a column of a Matrix. The public functionalities
+ * allow data access using indexes instead of iterators. We prefer indexes over iterators
+ * because we can apply the same index to different inner vector of Matrix without conversion.
+ *
+ * @tparam DataType
+ * @tparam IndexType
+ */
+template<typename MatrixType>
+struct MatRow {
+   using owner_t = MatrixType;
+
+   using DataType  = typename MatrixType::dataType;
+   using IndexType = typename MatrixType::indexType;
+
+   /*!
+    * ctor using virtual full matrix row index. own is pointer to Matrix.
+    */
+   MatRow(owner_t* own, const IndexType row) noexcept :
+      owner_(own), vindex_(IndexType{}), row_(row), index_(IndexType{}),
+      begin_(IndexType{}), end_(owner_->NNZ) {
+      // place begin
+      while(begin_ != end_ && owner_->rows[begin_] != row_)
+         ++begin_;
+      // place index_ and vindex_
+      if (owner_->rows[index_] != row_)
+         advance();
+   }
+   MatRow()                        = default;
+   MatRow(const MatRow&)           = delete;   //!< make sure there are no copies
+   MatRow& operator=(const MatRow&)= delete;   //!< make sure there are no copies
+   MatRow(MatRow&&)                = default;
+   MatRow& operator=(MatRow&&)     = default;
+
+   //! a simple dereference operator, like an iterator
+   DataType operator* () {
+      return get();
+   }
+   //! Increment operator acts on index(), like an iterator
+   //! here the increment is a O(N) process.
+   MatRow& operator++ ()    { advance(); return *this; }
+   MatRow& operator++ (int) { MatRow& p = *this; advance(); return p; }
+
+   //! () operator acts as member access (like a view)
+   DataType operator()(IndexType x) {
+      return (x == index())? get() : DataType{};
+   }
+   //! = operator acts as member assignment (like a view)
+   DataType operator= (DataType v) { return owner_->values[index_] = v; }
+   // iterator like handlers
+   // these return a virtual index value based on the items position on the full matrix
+   // but the move of the index is just a ++ away.
+   IndexType       index()       noexcept { return vindex_; }
+   const IndexType index() const noexcept { return vindex_; }
+   IndexType       begin()       noexcept { return vIndexCalc(begin_); }
+   const IndexType begin() const noexcept { return vIndexCalc(begin_); }
+   IndexType       end()         noexcept { return owner_->N; }
+   const IndexType end()   const noexcept { return owner_->N; }
+
+   /*!
+    * Multiplication operator
+    *
+    * We follow only the non-zero values and multiply only the common indexes.
+    *
+    * @tparam C   Universal reference for the type right half site column
+    *
+    * @param c    The right hand site matrix
+    * @return     The value of the inner product of two vectors
+    * @note       The time complexity is \$ O(N+nnz2) \$ and way heavier the ColxCol multiplication.
+    *             Where the nnz is the max NNZ elements of the column of the matrix
+    */
+   template <typename C>
+   DataType operator* (C&& c) {
+      static_assert(std::is_same<remove_cvref_t<C>, MatCol<MatrixType>>(), "");
+      DataType v{};
+      while (index() != end() && c.index() != c.end()) {
+         if      (index() < c.index())  advance(); // advance me
+         else if (index() > c.index())  ++c;       // advance other
+         else { //index() == c.index()
+            v += get() * *c;                       // multiply and advance both
+            ++c;
+            advance();
+         }
+      }
+      return v;
+   }
+private:
+   //! small tool to increase the index pointers to Matrix matrix
+   //! We have to search the entire rows vector in Matrix to find the next
+   //! virtual row position.
+   //! time complexity O(N)
+   void advance() noexcept {
+      do
+         ++index_;
+      while(index_ != end_ && owner_->rows[index_] != row_);
+      vindex_ = vIndexCalc(index_);
+   }
+   //! tool to translate between col_ptr indexes and Matrix "virtual" full matrix indexes
+   IndexType vIndexCalc(IndexType idx) {
+      for(IndexType i =0 ; i<(owner_->N+1) ; ++i)
+         if (idx < owner_->col_ptr[i])
+            return i-1;
+      return end();
+   }
+   //! small get tool
+   DataType get() { return owner_->values[index_]; }
+
+   owner_t*    owner_   {nullptr};     //!< Pointer to owner Matrix. MatCol is just a view
+   IndexType   vindex_  {IndexType{}}; //!< Virtual index of full matrix
+   IndexType   row_     {IndexType{}}; //!< The virtual full matrix row of the object
+   IndexType   index_   {IndexType{}}; //!< index to Matrix::rows
+   IndexType   begin_   {IndexType{}}; //!< beginning index of the column in Matrix::rows
+   IndexType   end_     {IndexType{}}; //!< ending index of the column in Matrix::rows
+};
+
+/*!
+ * A proxy Matrix value object/view.
+ *
+ * This object acts as proxy to provide read/write access to an Matrix item.
+ *
+ * @tparam DataType     The type of the values of the Matrix matrix
+ * @tparam IndexType    The type of the indexes of the Matrix matrix
+ */
+template<typename MatrixType>
+struct MatVal {
+   using owner_t = MatrixType;
+
+   using DataType  = typename MatrixType::dataType;
+   using IndexType = typename MatrixType::indexType;
+
+   //!< ctor using all value-row-column data, plus a pointer to owner Matrix object
+   MatVal(owner_t* own, DataType v, IndexType i, IndexType j) :
+      owner_(own), v_(v), i_(i), j_(j) { }
+   MatVal()                         = default;
+   MatVal(const MatVal&)            = delete;  //!< make sure there are no copies
+   MatVal& operator=(const MatVal&) = delete;  //!< make sure there are no copies
+   MatVal(MatVal&&)                 = default;
+   MatVal& operator=(MatVal&&)      = default;
+
+   //! Operator to return the DataType value implicitly
+   operator DataType() { return v_; }
+   //! Operator to write back to owner the assigned value
+   //! for ex: A(2,3) = 5;
+   MatVal& operator=(DataType v) {
+      v_ = v;
+      owner_->set(v_, i_, j_);
+      return *this;
+   }
+private:
+   owner_t*    owner_{nullptr};  //!< Pointer to owner Matrix. MatVal is just a view.
+   DataType    v_{DataType{}};   //!< The value of the row-column pair (for speed)
+   IndexType   i_{IndexType{}};  //!< The row
+   IndexType   j_{IndexType{}};  //!< the column
+};
+
+
+} // namespace mtx
+
+
+#endif /* MATRIX_HPP_ */
--- a/homework_2/include/utils.hpp
+++ b/homework_2/include/utils.hpp
@ -0,0 +1,87 @@
+/**
+ * \file    utils.hpp
+ * \brief   Utilities header
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#ifndef UTILS_HPP_
+#define UTILS_HPP_
+
+#include <iostream>
+#include <chrono>
+#include <unistd.h>
+
+#include "matrix.hpp"
+#include "config.h"
+
+/*!
+ * A Logger for entire program.
+ */
+struct Log {
+   struct Endl {} endl;    //!< a tag object to to use it as a new line request.
+
+   //! We provide logging via << operator
+   template<typename T>
+   Log& operator<< (T&& t) {
+      if (session.verbose) {
+         if (line_) {
+            std::cout << "[Log]: " << t;
+            line_ = false;
+         }
+         else
+            std::cout << t;
+      }
+      return *this;
+   }
+   // overload for special end line handling
+   Log& operator<< (Endl e) { (void)e;
+      if (session.verbose) {
+         std::cout << '\n';
+         line_ = true;
+      }
+      return *this;
+   }
+private:
+   bool line_ {true};
+};
+
+extern Log logger;
+
+/*!
+ * A small timing utility based on chrono.
+ */
+struct Timing{
+   using Tpoint = std::chrono::steady_clock::time_point;
+   using microseconds = std::chrono::microseconds;
+   using milliseconds = std::chrono::milliseconds;
+   using seconds = std::chrono::seconds;
+
+   //! tool to mark the starting point
+   Tpoint start () noexcept { return start_ = std::chrono::steady_clock::now(); }
+   //! tool to mark the ending point
+   Tpoint stop ()  noexcept { return stop_ = std::chrono::steady_clock::now(); }
+
+   auto dt () noexcept {
+      return std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_).count();
+   }
+   //! tool to print the time interval
+   void print_dt (const char* what) noexcept {
+      if (session.timing) {
+         auto t = stop_ - start_;
+         if       (std::chrono::duration_cast<microseconds>(t).count() < 10000)
+            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<microseconds>(t).count()) << " [usec]\n";
+         else if (std::chrono::duration_cast<milliseconds>(t).count() < 10000)
+            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<milliseconds>(t).count()) << " [msec]\n";
+         else
+            std::cout << "[Timing]: " << what << ": " << std::to_string(std::chrono::duration_cast<seconds>(t).count()) << " [sec]\n";
+      }
+   }
+private:
+   Tpoint start_;
+   Tpoint stop_;
+};
+
+
+#endif /* UTILS_HPP_ */
--- a/homework_2/julia/bitonic_v05.jl
+++ b/homework_2/julia/bitonic_v05.jl
@ -0,0 +1,114 @@
+#
+# ---------------------------------------------
+# Bitonic v0.5 functionality
+#
+
+function partner(node, step)
+    partner = node + ((((node+step) % 2) == 0) ? 1 : -1)
+end
+
+function active(id, p)
+    ret =  (id >= 0) && (id < p)
+end
+
+function exchange(localid,  remoteid)
+    if verbose
+        println("Exchange local data from $localid with partner $remoteid")
+    end
+    nothing # We have all data here ;)
+end
+
+function minmax(data, localid, remoteid, keepsmall)
+    # Keep min-max on local data
+    temp = copy(data[localid+1, :])
+    if (keepsmall)
+        view(data, localid+1, :)  .= min.(temp, data[remoteid+1, :])
+        view(data, remoteid+1, :) .= max.(temp, data[remoteid+1, :])
+    else
+        view(data, localid+1, :)  .= max.(temp, data[remoteid+1, :])
+        view(data, remoteid+1, :) .= min.(temp, data[remoteid+1, :])
+    end
+end
+
+"""
+    distbubletonic!(p, data)
+
+distributed bitonic v0.5 sort using a "bubble sort"-like functionality to propagate large and small
+items between nodes.
+
+p:    The number of processes
+data: (p, N/p) array
+"""
+function distbubletonic!(p, data) 
+  
+    pid = 0:p-1
+    ascending = mod.(pid,2) .== 0
+    if verbose
+        println("ascending: $ascending")
+    end
+    # local full sort here (run all MPI nodes)
+    for i in 1:p
+        sort!(view(data, i, :), rev = !ascending[i])
+    end
+    for step in 0:p-2
+        direction = [true for x = 1:p]
+        partnerid = partner.(pid, step)
+        activeids = active.(partnerid, p)
+        keepsmall = pid .< partnerid
+        if verbose
+            println("step: $step | active ids: $activeids | partner: $partnerid | keepsmall: $keepsmall")
+        end
+        # exchange with partner and keep small or large (run all MPI nodes)
+        for i in 0:p-1
+            l_idx = i+1
+            r_idx = partnerid[i+1]+1
+            if activeids[l_idx] && i < partnerid[l_idx]
+                exchange(i, partnerid[l_idx])
+                minmax(data, i, partnerid[l_idx], keepsmall[l_idx])
+                sort!(view(data, l_idx, :), rev = !ascending[l_idx])    # elbow sort here
+                sort!(view(data, r_idx, :), rev = !ascending[r_idx])    # elbow sort here
+            end
+        end
+    end
+    # [optional] reverse the odd positions (run all MPI nodes)
+    for i in 1:p
+        if !ascending[i]
+            sort!(view(data, i, :))
+        end
+    end
+    nothing
+end
+
+
+
+#
+# Homework setup
+# ---------------------------------------------
+#
+p::Int8 = 3  # The order of number of "processors"
+q::Int8 = 8  # The data size order (power of 2) of each "processor"
+verbose = false;
+
+
+# Run Script
+# ---------------------------------------------
+P::Int = 2^p
+Q::Int = 2^q
+N::Int = 2^(q+p)
+
+println("Distributed Bubbletonic (v0.5) test")
+println("p: $p -> Number of processors: $P")
+println("q: $q -> Data length for each node: $Q, Total: $(P*Q)")
+
+println("Create an $P x $Q array")
+Data = rand(Int8, P, Q)
+
+println("Sort array with $P (MPI) nodes")
+@time distbubletonic!(P, Data)
+
+# Test
+if issorted(vec(permutedims(Data)))
+    println("Test: Passed")
+else
+    println("Test: Failed")
+end
--- a/homework_2/julia/bitonic_v1.jl
+++ b/homework_2/julia/bitonic_v1.jl
@ -0,0 +1,113 @@
+#
+# ---------------------------------------------
+# Bitonic v0.5 functionality
+#
+
+function exchange(localid,  remoteid)
+    if verbose
+        println("Exchange local data from $localid with partner $remoteid")
+    end
+    nothing # We have all data here ;)
+end
+
+function minmax(data, localid, remoteid, keepsmall)
+    # Keep min-max on local data
+    temp = copy(data[localid+1, :])
+    if (keepsmall)
+        view(data, localid+1, :)  .= min.(temp, data[remoteid+1, :])
+        view(data, remoteid+1, :) .= max.(temp, data[remoteid+1, :])
+    else
+        view(data, localid+1, :)  .= max.(temp, data[remoteid+1, :])
+        view(data, remoteid+1, :) .= min.(temp, data[remoteid+1, :])
+    end
+end
+
+
+
+function sort_network!(data, n, depth)
+    nodes = 0:n-1
+    for step = depth-1:-1:0
+        partnerid = nodes .⊻ (1 << step)
+        direction = (nodes .& (1 << depth)) .== 0 .& (nodes .< partnerid)
+        keepsmall = ((nodes .< partnerid) .& direction) .| ((nodes .> partnerid) .& .!direction)
+        if verbose
+            println("depth: $depth | step: $step | partner: $partnerid | keepsmall: $keepsmall")
+        end
+        # exchange with partner and keep small or large (run all MPI nodes)
+        for i in 0:n-1
+            if (i < partnerid[i+1])
+                exchange(i, partnerid[i+1])
+                minmax(data, i, partnerid[i+1], keepsmall[i+1])
+            end
+        end
+    end
+end
+
+
+
+"""
+    distbitonic!(p, data)
+
+distributed bitonic sort v1 using elbow merge locally except for the first step
+p:    The number of processes
+data: (p, N/p) array
+"""
+function distbitonic!(p, data) 
+
+    q = Int(log2(p))    # CPU order
+  
+    pid = 0:p-1
+    ascending = mod.(pid,2) .== 0
+    if verbose
+        println("ascending: $ascending")
+    end
+    # local full sort here (run all MPI nodes)
+    for i in 1:p
+        sort!(view(data, i, :), rev = !ascending[i])
+    end
+    for depth = 1:q
+        sort_network!(data, p, depth)
+        ascending = (pid .& (1 << depth)) .== 0
+        if verbose
+            println("ascending: $ascending")
+        end
+        # local elbowmerge here (run all MPI nodes)
+        for i in 1:p
+            sort!(view(data, i, :), rev = !ascending[i])
+        end
+    end
+  
+    nothing
+end
+
+#
+# Homework setup
+# ---------------------------------------------
+#
+p::Int8 = 3  # The order of number of "processors"
+q::Int8 = 8  # The data size order (power of 2) of each "processor"
+verbose = false;
+
+
+# Run Script
+# ---------------------------------------------
+P::Int = 2^p
+Q::Int = 2^q
+N::Int = 2^(q+p)
+
+println("Distributed bitonic (v1) test")
+println("p: $p -> Number of processors: $P")
+println("q: $q -> Data length for each node: $Q, Total: $(P*Q)")
+
+println("Create an $P x $Q array")
+Data = rand(Int8, P, Q)
+
+println("Sort array with $P (MPI) nodes")
+@time distbitonic!(P, Data)
+
+# Test
+if issorted(vec(permutedims(Data)))
+    println("Test: Passed")
+else
+    println("Test: Failed")
+end
--- a/homework_2/julia/distbitonic.jl
+++ b/homework_2/julia/distbitonic.jl
@ -0,0 +1,27 @@
+# distributed bitonic sort using elbow merge locally except for the first step
+function distbitonic!(p)
+
+    q = Int(log2(p))
+  
+    pid = 0:p-1
+    ascending = mod.(pid,2) .== 0
+    println("ascending: $ascending")
+    # local full sort here
+    for k = 1:q
+      kk = 1 << k
+      for j = k-1:-1:0
+        jj = 1 << j
+        partnerid = pid .⊻ jj
+        direction = (pid .& kk) .== 0 .& (pid .< partnerid)
+        keepsmall = ((pid .< partnerid) .& direction) .| ((pid .> partnerid) .& .!direction)
+        println("k: $k | j: $j | partner: $partnerid | keepsmall: $keepsmall")
+        # exchange with partner and keep small or large
+      end
+      ascending = (pid .& kk) .== 0
+      println("ascending: $ascending")
+      # local elbowmerge here
+    end
+  
+    nothing
+  end
+ 
--- a/homework_2/julia/elbowmerge.jl
+++ b/homework_2/julia/elbowmerge.jl
@ -0,0 +1,23 @@
+
+# given a bitonic sequence b, merge it into a sorted sequence s
+@inbounds function elbowmerge!(s, b)
+    n = length(b)
+    l = argmin(b)
+    r = l == n ? 1 : l + 1
+  
+    i = 1
+    while i <= n
+      if b[l] < b[r]
+        s[i] = b[l]
+        l = l == 1 ? n : l - 1
+      else
+        s[i] = b[r]
+        r = r == n ? 1 : r + 1
+      end
+      i += 1
+    end
+    nothing
+  end
+  
+
+  
--- a/homework_2/src/main.cpp
+++ b/homework_2/src/main.cpp
@ -0,0 +1,78 @@
+/*!
+ * \file    main.cpp
+ * \brief   Main application file for PDS HW2 (MPI)
+ *
+ * \author
+ *    Christos Choutouridis AEM:8997
+ *    <cchoutou@ece.auth.gr>
+ */
+#include <exception>
+#include <iostream>
+
+#include <mpi.h>
+
+#include "matrix.hpp"
+#include "utils.hpp"
+#include "config.h"
+
+// Global session data
+session_t   session;
+
+/*!
+ * A small command line argument parser
+ * \return  The status of the operation
+ */
+bool get_options(int argc, char* argv[]){
+    bool status =true;
+
+    // iterate over the passed arguments
+    for (int i=1 ; i<argc ; ++i) {
+        std::string arg(argv[i]);     // get current argument
+
+        if (arg == "-x" || arg == "--xxxxx") {
+            if (i+2 < argc) {
+//                session.corpusMtxFile = std::string(argv[++i]);
+//                session.corpusDataSet = std::string(argv[++i]);
+            }
+            else
+                status = false;
+        }
+
+        else if (arg == "-v" || arg == "--verbose")
+            session.verbose = true;
+        else if (arg == "-h" || arg == "--help") {
+            std::cout << "distbitonic - A distributed bitonic sort\n\n";
+            std::cout << "distbitonic -x <> [-v]\n";
+            std::cout << '\n';
+            std::cout << "Options:\n\n";
+            std::cout << "   -v | --verbose\n";
+            std::cout << "      Request a more verbose output to stdout.\n\n";
+            std::cout << "   -h | --help\n";
+            std::cout << "      Prints this and exit.\n\n";
+            std::cout << "Examples:\n\n";
+            std::cout << "   ...Example case...:\n";
+            std::cout << "   > distbitonic -x <xxxxx>  \n\n";
+
+            exit(0);
+        }
+        else {   // parse error
+            std::cout << "Invocation error. Try -h for details.\n";
+            status = false;
+        }
+    }
+
+    return status;
+}
+
+int main(int argc, char* argv[]) try {
+    // try to read command line
+    if (!get_options(argc, argv))
+        exit(1);
+
+    return 0;
+}
+catch (std::exception& e) {
+    //we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
+    std::cerr << "Error: " << e.what() << '\n';
+    exit(1);
+}
Author	SHA1	Message	Date
Christos Choutouridis	058df095b0	HW2: Setup build env and basic code setup	2024-12-26 12:04:24 +02:00
Christos Choutouridis	64dc18280c	HW2: Init C++ project	2024-12-25 10:02:48 +02:00
Christos Choutouridis	61b63016cc	HW2: Add julia implementations for v0.5 and v1	2024-12-24 17:58:10 +02:00
Christos Choutouridis	955981d177	Refresh the report and better commenting	2024-11-21 22:56:37 +02:00
Christos Choutouridis	381a5737db	Copy report to the repo also (no code changes only comments)	2024-11-21 15:18:39 +02:00
Christos Choutouridis	494fdddd04	v1 version and measurements	2024-11-20 21:56:04 +02:00