@@ -5,6 +5,10 @@ out/ | |||||
mat/ | mat/ | ||||
mtx/ | mtx/ | ||||
# hpc related | |||||
exclude | |||||
hpc_auth_sync.sh | |||||
# eclipse | # eclipse | ||||
.project | .project | ||||
.cproject | .cproject | ||||
@@ -38,7 +38,7 @@ DEP_DIR := $(BUILD_DIR)/.dep | |||||
# ========== Compiler settings ========== | # ========== Compiler settings ========== | ||||
# Compiler flags for debug and release | # Compiler flags for debug and release | ||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++14 | DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++14 | ||||
REL_CFLAGS := -DDEBUG -g3 -Wall -Wextra -O2 -std=c++14 | |||||
REL_CFLAGS := -Wall -Wextra -O3 -std=c++14 | |||||
# Pre-defines | # Pre-defines | ||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE | # PRE_DEFS := MYCAB=1729 SUPER_MODE | ||||
PRE_DEFS := | PRE_DEFS := | ||||
@@ -151,39 +151,50 @@ release: $(BUILD_DIR)/$(TARGET) | |||||
all: release | all: release | ||||
local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V3 | |||||
local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=3 | |||||
local_v3: TARGET := local_v3 | local_v3: TARGET := local_v3 | ||||
local_v3: $(BUILD_DIR)/$(TARGET) | local_v3: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V4 | |||||
local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=4 | |||||
local_v4: TARGET := local_v4 | local_v4: TARGET := local_v4 | ||||
local_v4: $(BUILD_DIR)/$(TARGET) | local_v4: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
elearn: CFLAGS := $(DEB_CFLAGS) -DELEARNING | |||||
elearn: TARGET := elearn | |||||
elearn: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
local_v4_opt: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -pg | |||||
local_v4_opt: LDFLAGS += -pg | |||||
local_v4_opt: TARGET := local_v4_opt | |||||
local_v4_opt: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
v3: DOCKER := $(DOCKER_CMD) | v3: DOCKER := $(DOCKER_CMD) | ||||
v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V3 | |||||
v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3 | |||||
v3: TARGET := tcount_v3 | v3: TARGET := tcount_v3 | ||||
v3: $(BUILD_DIR)/$(TARGET) | v3: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
v3_cilk: DOCKER := $(DOCKER_CMD) | v3_cilk: DOCKER := $(DOCKER_CMD) | ||||
v3_cilk: CXX := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++ | v3_cilk: CXX := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++ | ||||
v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=V3 -DCILK | |||||
v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK | |||||
v3_cilk: LDFLAGS += -fcilkplus | v3_cilk: LDFLAGS += -fcilkplus | ||||
v3_cilk: TARGET := tcount_cilkv3 | v3_cilk: TARGET := tcount_cilkv3 | ||||
v3_cilk: $(BUILD_DIR)/$(TARGET) | v3_cilk: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
v3_omp: DOCKER := $(DOCKER_CMD) | v3_omp: DOCKER := $(DOCKER_CMD) | ||||
v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V3 -DOMP | |||||
v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP | |||||
v3_omp: LDFLAGS += -fopenmp | v3_omp: LDFLAGS += -fopenmp | ||||
v3_omp: TARGET := tcount_ompv3 | v3_omp: TARGET := tcount_ompv3 | ||||
v3_omp: $(BUILD_DIR)/$(TARGET) | v3_omp: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
v4: DOCKER := $(DOCKER_CMD) | v4: DOCKER := $(DOCKER_CMD) | ||||
v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4 | |||||
v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 | |||||
v4: TARGET := tcount_v4 | v4: TARGET := tcount_v4 | ||||
v4: $(BUILD_DIR)/$(TARGET) | v4: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
@@ -197,25 +208,61 @@ v4_cilk: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
v4_omp: DOCKER := $(DOCKER_CMD) | v4_omp: DOCKER := $(DOCKER_CMD) | ||||
v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V4 -DOMP | |||||
v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP | |||||
v4_omp: LDFLAGS += -fopenmp | v4_omp: LDFLAGS += -fopenmp | ||||
v4_omp: TARGET := tcount_ompv4 | v4_omp: TARGET := tcount_ompv4 | ||||
v4_omp: $(BUILD_DIR)/$(TARGET) | v4_omp: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
v4_pthreads: DOCKER := $(DOCKER_CMD) | v4_pthreads: DOCKER := $(DOCKER_CMD) | ||||
v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4 -DTHREADS | |||||
v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS | |||||
v4_pthreads: TARGET := tcount_pthv4 | v4_pthreads: TARGET := tcount_pthv4 | ||||
v4_pthreads: $(BUILD_DIR)/$(TARGET) | v4_pthreads: $(BUILD_DIR)/$(TARGET) | ||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | ||||
# | # | ||||
# ================ Docker based rules ================ | |||||
# examples: | |||||
# make IMAGE="gcc:8.3" dock | |||||
# ================ hpc build rules ================= | |||||
# | # | ||||
dock: DOCKER := $(DOCKER_CMD) | |||||
dock: CFLAGS := $(REL_CFLAGS) | |||||
dock: $(BUILD_DIR)/$(TARGET) | |||||
hpc_v3_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3 | |||||
hpc_v3_ser: TARGET := hpc_v3 | |||||
hpc_v3_ser: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP | |||||
hpc_v3_omp: LDFLAGS += -fopenmp | |||||
hpc_v3_omp: TARGET := hpc_ompv3 | |||||
hpc_v3_omp: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v3_cilk: CXX := clang++ | |||||
hpc_v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK | |||||
hpc_v3_cilk: LDFLAGS += -fcilkplus | |||||
hpc_v3_cilk: TARGET := hpc_cilkv3 | |||||
hpc_v3_cilk: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v4_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 | |||||
hpc_v4_ser: TARGET := hpc_v4 | |||||
hpc_v4_ser: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP | |||||
hpc_v4_omp: LDFLAGS += -fopenmp | |||||
hpc_v4_omp: TARGET := hpc_ompv4 | |||||
hpc_v4_omp: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v4_cilk: CXX := clang++ | |||||
hpc_v4_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=4 -DCILK | |||||
hpc_v4_cilk: LDFLAGS += -fcilkplus | |||||
hpc_v4_cilk: TARGET := hpc_cilkv4 | |||||
hpc_v4_cilk: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
hpc_v4_pth: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS | |||||
hpc_v4_pth: TARGET := hpc_pthv4 | |||||
hpc_v4_pth: $(BUILD_DIR)/$(TARGET) | |||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET) | |||||
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=1 | |||||
#SBATCH --output=ntasks1.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=10 | |||||
#SBATCH --output=ntasks10.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=15 | |||||
#SBATCH --output=ntasks15.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=2 | |||||
#SBATCH --output=ntasks2.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=20 | |||||
#SBATCH --output=ntasks20.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=4 | |||||
#SBATCH --output=ntasks4.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=5 | |||||
#SBATCH --output=ntasks5.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -0,0 +1,19 @@ | |||||
#! /usr/bin/env bash | |||||
#SBATCH --time=20:00 | |||||
#SBATCH --partition=batch | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --ntasks-per-node=8 | |||||
#SBATCH --output=ntasks8.out | |||||
module load gcc/9.2.0 openmpi/3.1.6 | |||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib | |||||
export OMP_NUM_THREADS=$SLURM_NTASKS | |||||
export CILK_NWORKERS=$SLURM_NTASKS | |||||
./runall.sh mtx/belgium_osm.mtx 8 | |||||
./runall.sh mtx/com-Youtube.mtx 8 | |||||
./runall.sh mtx/dblp-2010.mtx 8 | |||||
./runall.sh mtx/mycielskian13.mtx 8 | |||||
./runall.sh mtx/NACA0015.mtx 8 |
@@ -13,6 +13,7 @@ | |||||
#include <v12.h> | #include <v12.h> | ||||
#include <v3.h> | #include <v3.h> | ||||
#include <v4.h> | #include <v4.h> | ||||
#include <elearn.h> | |||||
/* | /* | ||||
* Defines for different version of the exercise | * Defines for different version of the exercise | ||||
@@ -0,0 +1,17 @@ | |||||
/*! | |||||
* \file elearn.h | |||||
* \brief e-learning version of the exercise. | |||||
* | |||||
* \author | |||||
* Christos Choutouridis AEM:8997 | |||||
* <cchoutou@ece.auth.gr> | |||||
*/ | |||||
#ifndef ELEARN_H_ | |||||
#define ELEARN_H_ | |||||
#include <impl.hpp> | |||||
uint32_t elearn_test (void) ; | |||||
#endif /* ELEARN_H_ */ |
@@ -284,8 +284,22 @@ struct SpMat { | |||||
* @return The value of the item or DataType{} if is not present. | * @return The value of the item or DataType{} if is not present. | ||||
*/ | */ | ||||
DataType get(IndexType i, IndexType j) { | DataType get(IndexType i, IndexType j) { | ||||
IndexType end, idx =find_idx(rows, col_ptr[j], end=col_ptr[j+1], i); | |||||
return (idx != end) ? values[idx] : 0; | |||||
IndexType idx; bool found; | |||||
std::tie(idx, found) =find_idx(rows, col_ptr[j], col_ptr[j+1], i); | |||||
return (found) ? values[idx] : 0; | |||||
} | |||||
/*! | |||||
* A read item functionality using binary search to find the correct row | |||||
* | |||||
* @param i The row number | |||||
* @param j The column number | |||||
* @return The value of the item or DataType{} if is not present. | |||||
*/ | |||||
DataType get2(IndexType i, IndexType j) { | |||||
IndexType idx; bool found; | |||||
std::tie(idx, found) =find2_idx(rows, col_ptr[j], col_ptr[j+1], i); | |||||
return (found) ? values[idx] : 0; | |||||
} | } | ||||
/*! | /*! | ||||
@@ -380,18 +394,18 @@ private: | |||||
* \param match What to search | * \param match What to search | ||||
* @return The index of the item or end on failure. | * @return The index of the item or end on failure. | ||||
*/ | */ | ||||
IndexType find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) { | |||||
std::pair<IndexType, bool> find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) { | |||||
IndexType b = begin, e = end-1; | IndexType b = begin, e = end-1; | ||||
while (true) { | while (true) { | ||||
IndexType m = (b+e)/2; | IndexType m = (b+e)/2; | ||||
if (v[m] == match) return m; | |||||
else if (b >= e) return end; | |||||
if (v[m] == match) return std::make_pair(m, true); | |||||
else if (b >= e) return std::make_pair(end, false); | |||||
else { | else { | ||||
if (v[m] < match) b = m +1; | if (v[m] < match) b = m +1; | ||||
else e = m -1; | else e = m -1; | ||||
} | } | ||||
} | } | ||||
return end; | |||||
return std::make_pair(end, false);; | |||||
} | } | ||||
/*! | /*! | ||||
* find helper for set using index for begin-end instead of iterators. | * find helper for set using index for begin-end instead of iterators. | ||||
@@ -687,13 +701,19 @@ struct session_t { | |||||
OutputMode outputMode {OutputMode::STD}; //!< Type of the output file | OutputMode outputMode {OutputMode::STD}; //!< Type of the output file | ||||
std::ofstream outFile {}; //!< File to use for output | std::ofstream outFile {}; //!< File to use for output | ||||
std::size_t max_threads {}; //!< Maximum threads to use | std::size_t max_threads {}; //!< Maximum threads to use | ||||
std::size_t repeat {1}; //!< How many times we execute the calculations part | |||||
bool timing {false}; //!< Enable timing prints of the program | bool timing {false}; //!< Enable timing prints of the program | ||||
bool verbose {false}; //!< Flag to enable verbose output to stdout | bool verbose {false}; //!< Flag to enable verbose output to stdout | ||||
#if CODE_VERSION == 3 | |||||
bool makeSymmetric {false}; //!< symmetric matrix creation flag (true by default) | |||||
#else | |||||
bool makeSymmetric {true}; //!< symmetric matrix creation flag (true by default) | bool makeSymmetric {true}; //!< symmetric matrix creation flag (true by default) | ||||
#endif | |||||
bool validate_mtx {false}; //!< Flag to request mtx input data triangular validation. | bool validate_mtx {false}; //!< Flag to request mtx input data triangular validation. | ||||
bool print_count {false}; //!< Flag to request total count printing | bool print_count {false}; //!< Flag to request total count printing | ||||
bool mtx_print {false}; //!< matrix print flag | bool mtx_print {false}; //!< matrix print flag | ||||
std::size_t mtx_print_size {}; //!< matrix print size | std::size_t mtx_print_size {}; //!< matrix print size | ||||
bool dynamic {false}; //!< Selects dynamic scheduling for OpenMP and pthreads. | |||||
}; | }; | ||||
extern session_t session; | extern session_t session; | ||||
@@ -11,6 +11,7 @@ | |||||
#include <iostream> | #include <iostream> | ||||
#include <mutex> | #include <mutex> | ||||
#include <atomic> | |||||
#include <impl.hpp> | #include <impl.hpp> | ||||
#if defined CILK | #if defined CILK | ||||
@@ -24,6 +24,8 @@ | |||||
#elif defined THREADS | #elif defined THREADS | ||||
#include <thread> | #include <thread> | ||||
#include <numeric> | |||||
#include <random> | |||||
#else | #else | ||||
#endif | #endif | ||||
@@ -0,0 +1,31 @@ | |||||
#!/usr/bin/env bash | |||||
if [[ $# -lt 2 ]]; then | |||||
echo "Error: You must pass the matrix files and the number of iterations" | |||||
echo "example $ runnall.sh mtx/s12.mtx 5" | |||||
exit 1; | |||||
fi | |||||
dynamics=("out/hpc_ompv3" "out/hpc_ompv4" "out/hpc_pthv4") | |||||
for ex in out/*; do | |||||
echo "-------------------------------------------" | |||||
echo "executable: $ex" | |||||
for file in "$@"; do | |||||
if [[ $file == ${@: -1} ]];then | |||||
continue | |||||
fi | |||||
echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null" | |||||
eval $ex -i $file -r ${@: -1} --timing -o /dev/null | |||||
echo "running $ex -i $file -r ${@: -1} --timing --print_count" | |||||
eval $ex -i $file -r ${@: -1} --timing --print_count | |||||
if [[ $ex == ${dynamics[0]} || $ex == ${dynamics[1]} || $ex == ${dynamics[2]} ]]; then | |||||
echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic" | |||||
eval $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic | |||||
echo "running $ex -i $file -r ${@: -1} --timing --print_count --dynamic" | |||||
eval $ex -i $file -r ${@: -1} --timing --print_count --dynamic | |||||
fi | |||||
done | |||||
done | |||||
@@ -0,0 +1,126 @@ | |||||
/*! | |||||
* \file elearn.cpp | |||||
* \brief e-learning version of the exercise. | |||||
* | |||||
* \author | |||||
* Christos Choutouridis AEM:8997 | |||||
* <cchoutou@ece.auth.gr> | |||||
*/ | |||||
#include <elearn.h> | |||||
//------- e-learning code start --------- | |||||
//! Credits to PDS team | |||||
static void coo2csc_e( | |||||
uint32_t *row, uint32_t *col, uint32_t const* row_coo, uint32_t const* col_coo, uint32_t nnz, uint32_t n, uint32_t isOneBased | |||||
) { | |||||
// ----- cannot assume that input is already 0! | |||||
for (uint32_t l = 0; l < n+1; l++) col[l] = 0; | |||||
// ----- find the correct column sizes | |||||
for (uint32_t l = 0; l < nnz; l++) | |||||
col[col_coo[l] - isOneBased]++; | |||||
// ----- cumulative sum | |||||
for (uint32_t i = 0, cumsum = 0; i < n; i++) { | |||||
uint32_t temp = col[i]; | |||||
col[i] = cumsum; | |||||
cumsum += temp; | |||||
} | |||||
col[n] = nnz; | |||||
// ----- copy the row indices to the correct place | |||||
for (uint32_t l = 0; l < nnz; l++) { | |||||
uint32_t col_l; | |||||
col_l = col_coo[l] - isOneBased; | |||||
uint32_t dst = col[col_l]; | |||||
row[dst] = row_coo[l] - isOneBased; | |||||
col[col_l]++; | |||||
} | |||||
// ----- revert the column pointers | |||||
for (uint32_t i = 0, last = 0; i < n; i++) { | |||||
uint32_t temp = col[i]; | |||||
col[i] = last; | |||||
last = temp; | |||||
} | |||||
} | |||||
/*! | |||||
* A small binary search utility | |||||
*/ | |||||
uint32_t find_idx(const uint32_t* v, uint32_t begin, uint32_t end, uint32_t match) { | |||||
uint32_t b = begin, e = end-1; | |||||
while (1) { | |||||
uint32_t m = (b+e)/2; | |||||
if (v[m] == match) return m; | |||||
else if (b >= e) return end; | |||||
else { | |||||
if (v[m] < match) b = m +1; | |||||
else e = m -1; | |||||
} | |||||
} | |||||
return end; | |||||
} | |||||
/*! | |||||
* Sparse matrix item accessor | |||||
*/ | |||||
uint32_t get(uint32_t* R, uint32_t* C, uint32_t i, uint32_t j) { | |||||
uint32_t e = C[j+1]; | |||||
return (find_idx(R, C[j], e, i) != e) ? 1 : 0; | |||||
} | |||||
/*! | |||||
* \param coo_row pointer to coo row data | |||||
* \param coo_col pointer to coo_column data | |||||
* \param n the size of matrix | |||||
* \param nz the number of non-zero items | |||||
* \return The vertex-wise count vector | |||||
*/ | |||||
uint32_t* vertexWiseTriangleCounts (uint32_t *coo_row, uint32_t *coo_col, uint32_t n, uint32_t nz) { | |||||
uint32_t* v = (uint32_t*)malloc(sizeof(uint32_t)*n); | |||||
uint32_t* R = (uint32_t*)malloc(sizeof(uint32_t)*nz); | |||||
uint32_t* C = (uint32_t*)malloc(sizeof(uint32_t)*n+1); | |||||
// convert input | |||||
coo2csc_e (R, C, coo_row, coo_col, nz, n, 1); | |||||
for (uint32_t i=0 ; i<n ; ++i) { | |||||
for (uint32_t j = C[i]; j<C[i+1] ; ++j) { | |||||
uint32_t j_idx = R[j]; | |||||
for (uint32_t k = C[j_idx] ; k<C[j_idx+1] ; ++k) { | |||||
uint32_t k_idx = R[k]; | |||||
if (get(R, C, k_idx, i)) { | |||||
++v[i]; | |||||
++v[j_idx]; | |||||
++v[k_idx]; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return v; | |||||
} | |||||
//------- e-learning code end --------- | |||||
/*! | |||||
* A unit-test like functionality to check our implementation. | |||||
* \return | |||||
*/ | |||||
uint32_t elearn_test (void) { | |||||
uint32_t CooR[] = { 2, 4, 6, 7, 3, 5, 6, 8, 11, 12, 4, 11, 12, 7, 6, 7, 9, 10, 12}; | |||||
uint32_t CooC[] = { 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 5, 6, 8, 8, 11}; | |||||
uint32_t N = 12; | |||||
uint32_t NZ = 19; | |||||
uint32_t c3[] = { 3, 5, 3, 1, 1, 3, 2, 0, 0, 0, 3, 3 }; | |||||
uint32_t* tc3 = vertexWiseTriangleCounts(CooR, CooC, N, NZ); // call | |||||
for (uint32_t i=0 ; i<N ; ++i) // validate | |||||
if (tc3[i] != c3[i]) | |||||
return 0; // fail | |||||
return 1; // pass | |||||
} | |||||
@@ -55,26 +55,72 @@ bool get_options(int argc, char* argv[]){ | |||||
else if (arg == "-n" || arg == "--max_trheads") { | else if (arg == "-n" || arg == "--max_trheads") { | ||||
session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads; | session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads; | ||||
} | } | ||||
else if (arg == "-r" || arg == "--repeat") { | |||||
session.repeat = (i+1 < argc) ? std::atoi(argv[++i]) : session.repeat; | |||||
} | |||||
else if (arg == "-t" || arg == "--timing") | else if (arg == "-t" || arg == "--timing") | ||||
session.timing = true; | session.timing = true; | ||||
else if (arg == "-v" || arg == "--verbose") | else if (arg == "-v" || arg == "--verbose") | ||||
session.verbose = true; | session.verbose = true; | ||||
else if (arg == "--make_symmetric") | |||||
session.makeSymmetric = true; | |||||
else if (arg == "--triangular_only") | else if (arg == "--triangular_only") | ||||
session.makeSymmetric = false; | session.makeSymmetric = false; | ||||
else if (arg == "--validate_mtx") | else if (arg == "--validate_mtx") | ||||
session.validate_mtx = true; | session.validate_mtx = true; | ||||
else if (arg == "--print_count") | |||||
else if (arg == "--dynamic") | |||||
session.dynamic = true; | |||||
else if (arg == "--print_count") { | |||||
session.print_count = true; | session.print_count = true; | ||||
session.makeSymmetric = false; | |||||
} | |||||
else if (arg == "--print_graph") { | else if (arg == "--print_graph") { | ||||
session.mtx_print = true; | session.mtx_print = true; | ||||
session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size; | session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size; | ||||
} | } | ||||
else if (arg == "-h" || arg == "--help") { | else if (arg == "-h" || arg == "--help") { | ||||
std::cout << "Help message\n"; | |||||
std::cout << "vertex-wise triangular count utility.\n\n"; | |||||
std::cout << "tcount -i <file> | -g <size> <probability> [-o <file>] [-n <threads>] [--dynamic] [-r <times>] [-t] [-v]\n"; | |||||
std::cout << " [--make_symmetric] [--triangular_only] [--print_count] [--validate_mtx] [--print_graph <size>]\n"; | |||||
std::cout << '\n'; | |||||
std::cout << "Options:\n\n"; | |||||
std::cout << " -i | --input <file>\n"; | |||||
std::cout << " Path to mtx file to load.\n\n"; | |||||
std::cout << " -g | --generate <size> <probability>\n"; | |||||
std::cout << " Request a random generated graph with size <size> and probability <probability>.\n"; | |||||
std::cout << " This is very slow, use it with care.\n\n"; | |||||
std::cout << " -o | --output <file>\n"; | |||||
std::cout << " Select <file> as output file. Default is stdout.\n\n"; | |||||
std::cout << " -n | --max_trheads <threads>\n"; | |||||
std::cout << " Reduce the thread number for the execution to <threads>. <threads> must be less or equal to available CPUs.\n\n"; | |||||
std::cout << " --dynamic\n"; | |||||
std::cout << " Request of dynamic scheduling for OpenMP and pthreads. Does not affect cilk versions.\n\n"; | |||||
std::cout << " -r | --repeat <times>\n"; | |||||
std::cout << " Repeat the vector calculation <times> times.\n\n"; | |||||
std::cout << " -t | --timing\n"; | |||||
std::cout << " Request timing measurements output to stdout.\n\n"; | |||||
std::cout << " -v | --verbose\n"; | |||||
std::cout << " Request a more verbose output to stdout.\n\n"; | |||||
std::cout << " --make_symmetric\n"; | |||||
std::cout << " Explicitly request a symmetric graph generation. This affects only V3 versions where by default a lower\n"; | |||||
std::cout << " triangular matrix is used.\n\n"; | |||||
std::cout << " --triangular_only\n"; | |||||
std::cout << " NOTE: Requires also \"--print_count\".\n"; | |||||
std::cout << " Explicitly request to use a lower triangular matrix. This affects only V4 versions where a symmetric\n"; | |||||
std::cout << " matrix is used by default and produce correct answer ONLY for total triangle counting (--print_count).\n\n"; | |||||
std::cout << " --print_count\n"; | |||||
std::cout << " NOTE: When used, also implies \"---triangular_only\" for all versions.\n"; | |||||
std::cout << " Request a total triangle counting output.\n\n"; | |||||
std::cout << " --validate_mtx\n"; | |||||
std::cout << " Request an input matrix validation before execution.\n\n"; | |||||
std::cout << " --print_graph <size>\n"; | |||||
std::cout << " Prints the first <size> x <size> part of the matrix to stdout.\n\n"; | |||||
std::cout << " -h | --help <size>\n"; | |||||
std::cout << " Prints this and exit.\n"; | |||||
exit(0); | exit(0); | ||||
} | } | ||||
else { // parse error | else { // parse error | ||||
std::cout << "Error message\n"; | |||||
std::cout << "Invokation error. Try -h for details.\n"; | |||||
status = false; | status = false; | ||||
} | } | ||||
} | } | ||||
@@ -84,6 +130,12 @@ bool get_options(int argc, char* argv[]){ | |||||
std::cout << "Error message\n"; | std::cout << "Error message\n"; | ||||
status = false; | status = false; | ||||
} | } | ||||
#if CODE_VERSION == V4 | |||||
else if (!session.makeSymmetric && !session.print_count) { | |||||
std::cout << "\"--triangular_only\" requires \"--print_count\"\n"; | |||||
status = false; | |||||
} | |||||
#endif | |||||
return status; | return status; | ||||
} | } | ||||
@@ -129,17 +181,26 @@ int main(int argc, char* argv[]) try { | |||||
std::vector<value_t> c; | std::vector<value_t> c; | ||||
index_t s; | index_t s; | ||||
#if defined ELEARNING | |||||
if (!elearn_test()) std::cout << "E-learning test: FAIL\n"; | |||||
else std::cout << "E-learning test: PASS\n"; | |||||
exit(0); | |||||
#endif | |||||
// try to read command line | // try to read command line | ||||
if (!get_options(argc, argv)) | if (!get_options(argc, argv)) | ||||
exit(1); | exit(1); | ||||
prepare_matrix(A, timer); | prepare_matrix(A, timer); | ||||
threads_info(); | threads_info(); | ||||
logger << "Create count vector" << logger.endl; | |||||
timer.start(); | |||||
c = triang_v (A); | |||||
timer.stop(); | |||||
timer.print_dt("create count vector"); | |||||
for (size_t i =0 ; i<session.repeat ; ++i) { | |||||
// repeat calculations as requested by user | |||||
logger << "Create vector" << logger.endl; | |||||
timer.start(); | |||||
c = triang_v (A); | |||||
timer.stop(); | |||||
timer.print_dt("create vector"); | |||||
} | |||||
if (session.print_count) { | if (session.print_count) { | ||||
logger << "Calculate total triangles" << logger.endl; | logger << "Calculate total triangles" << logger.endl; | ||||
timer.start(); | timer.start(); | ||||
@@ -156,7 +217,7 @@ int main(int argc, char* argv[]) try { | |||||
return 0; | return 0; | ||||
} | } | ||||
catch (std::exception& e) { | catch (std::exception& e) { | ||||
//we probably pollute the user's screen. Comment `cerr << ...` if you don't like it. | |||||
std::cerr << e.what() << '\n'; | |||||
//we probably pollute the user's screen. Comment `cerr << ...` if you don't like it. | |||||
std::cerr << e.what() << '\n'; | |||||
exit(1); | exit(1); | ||||
} | } |
@@ -8,23 +8,20 @@ | |||||
*/ | */ | ||||
#include <v3.h> | #include <v3.h> | ||||
// for (int i=0 ; i<A.size() ; ++i) { | |||||
// for (int j = A.col_ptr[i]; j<A.col_ptr[i+1] ; ++j) { | |||||
// int j_idx = A.rows[j]; | |||||
// for (int k = A.col_ptr[j_idx] ; k<A.col_ptr[j_idx+1] ; ++k) { | |||||
// int k_idx = A.rows[k]; | |||||
// if (A.get(k_idx, i)) { | |||||
// ++c[i]; | |||||
// } | |||||
// } | |||||
// } | |||||
// } | |||||
namespace v3 { | namespace v3 { | ||||
#if defined CILK | #if defined CILK | ||||
// export CILK_NWORKERS=<num> | |||||
/*! | |||||
* Utility function to get/set the number of threads. | |||||
* | |||||
* The number of threads are controlled via environment variable \c CILK_NWORKERS | |||||
* | |||||
* \return The number of threads used. | |||||
* \note | |||||
* The user can reduce the number with the command option \c --max_threads. | |||||
* If so the requested number will be used even if the environment has more threads available. | |||||
*/ | |||||
int nworkers() { | int nworkers() { | ||||
if (session.max_threads) | if (session.max_threads) | ||||
return (session.max_threads < __cilkrts_get_nworkers()) ? | return (session.max_threads < __cilkrts_get_nworkers()) ? | ||||
@@ -33,45 +30,93 @@ int nworkers() { | |||||
return __cilkrts_get_nworkers(); | return __cilkrts_get_nworkers(); | ||||
} | } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* \param A The matrix to use. | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
*/ | |||||
std::vector<value_t> triang_v(matrix& A) { | std::vector<value_t> triang_v(matrix& A) { | ||||
std::vector<value_t> c(A.size()); | |||||
std::vector<std::atomic<value_t>> c(A.size()); | |||||
std::vector<value_t> ret(A.size()); | |||||
cilk_for (int i=0 ; i<A.size() ; ++i) { | cilk_for (int i=0 ; i<A.size() ; ++i) { | ||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j | |||||
if (A.get(k.index(), i)) // search for i-k edge | |||||
++c[i]; | |||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) { | |||||
// j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) { | |||||
// k list all the edges with j | |||||
if (A.get(k.index(), i)) { | |||||
++ret[i]; | |||||
c[j.index()] += (!session.makeSymmetric)? 1:0; | |||||
c[k.index()] += (!session.makeSymmetric)? 1:0; | |||||
} | |||||
} | |||||
} | |||||
if (session.makeSymmetric) { | |||||
ret[i] = ret[i]/2; | |||||
c[i] = c[i]/2; | |||||
} | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | |||||
for (index_t i =0 ; i<A.size() ; ++i) ret[i] += c[i]; | |||||
return ret; | |||||
} | } | ||||
/*! | |||||
* A sum utility to use as spawn function for parallelized sum. | |||||
* \return The sum of \c v from \c begin to \c end. | |||||
*/ | |||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | ||||
for (auto i =begin ; i != end ; ++i) | for (auto i =begin ; i != end ; ++i) | ||||
out_sum += v[i]; | out_sum += v[i]; | ||||
} | } | ||||
/*! | |||||
* A parallelized version of sum. Just because ;) | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
int n = nworkers(); | int n = nworkers(); | ||||
std::vector<value_t> sum_v(n, 0); | |||||
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation. | |||||
// We spawn workers in a more statically way. | |||||
for (index_t i =0 ; i < n ; ++i) { | for (index_t i =0 ; i < n ; ++i) { | ||||
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); | cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); | ||||
} | } | ||||
cilk_sync; | cilk_sync; | ||||
value_t s =0; | |||||
for (auto& it : sum_v) s += it; | |||||
// sum the sums (a sum to rule them all) | |||||
value_t s =0; for (auto& it : sum_v) s += it; | |||||
return s; | return s; | ||||
} | } | ||||
#elif defined OMP | #elif defined OMP | ||||
/* | |||||
// export OMP_NUM_THREADS=<num> | |||||
/*! | |||||
* A "simple" user defined OpenMP reduction for vector<value_t> | |||||
* \note | |||||
* Not used. Reason: The atomic version of the code performs better. | |||||
*/ | |||||
#pragma omp declare reduction(vec_value_plus : std::vector<value_t> : \ | |||||
std::transform( \ | |||||
omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus<value_t>() \ | |||||
) \ | |||||
) \ | |||||
initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) | |||||
/*! | |||||
* Utility function to get/set the number of threads. | |||||
* | |||||
* The number of threads are controlled via environment variable \c OMP_NUM_THREADS | |||||
* | |||||
* \return The number of threads used. | |||||
* \note | |||||
* The user can reduce the number with the command option \c --max_threads. | |||||
* If so the requested number will be used even if the environment has more threads available. | |||||
*/ | */ | ||||
int nworkers() { | int nworkers() { | ||||
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { | if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { | ||||
@@ -85,23 +130,49 @@ int nworkers() { | |||||
} | } | ||||
} | } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* \param A The matrix to use. | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
*/ | |||||
std::vector<value_t> triang_v(matrix& A) { | std::vector<value_t> triang_v(matrix& A) { | ||||
std::vector<value_t> c(A.size()); | |||||
std::vector<std::atomic<value_t>> c(A.size()); | |||||
std::vector<value_t> ret(A.size()); | |||||
#pragma omp parallel for shared(c) | |||||
// OMP schedule selection | |||||
if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0); | |||||
else omp_set_schedule (omp_sched_static, 0); | |||||
#pragma omp parallel for schedule(runtime) //reduction(vec_value_plus : c) | |||||
for (int i=0 ; i<A.size() ; ++i) { | for (int i=0 ; i<A.size() ; ++i) { | ||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j | |||||
if (A.get(k.index(), i)) // search for i-k edge | |||||
++c[i]; | |||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) { | |||||
// j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) { | |||||
// k list all the edges with j | |||||
if (A.get(k.index(), i)) { | |||||
++ret[i]; | |||||
c[j.index()] += (!session.makeSymmetric)? 1:0; | |||||
c[k.index()] += (!session.makeSymmetric)? 1:0; | |||||
} | |||||
} | |||||
} | |||||
if (session.makeSymmetric) { | |||||
ret[i] = ret[i]/2; | |||||
c[i] = c[i]/2; | |||||
} | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | |||||
for (index_t i =0 ; i<A.size() ; ++i) ret[i] += c[i]; | |||||
return ret; | |||||
} | } | ||||
/*! | |||||
* A parallelized version of sum. Just because ;) | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
value_t s =0; | value_t s =0; | ||||
@@ -113,24 +184,44 @@ value_t sum (std::vector<value_t>& v) { | |||||
#else | #else | ||||
//! Return the number of workers. | |||||
//! \note This function is just for completion | |||||
int nworkers() { return 1; } | int nworkers() { return 1; } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* \param A The matrix to use. | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
*/ | |||||
std::vector<value_t> triang_v(matrix& A) { | std::vector<value_t> triang_v(matrix& A) { | ||||
std::vector<value_t> c(A.size()); | std::vector<value_t> c(A.size()); | ||||
for (int i=0 ; i<A.size() ; ++i) { | for (int i=0 ; i<A.size() ; ++i) { | ||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j | |||||
if (A.get(k.index(), i)) // search for i-k edge | |||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) { | |||||
// j list all the edges with i | |||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) { | |||||
// k list all the edges with j | |||||
if (A.get(k.index(), i)) { | |||||
++c[i]; | ++c[i]; | ||||
c[j.index()] += (!session.makeSymmetric)? 1:0; | |||||
c[k.index()] += (!session.makeSymmetric)? 1:0; | |||||
} | |||||
} | |||||
} | |||||
if (session.makeSymmetric) c[i] /= 2; | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | return c; | ||||
} | } | ||||
/*! | |||||
* Summation functionality. | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
value_t s =0; | value_t s =0; | ||||
for (auto& it : v) | for (auto& it : v) | ||||
@@ -140,9 +231,9 @@ value_t sum (std::vector<value_t>& v) { | |||||
#endif | #endif | ||||
//! Polymorphic interface function for sum results | |||||
value_t triang_count (std::vector<value_t>& c) { | value_t triang_count (std::vector<value_t>& c) { | ||||
return (session.makeSymmetric) ? sum(c)/3 : sum(c); | |||||
return sum(c)/3; | |||||
} | } | ||||
} | } |
@@ -12,7 +12,16 @@ namespace v4 { | |||||
#if defined CILK | #if defined CILK | ||||
// export CILK_NWORKERS=<num> | |||||
/*! | |||||
* Utility function to get/set the number of threads. | |||||
* | |||||
* The number of threads are controlled via environment variable \c CILK_NWORKERS | |||||
* | |||||
* \return The number of threads used. | |||||
* \note | |||||
* The user can reduce the number with the command option \c --max_threads. | |||||
* If so the requested number will be used even if the environment has more threads available. | |||||
*/ | |||||
int nworkers() { | int nworkers() { | ||||
if (session.max_threads) | if (session.max_threads) | ||||
return (session.max_threads < __cilkrts_get_nworkers()) ? | return (session.max_threads < __cilkrts_get_nworkers()) ? | ||||
@@ -21,6 +30,25 @@ int nworkers() { | |||||
return __cilkrts_get_nworkers(); | return __cilkrts_get_nworkers(); | ||||
} | } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* 1 | |||||
* vector = --- * (A.* (A*B))*ones_N | |||||
* 2 | |||||
* We squeezed all that to one function for performance. The row*column multiplication | |||||
* uses the inner CSC structure of sparse matrix and follows only non-zero members. | |||||
* | |||||
* \param A The first matrix to use. | |||||
* \param B The second matrix to use (they can be the same). | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
* \warning | |||||
* The later(--triangular_only) produce correct results ONLY if we are after the total count. | |||||
*/ | |||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | ||||
std::vector<value_t> c(A.size()); | std::vector<value_t> c(A.size()); | ||||
@@ -28,37 +56,50 @@ std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | |||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){ | for (auto j = A.getRow(i); j.index() != j.end() ; ++j){ | ||||
c[i] += A.getRow(i)*B.getCol(j.index()); | c[i] += A.getRow(i)*B.getCol(j.index()); | ||||
} | } | ||||
if (session.makeSymmetric) c[i] /= 2; | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | return c; | ||||
} | } | ||||
/*! | |||||
* A sum utility to use as spawn function for parallelized sum. | |||||
* \return The sum of \c v from \c begin to \c end. | |||||
*/ | |||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | ||||
for (auto i =begin ; i != end ; ++i) | for (auto i =begin ; i != end ; ++i) | ||||
out_sum += v[i]; | out_sum += v[i]; | ||||
} | } | ||||
/*! | |||||
* A parallelized version of sum. Just because ;) | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
int n = nworkers(); | int n = nworkers(); | ||||
std::vector<value_t> sum_v(n, 0); | |||||
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation. | |||||
// We spawn workers in a more statically way. | |||||
for (index_t i =0 ; i < n ; ++i) { | for (index_t i =0 ; i < n ; ++i) { | ||||
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); | cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n); | ||||
} | } | ||||
cilk_sync; | cilk_sync; | ||||
value_t s =0; | |||||
for (auto& it : sum_v) s += it; | |||||
// sum the sums (a sum to rule them all) | |||||
value_t s =0; for (auto& it : sum_v) s += it; | |||||
return s; | return s; | ||||
} | } | ||||
#elif defined OMP | #elif defined OMP | ||||
/* | |||||
// export OMP_NUM_THREADS=<num> | |||||
/*! | |||||
* Utility function to get/set the number of threads. | |||||
* | |||||
* The number of threads are controlled via environment variable \c OMP_NUM_THREADS | |||||
* | |||||
* \return The number of threads used. | |||||
* \note | |||||
* The user can reduce the number with the command option \c --max_threads. | |||||
* If so the requested number will be used even if the environment has more threads available. | |||||
*/ | */ | ||||
int nworkers() { | int nworkers() { | ||||
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { | if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) { | ||||
@@ -72,22 +113,45 @@ int nworkers() { | |||||
} | } | ||||
} | } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* 1 | |||||
* vector = --- * (A.* (A*B))*ones_N | |||||
* 2 | |||||
* We squeezed all that to one function for performance. The row*column multiplication | |||||
* uses the inner CSC structure of sparse matrix and follows only non-zero members. | |||||
* | |||||
* \param A The first matrix to use. | |||||
* \param B The second matrix to use (they can be the same). | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
* \warning | |||||
* The later(--triangular_only) produce correct results ONLY if we are after the total count. | |||||
*/ | |||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | ||||
std::vector<value_t> c(A.size()); | std::vector<value_t> c(A.size()); | ||||
#pragma omp parallel for shared(c) | |||||
// OMP schedule selection | |||||
if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0); | |||||
else omp_set_schedule (omp_sched_static, 0); | |||||
#pragma omp parallel for shared(c) schedule(runtime) | |||||
for (int i=0 ; i<A.size() ; ++i) { | for (int i=0 ; i<A.size() ; ++i) { | ||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j) { | for (auto j = A.getRow(i); j.index() != j.end() ; ++j) { | ||||
c[i] += A.getRow(i)*B.getCol(j.index()); | c[i] += A.getRow(i)*B.getCol(j.index()); | ||||
} | } | ||||
if (session.makeSymmetric) c[i] /= 2; | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | return c; | ||||
} | } | ||||
/*! | |||||
* A parallelized version of sum. Just because ;) | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
value_t s =0; | value_t s =0; | ||||
@@ -99,8 +163,15 @@ value_t sum (std::vector<value_t>& v) { | |||||
#elif defined THREADS | #elif defined THREADS | ||||
/* | |||||
* std::thread::hardware_concurrency() | |||||
/*! | |||||
* Utility function to get/set the number of threads. | |||||
* | |||||
* The number of threads are inherited by the environment via std::thread::hardware_concurrency() | |||||
* | |||||
* \return The number of threads used. | |||||
* \note | |||||
* The user can reduce the number with the command option \c --max_threads. | |||||
* If so the requested number will be used even if the environment has more threads available. | |||||
*/ | */ | ||||
int nworkers() { | int nworkers() { | ||||
if (session.max_threads) | if (session.max_threads) | ||||
@@ -110,43 +181,89 @@ int nworkers() { | |||||
return std::thread::hardware_concurrency(); | return std::thread::hardware_concurrency(); | ||||
} | } | ||||
std::vector<value_t> mmacc_v_rng(std::vector<value_t>& out, matrix& A, matrix& B, index_t begin, index_t end) { | |||||
/*! | |||||
* A spawn function to calculate and return a vertex-wise count vector. | |||||
* | |||||
* 1 | |||||
* vector(begin..end) = --- * (A.* (A*B))*ones_N | |||||
* 2 | |||||
* | |||||
* We squeezed all that to one function for performance. The row*column multiplication | |||||
* uses the inner CSC structure of sparse matrix and follows only non-zero members. | |||||
* | |||||
* \param out Reference to output vector | |||||
* \param A The first matrix to use. | |||||
* \param B The second matrix to use (they can be the same). | |||||
* \param iton vector containing the range with the columns to use (it can be shuffled). | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
* \warning | |||||
* The later(--triangular_only) produce correct results ONLY if we are after the total count. | |||||
*/ | |||||
std::vector<value_t> mmacc_v_rng( | |||||
std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) { | |||||
for (index_t i=begin ; i<end ; ++i) { | for (index_t i=begin ; i<end ; ++i) { | ||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){ | |||||
out[i] += A.getRow(i)*B.getCol(j.index()); | |||||
index_t ii = iton[i]; | |||||
for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){ | |||||
out[ii] += A.getRow(ii)*B.getCol(j.index()); | |||||
} | } | ||||
if (session.makeSymmetric) out[ii] /= 2; | |||||
} | } | ||||
return out; | return out; | ||||
} | } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* \param A The first matrix to use. | |||||
* \param B The second matrix to use (they can be the same). | |||||
* \return The count vector. RVO is used here. | |||||
*/ | |||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | ||||
std::vector<std::thread> workers; | std::vector<std::thread> workers; | ||||
std::vector<value_t> c(A.size()); | std::vector<value_t> c(A.size()); | ||||
int n = nworkers(); | int n = nworkers(); | ||||
for (index_t i=0 ; i<n ; ++i) | |||||
workers.push_back (std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), i*c.size()/n, (i+1)*c.size()/n)); | |||||
std::vector<index_t> iton(A.size()); // Create a 0 .. N range for outer loop | |||||
std::iota(iton.begin(), iton.end(), 0); | |||||
if (session.dynamic) // in case of dynamic scheduling, shuffle the range | |||||
std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()}); | |||||
for (index_t i=0 ; i<n ; ++i) // dispatch the workers and hold them in a vector | |||||
workers.push_back ( | |||||
std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n) | |||||
); | |||||
// a for to join them all... | |||||
std::for_each(workers.begin(), workers.end(), [](std::thread& t){ | std::for_each(workers.begin(), workers.end(), [](std::thread& t){ | ||||
t.join(); | t.join(); | ||||
}); | }); | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | return c; | ||||
} | } | ||||
/*! | |||||
* A sum utility to use as spawn function for parallelized sum. | |||||
* \return The sum of \c v from \c begin to \c end. | |||||
*/ | |||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) { | ||||
for (auto i =begin ; i != end ; ++i) | for (auto i =begin ; i != end ; ++i) | ||||
out_sum += v[i]; | out_sum += v[i]; | ||||
} | } | ||||
/*! | |||||
* A parallelized version of sum. Just because ;) | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
int n = nworkers(); | int n = nworkers(); | ||||
std::vector<value_t> sum_v(n, 0); | |||||
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation. | |||||
std::vector<std::thread> workers; | std::vector<std::thread> workers; | ||||
// We spawn workers in a more statically way. | |||||
for (index_t i =0 ; i < n ; ++i) | for (index_t i =0 ; i < n ; ++i) | ||||
workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n)); | workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n)); | ||||
@@ -154,29 +271,51 @@ value_t sum (std::vector<value_t>& v) { | |||||
t.join(); | t.join(); | ||||
}); | }); | ||||
value_t s =0; | |||||
for (auto& it : sum_v) s += it; | |||||
// sum the sums (a sum to rule them all) | |||||
value_t s =0; for (auto& it : sum_v) s += it; | |||||
return s; | return s; | ||||
} | } | ||||
#else | #else | ||||
//! Return the number of workers. | |||||
//! \note This function is just for completion | |||||
int nworkers() { return 1; } | int nworkers() { return 1; } | ||||
/*! | |||||
* Calculate and return a vertex-wise count vector. | |||||
* | |||||
* 1 | |||||
* vector = --- * (A.* (A*B))*ones_N | |||||
* 2 | |||||
* We squeezed all that to one function for performance. The row*column multiplication | |||||
* uses the inner CSC structure of sparse matrix and follows only non-zero members. | |||||
* | |||||
* \param A The first matrix to use. | |||||
* \param B The second matrix to use (they can be the same). | |||||
* \return The count vector. RVO is used here. | |||||
* \note | |||||
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only | |||||
* - A full matrix calculation which update only c[i] | |||||
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster. | |||||
* \warning | |||||
* The later(--triangular_only) produce correct results ONLY if we are after the total count. | |||||
*/ | |||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | std::vector<value_t> mmacc_v(matrix& A, matrix& B) { | ||||
std::vector<value_t> c(A.size()); | std::vector<value_t> c(A.size()); | ||||
for (int i=0 ; i<A.size() ; ++i) { | for (int i=0 ; i<A.size() ; ++i) { | ||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){ | for (auto j = A.getRow(i); j.index() != j.end() ; ++j){ | ||||
c[i] += A.getRow(i)*B.getCol(j.index()); | c[i] += A.getRow(i)*B.getCol(j.index()); | ||||
} | } | ||||
if (session.makeSymmetric) c[i] /= 2; | |||||
} | } | ||||
if (session.makeSymmetric) | |||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) { | |||||
return x/2; | |||||
}); | |||||
return c; | return c; | ||||
} | } | ||||
/*! | |||||
* Summation functionality. | |||||
* \return The total sum of vector \c v | |||||
*/ | |||||
value_t sum (std::vector<value_t>& v) { | value_t sum (std::vector<value_t>& v) { | ||||
value_t s =0; | value_t s =0; | ||||
for (auto& it : v) | for (auto& it : v) | ||||
@@ -186,10 +325,12 @@ value_t sum (std::vector<value_t>& v) { | |||||
#endif | #endif | ||||
//! Polymorphic interface function for count vector | |||||
std::vector<value_t> triang_v(matrix& A) { | std::vector<value_t> triang_v(matrix& A) { | ||||
return mmacc_v(A, A); | return mmacc_v(A, A); | ||||
} | } | ||||
//! Polymorphic interface function for sum results | |||||
value_t triang_count (std::vector<value_t>& c) { | value_t triang_count (std::vector<value_t>& c) { | ||||
return (session.makeSymmetric) ? sum(c)/3 : sum(c); | return (session.makeSymmetric) ? sum(c)/3 : sum(c); | ||||
} | } | ||||