A measurement ready version
This commit is contained in:
parent
fd8a3a2bc3
commit
122a205892
4
.gitignore
vendored
4
.gitignore
vendored
@ -5,6 +5,10 @@ out/
|
|||||||
mat/
|
mat/
|
||||||
mtx/
|
mtx/
|
||||||
|
|
||||||
|
# hpc related
|
||||||
|
exclude
|
||||||
|
hpc_auth_sync.sh
|
||||||
|
|
||||||
# eclipse
|
# eclipse
|
||||||
.project
|
.project
|
||||||
.cproject
|
.cproject
|
||||||
|
77
Makefile
77
Makefile
@ -38,7 +38,7 @@ DEP_DIR := $(BUILD_DIR)/.dep
|
|||||||
# ========== Compiler settings ==========
|
# ========== Compiler settings ==========
|
||||||
# Compiler flags for debug and release
|
# Compiler flags for debug and release
|
||||||
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++14
|
DEB_CFLAGS := -DDEBUG -g3 -Wall -Wextra -std=c++14
|
||||||
REL_CFLAGS := -DDEBUG -g3 -Wall -Wextra -O2 -std=c++14
|
REL_CFLAGS := -Wall -Wextra -O3 -std=c++14
|
||||||
# Pre-defines
|
# Pre-defines
|
||||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
||||||
PRE_DEFS :=
|
PRE_DEFS :=
|
||||||
@ -151,39 +151,50 @@ release: $(BUILD_DIR)/$(TARGET)
|
|||||||
|
|
||||||
all: release
|
all: release
|
||||||
|
|
||||||
local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V3
|
local_v3: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=3
|
||||||
local_v3: TARGET := local_v3
|
local_v3: TARGET := local_v3
|
||||||
local_v3: $(BUILD_DIR)/$(TARGET)
|
local_v3: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V4
|
local_v4: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=4
|
||||||
local_v4: TARGET := local_v4
|
local_v4: TARGET := local_v4
|
||||||
local_v4: $(BUILD_DIR)/$(TARGET)
|
local_v4: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
elearn: CFLAGS := $(DEB_CFLAGS) -DELEARNING
|
||||||
|
elearn: TARGET := elearn
|
||||||
|
elearn: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
local_v4_opt: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -pg
|
||||||
|
local_v4_opt: LDFLAGS += -pg
|
||||||
|
local_v4_opt: TARGET := local_v4_opt
|
||||||
|
local_v4_opt: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v3: DOCKER := $(DOCKER_CMD)
|
v3: DOCKER := $(DOCKER_CMD)
|
||||||
v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V3
|
v3: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3
|
||||||
v3: TARGET := tcount_v3
|
v3: TARGET := tcount_v3
|
||||||
v3: $(BUILD_DIR)/$(TARGET)
|
v3: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v3_cilk: DOCKER := $(DOCKER_CMD)
|
v3_cilk: DOCKER := $(DOCKER_CMD)
|
||||||
v3_cilk: CXX := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
|
v3_cilk: CXX := /usr/local/OpenCilk-9.0.1-Linux/bin/clang++
|
||||||
v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=V3 -DCILK
|
v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK
|
||||||
v3_cilk: LDFLAGS += -fcilkplus
|
v3_cilk: LDFLAGS += -fcilkplus
|
||||||
v3_cilk: TARGET := tcount_cilkv3
|
v3_cilk: TARGET := tcount_cilkv3
|
||||||
v3_cilk: $(BUILD_DIR)/$(TARGET)
|
v3_cilk: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v3_omp: DOCKER := $(DOCKER_CMD)
|
v3_omp: DOCKER := $(DOCKER_CMD)
|
||||||
v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V3 -DOMP
|
v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP
|
||||||
v3_omp: LDFLAGS += -fopenmp
|
v3_omp: LDFLAGS += -fopenmp
|
||||||
v3_omp: TARGET := tcount_ompv3
|
v3_omp: TARGET := tcount_ompv3
|
||||||
v3_omp: $(BUILD_DIR)/$(TARGET)
|
v3_omp: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v4: DOCKER := $(DOCKER_CMD)
|
v4: DOCKER := $(DOCKER_CMD)
|
||||||
v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4
|
v4: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4
|
||||||
v4: TARGET := tcount_v4
|
v4: TARGET := tcount_v4
|
||||||
v4: $(BUILD_DIR)/$(TARGET)
|
v4: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
@ -197,25 +208,61 @@ v4_cilk: $(BUILD_DIR)/$(TARGET)
|
|||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v4_omp: DOCKER := $(DOCKER_CMD)
|
v4_omp: DOCKER := $(DOCKER_CMD)
|
||||||
v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=V4 -DOMP
|
v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP
|
||||||
v4_omp: LDFLAGS += -fopenmp
|
v4_omp: LDFLAGS += -fopenmp
|
||||||
v4_omp: TARGET := tcount_ompv4
|
v4_omp: TARGET := tcount_ompv4
|
||||||
v4_omp: $(BUILD_DIR)/$(TARGET)
|
v4_omp: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
v4_pthreads: DOCKER := $(DOCKER_CMD)
|
v4_pthreads: DOCKER := $(DOCKER_CMD)
|
||||||
v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V4 -DTHREADS
|
v4_pthreads: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS
|
||||||
v4_pthreads: TARGET := tcount_pthv4
|
v4_pthreads: TARGET := tcount_pthv4
|
||||||
v4_pthreads: $(BUILD_DIR)/$(TARGET)
|
v4_pthreads: $(BUILD_DIR)/$(TARGET)
|
||||||
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# ================ Docker based rules ================
|
# ================ hpc build rules =================
|
||||||
# examples:
|
|
||||||
# make IMAGE="gcc:8.3" dock
|
|
||||||
#
|
#
|
||||||
dock: DOCKER := $(DOCKER_CMD)
|
hpc_v3_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=3
|
||||||
dock: CFLAGS := $(REL_CFLAGS)
|
hpc_v3_ser: TARGET := hpc_v3
|
||||||
dock: $(BUILD_DIR)/$(TARGET)
|
hpc_v3_ser: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v3_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=3 -DOMP
|
||||||
|
hpc_v3_omp: LDFLAGS += -fopenmp
|
||||||
|
hpc_v3_omp: TARGET := hpc_ompv3
|
||||||
|
hpc_v3_omp: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v3_cilk: CXX := clang++
|
||||||
|
hpc_v3_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=3 -DCILK
|
||||||
|
hpc_v3_cilk: LDFLAGS += -fcilkplus
|
||||||
|
hpc_v3_cilk: TARGET := hpc_cilkv3
|
||||||
|
hpc_v3_cilk: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v4_ser: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4
|
||||||
|
hpc_v4_ser: TARGET := hpc_v4
|
||||||
|
hpc_v4_ser: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v4_omp: CFLAGS := $(REL_CFLAGS) -fopenmp -DCODE_VERSION=4 -DOMP
|
||||||
|
hpc_v4_omp: LDFLAGS += -fopenmp
|
||||||
|
hpc_v4_omp: TARGET := hpc_ompv4
|
||||||
|
hpc_v4_omp: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v4_cilk: CXX := clang++
|
||||||
|
hpc_v4_cilk: CFLAGS := $(REL_CFLAGS) -fcilkplus -DCODE_VERSION=4 -DCILK
|
||||||
|
hpc_v4_cilk: LDFLAGS += -fcilkplus
|
||||||
|
hpc_v4_cilk: TARGET := hpc_cilkv4
|
||||||
|
hpc_v4_cilk: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
hpc_v4_pth: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=4 -DTHREADS
|
||||||
|
hpc_v4_pth: TARGET := hpc_pthv4
|
||||||
|
hpc_v4_pth: $(BUILD_DIR)/$(TARGET)
|
||||||
|
cp $(BUILD_DIR)/$(TARGET) out/$(TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
1170
hpc-results/ntasks1.out
Normal file
1170
hpc-results/ntasks1.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks1.sh
Normal file
19
hpc-results/ntasks1.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --output=ntasks1.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks10.out
Normal file
1170
hpc-results/ntasks10.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks10.sh
Normal file
19
hpc-results/ntasks10.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=10
|
||||||
|
#SBATCH --output=ntasks10.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks15.out
Normal file
1170
hpc-results/ntasks15.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks15.sh
Normal file
19
hpc-results/ntasks15.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=15
|
||||||
|
#SBATCH --output=ntasks15.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks2.out
Normal file
1170
hpc-results/ntasks2.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks2.sh
Normal file
19
hpc-results/ntasks2.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=2
|
||||||
|
#SBATCH --output=ntasks2.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks20.out
Normal file
1170
hpc-results/ntasks20.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks20.sh
Normal file
19
hpc-results/ntasks20.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=20
|
||||||
|
#SBATCH --output=ntasks20.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks4.out
Normal file
1170
hpc-results/ntasks4.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks4.sh
Normal file
19
hpc-results/ntasks4.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --output=ntasks4.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks5.out
Normal file
1170
hpc-results/ntasks5.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks5.sh
Normal file
19
hpc-results/ntasks5.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=5
|
||||||
|
#SBATCH --output=ntasks5.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
1170
hpc-results/ntasks8.out
Normal file
1170
hpc-results/ntasks8.out
Normal file
File diff suppressed because it is too large
Load Diff
19
hpc-results/ntasks8.sh
Normal file
19
hpc-results/ntasks8.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
#SBATCH --time=20:00
|
||||||
|
#SBATCH --partition=batch
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=8
|
||||||
|
#SBATCH --output=ntasks8.out
|
||||||
|
|
||||||
|
module load gcc/9.2.0 openmpi/3.1.6
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=$SLURM_NTASKS
|
||||||
|
export CILK_NWORKERS=$SLURM_NTASKS
|
||||||
|
|
||||||
|
./runall.sh mtx/belgium_osm.mtx 8
|
||||||
|
./runall.sh mtx/com-Youtube.mtx 8
|
||||||
|
./runall.sh mtx/dblp-2010.mtx 8
|
||||||
|
./runall.sh mtx/mycielskian13.mtx 8
|
||||||
|
./runall.sh mtx/NACA0015.mtx 8
|
@ -13,6 +13,7 @@
|
|||||||
#include <v12.h>
|
#include <v12.h>
|
||||||
#include <v3.h>
|
#include <v3.h>
|
||||||
#include <v4.h>
|
#include <v4.h>
|
||||||
|
#include <elearn.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Defines for different version of the exercise
|
* Defines for different version of the exercise
|
||||||
|
17
inc/elearn.h
Normal file
17
inc/elearn.h
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/*!
|
||||||
|
* \file elearn.h
|
||||||
|
* \brief e-learning version of the exercise.
|
||||||
|
*
|
||||||
|
* \author
|
||||||
|
* Christos Choutouridis AEM:8997
|
||||||
|
* <cchoutou@ece.auth.gr>
|
||||||
|
*/
|
||||||
|
#ifndef ELEARN_H_
|
||||||
|
#define ELEARN_H_
|
||||||
|
|
||||||
|
#include <impl.hpp>
|
||||||
|
|
||||||
|
uint32_t elearn_test (void) ;
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* ELEARN_H_ */
|
32
inc/impl.hpp
32
inc/impl.hpp
@ -284,8 +284,22 @@ struct SpMat {
|
|||||||
* @return The value of the item or DataType{} if is not present.
|
* @return The value of the item or DataType{} if is not present.
|
||||||
*/
|
*/
|
||||||
DataType get(IndexType i, IndexType j) {
|
DataType get(IndexType i, IndexType j) {
|
||||||
IndexType end, idx =find_idx(rows, col_ptr[j], end=col_ptr[j+1], i);
|
IndexType idx; bool found;
|
||||||
return (idx != end) ? values[idx] : 0;
|
std::tie(idx, found) =find_idx(rows, col_ptr[j], col_ptr[j+1], i);
|
||||||
|
return (found) ? values[idx] : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A read item functionality using binary search to find the correct row
|
||||||
|
*
|
||||||
|
* @param i The row number
|
||||||
|
* @param j The column number
|
||||||
|
* @return The value of the item or DataType{} if is not present.
|
||||||
|
*/
|
||||||
|
DataType get2(IndexType i, IndexType j) {
|
||||||
|
IndexType idx; bool found;
|
||||||
|
std::tie(idx, found) =find2_idx(rows, col_ptr[j], col_ptr[j+1], i);
|
||||||
|
return (found) ? values[idx] : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -380,18 +394,18 @@ private:
|
|||||||
* \param match What to search
|
* \param match What to search
|
||||||
* @return The index of the item or end on failure.
|
* @return The index of the item or end on failure.
|
||||||
*/
|
*/
|
||||||
IndexType find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
|
std::pair<IndexType, bool> find_idx(const std::vector<IndexType>& v, IndexType begin, IndexType end, IndexType match) {
|
||||||
IndexType b = begin, e = end-1;
|
IndexType b = begin, e = end-1;
|
||||||
while (true) {
|
while (true) {
|
||||||
IndexType m = (b+e)/2;
|
IndexType m = (b+e)/2;
|
||||||
if (v[m] == match) return m;
|
if (v[m] == match) return std::make_pair(m, true);
|
||||||
else if (b >= e) return end;
|
else if (b >= e) return std::make_pair(end, false);
|
||||||
else {
|
else {
|
||||||
if (v[m] < match) b = m +1;
|
if (v[m] < match) b = m +1;
|
||||||
else e = m -1;
|
else e = m -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return end;
|
return std::make_pair(end, false);;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* find helper for set using index for begin-end instead of iterators.
|
* find helper for set using index for begin-end instead of iterators.
|
||||||
@ -687,13 +701,19 @@ struct session_t {
|
|||||||
OutputMode outputMode {OutputMode::STD}; //!< Type of the output file
|
OutputMode outputMode {OutputMode::STD}; //!< Type of the output file
|
||||||
std::ofstream outFile {}; //!< File to use for output
|
std::ofstream outFile {}; //!< File to use for output
|
||||||
std::size_t max_threads {}; //!< Maximum threads to use
|
std::size_t max_threads {}; //!< Maximum threads to use
|
||||||
|
std::size_t repeat {1}; //!< How many times we execute the calculations part
|
||||||
bool timing {false}; //!< Enable timing prints of the program
|
bool timing {false}; //!< Enable timing prints of the program
|
||||||
bool verbose {false}; //!< Flag to enable verbose output to stdout
|
bool verbose {false}; //!< Flag to enable verbose output to stdout
|
||||||
|
#if CODE_VERSION == 3
|
||||||
|
bool makeSymmetric {false}; //!< symmetric matrix creation flag (true by default)
|
||||||
|
#else
|
||||||
bool makeSymmetric {true}; //!< symmetric matrix creation flag (true by default)
|
bool makeSymmetric {true}; //!< symmetric matrix creation flag (true by default)
|
||||||
|
#endif
|
||||||
bool validate_mtx {false}; //!< Flag to request mtx input data triangular validation.
|
bool validate_mtx {false}; //!< Flag to request mtx input data triangular validation.
|
||||||
bool print_count {false}; //!< Flag to request total count printing
|
bool print_count {false}; //!< Flag to request total count printing
|
||||||
bool mtx_print {false}; //!< matrix print flag
|
bool mtx_print {false}; //!< matrix print flag
|
||||||
std::size_t mtx_print_size {}; //!< matrix print size
|
std::size_t mtx_print_size {}; //!< matrix print size
|
||||||
|
bool dynamic {false}; //!< Selects dynamic scheduling for OpenMP and pthreads.
|
||||||
};
|
};
|
||||||
|
|
||||||
extern session_t session;
|
extern session_t session;
|
||||||
|
1
inc/v3.h
1
inc/v3.h
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <atomic>
|
||||||
#include <impl.hpp>
|
#include <impl.hpp>
|
||||||
|
|
||||||
#if defined CILK
|
#if defined CILK
|
||||||
|
2
inc/v4.h
2
inc/v4.h
@ -24,6 +24,8 @@
|
|||||||
|
|
||||||
#elif defined THREADS
|
#elif defined THREADS
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <numeric>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#endif
|
#endif
|
||||||
|
31
runall.sh
Executable file
31
runall.sh
Executable file
@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if [[ $# -lt 2 ]]; then
|
||||||
|
echo "Error: You must pass the matrix files and the number of iterations"
|
||||||
|
echo "example $ runnall.sh mtx/s12.mtx 5"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
dynamics=("out/hpc_ompv3" "out/hpc_ompv4" "out/hpc_pthv4")
|
||||||
|
|
||||||
|
for ex in out/*; do
|
||||||
|
echo "-------------------------------------------"
|
||||||
|
echo "executable: $ex"
|
||||||
|
for file in "$@"; do
|
||||||
|
if [[ $file == ${@: -1} ]];then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null"
|
||||||
|
eval $ex -i $file -r ${@: -1} --timing -o /dev/null
|
||||||
|
echo "running $ex -i $file -r ${@: -1} --timing --print_count"
|
||||||
|
eval $ex -i $file -r ${@: -1} --timing --print_count
|
||||||
|
|
||||||
|
if [[ $ex == ${dynamics[0]} || $ex == ${dynamics[1]} || $ex == ${dynamics[2]} ]]; then
|
||||||
|
echo "running $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic"
|
||||||
|
eval $ex -i $file -r ${@: -1} --timing -o /dev/null --dynamic
|
||||||
|
echo "running $ex -i $file -r ${@: -1} --timing --print_count --dynamic"
|
||||||
|
eval $ex -i $file -r ${@: -1} --timing --print_count --dynamic
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
126
src/elearn.cpp
Normal file
126
src/elearn.cpp
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
/*!
|
||||||
|
* \file elearn.cpp
|
||||||
|
* \brief e-learning version of the exercise.
|
||||||
|
*
|
||||||
|
* \author
|
||||||
|
* Christos Choutouridis AEM:8997
|
||||||
|
* <cchoutou@ece.auth.gr>
|
||||||
|
*/
|
||||||
|
#include <elearn.h>
|
||||||
|
|
||||||
|
//------- e-learning code start ---------
|
||||||
|
|
||||||
|
//! Credits to PDS team
|
||||||
|
static void coo2csc_e(
|
||||||
|
uint32_t *row, uint32_t *col, uint32_t const* row_coo, uint32_t const* col_coo, uint32_t nnz, uint32_t n, uint32_t isOneBased
|
||||||
|
) {
|
||||||
|
// ----- cannot assume that input is already 0!
|
||||||
|
for (uint32_t l = 0; l < n+1; l++) col[l] = 0;
|
||||||
|
|
||||||
|
// ----- find the correct column sizes
|
||||||
|
for (uint32_t l = 0; l < nnz; l++)
|
||||||
|
col[col_coo[l] - isOneBased]++;
|
||||||
|
|
||||||
|
// ----- cumulative sum
|
||||||
|
for (uint32_t i = 0, cumsum = 0; i < n; i++) {
|
||||||
|
uint32_t temp = col[i];
|
||||||
|
col[i] = cumsum;
|
||||||
|
cumsum += temp;
|
||||||
|
}
|
||||||
|
col[n] = nnz;
|
||||||
|
// ----- copy the row indices to the correct place
|
||||||
|
for (uint32_t l = 0; l < nnz; l++) {
|
||||||
|
uint32_t col_l;
|
||||||
|
col_l = col_coo[l] - isOneBased;
|
||||||
|
|
||||||
|
uint32_t dst = col[col_l];
|
||||||
|
row[dst] = row_coo[l] - isOneBased;
|
||||||
|
|
||||||
|
col[col_l]++;
|
||||||
|
}
|
||||||
|
// ----- revert the column pointers
|
||||||
|
for (uint32_t i = 0, last = 0; i < n; i++) {
|
||||||
|
uint32_t temp = col[i];
|
||||||
|
col[i] = last;
|
||||||
|
last = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A small binary search utility
|
||||||
|
*/
|
||||||
|
uint32_t find_idx(const uint32_t* v, uint32_t begin, uint32_t end, uint32_t match) {
|
||||||
|
uint32_t b = begin, e = end-1;
|
||||||
|
while (1) {
|
||||||
|
uint32_t m = (b+e)/2;
|
||||||
|
if (v[m] == match) return m;
|
||||||
|
else if (b >= e) return end;
|
||||||
|
else {
|
||||||
|
if (v[m] < match) b = m +1;
|
||||||
|
else e = m -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Sparse matrix item accessor
|
||||||
|
*/
|
||||||
|
uint32_t get(uint32_t* R, uint32_t* C, uint32_t i, uint32_t j) {
|
||||||
|
uint32_t e = C[j+1];
|
||||||
|
return (find_idx(R, C[j], e, i) != e) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \param coo_row pointer to coo row data
|
||||||
|
* \param coo_col pointer to coo_column data
|
||||||
|
* \param n the size of matrix
|
||||||
|
* \param nz the number of non-zero items
|
||||||
|
* \return The vertex-wise count vector
|
||||||
|
*/
|
||||||
|
uint32_t* vertexWiseTriangleCounts (uint32_t *coo_row, uint32_t *coo_col, uint32_t n, uint32_t nz) {
|
||||||
|
uint32_t* v = (uint32_t*)malloc(sizeof(uint32_t)*n);
|
||||||
|
uint32_t* R = (uint32_t*)malloc(sizeof(uint32_t)*nz);
|
||||||
|
uint32_t* C = (uint32_t*)malloc(sizeof(uint32_t)*n+1);
|
||||||
|
|
||||||
|
// convert input
|
||||||
|
coo2csc_e (R, C, coo_row, coo_col, nz, n, 1);
|
||||||
|
|
||||||
|
for (uint32_t i=0 ; i<n ; ++i) {
|
||||||
|
for (uint32_t j = C[i]; j<C[i+1] ; ++j) {
|
||||||
|
uint32_t j_idx = R[j];
|
||||||
|
for (uint32_t k = C[j_idx] ; k<C[j_idx+1] ; ++k) {
|
||||||
|
uint32_t k_idx = R[k];
|
||||||
|
if (get(R, C, k_idx, i)) {
|
||||||
|
++v[i];
|
||||||
|
++v[j_idx];
|
||||||
|
++v[k_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
//------- e-learning code end ---------
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A unit-test like functionality to check our implementation.
|
||||||
|
* \return
|
||||||
|
*/
|
||||||
|
uint32_t elearn_test (void) {
|
||||||
|
uint32_t CooR[] = { 2, 4, 6, 7, 3, 5, 6, 8, 11, 12, 4, 11, 12, 7, 6, 7, 9, 10, 12};
|
||||||
|
uint32_t CooC[] = { 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 5, 6, 8, 8, 11};
|
||||||
|
uint32_t N = 12;
|
||||||
|
uint32_t NZ = 19;
|
||||||
|
uint32_t c3[] = { 3, 5, 3, 1, 1, 3, 2, 0, 0, 0, 3, 3 };
|
||||||
|
|
||||||
|
uint32_t* tc3 = vertexWiseTriangleCounts(CooR, CooC, N, NZ); // call
|
||||||
|
|
||||||
|
for (uint32_t i=0 ; i<N ; ++i) // validate
|
||||||
|
if (tc3[i] != c3[i])
|
||||||
|
return 0; // fail
|
||||||
|
return 1; // pass
|
||||||
|
}
|
||||||
|
|
||||||
|
|
71
src/main.cpp
71
src/main.cpp
@ -55,26 +55,72 @@ bool get_options(int argc, char* argv[]){
|
|||||||
else if (arg == "-n" || arg == "--max_trheads") {
|
else if (arg == "-n" || arg == "--max_trheads") {
|
||||||
session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads;
|
session.max_threads = (i+1 < argc) ? std::atoi(argv[++i]) : session.max_threads;
|
||||||
}
|
}
|
||||||
|
else if (arg == "-r" || arg == "--repeat") {
|
||||||
|
session.repeat = (i+1 < argc) ? std::atoi(argv[++i]) : session.repeat;
|
||||||
|
}
|
||||||
else if (arg == "-t" || arg == "--timing")
|
else if (arg == "-t" || arg == "--timing")
|
||||||
session.timing = true;
|
session.timing = true;
|
||||||
else if (arg == "-v" || arg == "--verbose")
|
else if (arg == "-v" || arg == "--verbose")
|
||||||
session.verbose = true;
|
session.verbose = true;
|
||||||
|
else if (arg == "--make_symmetric")
|
||||||
|
session.makeSymmetric = true;
|
||||||
else if (arg == "--triangular_only")
|
else if (arg == "--triangular_only")
|
||||||
session.makeSymmetric = false;
|
session.makeSymmetric = false;
|
||||||
else if (arg == "--validate_mtx")
|
else if (arg == "--validate_mtx")
|
||||||
session.validate_mtx = true;
|
session.validate_mtx = true;
|
||||||
else if (arg == "--print_count")
|
else if (arg == "--dynamic")
|
||||||
|
session.dynamic = true;
|
||||||
|
else if (arg == "--print_count") {
|
||||||
session.print_count = true;
|
session.print_count = true;
|
||||||
|
session.makeSymmetric = false;
|
||||||
|
}
|
||||||
else if (arg == "--print_graph") {
|
else if (arg == "--print_graph") {
|
||||||
session.mtx_print = true;
|
session.mtx_print = true;
|
||||||
session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size;
|
session.mtx_print_size = (i+1 < argc) ? std::atoi(argv[++i]) : session.mtx_print_size;
|
||||||
}
|
}
|
||||||
else if (arg == "-h" || arg == "--help") {
|
else if (arg == "-h" || arg == "--help") {
|
||||||
std::cout << "Help message\n";
|
std::cout << "vertex-wise triangular count utility.\n\n";
|
||||||
|
std::cout << "tcount -i <file> | -g <size> <probability> [-o <file>] [-n <threads>] [--dynamic] [-r <times>] [-t] [-v]\n";
|
||||||
|
std::cout << " [--make_symmetric] [--triangular_only] [--print_count] [--validate_mtx] [--print_graph <size>]\n";
|
||||||
|
std::cout << '\n';
|
||||||
|
std::cout << "Options:\n\n";
|
||||||
|
std::cout << " -i | --input <file>\n";
|
||||||
|
std::cout << " Path to mtx file to load.\n\n";
|
||||||
|
std::cout << " -g | --generate <size> <probability>\n";
|
||||||
|
std::cout << " Request a random generated graph with size <size> and probability <probability>.\n";
|
||||||
|
std::cout << " This is very slow, use it with care.\n\n";
|
||||||
|
std::cout << " -o | --output <file>\n";
|
||||||
|
std::cout << " Select <file> as output file. Default is stdout.\n\n";
|
||||||
|
std::cout << " -n | --max_trheads <threads>\n";
|
||||||
|
std::cout << " Reduce the thread number for the execution to <threads>. <threads> must be less or equal to available CPUs.\n\n";
|
||||||
|
std::cout << " --dynamic\n";
|
||||||
|
std::cout << " Request of dynamic scheduling for OpenMP and pthreads. Does not affect cilk versions.\n\n";
|
||||||
|
std::cout << " -r | --repeat <times>\n";
|
||||||
|
std::cout << " Repeat the vector calculation <times> times.\n\n";
|
||||||
|
std::cout << " -t | --timing\n";
|
||||||
|
std::cout << " Request timing measurements output to stdout.\n\n";
|
||||||
|
std::cout << " -v | --verbose\n";
|
||||||
|
std::cout << " Request a more verbose output to stdout.\n\n";
|
||||||
|
std::cout << " --make_symmetric\n";
|
||||||
|
std::cout << " Explicitly request a symmetric graph generation. This affects only V3 versions where by default a lower\n";
|
||||||
|
std::cout << " triangular matrix is used.\n\n";
|
||||||
|
std::cout << " --triangular_only\n";
|
||||||
|
std::cout << " NOTE: Requires also \"--print_count\".\n";
|
||||||
|
std::cout << " Explicitly request to use a lower triangular matrix. This affects only V4 versions where a symmetric\n";
|
||||||
|
std::cout << " matrix is used by default and produce correct answer ONLY for total triangle counting (--print_count).\n\n";
|
||||||
|
std::cout << " --print_count\n";
|
||||||
|
std::cout << " NOTE: When used, also implies \"---triangular_only\" for all versions.\n";
|
||||||
|
std::cout << " Request a total triangle counting output.\n\n";
|
||||||
|
std::cout << " --validate_mtx\n";
|
||||||
|
std::cout << " Request an input matrix validation before execution.\n\n";
|
||||||
|
std::cout << " --print_graph <size>\n";
|
||||||
|
std::cout << " Prints the first <size> x <size> part of the matrix to stdout.\n\n";
|
||||||
|
std::cout << " -h | --help <size>\n";
|
||||||
|
std::cout << " Prints this and exit.\n";
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
else { // parse error
|
else { // parse error
|
||||||
std::cout << "Error message\n";
|
std::cout << "Invokation error. Try -h for details.\n";
|
||||||
status = false;
|
status = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -84,6 +130,12 @@ bool get_options(int argc, char* argv[]){
|
|||||||
std::cout << "Error message\n";
|
std::cout << "Error message\n";
|
||||||
status = false;
|
status = false;
|
||||||
}
|
}
|
||||||
|
#if CODE_VERSION == V4
|
||||||
|
else if (!session.makeSymmetric && !session.print_count) {
|
||||||
|
std::cout << "\"--triangular_only\" requires \"--print_count\"\n";
|
||||||
|
status = false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,17 +181,26 @@ int main(int argc, char* argv[]) try {
|
|||||||
std::vector<value_t> c;
|
std::vector<value_t> c;
|
||||||
index_t s;
|
index_t s;
|
||||||
|
|
||||||
|
#if defined ELEARNING
|
||||||
|
if (!elearn_test()) std::cout << "E-learning test: FAIL\n";
|
||||||
|
else std::cout << "E-learning test: PASS\n";
|
||||||
|
exit(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
// try to read command line
|
// try to read command line
|
||||||
if (!get_options(argc, argv))
|
if (!get_options(argc, argv))
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
||||||
prepare_matrix(A, timer);
|
prepare_matrix(A, timer);
|
||||||
threads_info();
|
threads_info();
|
||||||
logger << "Create count vector" << logger.endl;
|
for (size_t i =0 ; i<session.repeat ; ++i) {
|
||||||
|
// repeat calculations as requested by user
|
||||||
|
logger << "Create vector" << logger.endl;
|
||||||
timer.start();
|
timer.start();
|
||||||
c = triang_v (A);
|
c = triang_v (A);
|
||||||
timer.stop();
|
timer.stop();
|
||||||
timer.print_dt("create count vector");
|
timer.print_dt("create vector");
|
||||||
|
}
|
||||||
if (session.print_count) {
|
if (session.print_count) {
|
||||||
logger << "Calculate total triangles" << logger.endl;
|
logger << "Calculate total triangles" << logger.endl;
|
||||||
timer.start();
|
timer.start();
|
||||||
|
187
src/v3.cpp
187
src/v3.cpp
@ -8,23 +8,20 @@
|
|||||||
*/
|
*/
|
||||||
#include <v3.h>
|
#include <v3.h>
|
||||||
|
|
||||||
// for (int i=0 ; i<A.size() ; ++i) {
|
|
||||||
// for (int j = A.col_ptr[i]; j<A.col_ptr[i+1] ; ++j) {
|
|
||||||
// int j_idx = A.rows[j];
|
|
||||||
// for (int k = A.col_ptr[j_idx] ; k<A.col_ptr[j_idx+1] ; ++k) {
|
|
||||||
// int k_idx = A.rows[k];
|
|
||||||
// if (A.get(k_idx, i)) {
|
|
||||||
// ++c[i];
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
namespace v3 {
|
namespace v3 {
|
||||||
|
|
||||||
#if defined CILK
|
#if defined CILK
|
||||||
|
|
||||||
// export CILK_NWORKERS=<num>
|
/*!
|
||||||
|
* Utility function to get/set the number of threads.
|
||||||
|
*
|
||||||
|
* The number of threads are controlled via environment variable \c CILK_NWORKERS
|
||||||
|
*
|
||||||
|
* \return The number of threads used.
|
||||||
|
* \note
|
||||||
|
* The user can reduce the number with the command option \c --max_threads.
|
||||||
|
* If so the requested number will be used even if the environment has more threads available.
|
||||||
|
*/
|
||||||
int nworkers() {
|
int nworkers() {
|
||||||
if (session.max_threads)
|
if (session.max_threads)
|
||||||
return (session.max_threads < __cilkrts_get_nworkers()) ?
|
return (session.max_threads < __cilkrts_get_nworkers()) ?
|
||||||
@ -33,45 +30,93 @@ int nworkers() {
|
|||||||
return __cilkrts_get_nworkers();
|
return __cilkrts_get_nworkers();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* \param A The matrix to use.
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
*/
|
||||||
std::vector<value_t> triang_v(matrix& A) {
|
std::vector<value_t> triang_v(matrix& A) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<std::atomic<value_t>> c(A.size());
|
||||||
|
std::vector<value_t> ret(A.size());
|
||||||
|
|
||||||
cilk_for (int i=0 ; i<A.size() ; ++i) {
|
cilk_for (int i=0 ; i<A.size() ; ++i) {
|
||||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i
|
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
|
||||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j
|
// j list all the edges with i
|
||||||
if (A.get(k.index(), i)) // search for i-k edge
|
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
|
||||||
++c[i];
|
// k list all the edges with j
|
||||||
|
if (A.get(k.index(), i)) {
|
||||||
|
++ret[i];
|
||||||
|
c[j.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
|
c[k.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
}
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
}
|
||||||
return x/2;
|
if (session.makeSymmetric) {
|
||||||
});
|
ret[i] = ret[i]/2;
|
||||||
return c;
|
c[i] = c[i]/2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (index_t i =0 ; i<A.size() ; ++i) ret[i] += c[i];
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A sum utility to use as spawn function for parallelized sum.
|
||||||
|
* \return The sum of \c v from \c begin to \c end.
|
||||||
|
*/
|
||||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
||||||
for (auto i =begin ; i != end ; ++i)
|
for (auto i =begin ; i != end ; ++i)
|
||||||
out_sum += v[i];
|
out_sum += v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A parallelized version of sum. Just because ;)
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
int n = nworkers();
|
int n = nworkers();
|
||||||
std::vector<value_t> sum_v(n, 0);
|
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation.
|
||||||
|
|
||||||
|
// We spawn workers in a more statically way.
|
||||||
for (index_t i =0 ; i < n ; ++i) {
|
for (index_t i =0 ; i < n ; ++i) {
|
||||||
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
|
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
|
||||||
}
|
}
|
||||||
cilk_sync;
|
cilk_sync;
|
||||||
|
|
||||||
value_t s =0;
|
// sum the sums (a sum to rule them all)
|
||||||
for (auto& it : sum_v) s += it;
|
value_t s =0; for (auto& it : sum_v) s += it;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined OMP
|
#elif defined OMP
|
||||||
|
|
||||||
/*
|
/*!
|
||||||
// export OMP_NUM_THREADS=<num>
|
* A "simple" user defined OpenMP reduction for vector<value_t>
|
||||||
|
* \note
|
||||||
|
* Not used. Reason: The atomic version of the code performs better.
|
||||||
|
*/
|
||||||
|
#pragma omp declare reduction(vec_value_plus : std::vector<value_t> : \
|
||||||
|
std::transform( \
|
||||||
|
omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus<value_t>() \
|
||||||
|
) \
|
||||||
|
) \
|
||||||
|
initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Utility function to get/set the number of threads.
|
||||||
|
*
|
||||||
|
* The number of threads are controlled via environment variable \c OMP_NUM_THREADS
|
||||||
|
*
|
||||||
|
* \return The number of threads used.
|
||||||
|
* \note
|
||||||
|
* The user can reduce the number with the command option \c --max_threads.
|
||||||
|
* If so the requested number will be used even if the environment has more threads available.
|
||||||
*/
|
*/
|
||||||
int nworkers() {
|
int nworkers() {
|
||||||
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
|
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
|
||||||
@ -85,23 +130,49 @@ int nworkers() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* \param A The matrix to use.
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
*/
|
||||||
std::vector<value_t> triang_v(matrix& A) {
|
std::vector<value_t> triang_v(matrix& A) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<std::atomic<value_t>> c(A.size());
|
||||||
|
std::vector<value_t> ret(A.size());
|
||||||
|
|
||||||
#pragma omp parallel for shared(c)
|
// OMP schedule selection
|
||||||
|
if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0);
|
||||||
|
else omp_set_schedule (omp_sched_static, 0);
|
||||||
|
#pragma omp parallel for schedule(runtime) //reduction(vec_value_plus : c)
|
||||||
for (int i=0 ; i<A.size() ; ++i) {
|
for (int i=0 ; i<A.size() ; ++i) {
|
||||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i
|
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
|
||||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j
|
// j list all the edges with i
|
||||||
if (A.get(k.index(), i)) // search for i-k edge
|
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
|
||||||
++c[i];
|
// k list all the edges with j
|
||||||
|
if (A.get(k.index(), i)) {
|
||||||
|
++ret[i];
|
||||||
|
c[j.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
|
c[k.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
}
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
}
|
||||||
return x/2;
|
if (session.makeSymmetric) {
|
||||||
});
|
ret[i] = ret[i]/2;
|
||||||
return c;
|
c[i] = c[i]/2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (index_t i =0 ; i<A.size() ; ++i) ret[i] += c[i];
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A parallelized version of sum. Just because ;)
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
value_t s =0;
|
value_t s =0;
|
||||||
|
|
||||||
@ -113,24 +184,44 @@ value_t sum (std::vector<value_t>& v) {
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
//! Return the number of workers.
|
||||||
|
//! \note This function is just for completion
|
||||||
int nworkers() { return 1; }
|
int nworkers() { return 1; }
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* \param A The matrix to use.
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
*/
|
||||||
std::vector<value_t> triang_v(matrix& A) {
|
std::vector<value_t> triang_v(matrix& A) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<value_t> c(A.size());
|
||||||
|
|
||||||
for (int i=0 ; i<A.size() ; ++i) {
|
for (int i=0 ; i<A.size() ; ++i) {
|
||||||
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) // j list all the edges with i
|
for (auto j = A.getCol(i); j.index() != j.end() ; ++j) {
|
||||||
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) // k list all the edges with j
|
// j list all the edges with i
|
||||||
if (A.get(k.index(), i)) // search for i-k edge
|
for (auto k = A.getCol(j.index()); k.index() != k.end() ; ++k) {
|
||||||
|
// k list all the edges with j
|
||||||
|
if (A.get(k.index(), i)) {
|
||||||
++c[i];
|
++c[i];
|
||||||
|
c[j.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
|
c[k.index()] += (!session.makeSymmetric)? 1:0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (session.makeSymmetric) c[i] /= 2;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
|
||||||
return x/2;
|
|
||||||
});
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Summation functionality.
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
value_t s =0;
|
value_t s =0;
|
||||||
for (auto& it : v)
|
for (auto& it : v)
|
||||||
@ -140,9 +231,9 @@ value_t sum (std::vector<value_t>& v) {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//! Polymorphic interface function for sum results
|
||||||
value_t triang_count (std::vector<value_t>& c) {
|
value_t triang_count (std::vector<value_t>& c) {
|
||||||
return (session.makeSymmetric) ? sum(c)/3 : sum(c);
|
return sum(c)/3;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
207
src/v4.cpp
207
src/v4.cpp
@ -12,7 +12,16 @@ namespace v4 {
|
|||||||
|
|
||||||
#if defined CILK
|
#if defined CILK
|
||||||
|
|
||||||
// export CILK_NWORKERS=<num>
|
/*!
|
||||||
|
* Utility function to get/set the number of threads.
|
||||||
|
*
|
||||||
|
* The number of threads are controlled via environment variable \c CILK_NWORKERS
|
||||||
|
*
|
||||||
|
* \return The number of threads used.
|
||||||
|
* \note
|
||||||
|
* The user can reduce the number with the command option \c --max_threads.
|
||||||
|
* If so the requested number will be used even if the environment has more threads available.
|
||||||
|
*/
|
||||||
int nworkers() {
|
int nworkers() {
|
||||||
if (session.max_threads)
|
if (session.max_threads)
|
||||||
return (session.max_threads < __cilkrts_get_nworkers()) ?
|
return (session.max_threads < __cilkrts_get_nworkers()) ?
|
||||||
@ -21,6 +30,25 @@ int nworkers() {
|
|||||||
return __cilkrts_get_nworkers();
|
return __cilkrts_get_nworkers();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* 1
|
||||||
|
* vector = --- * (A.* (A*B))*ones_N
|
||||||
|
* 2
|
||||||
|
* We squeezed all that to one function for performance. The row*column multiplication
|
||||||
|
* uses the inner CSC structure of sparse matrix and follows only non-zero members.
|
||||||
|
*
|
||||||
|
* \param A The first matrix to use.
|
||||||
|
* \param B The second matrix to use (they can be the same).
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
* \warning
|
||||||
|
* The later(--triangular_only) produce correct results ONLY if we are after the total count.
|
||||||
|
*/
|
||||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<value_t> c(A.size());
|
||||||
|
|
||||||
@ -28,37 +56,50 @@ std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
|||||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
|
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
|
||||||
c[i] += A.getRow(i)*B.getCol(j.index());
|
c[i] += A.getRow(i)*B.getCol(j.index());
|
||||||
}
|
}
|
||||||
|
if (session.makeSymmetric) c[i] /= 2;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
|
||||||
return x/2;
|
|
||||||
});
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A sum utility to use as spawn function for parallelized sum.
|
||||||
|
* \return The sum of \c v from \c begin to \c end.
|
||||||
|
*/
|
||||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
||||||
for (auto i =begin ; i != end ; ++i)
|
for (auto i =begin ; i != end ; ++i)
|
||||||
out_sum += v[i];
|
out_sum += v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A parallelized version of sum. Just because ;)
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
int n = nworkers();
|
int n = nworkers();
|
||||||
std::vector<value_t> sum_v(n, 0);
|
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation.
|
||||||
|
|
||||||
|
// We spawn workers in a more statically way.
|
||||||
for (index_t i =0 ; i < n ; ++i) {
|
for (index_t i =0 ; i < n ; ++i) {
|
||||||
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
|
cilk_spawn do_sum(sum_v[i], v, i*v.size()/n, (i+1)*v.size()/n);
|
||||||
}
|
}
|
||||||
cilk_sync;
|
cilk_sync;
|
||||||
|
|
||||||
value_t s =0;
|
// sum the sums (a sum to rule them all)
|
||||||
for (auto& it : sum_v) s += it;
|
value_t s =0; for (auto& it : sum_v) s += it;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined OMP
|
#elif defined OMP
|
||||||
|
|
||||||
/*
|
/*!
|
||||||
// export OMP_NUM_THREADS=<num>
|
* Utility function to get/set the number of threads.
|
||||||
|
*
|
||||||
|
* The number of threads are controlled via environment variable \c OMP_NUM_THREADS
|
||||||
|
*
|
||||||
|
* \return The number of threads used.
|
||||||
|
* \note
|
||||||
|
* The user can reduce the number with the command option \c --max_threads.
|
||||||
|
* If so the requested number will be used even if the environment has more threads available.
|
||||||
*/
|
*/
|
||||||
int nworkers() {
|
int nworkers() {
|
||||||
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
|
if (session.max_threads && session.max_threads < (size_t)omp_get_max_threads()) {
|
||||||
@ -72,22 +113,45 @@ int nworkers() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* 1
|
||||||
|
* vector = --- * (A.* (A*B))*ones_N
|
||||||
|
* 2
|
||||||
|
* We squeezed all that to one function for performance. The row*column multiplication
|
||||||
|
* uses the inner CSC structure of sparse matrix and follows only non-zero members.
|
||||||
|
*
|
||||||
|
* \param A The first matrix to use.
|
||||||
|
* \param B The second matrix to use (they can be the same).
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
* \warning
|
||||||
|
* The later(--triangular_only) produce correct results ONLY if we are after the total count.
|
||||||
|
*/
|
||||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<value_t> c(A.size());
|
||||||
|
|
||||||
#pragma omp parallel for shared(c)
|
// OMP schedule selection
|
||||||
|
if (session.dynamic) omp_set_schedule (omp_sched_dynamic, 0);
|
||||||
|
else omp_set_schedule (omp_sched_static, 0);
|
||||||
|
#pragma omp parallel for shared(c) schedule(runtime)
|
||||||
for (int i=0 ; i<A.size() ; ++i) {
|
for (int i=0 ; i<A.size() ; ++i) {
|
||||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
|
for (auto j = A.getRow(i); j.index() != j.end() ; ++j) {
|
||||||
c[i] += A.getRow(i)*B.getCol(j.index());
|
c[i] += A.getRow(i)*B.getCol(j.index());
|
||||||
}
|
}
|
||||||
|
if (session.makeSymmetric) c[i] /= 2;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
|
||||||
return x/2;
|
|
||||||
});
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A parallelized version of sum. Just because ;)
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
value_t s =0;
|
value_t s =0;
|
||||||
|
|
||||||
@ -99,8 +163,15 @@ value_t sum (std::vector<value_t>& v) {
|
|||||||
|
|
||||||
#elif defined THREADS
|
#elif defined THREADS
|
||||||
|
|
||||||
/*
|
/*!
|
||||||
* std::thread::hardware_concurrency()
|
* Utility function to get/set the number of threads.
|
||||||
|
*
|
||||||
|
* The number of threads are inherited by the environment via std::thread::hardware_concurrency()
|
||||||
|
*
|
||||||
|
* \return The number of threads used.
|
||||||
|
* \note
|
||||||
|
* The user can reduce the number with the command option \c --max_threads.
|
||||||
|
* If so the requested number will be used even if the environment has more threads available.
|
||||||
*/
|
*/
|
||||||
int nworkers() {
|
int nworkers() {
|
||||||
if (session.max_threads)
|
if (session.max_threads)
|
||||||
@ -110,43 +181,89 @@ int nworkers() {
|
|||||||
return std::thread::hardware_concurrency();
|
return std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<value_t> mmacc_v_rng(std::vector<value_t>& out, matrix& A, matrix& B, index_t begin, index_t end) {
|
/*!
|
||||||
|
* A spawn function to calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* 1
|
||||||
|
* vector(begin..end) = --- * (A.* (A*B))*ones_N
|
||||||
|
* 2
|
||||||
|
*
|
||||||
|
* We squeezed all that to one function for performance. The row*column multiplication
|
||||||
|
* uses the inner CSC structure of sparse matrix and follows only non-zero members.
|
||||||
|
*
|
||||||
|
* \param out Reference to output vector
|
||||||
|
* \param A The first matrix to use.
|
||||||
|
* \param B The second matrix to use (they can be the same).
|
||||||
|
* \param iton vector containing the range with the columns to use (it can be shuffled).
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
* \warning
|
||||||
|
* The later(--triangular_only) produce correct results ONLY if we are after the total count.
|
||||||
|
*/
|
||||||
|
std::vector<value_t> mmacc_v_rng(
|
||||||
|
std::vector<value_t>& out, matrix& A, matrix& B, std::vector<index_t>& iton, index_t begin, index_t end) {
|
||||||
for (index_t i=begin ; i<end ; ++i) {
|
for (index_t i=begin ; i<end ; ++i) {
|
||||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
|
index_t ii = iton[i];
|
||||||
out[i] += A.getRow(i)*B.getCol(j.index());
|
for (auto j = A.getRow(ii); j.index() != j.end() ; ++j){
|
||||||
|
out[ii] += A.getRow(ii)*B.getCol(j.index());
|
||||||
}
|
}
|
||||||
|
if (session.makeSymmetric) out[ii] /= 2;
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* \param A The first matrix to use.
|
||||||
|
* \param B The second matrix to use (they can be the same).
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
*/
|
||||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<value_t> c(A.size());
|
||||||
int n = nworkers();
|
int n = nworkers();
|
||||||
|
|
||||||
for (index_t i=0 ; i<n ; ++i)
|
std::vector<index_t> iton(A.size()); // Create a 0 .. N range for outer loop
|
||||||
workers.push_back (std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), i*c.size()/n, (i+1)*c.size()/n));
|
std::iota(iton.begin(), iton.end(), 0);
|
||||||
|
if (session.dynamic) // in case of dynamic scheduling, shuffle the range
|
||||||
|
std::shuffle(iton.begin(), iton.end(), std::mt19937{std::random_device{}()});
|
||||||
|
|
||||||
|
for (index_t i=0 ; i<n ; ++i) // dispatch the workers and hold them in a vector
|
||||||
|
workers.push_back (
|
||||||
|
std::thread (mmacc_v_rng, std::ref(c), std::ref(A), std::ref(B), std::ref(iton), i*A.size()/n, (i+1)*A.size()/n)
|
||||||
|
);
|
||||||
|
|
||||||
|
// a for to join them all...
|
||||||
std::for_each(workers.begin(), workers.end(), [](std::thread& t){
|
std::for_each(workers.begin(), workers.end(), [](std::thread& t){
|
||||||
t.join();
|
t.join();
|
||||||
});
|
});
|
||||||
if (session.makeSymmetric)
|
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
|
||||||
return x/2;
|
|
||||||
});
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A sum utility to use as spawn function for parallelized sum.
|
||||||
|
* \return The sum of \c v from \c begin to \c end.
|
||||||
|
*/
|
||||||
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
void do_sum (value_t& out_sum, std::vector<value_t>& v, index_t begin, index_t end) {
|
||||||
for (auto i =begin ; i != end ; ++i)
|
for (auto i =begin ; i != end ; ++i)
|
||||||
out_sum += v[i];
|
out_sum += v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* A parallelized version of sum. Just because ;)
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
int n = nworkers();
|
int n = nworkers();
|
||||||
std::vector<value_t> sum_v(n, 0);
|
std::vector<value_t> sum_v(n, 0); // result of each do_sum invokation.
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
|
|
||||||
|
// We spawn workers in a more statically way.
|
||||||
for (index_t i =0 ; i < n ; ++i)
|
for (index_t i =0 ; i < n ; ++i)
|
||||||
workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));
|
workers.push_back (std::thread (do_sum, std::ref(sum_v[i]), std::ref(v), i*v.size()/n, (i+1)*v.size()/n));
|
||||||
|
|
||||||
@ -154,29 +271,51 @@ value_t sum (std::vector<value_t>& v) {
|
|||||||
t.join();
|
t.join();
|
||||||
});
|
});
|
||||||
|
|
||||||
value_t s =0;
|
// sum the sums (a sum to rule them all)
|
||||||
for (auto& it : sum_v) s += it;
|
value_t s =0; for (auto& it : sum_v) s += it;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
//! Return the number of workers.
|
||||||
|
//! \note This function is just for completion
|
||||||
int nworkers() { return 1; }
|
int nworkers() { return 1; }
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Calculate and return a vertex-wise count vector.
|
||||||
|
*
|
||||||
|
* 1
|
||||||
|
* vector = --- * (A.* (A*B))*ones_N
|
||||||
|
* 2
|
||||||
|
* We squeezed all that to one function for performance. The row*column multiplication
|
||||||
|
* uses the inner CSC structure of sparse matrix and follows only non-zero members.
|
||||||
|
*
|
||||||
|
* \param A The first matrix to use.
|
||||||
|
* \param B The second matrix to use (they can be the same).
|
||||||
|
* \return The count vector. RVO is used here.
|
||||||
|
* \note
|
||||||
|
* We use two methods of calculation based on \c --make_symmetric or \c --triangular_only
|
||||||
|
* - A full matrix calculation which update only c[i]
|
||||||
|
* - A lower triangular matrix which update c[i], c[j], c[k]. This is wayyy faster.
|
||||||
|
* \warning
|
||||||
|
* The later(--triangular_only) produce correct results ONLY if we are after the total count.
|
||||||
|
*/
|
||||||
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
std::vector<value_t> mmacc_v(matrix& A, matrix& B) {
|
||||||
std::vector<value_t> c(A.size());
|
std::vector<value_t> c(A.size());
|
||||||
for (int i=0 ; i<A.size() ; ++i) {
|
for (int i=0 ; i<A.size() ; ++i) {
|
||||||
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
|
for (auto j = A.getRow(i); j.index() != j.end() ; ++j){
|
||||||
c[i] += A.getRow(i)*B.getCol(j.index());
|
c[i] += A.getRow(i)*B.getCol(j.index());
|
||||||
}
|
}
|
||||||
|
if (session.makeSymmetric) c[i] /= 2;
|
||||||
}
|
}
|
||||||
if (session.makeSymmetric)
|
|
||||||
std::transform (c.begin(), c.end(), c.begin(), [] (value_t& x) {
|
|
||||||
return x/2;
|
|
||||||
});
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* Summation functionality.
|
||||||
|
* \return The total sum of vector \c v
|
||||||
|
*/
|
||||||
value_t sum (std::vector<value_t>& v) {
|
value_t sum (std::vector<value_t>& v) {
|
||||||
value_t s =0;
|
value_t s =0;
|
||||||
for (auto& it : v)
|
for (auto& it : v)
|
||||||
@ -186,10 +325,12 @@ value_t sum (std::vector<value_t>& v) {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//! Polymorphic interface function for count vector
|
||||||
std::vector<value_t> triang_v(matrix& A) {
|
std::vector<value_t> triang_v(matrix& A) {
|
||||||
return mmacc_v(A, A);
|
return mmacc_v(A, A);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Polymorphic interface function for sum results
|
||||||
value_t triang_count (std::vector<value_t>& c) {
|
value_t triang_count (std::vector<value_t>& c) {
|
||||||
return (session.makeSymmetric) ? sum(c)/3 : sum(c);
|
return (session.makeSymmetric) ? sum(c)/3 : sum(c);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user