Compare commits
No commits in common. "7a6f7f53b5d10db2277e691c45c36162c0faa6ff" and "e7be3a10ddfa08d791384b9c9998bd6c791fc306" have entirely different histories.
7a6f7f53b5
...
e7be3a10dd
23
homework_3/.gitignore
vendored
23
homework_3/.gitignore
vendored
@ -1,23 +0,0 @@
|
|||||||
# project
|
|
||||||
bin/
|
|
||||||
out/
|
|
||||||
mat/
|
|
||||||
mtx/
|
|
||||||
.unused/
|
|
||||||
various/
|
|
||||||
|
|
||||||
# hpc
|
|
||||||
|
|
||||||
# IDEs
|
|
||||||
.idea/
|
|
||||||
.clangd
|
|
||||||
|
|
||||||
# eclipse
|
|
||||||
.project
|
|
||||||
.cproject
|
|
||||||
.settings/
|
|
||||||
|
|
||||||
.vs/
|
|
||||||
.vscode/
|
|
||||||
|
|
||||||
|
|
@ -1,236 +0,0 @@
|
|||||||
#
|
|
||||||
# PDS HW3 Makefile
|
|
||||||
#
|
|
||||||
# Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
|
|
||||||
#
|
|
||||||
# This program is free software: you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU Lesser General Public License as
|
|
||||||
# published by the Free Software Foundation, either version 3
|
|
||||||
# of the License, or (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU Lesser General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU Lesser General Public License
|
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
#
|
|
||||||
|
|
||||||
# ============== Project settings ==============
|
|
||||||
# Project's name
|
|
||||||
PROJECT := PDS_homework_3
|
|
||||||
|
|
||||||
# Excecutable's name
|
|
||||||
TARGET := bitonicCUDA
|
|
||||||
|
|
||||||
# Source directories list(space seperated). Makefile-relative path, UNDER current directory.
|
|
||||||
SRC_DIR_LIST := src #test test/gtest
|
|
||||||
|
|
||||||
# Include directories list(space seperated). Makefile-relative path.
|
|
||||||
INC_DIR_LIST := src
|
|
||||||
# test \
|
|
||||||
# test/gtest/ \
|
|
||||||
|
|
||||||
|
|
||||||
# Exclude files list(space seperated). Filenames only.
|
|
||||||
# EXC_FILE_LIST := bad.cpp old.cpp
|
|
||||||
|
|
||||||
# Build directories
|
|
||||||
BUILD_DIR := bin
|
|
||||||
OBJ_DIR := $(BUILD_DIR)/obj
|
|
||||||
DEP_DIR := $(BUILD_DIR)/.dep
|
|
||||||
|
|
||||||
OUTPUT_DIR := out
|
|
||||||
|
|
||||||
# ========== Compiler settings ==========
|
|
||||||
# Compiler flags for debug and release
|
|
||||||
DEB_CFLAGS := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG"
|
|
||||||
REL_CFLAGS := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
|
|
||||||
DEB_CXXFLAGS := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG"
|
|
||||||
REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
|
|
||||||
|
|
||||||
# Pre-defines
|
|
||||||
# PRE_DEFS := MYCAB=1729 SUPER_MODE
|
|
||||||
PRE_DEFS := TARGET=$(TARGET)
|
|
||||||
|
|
||||||
# ============== Linker settings ==============
|
|
||||||
# Linker flags (example: -pthread -lm)
|
|
||||||
LDFLAGS :=
|
|
||||||
|
|
||||||
# Map output file
|
|
||||||
MAP_FILE := # output.map
|
|
||||||
MAP_FLAG := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE)
|
|
||||||
|
|
||||||
# ============== Docker settings ==============
|
|
||||||
# We need:
|
|
||||||
# - Bind the entire project directory(the dir that includes all the code) as volume.
|
|
||||||
# - In docker instance, change to working directory(where the makefile is).
|
|
||||||
DOCKER_VOL_DIR := $(shell pwd)
|
|
||||||
DOCKER_WRK_DIR :=
|
|
||||||
DOCKER_RUN := docker run --rm
|
|
||||||
DOCKER_FLAGS := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR)
|
|
||||||
|
|
||||||
# docker invoke mechanism (edit with care)
|
|
||||||
# note:
|
|
||||||
# Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
|
|
||||||
# functionality is needed.
|
|
||||||
DOCKER_CMD = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE)
|
|
||||||
DOCKER :=
|
|
||||||
|
|
||||||
# ============== Tool selection ==============
|
|
||||||
# compiler and compiler flags.
|
|
||||||
CSIZE := size
|
|
||||||
CFLAGS := $(DEB_CFLAGS)
|
|
||||||
CXXFLAGS := $(DEB_CXXFLAGS)
|
|
||||||
CXX := g++
|
|
||||||
CC := gcc
|
|
||||||
LINKER := g++
|
|
||||||
|
|
||||||
#
|
|
||||||
# =========== Main body and Patterns ===========
|
|
||||||
#
|
|
||||||
|
|
||||||
INC := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
|
|
||||||
DEF := $(foreach def,$(PRE_DEFS),-D$(def))
|
|
||||||
EXC := $(foreach fil,$(EXC_FILE_LIST), \
|
|
||||||
$(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
|
|
||||||
)
|
|
||||||
# source files. object and dependencies list
|
|
||||||
# recursive search into current and source directories
|
|
||||||
SRC := $(wildcard *.cpp)
|
|
||||||
SRC += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp))
|
|
||||||
SRC += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp))
|
|
||||||
SRC := $(filter-out $(EXC),$(SRC))
|
|
||||||
#SRC := $(abspath $(SRC))
|
|
||||||
|
|
||||||
OBJ := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
|
|
||||||
DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
|
|
||||||
|
|
||||||
|
|
||||||
# c file objects depent on .c AND dependency files, which have an empty recipe
|
|
||||||
$(OBJ_DIR)/%.o: %.c
|
|
||||||
@mkdir -p $(@D)
|
|
||||||
$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
|
|
||||||
|
|
||||||
# cpp file objects depend on .cpp AND dependency files, which have an empty recipe
|
|
||||||
$(OBJ_DIR)/%.o: %.cpp
|
|
||||||
@mkdir -p $(@D)
|
|
||||||
$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
|
|
||||||
|
|
||||||
# main target rule
|
|
||||||
$(BUILD_DIR)/$(TARGET): $(OBJ)
|
|
||||||
@mkdir -p $(@D)
|
|
||||||
@echo Linking to target: $(TARGET)
|
|
||||||
@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
|
||||||
@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET)
|
|
||||||
@echo
|
|
||||||
@echo Print size information
|
|
||||||
@$(CSIZE) $(@D)/$(TARGET)
|
|
||||||
@echo Done
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# ================ Default local build rules =================
|
|
||||||
# example:
|
|
||||||
# make debug
|
|
||||||
|
|
||||||
.DEFAULT_GOAL := all
|
|
||||||
|
|
||||||
.PHONY: clean
|
|
||||||
clean:
|
|
||||||
@echo Cleaning build directories
|
|
||||||
@rm -rf $(OBJ_DIR)
|
|
||||||
@rm -rf $(DEP_DIR)
|
|
||||||
@rm -rf $(BUILD_DIR)
|
|
||||||
|
|
||||||
debug: CFLAGS := $(DEB_CFLAGS)
|
|
||||||
debug: $(BUILD_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
release: CFLAGS := $(REL_CFLAGS)
|
|
||||||
release: $(BUILD_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
#
|
|
||||||
# ================ Build rules =================
|
|
||||||
#
|
|
||||||
|
|
||||||
bitonic_v0deb: CC := nvcc -G -g -x cu
|
|
||||||
bitonic_v0deb: CXX := nvcc -G -g -x cu
|
|
||||||
bitonic_v0deb: LINKER := nvcc
|
|
||||||
bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
|
|
||||||
bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
|
|
||||||
bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
|
||||||
bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
|
|
||||||
bitonic_v1deb: CC := nvcc -G -g -x cu
|
|
||||||
bitonic_v1deb: CXX := nvcc -G -g -x cu
|
|
||||||
bitonic_v1deb: LINKER := nvcc
|
|
||||||
bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
|
|
||||||
bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
|
|
||||||
bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
|
|
||||||
bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
|
|
||||||
bitonic_v2deb: CC := nvcc -G -g -x cu
|
|
||||||
bitonic_v2deb: CXX := nvcc -G -g -x cu
|
|
||||||
bitonic_v2deb: LINKER := nvcc
|
|
||||||
bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
|
|
||||||
bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
|
|
||||||
bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
|
|
||||||
bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
bitonic_v0: CC := nvcc -x cu
|
|
||||||
bitonic_v0: CXX := nvcc -x cu
|
|
||||||
bitonic_v0: LINKER := nvcc
|
|
||||||
bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
|
|
||||||
bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
|
|
||||||
bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
|
|
||||||
bitonic_v0: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
|
|
||||||
bitonic_v1: CC := nvcc -x cu
|
|
||||||
bitonic_v1: CXX := nvcc -x cu
|
|
||||||
bitonic_v1: LINKER := nvcc
|
|
||||||
bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
|
|
||||||
bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
|
|
||||||
bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
|
|
||||||
bitonic_v1: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
bitonic_v2: CC := nvcc -x cu
|
|
||||||
bitonic_v2: CXX := nvcc -x cu
|
|
||||||
bitonic_v2: LINKER := nvcc
|
|
||||||
bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2
|
|
||||||
bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2
|
|
||||||
bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
|
|
||||||
bitonic_v2: $(BUILD_DIR)/$(TARGET)
|
|
||||||
@mkdir -p $(OUTPUT_DIR)
|
|
||||||
cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
|
|
||||||
|
|
||||||
|
|
||||||
hpc-build:
|
|
||||||
make clean
|
|
||||||
make bitonic_v0
|
|
||||||
make clean
|
|
||||||
make bitonic_v1
|
|
||||||
make clean
|
|
||||||
make bitonic_v2
|
|
||||||
|
|
||||||
|
|
||||||
all: debug bitonic_v0
|
|
||||||
# Note:
|
|
||||||
# Add a gcc based make rule here in order for clangd to successfully scan the project files.
|
|
||||||
# Otherwise we do not need the gcc build.
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
Parallel & Distributed Computer Systems HW3
|
|
||||||
|
|
||||||
January, 2025
|
|
||||||
|
|
||||||
Write a program that sorts $N$ integers in ascending order, using CUDA.
|
|
||||||
|
|
||||||
The program must perform the following tasks:
|
|
||||||
|
|
||||||
- The user specifies a positive integers $q$.
|
|
||||||
|
|
||||||
- Start a process with an array of $N = 2^q$ random integers is each processes.
|
|
||||||
|
|
||||||
- Sort all $N$ elements int ascending order.
|
|
||||||
|
|
||||||
- Check the correctness of the final result.
|
|
||||||
|
|
||||||
Your implementation should be based on the following steps:
|
|
||||||
|
|
||||||
V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations.
|
|
||||||
|
|
||||||
V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0!
|
|
||||||
|
|
||||||
V2. Modify the kernel of V1 to work with local memory instead of global.
|
|
||||||
|
|
||||||
You must deliver:
|
|
||||||
- A report (about $3-4$ pages) that describes your parallel algorithm and implementation.
|
|
||||||
|
|
||||||
- Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$.
|
|
||||||
|
|
||||||
- The source code of your program uploaded online.
|
|
||||||
|
|
||||||
Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names.
|
|
||||||
Deadline: 2 February, $2025$.
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Parameters
|
|
||||||
versions=("v0" "v1" "v2")
|
|
||||||
q_values=(20 21 22 23 24 25 26 27 28 29 30)
|
|
||||||
|
|
||||||
# Make scripts
|
|
||||||
for version in "${versions[@]}"; do
|
|
||||||
for q in "${q_values[@]}"; do
|
|
||||||
filename="Bitnc${version^^}Q${q}.sh" # Convert v0 -> V0 etc...
|
|
||||||
cat > "$filename" <<EOL
|
|
||||||
#! /usr/bin/env bash
|
|
||||||
|
|
||||||
#SBATCH --job-name=Bitnc${version^^}Q${q}
|
|
||||||
#SBATCH --nodes=1
|
|
||||||
#SBATCH --gres=gpu:1
|
|
||||||
#SBATCH --time=10:00
|
|
||||||
|
|
||||||
module load gcc/9.2.0 cuda/11.1.0
|
|
||||||
|
|
||||||
./out/${version}/bitonicCUDA -v --validation --perf 7 -b 512 -q ${q}
|
|
||||||
|
|
||||||
EOL
|
|
||||||
echo "Create: $filename"
|
|
||||||
done
|
|
||||||
done
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Submission parameters
|
|
||||||
QOS="small"
|
|
||||||
PARTITION="ampere"
|
|
||||||
SCRIPT_DIR="hpc" # Directory containing the job scripts
|
|
||||||
|
|
||||||
# Range of values for the -q parameter
|
|
||||||
VERSIONS=("V0" "V1" "V2")
|
|
||||||
Q_START=20
|
|
||||||
Q_END=30
|
|
||||||
|
|
||||||
# Submitting the jobs
|
|
||||||
for version in "${VERSIONS[@]}"; do
|
|
||||||
for ((q = Q_START; q <= Q_END; q++)); do
|
|
||||||
script_name="Bitnc${version}Q${q}.sh"
|
|
||||||
script_path="${SCRIPT_DIR}/${script_name}"
|
|
||||||
|
|
||||||
if [[ -f "$script_path" ]]; then
|
|
||||||
sbatch --qos="$QOS" -p "$PARTITION" "$script_path"
|
|
||||||
echo "Submitted: $script_path"
|
|
||||||
else
|
|
||||||
echo "Warning: File not found - $script_path"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
@ -1,456 +0,0 @@
|
|||||||
/*!
|
|
||||||
* \file
|
|
||||||
* \brief Bitonic sort CUDA implementation header
|
|
||||||
*
|
|
||||||
* \author
|
|
||||||
* Christos Choutouridis AEM:8997
|
|
||||||
* <cchoutou@ece.auth.gr>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef BITONICSORTCUDA_H_
|
|
||||||
#define BITONICSORTCUDA_H_
|
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <vector>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "utils.hpp"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Exported timers
|
|
||||||
*/
|
|
||||||
extern Timing Timer_total, Timer_memory, Timer_sorting;
|
|
||||||
|
|
||||||
using threadId_t = size_t;
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* ============================== Sort utilities ==============================
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Returns the ascending or descending configuration (up/down phase) of the thread id
|
|
||||||
* depending on the current depth
|
|
||||||
*
|
|
||||||
* @param tid [threadId_t] The current thread
|
|
||||||
* @param stage [size_t] The current stage of the sorting network (same for each step)
|
|
||||||
* @return [bool] True if we need ascending configuration, false otherwise
|
|
||||||
*/
|
|
||||||
__device__ inline bool ascending(threadId_t tid, size_t stage) noexcept {
|
|
||||||
return !(tid & (1 << stage));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Returns the thread's partner for data exchange during the sorting network iterations
|
|
||||||
* of Bitonic
|
|
||||||
*
|
|
||||||
* @param tid [threadId_t] The current node
|
|
||||||
* @param step [size_t] The step of the sorting network
|
|
||||||
* @return [threadId_t] The node id of the partner for data exchange
|
|
||||||
*/
|
|
||||||
__device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept {
|
|
||||||
return (tid ^ (1 << step));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Predicate to check if a node keeps the small numbers during the bitonic sort network exchange.
|
|
||||||
*
|
|
||||||
* @param tid [threadId_t] The node for which we check
|
|
||||||
* @param partner [threadId_t] The partner of the data exchange
|
|
||||||
* @param stage [size_t] The current stage of the sorting network (same for each step)
|
|
||||||
* @return [bool] True if the node should keep the small values, false otherwise
|
|
||||||
*/
|
|
||||||
|
|
||||||
__device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) {
|
|
||||||
return ascending(tid, stage) == (tid < partner);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* ============================== Sort algorithms ==============================
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Each thread can handle 2 points in the array. For each of these 2 points it may
|
|
||||||
* - compare and exchange if needed
|
|
||||||
* - copy data to local and back if needed
|
|
||||||
*/
|
|
||||||
static constexpr size_t SizeToThreadsRatio = 2;
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Calculates the blocks needed for the entire sorting process
|
|
||||||
*
|
|
||||||
* @note
|
|
||||||
* This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
|
|
||||||
* multipliers of config.blockSize.
|
|
||||||
* Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
|
|
||||||
*
|
|
||||||
* @param arraySize [ArraySize_t] The size of the entire array (in points)
|
|
||||||
* @return [size_t] The number of blocks
|
|
||||||
*/
|
|
||||||
inline size_t NBlocks(ArraySize_t arraySize) {
|
|
||||||
return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Exchange utility
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
*
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param tid [threadId_t] Current thread's index to data
|
|
||||||
* @param pid [threadId_t] Parents's index to data
|
|
||||||
* @param keepSmall [bool] Flag to indicate if current threads is keeping the small
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
|
|
||||||
if (( keepSmall && (data[tid] > data[partner])) ||
|
|
||||||
(!keepSmall && (data[tid] < data[partner])) ) {
|
|
||||||
ValueT temp = data[tid];
|
|
||||||
data[tid] = data[partner];
|
|
||||||
data[partner] = temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CODE_VERSION == V0
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is the body of each thread. This function compare and exchange data
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
|
|
||||||
threadId_t pid = partner(tid, step);
|
|
||||||
if (tid > pid) {
|
|
||||||
// Shift to the other half of the array for global data
|
|
||||||
tid += n / SizeToThreadsRatio;
|
|
||||||
pid += n / SizeToThreadsRatio;
|
|
||||||
}
|
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
|
||||||
exchange(data, tid, pid, keep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A CUDA version of the Bitonic sort algorithm.
|
|
||||||
*
|
|
||||||
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
|
||||||
* @param data [DataT&] Reference to the container to sort
|
|
||||||
*/
|
|
||||||
template <typename DataT>
|
|
||||||
void bitonicSort(DataT& data) {
|
|
||||||
using value_t = typename DataT::value_type;
|
|
||||||
|
|
||||||
value_t* dev_data;
|
|
||||||
auto size = data.size();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
|
||||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
|
||||||
Timer_memory.stop();
|
|
||||||
|
|
||||||
size_t Nth = config.blockSize;
|
|
||||||
size_t Nbl = NBlocks(size);
|
|
||||||
|
|
||||||
size_t Stages = static_cast<size_t>(log2(size));
|
|
||||||
Timer_sorting.start();
|
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
|
||||||
for (size_t step = stage; step > 0; ) {
|
|
||||||
--step;
|
|
||||||
bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
|
||||||
cudaDeviceSynchronize();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Timer_sorting.stop();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
|
||||||
cudaFree(dev_data);
|
|
||||||
Timer_memory.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif CODE_VERSION == V1
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is the body of each thread. This function compare and exchange data
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
|
|
||||||
/*
|
|
||||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
|
||||||
* And we use the neighbor block address indices for the other half of the threads
|
|
||||||
*/
|
|
||||||
threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
|
||||||
threadId_t pid = partner(tid, step);
|
|
||||||
if (tid > pid) {
|
|
||||||
// Shift to the other half of the array for global data
|
|
||||||
tid += blockDim.x;
|
|
||||||
pid += blockDim.x;
|
|
||||||
}
|
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
|
||||||
exchange(data, tid, pid, keep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is the version of the body that is called outside of the loop unrolling
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
|
||||||
interBlockStep_(data, n, step, stage);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is unrolled part of the bitonic double loop.
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
|
||||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
|
||||||
--step;
|
|
||||||
interBlockStep_(data, n, step, stage);
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A CUDA version of the Bitonic sort algorithm.
|
|
||||||
*
|
|
||||||
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
|
||||||
* @param data [DataT&] Reference to the container to sort
|
|
||||||
*/
|
|
||||||
template <typename DataT>
|
|
||||||
void bitonicSort(DataT& data) {
|
|
||||||
using value_t = typename DataT::value_type;
|
|
||||||
|
|
||||||
value_t* dev_data;
|
|
||||||
auto size = data.size();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
|
||||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
|
||||||
Timer_memory.stop();
|
|
||||||
|
|
||||||
size_t Nth = config.blockSize;
|
|
||||||
size_t Nbl = NBlocks(size);
|
|
||||||
|
|
||||||
auto Stages = static_cast<size_t>(log2(size));
|
|
||||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); //
|
|
||||||
Timer_sorting.start();
|
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
|
||||||
size_t step = stage - 1;
|
|
||||||
for ( ; step > InnerBlockSteps; --step) {
|
|
||||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
|
||||||
cudaDeviceSynchronize();
|
|
||||||
}
|
|
||||||
inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
|
||||||
cudaDeviceSynchronize();
|
|
||||||
}
|
|
||||||
Timer_sorting.stop();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
|
||||||
cudaFree(dev_data);
|
|
||||||
Timer_memory.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#elif CODE_VERSION == V2
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* @return The memory that each block local threads can affect.
|
|
||||||
*
|
|
||||||
* @note
|
|
||||||
* Each block thread collection can exchange twice the size of data points.
|
|
||||||
*/
|
|
||||||
inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Converts the global address of the data to the local shared memory array which is used
|
|
||||||
* as cached memory to the unrolled part of the bitonic sort loop.
|
|
||||||
*
|
|
||||||
* @note
|
|
||||||
* Each block's thread collection can exchange twice the size of data points.
|
|
||||||
* These points get copied (cached) in the shared memory location. We use contiguous blocks
|
|
||||||
* both in global data memory and the shared memory buffer.
|
|
||||||
*
|
|
||||||
* @param gIndex The global array index
|
|
||||||
* @param blockDim The block size (threads per block)
|
|
||||||
* @return The equivalent local address of the shared memory
|
|
||||||
*/
|
|
||||||
__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
|
|
||||||
return gIndex % (SizeToThreadsRatio * blockDim);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is the version of the body that is called outside of the loop unrolling
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
|
|
||||||
threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
|
|
||||||
threadId_t pid = partner(tid, step);
|
|
||||||
if (tid > pid) {
|
|
||||||
// Shift to the other half of the array for global data
|
|
||||||
tid += n / SizeToThreadsRatio;
|
|
||||||
pid += n / SizeToThreadsRatio;
|
|
||||||
}
|
|
||||||
if ((tid < n) && (pid < n)) { // Boundary check
|
|
||||||
bool keep = keepSmall(tid, pid, stage);
|
|
||||||
exchange(data, tid, pid, keep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* This is unrolled part of the bitonic double loop.
|
|
||||||
*
|
|
||||||
* First each thread caches its corresponding data point from the current and the following data block.
|
|
||||||
* After that we execute the loop unrolling on the local data and then we write back to global memory.
|
|
||||||
*
|
|
||||||
* @tparam ValueT The underlying data type of the array items
|
|
||||||
* @param data [ValueT*] Pointer to data array
|
|
||||||
* @param n [size_t] The total size of the array
|
|
||||||
* @param step [size_t] The current step of the current stage of bitonic sort
|
|
||||||
* @param stage [size_t] The current stage of bitonic sort
|
|
||||||
*/
|
|
||||||
template <typename ValueT>
|
|
||||||
__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
|
|
||||||
extern __shared__ ValueT shared_data[];
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Global and local(shared) memory indices (calculated once)
|
|
||||||
* Here we skip blocks every time (one for SizeToThreadsRatio = 2)
|
|
||||||
* And we cache the neighbor block address indexes in local (shared) memory
|
|
||||||
*/
|
|
||||||
threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
|
|
||||||
threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
|
|
||||||
|
|
||||||
if (gIdx0 + blockDim.x >= n) // Boundary check
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Fetch to local memory the entire effective block size (2 positions for each thread)
|
|
||||||
shared_data[lIdx0] = data[gIdx0];
|
|
||||||
shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (size_t step = innerSteps + 1; step > 0; ) {
|
|
||||||
--step;
|
|
||||||
|
|
||||||
// Init thread global and local indices
|
|
||||||
threadId_t gIdx = gIdx0;
|
|
||||||
threadId_t lIdx = lIdx0;
|
|
||||||
// Find partner and keep-small configuration based on the global data positions
|
|
||||||
threadId_t pIdx = partner(gIdx, step);
|
|
||||||
if (gIdx > pIdx) {
|
|
||||||
// Shift inside effective block
|
|
||||||
gIdx += blockDim.x; // global
|
|
||||||
pIdx += blockDim.x;
|
|
||||||
lIdx += blockDim.x; // local
|
|
||||||
}
|
|
||||||
bool keep = keepSmall(gIdx, pIdx, stage);
|
|
||||||
|
|
||||||
// Exchange data on local(shared) copy
|
|
||||||
threadId_t lpIdx = toLocal(pIdx, blockDim.x);
|
|
||||||
exchange(shared_data, lIdx, lpIdx, keep);
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write back to global memory
|
|
||||||
data[gIdx0] = shared_data[lIdx0];
|
|
||||||
data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A CUDA version of the Bitonic sort algorithm.
|
|
||||||
*
|
|
||||||
* @tparam DataT A container type to hold data array. Should have .data() and .size() methods
|
|
||||||
* @param data [DataT&] Reference to the container to sort
|
|
||||||
*/
|
|
||||||
template <typename DataT>
|
|
||||||
void bitonicSort(DataT& data) {
|
|
||||||
using value_t = typename DataT::value_type;
|
|
||||||
|
|
||||||
value_t* dev_data;
|
|
||||||
auto size = data.size();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not allocate memory\n");
|
|
||||||
if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
|
|
||||||
Timer_memory.stop();
|
|
||||||
|
|
||||||
size_t Nth = config.blockSize;
|
|
||||||
size_t Nbl = NBlocks(size);
|
|
||||||
size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
|
|
||||||
|
|
||||||
auto Stages = static_cast<size_t>(log2(size));
|
|
||||||
auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
|
|
||||||
Timer_sorting.start();
|
|
||||||
for (size_t stage = 1; stage <= Stages; ++stage) {
|
|
||||||
size_t step = stage - 1;
|
|
||||||
for ( ; step > InnerBlockSteps; --step) {
|
|
||||||
interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
|
|
||||||
cudaDeviceSynchronize();
|
|
||||||
}
|
|
||||||
inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
|
|
||||||
cudaDeviceSynchronize();
|
|
||||||
}
|
|
||||||
Timer_sorting.stop();
|
|
||||||
|
|
||||||
Timer_memory.start();
|
|
||||||
if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
|
|
||||||
cudaFree(dev_data);
|
|
||||||
Timer_memory.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif //BITONICSORTCUDA_H_
|
|
@ -1,82 +0,0 @@
|
|||||||
/*!
|
|
||||||
* \file
|
|
||||||
* \brief Build and runtime configuration file.
|
|
||||||
*
|
|
||||||
* \author
|
|
||||||
* Christos Choutouridis AEM:8997
|
|
||||||
* <cchoutou@ece.auth.gr>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef CONFIG_H_
|
|
||||||
#define CONFIG_H_
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Versioning:
|
|
||||||
* - RC1: First version to test on HPC
|
|
||||||
*/
|
|
||||||
static constexpr char version[] = "0.1";
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Defines for different version of the exercise
|
|
||||||
*/
|
|
||||||
#define V0 0
|
|
||||||
#define V1 1
|
|
||||||
#define V2 2
|
|
||||||
|
|
||||||
// Fail-safe version selection
|
|
||||||
#if !defined CODE_VERSION
|
|
||||||
#define CODE_VERSION V2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Default Data size (in case -q <N> is not present)
|
|
||||||
static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16;
|
|
||||||
|
|
||||||
// Placeholder default (actual default comes from device properties read at initialization)
|
|
||||||
static constexpr size_t THREADS_PER_BLOCK = 1024;
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Value and Buffer type selection
|
|
||||||
*
|
|
||||||
* We support the following compiler types or the <cstdint> that translate to them:
|
|
||||||
* char - unsigned char
|
|
||||||
* short - unsigned short
|
|
||||||
* int - unsigned int
|
|
||||||
* long - unsigned long
|
|
||||||
* long long - unsigned long long
|
|
||||||
* float
|
|
||||||
* double
|
|
||||||
*/
|
|
||||||
using Value_t = uint32_t;
|
|
||||||
using Data_t = std::vector<Value_t>;
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* In theory we can support large arrays ;)
|
|
||||||
*/
|
|
||||||
using ArraySize_t = uint64_t;
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Session option for each invocation of the executable.
|
|
||||||
*
|
|
||||||
* @note
|
|
||||||
* The values of the members are set from the command line.
|
|
||||||
*/
|
|
||||||
struct config_t {
|
|
||||||
ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort.
|
|
||||||
size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session.
|
|
||||||
bool validation{false}; //!< Request a full validation at the end, performed by process rank 0.
|
|
||||||
size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat
|
|
||||||
//!< the sorting <perf> times to do so.
|
|
||||||
bool verbose{false}; //!< Flag to enable verbose output to stdout.
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Exported data types
|
|
||||||
*/
|
|
||||||
extern config_t config;
|
|
||||||
extern cudaDeviceProp device;
|
|
||||||
|
|
||||||
#endif /* CONFIG_H_ */
|
|
@ -1,260 +0,0 @@
|
|||||||
/*!
|
|
||||||
* \file
|
|
||||||
* \brief Main application file for PDS HW3 (CUDA)
|
|
||||||
*
|
|
||||||
* \author
|
|
||||||
* Christos Choutouridis AEM:8997
|
|
||||||
* <cchoutou@ece.auth.gr>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <exception>
|
|
||||||
#include <iostream>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <random>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
#include "utils.hpp"
|
|
||||||
#include "config.h"
|
|
||||||
#include "bitonicsort.hpp"
|
|
||||||
|
|
||||||
|
|
||||||
// Global session data
|
|
||||||
Data_t Data;
|
|
||||||
config_t config;
|
|
||||||
Log logger;
|
|
||||||
cudaDeviceProp device;
|
|
||||||
|
|
||||||
// Mersenne seeded from hw if possible. range: [type_min, type_max]
|
|
||||||
std::random_device rd;
|
|
||||||
std::mt19937 gen(rd());
|
|
||||||
|
|
||||||
//! Performance timers for each one of the "costly" functions
|
|
||||||
Timing Timer_total, Timer_memory, Timer_sorting;
|
|
||||||
|
|
||||||
|
|
||||||
//! Init timing objects for extra rounds
|
|
||||||
void measurements_init() {
|
|
||||||
if (config.perf > 1) {
|
|
||||||
Timer_total.init(config.perf);
|
|
||||||
Timer_memory.init(config.perf);
|
|
||||||
Timer_sorting.init(config.perf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//! iterate ot the next round of measurements for all measurement objects
|
|
||||||
void measurements_next() {
|
|
||||||
if (config.perf > 1) {
|
|
||||||
Timer_total.next();
|
|
||||||
Timer_memory.next();
|
|
||||||
Timer_sorting.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A small command line argument parser
|
|
||||||
* \return The status of the operation
|
|
||||||
*/
|
|
||||||
bool get_options(int argc, char* argv[]){
|
|
||||||
bool status =true;
|
|
||||||
|
|
||||||
// iterate over the passed arguments
|
|
||||||
for (int i=1 ; i<argc ; ++i) {
|
|
||||||
std::string arg(argv[i]); // get current argument
|
|
||||||
|
|
||||||
if (arg == "-q" || arg == "--array-size") {
|
|
||||||
if (i+1 < argc) {
|
|
||||||
config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
status = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (arg == "-b" || arg == "--block-size") {
|
|
||||||
if (i+1 < argc) {
|
|
||||||
config.blockSize = atoi(argv[++i]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
status = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (arg == "--validation") {
|
|
||||||
config.validation = true;
|
|
||||||
}
|
|
||||||
else if (arg == "--perf") {
|
|
||||||
if (i+1 < argc) {
|
|
||||||
config.perf = atoi(argv[++i]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
status = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (arg == "-v" || arg == "--verbose") {
|
|
||||||
config.verbose = true;
|
|
||||||
}
|
|
||||||
else if (arg == "--version") {
|
|
||||||
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
|
|
||||||
std::cout << "version: " << version << "\n\n";
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
else if (arg == "-h" || arg == "--help") {
|
|
||||||
std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
|
|
||||||
std::cout << " " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
|
|
||||||
std::cout << " " << STR(TARGET) << " -h\n";
|
|
||||||
std::cout << " " << STR(TARGET) << " --version\n";
|
|
||||||
std::cout << '\n';
|
|
||||||
std::cout << "Options:\n\n";
|
|
||||||
std::cout << " -q | --array-size <N>\n";
|
|
||||||
std::cout << " Selects the array size according to size = 2^N\n";
|
|
||||||
std::cout << " [Size must be larger than 2 * blockSize]\n";
|
|
||||||
std::cout << " [Default is 2^16]\n\n";
|
|
||||||
std::cout << " -b | --block-size <N>\n";
|
|
||||||
std::cout << " Selects the number of CUDA threads per block\n";
|
|
||||||
std::cout << " [Size has to be multiple of device's warp size (usually 32)\n";
|
|
||||||
std::cout << " [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
|
|
||||||
std::cout << " --validation\n";
|
|
||||||
std::cout << " Request a full validation at the end\n\n";
|
|
||||||
std::cout << " --perf <N> \n";
|
|
||||||
std::cout << " Enable performance timing measurements and prints, and repeat\n";
|
|
||||||
std::cout << " the sorting <N> times.\n\n";
|
|
||||||
std::cout << " -v | --verbose\n";
|
|
||||||
std::cout << " Request a more verbose output to stdout.\n\n";
|
|
||||||
std::cout << " -h | --help\n";
|
|
||||||
std::cout << " Prints this and exit.\n\n";
|
|
||||||
std::cout << " --version\n";
|
|
||||||
std::cout << " Prints version and exit.\n\n";
|
|
||||||
std::cout << "Examples:\n\n";
|
|
||||||
std::cout << " " << STR(TARGET) << " -q 24\n";
|
|
||||||
std::cout << " Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
|
|
||||||
std::cout << " " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
|
|
||||||
std::cout << " Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
|
|
||||||
std::cout << " 512 threads per block, performs a validation check at the end and prints the time\n";
|
|
||||||
std::cout << " of the median.\n\n";
|
|
||||||
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
else { // parse error
|
|
||||||
std::cout << "Invocation error. Try -h for details.\n";
|
|
||||||
status = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check configuration requirements
|
|
||||||
if (config.blockSize % device.warpSize)
|
|
||||||
throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
|
|
||||||
if (config.arraySize < 2*config.blockSize)
|
|
||||||
throw std::runtime_error("[Config] - Unsupported array size (smaller than "
|
|
||||||
+ std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
|
|
||||||
if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
|
|
||||||
throw std::runtime_error("[CUDA] - Unsupported array size: "
|
|
||||||
+ std::to_string(config.arraySize * sizeof(Value_t))
|
|
||||||
+ " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
|
|
||||||
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A simple validator for the entire distributed process
|
|
||||||
*
|
|
||||||
* @tparam DataT A buffer type with random access iterator.
|
|
||||||
*
|
|
||||||
* @param data [DataT] The data
|
|
||||||
* @return [bool] True if sorted in ascending order
|
|
||||||
*/
|
|
||||||
template<typename DataT>
|
|
||||||
bool validator(DataT& data) {
|
|
||||||
return std::is_sorted(data.begin(), data.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Initializes the environment, must called from each process
|
|
||||||
*
|
|
||||||
* @param argc [int*] POINTER to main's argc argument
|
|
||||||
* @param argv [char***] POINTER to main's argv argument
|
|
||||||
*/
|
|
||||||
void init(int* argc, char*** argv) {
|
|
||||||
|
|
||||||
// Get device configuration
|
|
||||||
if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
|
|
||||||
throw std::runtime_error("[CUDA] - Can not read GPU");
|
|
||||||
|
|
||||||
config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
|
|
||||||
|
|
||||||
// try to read command line
|
|
||||||
if (!get_options(*argc, *argv))
|
|
||||||
exit(1);
|
|
||||||
|
|
||||||
// Prepare vector and timing data
|
|
||||||
Data.resize(config.arraySize);
|
|
||||||
measurements_init();
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !defined TESTING
|
|
||||||
/*!
|
|
||||||
* @return Returns 0, but.... we may throw or exit(0) / exit(1)
|
|
||||||
*/
|
|
||||||
int main(int argc, char* argv[]) try {
|
|
||||||
|
|
||||||
// Init everything
|
|
||||||
init(&argc, &argv);
|
|
||||||
|
|
||||||
logger << "Array size: " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
|
|
||||||
logger << "Repeated sorts: " << config.perf << logger.endl;
|
|
||||||
logger << "GPU: " << device.name << logger.endl;
|
|
||||||
logger << "Block size: " << config.blockSize << logger.endl;
|
|
||||||
|
|
||||||
for (size_t it = 0 ; it < config.perf ; ++it) {
|
|
||||||
// Initialize local data
|
|
||||||
logger << "Initialize array ... ";
|
|
||||||
std::uniform_int_distribution<Value_t > dis(
|
|
||||||
std::numeric_limits<Value_t>::min(),
|
|
||||||
std::numeric_limits<Value_t>::max()
|
|
||||||
);
|
|
||||||
std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
|
|
||||||
logger << " Done." << logger.endl;
|
|
||||||
|
|
||||||
// Run distributed sort
|
|
||||||
logger << "Start sorting ... ";
|
|
||||||
Timer_total.start();
|
|
||||||
bitonicSort(Data);
|
|
||||||
Timer_total.stop();
|
|
||||||
measurements_next();
|
|
||||||
logger << " Done." << logger.endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Print-outs and validation
|
|
||||||
if (config.perf > 1) {
|
|
||||||
Timing::print_duration(Timer_total.median(), "Total ");
|
|
||||||
Timing::print_duration(Timer_memory.median(), "Mem-xch ");
|
|
||||||
Timing::print_duration(Timer_sorting.median(),"Sorting ");
|
|
||||||
}
|
|
||||||
if (config.validation) {
|
|
||||||
// If requested, we have the chance to fail!
|
|
||||||
std::cout << "[Validation] Results validation ...";
|
|
||||||
bool val = validator(Data);
|
|
||||||
std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
catch (std::exception& e) {
|
|
||||||
//we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
|
|
||||||
std::cerr << "Error: " << e.what() << '\n';
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#include <gtest/gtest.h>
|
|
||||||
#include <exception>
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* The testing version of our program
|
|
||||||
*/
|
|
||||||
GTEST_API_ int main(int argc, char **argv) try {
|
|
||||||
testing::InitGoogleTest(&argc, argv);
|
|
||||||
return RUN_ALL_TESTS();
|
|
||||||
}
|
|
||||||
catch (std::exception& e) {
|
|
||||||
std::cout << "Exception: " << e.what() << '\n';
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,157 +0,0 @@
|
|||||||
/**
|
|
||||||
* \file
|
|
||||||
* \brief Utilities header
|
|
||||||
*
|
|
||||||
* \author
|
|
||||||
* Christos Choutouridis AEM:8997
|
|
||||||
* <cchoutou@ece.auth.gr>
|
|
||||||
*/
|
|
||||||
#ifndef UTILS_HPP_
|
|
||||||
#define UTILS_HPP_
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <iostream>
|
|
||||||
#include <chrono>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "config.h"
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* Stringify preprocessor util
|
|
||||||
*/
|
|
||||||
#define STR(s) STR_(s)
|
|
||||||
#define STR_(s) #s
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A Logger for entire program.
|
|
||||||
*/
|
|
||||||
struct Log {
|
|
||||||
struct Endl {} endl; //!< a tag object to to use it as a new line request.
|
|
||||||
|
|
||||||
//! We provide logging via << operator
|
|
||||||
template<typename T>
|
|
||||||
Log &operator<<(T &&t) {
|
|
||||||
if (config.verbose) {
|
|
||||||
if (line_) {
|
|
||||||
std::cout << "[Log]: " << t;
|
|
||||||
line_ = false;
|
|
||||||
} else
|
|
||||||
std::cout << t;
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// overload for special end line handling
|
|
||||||
Log &operator<<(Endl e) {
|
|
||||||
(void) e;
|
|
||||||
if (config.verbose) {
|
|
||||||
std::cout << '\n';
|
|
||||||
line_ = true;
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool line_{true};
|
|
||||||
};
|
|
||||||
|
|
||||||
extern Log logger;
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A small timing utility based on chrono that supports timing rounds
|
|
||||||
* and returning the median of them. Time can accumulate to the measurement
|
|
||||||
* for each round.
|
|
||||||
*/
|
|
||||||
struct Timing {
|
|
||||||
using Tpoint = std::chrono::steady_clock::time_point;
|
|
||||||
using Tduration = std::chrono::microseconds;
|
|
||||||
using microseconds = std::chrono::microseconds;
|
|
||||||
using milliseconds = std::chrono::milliseconds;
|
|
||||||
using seconds = std::chrono::seconds;
|
|
||||||
|
|
||||||
//! Setup measurement rounds
|
|
||||||
void init(size_t rounds) {
|
|
||||||
duration_.resize(rounds);
|
|
||||||
for (auto& d : duration_)
|
|
||||||
d = Tduration::zero();
|
|
||||||
}
|
|
||||||
|
|
||||||
//! tool to mark the starting point
|
|
||||||
Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); }
|
|
||||||
|
|
||||||
//! tool to mark the ending point
|
|
||||||
Tpoint stop() noexcept {
|
|
||||||
Tpoint now = std::chrono::steady_clock::now();
|
|
||||||
duration_[current_] += dt(now, mark_);
|
|
||||||
return now;
|
|
||||||
}
|
|
||||||
|
|
||||||
//! Switch timing slot
|
|
||||||
void next() noexcept {
|
|
||||||
++current_;
|
|
||||||
current_ %= duration_.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
Tduration& median() noexcept {
|
|
||||||
std::sort(duration_.begin(), duration_.end());
|
|
||||||
return duration_[duration_.size()/2];
|
|
||||||
}
|
|
||||||
|
|
||||||
//! A duration calculation utility
|
|
||||||
static Tduration dt(Tpoint t2, Tpoint t1) noexcept {
|
|
||||||
return std::chrono::duration_cast<Tduration>(t2 - t1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//! Tool to print the time interval
|
|
||||||
static void print_duration(const Tduration& duration, const char *what) noexcept {
|
|
||||||
if (std::chrono::duration_cast<microseconds>(duration).count() < 10000)
|
|
||||||
std::cout << "[Timing] " << what << ": "
|
|
||||||
<< std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n";
|
|
||||||
else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000)
|
|
||||||
std::cout << "[Timing] " << what << ": "
|
|
||||||
<< std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n";
|
|
||||||
else {
|
|
||||||
char stime[26]; // fit ulong
|
|
||||||
auto sec = std::chrono::duration_cast<seconds>(duration).count();
|
|
||||||
auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10; // keep 2 digit
|
|
||||||
std::sprintf(stime, "%ld.%1ld", sec, msec);
|
|
||||||
std::cout << "[Timing] " << what << ": " << stime << " [sec]\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
size_t current_{0};
|
|
||||||
Tpoint mark_{};
|
|
||||||
std::vector<Tduration> duration_{1};
|
|
||||||
};
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A "high level function"-like utility macro to forward a function call
|
|
||||||
* and accumulate the execution time to the corresponding timing object.
|
|
||||||
*
|
|
||||||
* @param Tim The Timing object [Needs to have methods start() and stop()]
|
|
||||||
* @param Func The function name
|
|
||||||
* @param ... The arguments to pass to function (the preprocessor way)
|
|
||||||
*/
|
|
||||||
#define timeCall(Tim, Func, ...) \
|
|
||||||
Tim.start(); \
|
|
||||||
Func(__VA_ARGS__); \
|
|
||||||
Tim.stop(); \
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* A utility to check if a number is power of two
|
|
||||||
*
|
|
||||||
* @tparam Integral The integral type of the number to check
|
|
||||||
* @param x The number to check
|
|
||||||
* @return True if it is power of 2, false otherwise
|
|
||||||
*/
|
|
||||||
template <typename Integral>
|
|
||||||
constexpr inline bool isPowerOfTwo(Integral x) noexcept {
|
|
||||||
return (!(x & (x - 1)) && x);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#endif /* UTILS_HPP_ */
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,33 +0,0 @@
|
|||||||
/**
|
|
||||||
* \file
|
|
||||||
* \brief PDS HW3 tests
|
|
||||||
*
|
|
||||||
* To run these test execute:
|
|
||||||
* ...
|
|
||||||
*
|
|
||||||
* \author
|
|
||||||
* Christos Choutouridis AEM:8997
|
|
||||||
* <cchoutou@ece.auth.gr>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <gtest/gtest.h>
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Global fixtures
|
|
||||||
*/
|
|
||||||
|
|
||||||
class TCUDAbitonic : public ::testing::Test {
|
|
||||||
protected:
|
|
||||||
static void SetUpTestSuite() { }
|
|
||||||
|
|
||||||
static void TearDownTestSuite() { }
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
TEST_F(TCUDAbitonic, test1) {
|
|
||||||
|
|
||||||
EXPECT_EQ(true, true);
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user