diff --git a/homework_3/Makefile b/homework_3/Makefile
index 4856d50..3773f8f 100644
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@@ -45,9 +45,9 @@ OUTPUT_DIR      := out
 
 # ========== Compiler settings ==========
 # Compiler flags for debug and release
-DEB_CFLAGS      := -DDEBUG -g3 -std=c11 -Xcompiler "-Wall -Wextra"
+DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g3 -DDEBUG"
 REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra"
-DEB_CXXFLAGS    := -DDEBUG -g3 -std=c++17 -Xcompiler "-Wall -Wextra"
+DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g3 -DDEBUG" 
 REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
 
 # Pre-defines
@@ -186,15 +186,40 @@ bitonic_v0: LINKER := nvcc
 bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0
 bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0
 bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0
-bitonic_v0: TARGET := bitonic_v0
 bitonic_v0: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 
 
+bitonic_v1: CC := nvcc -x cu
+bitonic_v1: CXX := nvcc -x cu
+bitonic_v1: LINKER := nvcc
+bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1
+bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1
+bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1
+bitonic_v1: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v2: CC := nvcc -G -g -x cu
+bitonic_v2: CXX := nvcc -G -g -x cu
+bitonic_v2: LINKER := nvcc
+bitonic_v2: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
+bitonic_v2: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
+bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2
+bitonic_v2: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
 hpc-build:
 	make clean
 	make bitonic_v0
+	make clean
+	make bitonic_v1
+	make clean
+	make bitonic_v2
 
 
 all: debug bitonic_v0
diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp
index 100cb2d..447e82c 100644
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@@ -76,7 +76,7 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag
 
 
 template <typename ValueT>
-__device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall) {
+__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
     if (( keepSmall && (data[tid] > data[partner])) ||
         (!keepSmall && (data[tid] < data[partner])) ) {
         ValueT temp = data[tid];
@@ -85,16 +85,18 @@ __device__ void cudaExchange(ValueT* data, int tid, int partner, bool keepSmall)
     }
 }
 
-
+#if CODE_VERSION == V0
 template <typename ValueT>
 __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
     threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
-    if (tid < n) {
-        threadId_t pid = partner(tid, step);
-        if (pid < n) {
-            bool keep = keepSmall(tid, pid, stage);
-            cudaExchange(data, tid, pid, keep);
-        }
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        tid += n >> 1;
+        pid += n >> 1;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
     }
 }
 
@@ -122,14 +124,14 @@ void bitonicSort(DataT& data) {
     cudaMalloc(&dev_data, size * sizeof(value_t));
     cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
 
-    int Nthreads = 1024;
-    int Nblocks = (size + Nthreads - 1) / Nthreads;
+    int Nthreads = THREADS_PER_BLOCK;
+    int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
 
-    size_t max_depth = static_cast<size_t>(log2(size));
-    for (size_t stage = 1; stage <= max_depth; ++stage) {
+    size_t Stages = static_cast<size_t>(log2(size));
+    for (size_t stage = 1; stage <= Stages; ++stage) {
         for (size_t step = stage; step > 0; ) {
             --step;
-            bitonicStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
+            bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
             cudaDeviceSynchronize();
         }
     }
@@ -138,8 +140,182 @@ void bitonicSort(DataT& data) {
     cudaFree(dev_data);
 }
 
+#elif CODE_VERSION == V1
+
+template <typename ValueT>
+__device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        tid += n >> 1;
+        pid += n >> 1;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
+    }
+}
+
+template <typename ValueT>
+__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    interBlockStep_(data, n, step, stage);
+}
+
+
+template <typename ValueT>
+__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
+    for (size_t step = innerSteps + 1; step > 0; ) {
+        --step;
+        interBlockStep_(data, n, step, stage);
+        __syncthreads();
+    }
+}
+
+/*!
+ * A distributed version of the Bitonic sort algorithm.
+ *
+ * @note
+ *  Each MPI process should run an instance of this function.
+ *
+ * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
+ *
+ * @param data          [ShadowedDataT] The local to MPI process data to sort
+ * @param Processes     [mpi_id_t]      The total number of MPI processes
+ * @param rank          [mpi_id_t]      The current process id
+ */
+
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    cudaMalloc(&dev_data, size * sizeof(value_t));
+    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+
+    int Nthreads = THREADS_PER_BLOCK;
+    int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
+
+    auto Stages = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
+    for (size_t stage = 1; stage <= Stages; ++stage) {
+        size_t step = stage - 1;
+        for ( ; step > InnerBlockSteps; --step) {
+            interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+        inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
+        cudaDeviceSynchronize();
+    }
+
+    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    cudaFree(dev_data);
+}
 
 
+#elif CODE_VERSION == V2
+
+template <typename ValueT>
+__global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    threadId_t pid = partner(tid, step);
+    if (tid > pid) {
+        tid += n >> 1;
+        pid += n >> 1;
+    }
+    if ((tid < n) && (pid < n)) {       // Boundary check
+        bool keep = keepSmall(tid, pid, stage);
+        exchange(data, tid, pid, keep);
+    }
+}
+
+
+template <typename ValueT>
+__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage) {
+    extern __shared__ ValueT shared_data[];
+    ValueT* lowerHalf = (ValueT*) shared_data;
+    ValueT* upperHalf = (ValueT*) &shared_data[nthreads];
+
+    // Global memory thread and partner ids
+    threadId_t thread_id  = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // Shared memory thread and partner ids
+    threadId_t local_tid = threadIdx.x;
+
+    if ( thread_id  < (n >> 1) ) {
+        // Fetch to local memory for both half
+        lowerHalf[local_tid] = data[thread_id];
+        upperHalf[local_tid] = data[thread_id + (n >> 1)];
+        __syncthreads();
+
+        for (size_t step = innerSteps + 1; step > 0; ) {
+            --step;
+            // Find partner and localize it
+            threadId_t partner_id = partner(thread_id, step);
+            threadId_t local_pid = partner_id % nthreads;
+
+            if (local_tid < local_pid) {
+                // exchange on low site buffer (half of the threads)
+                bool keep = keepSmall(thread_id, partner_id, stage);
+                exchange(lowerHalf, local_tid, local_pid, keep);
+            }
+            else {
+                // exchange on high site buffer (other half of the threads)
+                bool keep = keepSmall(thread_id, partner_id, stage);
+                exchange(upperHalf, local_tid, local_pid, keep);
+            }
+            __syncthreads();
+        }
+        // Write back to global memory
+        data[thread_id] = lowerHalf[local_tid];
+        data[thread_id + (n >> 1)] = upperHalf[local_tid];
+        __syncthreads();
+    }
+}
+
+/*!
+ * A distributed version of the Bitonic sort algorithm.
+ *
+ * @note
+ *  Each MPI process should run an instance of this function.
+ *
+ * @tparam dDataT    A Shadowed buffer type with random access iterator.
+ *
+ * @param data          [ShadowedDataT] The local to MPI process data to sort
+ * @param Processes     [mpi_id_t]      The total number of MPI processes
+ * @param rank          [mpi_id_t]      The current process id
+ */
+
+template <typename DataT>
+void bitonicSort(DataT& data) {
+    using value_t = typename DataT::value_type;
+
+    value_t* dev_data;
+    auto size = data.size();
+
+    cudaMalloc(&dev_data, size * sizeof(value_t));
+    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+
+    int Nthreads = THREADS_PER_BLOCK;
+    int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
+
+    auto Stages = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
+    for (size_t stage = 1; stage <= Stages; ++stage) {
+        size_t step = stage - 1;
+        for ( ; step > InnerBlockSteps; --step) {
+            interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
+            cudaDeviceSynchronize();
+        }
+        inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage);
+        cudaDeviceSynchronize();
+    }
+
+    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    cudaFree(dev_data);
+}
 
+#endif
 
 #endif //BITONICSORTCUDA_H_
diff --git a/homework_3/src/config.h b/homework_3/src/config.h
index 3ff1235..2cd4b49 100644
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@@ -27,12 +27,14 @@ static constexpr char version[] = "0.0";
 
 // Fail-safe version selection
 #if !defined CODE_VERSION
-#define CODE_VERSION   V0
+#define CODE_VERSION   V2
 #endif
 
 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 
+static constexpr size_t THREADS_PER_BLOCK   = 8;
+static constexpr size_t IN_BLOCK_THRESHOLD  = 4;
 
 /*!
  * Value and Buffer type selection
@@ -46,7 +48,7 @@ static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
  *  float
  *  double
  */
-using Value_t = uint32_t;
+using Value_t = uint8_t;
 using Data_t  = std::vector<Value_t>;
 
 /*!
diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp
index 64cba4e..bd533c5 100644
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@@ -185,6 +185,13 @@ int main(int argc, char* argv[]) try {
         std::cout << "[Validation] Results validation ...";
         bool val = validator(Data);
         std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
+        if (Data.size() < 128) {
+        	std::cout << "Data: ";
+        	for (auto& v : Data) {
+        		std::cout << (int)v << ", ";
+        	}
+        	std::cout << '\n';
+        }
     }
     return 0;
 }