#
# ---------------------------------------------
# Bitonic v0.5 functionality
#

function partner(node, step)
    partner = node + ((((node+step) % 2) == 0) ? 1 : -1)
end

function active(id, p)
    ret =  (id >= 0) && (id < p)
end

function exchange(localid,  remoteid)
    if verbose
        println("Exchange local data from $localid with partner $remoteid")
    end
    nothing # We have all data here ;)
end

function minmax(data, localid, remoteid, keepsmall)
    # Keep min-max on local data
    temp = copy(data[localid+1, :])
    if (keepsmall)
        view(data, localid+1, :)  .= min.(temp, data[remoteid+1, :])
        view(data, remoteid+1, :) .= max.(temp, data[remoteid+1, :])
    else
        view(data, localid+1, :)  .= max.(temp, data[remoteid+1, :])
        view(data, remoteid+1, :) .= min.(temp, data[remoteid+1, :])
    end
end

"""
    distbubletonic!(p, data)

distributed bitonic v0.5 sort using a "bubble sort"-like functionality to propagate large and small
items between nodes.

p:    The number of processes
data: (p, N/p) array
"""
function distbubletonic!(p, data) 
  
    pid = 0:p-1
    ascending = mod.(pid,2) .== 0
    if verbose
        println("ascending: $ascending")
    end
    # local full sort here (run all MPI nodes)
    for i in 1:p
        sort!(view(data, i, :), rev = !ascending[i])
    end
    for step in 0:p-2
        direction = [true for x = 1:p]
        partnerid = partner.(pid, step)
        activeids = active.(partnerid, p)
        keepsmall = pid .< partnerid
        if verbose
            println("step: $step | active ids: $activeids | partner: $partnerid | keepsmall: $keepsmall")
        end
        # exchange with partner and keep small or large (run all MPI nodes)
        for i in 0:p-1
            l_idx = i+1
            r_idx = partnerid[i+1]+1
            if activeids[l_idx] && i < partnerid[l_idx]
                exchange(i, partnerid[l_idx])
                minmax(data, i, partnerid[l_idx], keepsmall[l_idx])
                sort!(view(data, l_idx, :), rev = !ascending[l_idx])    # elbow sort here
                sort!(view(data, r_idx, :), rev = !ascending[r_idx])    # elbow sort here
            end
        end
    end
    # [optional] reverse the odd positions (run all MPI nodes)
    for i in 1:p
        if !ascending[i]
            sort!(view(data, i, :))
        end
    end
    nothing
end


#
# Homework setup
# ---------------------------------------------
#
p::Int8 = 3  # The order of number of "processors"
q::Int8 = 8  # The data size order (power of 2) of each "processor"
verbose = false;


# Run Script
# ---------------------------------------------
P::Int = 2^p
Q::Int = 2^q
N::Int = 2^(q+p)

println("Distributed Bubbletonic (v0.5) test")
println("p: $p -> Number of processors: $P")
println("q: $q -> Data length for each node: $Q, Total: $(P*Q)")

println("Create an $P x $Q array")
Data = rand(Int8, P, Q)

println("Sort array with $P (MPI) nodes")
@time distbubletonic!(P, Data)

# Test
if issorted(vec(permutedims(Data)))
    println("Test: Passed")
else
    println("Test: Failed")
end