/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "NvParticlesGridCuda.h"
#include "cuda_utils.h"
#include "Profiler.h"

#if defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)

#include <thrust/iterator/counting_iterator.h>
#include <thrust/binary_search.h>
#include <thrust/transform_reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/device_vector.h>
#include <thrust/adjacent_difference.h>
#include <thrust/remove.h>

#elif defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP)

/// TODO:
// add appropriate headers here.

#else

#include "radixsort.cuh"

#endif

#include "math_utils.h"
//#include "cutil_math.h"
#include "NvParticlesGrid.h"

namespace Easy
{
namespace NvParticles
{
namespace SpatialGrid
{

//------------------------------------------------------------------------------------------
/// Find start and end item-indices of each cell in the sorted particle-list by
/// comparing with previous cellIndex value.
///
__global__
void computeCellRangesKernel(int start, int count, int nCells, uint *out_cellsStartIndex, uint *out_cellsEndIndex, uint *itemCellIndices)
{
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (i >= count)
        return;
    i += start;

#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
    uint cellIndex = ((uint2*)itemCellIndices)[i].x;
#else
    uint cellIndex = itemCellIndices[i];
#endif

    // this shouldn't happen!
    if (cellIndex == uint(-1))
        return;

    if (i == start)
    {
        out_cellsStartIndex[cellIndex] = i;
    }
    else // i > 0
    {
        // Note that this MUST be volatile to ensure the value isn't changed by another thread during its usage.
        //volatile uint prevCellIndex = NVPARTICLES_SPATIAL_GRID_GET_ITEM_CELL_INDEX(itemCellIndices, i-1);
#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
        volatile uint prevCellIndex = ((uint2*)itemCellIndices)[i-1].x;
#else
        volatile uint prevCellIndex = itemCellIndices[i-1];
#endif
        if (cellIndex != prevCellIndex)
        {
            out_cellsStartIndex[cellIndex] = i;
            out_cellsEndIndex[prevCellIndex] = i;
        }
    }

    if (i == start+count-1)
    {
        out_cellsEndIndex[cellIndex] = i+1;
    }
}

//------------------------------------------------------------------------------------------
__global__
void computeCellStartKernel(int start, int count, uint *out_cellsStartIndex, uint* itemCellIndices)
{
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (i >= count)
        return;
    i += start;

#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
    uint cellIndex = ((uint2*)itemCellIndices)[i].x;
#else
    uint cellIndex = itemCellIndices[i];
#endif

    if (i == start)
    {
        out_cellsStartIndex[cellIndex] = i;
    }
    else // i > 0
    {
#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
        volatile uint prevCellIndex = ((uint2*)itemCellIndices)[i-1].x;
#else
        volatile uint prevCellIndex = itemCellIndices[i-1];
#endif
        if (cellIndex != prevCellIndex)
        {
            out_cellsStartIndex[cellIndex] = i;
        }
    }
}

//------------------------------------------------------------------------------------------
extern "C"
void gridComputeCellRanges(int start, int count, uint* itemCellIndices, uint nCells, uint* out_cellsStartIndex, uint* out_cellsEndIndex, cudaStream_t stream)
{
    if (count <= 0)
        return;
    assert(nCells>0);
    assert(itemCellIndices);
    assert(out_cellsStartIndex);
    assert(out_cellsEndIndex);

    // clear to -1 to mark unused cells
    NVPARTICLES_CUDA_SAFE_CALL(cudaMemsetAsync(out_cellsStartIndex, -1, nCells*sizeof(uint), stream));

    uint numThreads, numBlocks;
    computeGridSize(count, 256, numBlocks, numThreads);
    computeCellRangesKernel<<< numBlocks, numThreads, 0, stream >>>(start, count, nCells, out_cellsStartIndex, out_cellsEndIndex, itemCellIndices);
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeCellRangesKernel");
}

//------------------------------------------------------------------------------------------
extern "C"
void gridComputeCellStarts(int start, int count, uint* itemCellIndices, uint nCells, uint* out_cellsStartIndex, cudaStream_t stream)
{
    assert(count>0);
    assert(nCells>0);
    assert(itemCellIndices);
    assert(out_cellsStartIndex);

    uint numThreads, numBlocks;
    computeGridSize(count, 256, numBlocks, numThreads);

    // clear to -1 to mark unused cells
    NVPARTICLES_CUDA_SAFE_CALL(cudaMemsetAsync(out_cellsStartIndex, -1, nCells*sizeof(uint), stream));

    computeCellStartKernel<<< numBlocks, numThreads, 0, stream >>>(start, count, out_cellsStartIndex, itemCellIndices);
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeCellStartKernel");
}

//------------------------------------------------------------------------------------------
/// Sort the items by their cellIndex.
///
void gridSort(int start, int count, int maxItems, int maxCells, SpatialGridData data, cudaStream_t stream)
{
    int sortBits = (uint)::ceil(log2(float(maxCells)));
#if defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP)
    NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
    cudppSort(sortHandle, data.itemCellIndices + start, data.sortedItemCellIndices + start, sortBits, count);
#elif defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
    NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
    thrust::device_ptr<uint> d_sortedIndicesPtr((uint *)data.sortedItemCellIndices + start);
    thrust::device_ptr<uint> d_bucketIdsPtr((uint *)data.itemCellIndices + start);
    thrust::sort_by_key(d_bucketIdsPtr, d_bucketIdsPtr + count, d_sortedIndicesPtr);
#else
    /// CAVEAT:
    // sort bits must be even so that the ping-ponging ends up with the result in the first buffer!
    //if (sortBits&1)
    //    ++sortBits;
    sortBits = 32;
    RadixSort((KeyValuePair *)data.itemCellIndices + start, ((KeyValuePair*)data.itemCellIndices) + maxItems, count, sortBits, stream);
#endif
}

//------------------------------------------------------------------------------------------
/// Sort a single buffer using the ordering in spatialGridData.
///
template <class T>
__global__
void sortBufferKernel(int numElements, T* inBuffer, T* outBuffer, SpatialGrid::SpatialGridData spatialGridData )
{
    uint i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(i >= numElements)
        return;
	uint sortedIndex = NVPARTICLES_SPATIAL_GRID_GET_SORTED_ITEM_INDEX(spatialGridData, i);
    outBuffer[i] = inBuffer[sortedIndex];
}

//------------------------------------------------------------------------------------------
extern "C"
void gridSortBuffer(int numElements, int elementBytes, void* inData, void* outData, SpatialGrid::SpatialGridData spatialGridData, cudaStream_t stream)
{
    if(numElements == 0)
        return;
    assert(inData && outData && elementBytes>0 && numElements>0);

    uint numThreads, numBlocks;
    computeGridSize(numElements, 256, numBlocks, numThreads);

    if(elementBytes == sizeof(float4))
    {
        sortBufferKernel<float4> <<< numBlocks, numThreads, 0, stream>>> (
                numElements,
                (float4*)inData,
                (float4*)outData,
                spatialGridData);
        NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: sortBufferKernel<float4>");
    }
    else if(elementBytes == sizeof(float))
    {
        sortBufferKernel<float> <<< numBlocks, numThreads, 0, stream>>> (
                numElements,
                (float*)inData,
                (float*)outData,
                spatialGridData);
        NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: sortBufferKernel<float>");
    }
    else
    {
        // do it byte by byte...
        sortBufferKernel<unsigned char> <<< numBlocks, numThreads, 0, stream>>> (
                numElements*elementBytes,
                (unsigned char*)inData,
                (unsigned char*)outData,
                spatialGridData);
        NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: sortBufferKernel<unsigned char>");
    }
}

//------------------------------------------------------------------------------------------
}
}
}
