/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "cuda_utils.h"
#include "NvParticlesConfig.h"
#include "CuBuffer.h"

namespace Easy
{
namespace NvParticles
{

//------------------------------------------------------------------------------------------
template<class T, class ReductionFunc>
__global__ 
void computeReductionKernel(int n, T* g_idata, T* g_odata)
{
    extern __shared__ T sdata[];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    if(i >= n)
        return;

    // do reduction in shared mem...
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        if (tid % (2*s) == 0)
        {
            // we must do this check or we may collect data from outside the range!
            if(blockIdx.x*blockDim.x+(tid+s) < n)
            {
                sdata[tid] = ReductionFunc()(sdata[tid], sdata[tid + s]);
            }
        }
        __syncthreads();
    }

    // write result for this block...
    if (tid == 0)
        g_odata[blockIdx.x] = sdata[0];

    return;
}

//------------------------------------------------------------------------------------------
template<class T, class ReductionFunc>
void computeReduction_CUDA(uint n, T* d_idata, T* d_odata, T* resultPtr, cudaStream_t stream=0)
{
    if (n == 0)
        return;

    if (n < 128)
    {
        /// TODO:
        // handle less than 128 particles...

        float data[4];
        data[0] = 0;
        data[1] = 0;
        data[2] = 0;
        data[3] = 0;
        NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpy(  d_odata, data, sizeof(float)*4, cudaMemcpyHostToDevice));
        
        return;
    }

    //const uint maxThreads = 64;
    uint numThreads=0, numBlocks=0, smemSize=0;

    numThreads = 128;
    numBlocks = iDivUp(n, numThreads); // use blocks of size pow2
    smemSize = numThreads * sizeof(T);

    // do one iteration over numBlocks...
    computeReductionKernel<T, ReductionFunc> <<< numBlocks, numThreads, smemSize, stream >>>(n, d_idata, d_odata);
    NVPARTICLES_CUDA_CHECK_ERROR("computeReductionKernel");
    n = numBlocks;

    while (n > 1)
    {
        numBlocks = iDivUp(n, numThreads); // use blocks of size pow2

        computeReductionKernel<T, ReductionFunc> <<< numBlocks, numThreads, smemSize, stream >>>(n, d_odata, d_odata);
        NVPARTICLES_CUDA_CHECK_ERROR("computeReductionKernel");
        n = numBlocks;
    }

    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyAsync( resultPtr, d_odata, sizeof(T), cudaMemcpyDeviceToDevice, stream));
}

//------------------------------------------------------------------------------------------
}
}
