/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "cuda_utils.h"
#include "math_utils.h"

#include "NvParticlesConfig.h"
#include "NvParticlesTypes.h"
#include "NvParticlesReduceCuda.h"

namespace Easy
{
namespace NvParticles
{

//------------------------------------------------------------------------------------------
struct ReduceMinFunc
{
    inline NVPARTICLES_CUDA_EXPORT
    float4 operator()(float4 a, float4 b)
    {
        if(a.w == 0)
            return b;
        if(b.w == 0)
            return a;
        float4 r;
        r.x = ::fminf(a.x, b.x);
        r.y = ::fminf(a.y, b.y);
        r.z = ::fminf(a.z, b.z);
        r.w = a.w;
        return r;
    }
};

//------------------------------------------------------------------------------------------
struct ReduceMaxFunc
{
    inline NVPARTICLES_CUDA_EXPORT
    float4 operator()(float4 a, float4 b)
    {
        if(a.w == 0)
            return b;
        if(b.w == 0)
            return a;
        float4 r;
        r.x = ::fmaxf(a.x, b.x);
        r.y = ::fmaxf(a.y, b.y);
        r.z = ::fmaxf(a.z, b.z);
        r.w = a.w;
        return r;
    }
};

//------------------------------------------------------------------------------------------
extern "C"
void calculateBoundsAsync_CUDA(float* h_outMin, float* h_outMax,
                                float* d_outMin, float* d_outMax,
                                int n, float* d_positions, float* tempBuf,
                                cudaStream_t stream)
{
    if(n < 128)
    {
        /// HACK:
        // until we fix the code in computeReduction_CUDA to handle size < 128... do it on the host.

		cudaStreamSynchronize(stream);

        Cu::Buffer d_pos(Cu::Buffer::CUDA, n*sizeof(float4), d_positions);
        Cu::BufferMapper<float4> h_pos(Cu::Buffer::HOST, d_pos);

        float4 h_min;
        h_min.w = 0;
        float4 h_max;
        h_max.w = 0;
        for(int i=0; i<n; ++i)
        {
            h_min = ReduceMinFunc()(h_pos[i], h_min);
            h_max = ReduceMaxFunc()(h_pos[i], h_max);
        }
        h_outMin[0] = h_min.x;
        h_outMin[1] = h_min.y;
        h_outMin[2] = h_min.z;
        h_outMax[0] = h_max.x;
        h_outMax[1] = h_max.y;
        h_outMax[2] = h_max.z;
        return;
    }

    computeReduction_CUDA<float4, ReduceMinFunc>(n, (float4*)d_positions, (float4*)tempBuf, (float4*)d_outMin, (cudaStream_t)stream);
    computeReduction_CUDA<float4, ReduceMaxFunc>(n, (float4*)d_positions, (float4*)tempBuf, (float4*)d_outMax, (cudaStream_t)stream);
    cudaMemcpyAsync(h_outMin, d_outMin, sizeof(float4), cudaMemcpyDeviceToHost, stream);
    cudaMemcpyAsync(h_outMax, d_outMax, sizeof(float4), cudaMemcpyDeviceToHost, stream);
}

//------------------------------------------------------------------------------------------
}
}
