/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "NvParticlesGridCudaInline.h"

//------------------------------------------------------------------------------------------
struct FieldParameters
{
    float3 fieldMin;
    float3 fieldMax;
    uint3 fieldCount;
    uint3 bucketMin;
    uint3 bucketMax;

    struct
    {
        float radius2,a_invRadius6,b_invRadius4,c_invRadius2,a6_invRadius6,b4_invRadius4,c2_invRadius2;

    } cache;
};

__device__ __constant__ FieldParameters d_fieldParameters;
static FieldParameters h_fieldParameters;

//------------------------------------------------------------------------------------------
static void uploadFieldParameters(float radius, const FieldParameters* parameters, cudaStream_t stream=0)
{
    h_fieldParameters = *parameters;

    const float a = -0.444444f;
    const float b = 1.888889f;
    const float c = -2.444444f;
    float _radius2 = radius * radius;
    float _radius4 = _radius2 * _radius2;
    float _radius6 = _radius4 * _radius2;
    float _a_invRadius6 = a / _radius6;
    float _b_invRadius4 = b / _radius4;
    float _c_invRadius2 = c / _radius2;
    float _a6_invRadius6 = 6.0f * _a_invRadius6;
    float _b4_invRadius4 = 4.0f * _b_invRadius4;
    float _c2_invRadius2 = 2.0f * _c_invRadius2;

    // precalculate some constants
    h_fieldParameters.cache.radius2 = _radius2;
    //h_fieldParameters.cache.radius4 = _radius4;
    //h_fieldParameters.cache.radius6 = _radius6;
    h_fieldParameters.cache.a_invRadius6 = _a_invRadius6;
    h_fieldParameters.cache.b_invRadius4 = _b_invRadius4;
    h_fieldParameters.cache.c_invRadius2 = _c_invRadius2;
    h_fieldParameters.cache.a6_invRadius6 = _a6_invRadius6;
    h_fieldParameters.cache.b4_invRadius4 = _b4_invRadius4;
    h_fieldParameters.cache.c2_invRadius2 = _c2_invRadius2;

    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_fieldParameters, &h_fieldParameters, sizeof(FieldParameters), 0, cudaMemcpyHostToDevice, stream));
}

//-----------------------------------------------------------------------------------
/// returns the gradient in xyz and the value in w.
///
template <bool USE_NORMALS>
inline NVPARTICLES_CUDA_EXPORT
float4 metaball(const float3 position_ij, const float radius)
{
    float4 field = make_float4(0.f);

#if 1
    const float a = -0.444444f;
    const float b = 1.888889f;
    const float c = -2.444444f;
    const float radiusPow2 = radius * radius;
    const float radiusPow4 = radiusPow2 * radiusPow2;
    const float radiusPow6 = radiusPow4 * radiusPow2;
    const float a_invRadiusPow6 = a / radiusPow6;
    const float b_invRadiusPow4 = b / radiusPow4;
    const float c_invRadiusPow2 = c / radiusPow2;
    const float a6_invRadiusPow6 = 6.0f * a_invRadiusPow6;
    const float b4_invRadiusPow4 = 4.0f * b_invRadiusPow4;
    const float c2_invRadiusPow2 = 2.0f * c_invRadiusPow2;
#endif

    const float rPow2 = dot(position_ij, position_ij);

    if (rPow2 < radiusPow2)
    {
        if (USE_NORMALS)
        {
            const float a6b4c2 = ( (a6_invRadiusPow6 * radiusPow2 + b4_invRadiusPow4) * radiusPow2 + c2_invRadiusPow2);
            field.x = a6b4c2 * position_ij.x;
            field.y = a6b4c2 * position_ij.y;
            field.z = a6b4c2 * position_ij.z;
        }

        const float rPow4 = rPow2 * rPow2;
        const float rPow6 = rPow2 * rPow4;
        field.w += (rPow6 * a_invRadiusPow6) + (rPow4 * b_invRadiusPow4) + (rPow2 * c_invRadiusPow2) + 1.f;
    }

    return field;
}

//-----------------------------------------------------------------------------------
template <bool USE_NORMALS>
struct ComputeFieldIterator : public IteratorFunctorBase3<float4, true>
{
    typedef ComputeFieldIterator Iter;

    float4* positions;
    uchar4 *outField;
    float4 field;
    float radius;

    static NVPARTICLES_CUDA_EXPORT
    void pre(Iter &it, uint const &index_i)
    {
        it.field = make_float4(0);
    }

    static NVPARTICLES_CUDA_EXPORT
    bool item(Iter &it, uint const &index_i, uint const &index_j, float3 const &position_i)
    {
		float3 position_j = make_float3(NVPARTICLES_SYSTEM_FETCH(it, positions, index_j));
		if(NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) == 2)
			position_j += it.periodicDisplacement;

		float3 position_ij = (position_i - position_j);
		position_ij *= NVPARTICLES_SYSTEM_PARAM(invScale);

        it.field += metaball<USE_NORMALS>(position_ij, it.radius);

        return true;
    }

    static NVPARTICLES_CUDA_EXPORT
    void post(Iter &it, uint index_i)
    {
        if (USE_NORMALS)
        {
            float3 norm = normalize(make_float3(it.field.x, it.field.y, it.field.z));
            it.outField[index_i].x = (norm.x + 1.f) * 127.f;
            it.outField[index_i].y = (norm.y + 1.f) * 127.f;
            it.outField[index_i].z = (norm.z + 1.f) * 127.f;
            it.outField[index_i].w = max(0.f, min(1.f, it.field.w)) * 255.f;
        }
        else
        {
            it.outField[index_i].w = max(0.f, min(1.f, it.field.w)) * 255.f;
            it.outField[index_i].x = it.outField[index_i].w;
            it.outField[index_i].y = it.outField[index_i].w;
            it.outField[index_i].z = it.outField[index_i].w;
        }
    }
};

//-----------------------------------------------------------------------------------
__global__
void computeFieldKernel(uint nFieldCells, float4* positions, uchar4* outField, float radius,
                                SpatialGrid::SpatialGridData const accel
                               )
{
    uint cellIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (cellIndex >= nFieldCells)
        return;

    //float3 position_i = CalcFieldIterator::CellToPos( calc_coord_from_address( fieldCellIndex, fieldCellCount), fieldCellCount, fieldMin, fieldMax);
    // get unitized position within field min and max

	//int3 cell = Iterator::PosToCell(make_float4(position), NVPARTICLES_SPATIAL_GRID_PARAM(low), NVPARTICLES_SPATIAL_GRID_PARAM(countPerCell));
    typedef ComputeFieldIterator<false> Iter;
    Iter it;

    float4 cellCenter = Iter::cellToPos(Iter::hashToCell(cellIndex));
    cellCenter.x += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).x / 2;
    cellCenter.y += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).y / 2;
    cellCenter.z += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).z / 2;

    it.positions = positions;
    it.outField = outField;
    it.radius = radius; // this must be the smoothingRadius.

    /// CAVEAT:
    // note that we can't use the pre-calculated neibs because there might not be a particle in cell[index]
    iterateNeibs<Iter, true, 1>(it, cellIndex, make_float3(cellCenter.x, cellCenter.y, cellCenter.z), accel);
}

//-----------------------------------------------------------------------------------
void computeField_CUDA(int nFieldCells, float radius, float4* positions, uchar4* outFieldData,
                    SpatialGrid::SpatialGridData accel,
                    const InternalParameters* parameters, cudaStream_t stream)
{
    if(nFieldCells == 0)
        return;

    uint nThreads, nBlocks;
    computeGridSize(nFieldCells, 256, nBlocks, nThreads);

    computeFieldKernel<<<nBlocks, nThreads, 0, stream>>>(
        nFieldCells,
        positions,
        outFieldData,
        radius,
        accel
    );
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeFieldKernel");
}

//-----------------------------------------------------------------------------------
struct ComputeHeightFieldIterator : public IteratorFunctorBase3<float4, true>
{
    typedef ComputeHeightFieldIterator Iter;

    float4* positions;
    uint* tags;
    float avgHeight;
    float w;
    float radius;

    static NVPARTICLES_CUDA_EXPORT
    void pre(Iter &it, uint const &index_i)
    {
        //we are getting th minimum surface.
        it.avgHeight = 0;
        it.w = 0;
    }

    static NVPARTICLES_CUDA_EXPORT
    bool item(Iter &it, uint const &index_i, uint const &index_j, float3 const &position_i)
    {
        uint tag_j = NVPARTICLES_SYSTEM_FETCH(it, tags, index_j);
        if (tag_j&NVPARTICLES_WCSPH_TAG_SURFACE)
        {
		    float3 position_j = make_float3(NVPARTICLES_SYSTEM_FETCH(it, positions, index_j));
            if(NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) == 2)
			    position_j += it.periodicDisplacement;
#if 1
		    float3 position_ij = (position_i - position_j);
		    position_ij *= NVPARTICLES_SYSTEM_PARAM(invScale);
            float rPow2 = dot(position_ij, position_ij);

            if (rPow2 < it.radius*it.radius)
            {
                float w = 1.0f - rPow2 / (it.radius*it.radius);
                w = w*w*w;
                it.avgHeight += position_j.y * w;
                it.w += w;
		    }
#else
            it.avgHeight = position_j.y;
            it.w = 1;
            return false;
#endif
        }

        return true;
    }

    static NVPARTICLES_CUDA_EXPORT
    void post(Iter &it, uint index_i)
    {
        // just in case there were no particles in this cell.
        if (it.w == 0)
            it.avgHeight = 999999; /// this needs to be a different value!
        else
            it.avgHeight /= it.w; // normalize.
    }
};

//-----------------------------------------------------------------------------------
__global__
void computeHeightFieldKernel(float4* positions, uint* tags, float* field, float radius,
                                SpatialGrid::SpatialGridData const accel
                               )
{
    uint cellIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (cellIndex >= NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z)
        return;

    typedef ComputeHeightFieldIterator Iter;
    Iter it;
    it.positions = positions;
    it.tags = tags;
    it.radius = radius;

    int cx = cellIndex % NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x;
	int cz = cellIndex / NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x;

    float minHeight = 999999;

    // for each horizontal layer of cells...
    for (int cy=0; cy<NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).y; ++cy)
    {
        float4 cellCenter = Iter::cellToPos(make_int3(cx, cy, cz));
        cellCenter.x += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).x / 2;
        cellCenter.y += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).y / 2;
        cellCenter.z += NVPARTICLES_SPATIAL_GRID_PARAM(cellSize).z / 2;

        /// CAVEAT:
        // note that we can't use the pre-calculated neibs because there might not be a particle in cell[index]
        //iterateNeibs<Iter, true, 1>(it, 0, make_float3(cellCenter), accel);

        // just get the paticles inside this cell.
        iterateNeibs<Iter, false, 0>(it, 0, make_float3(cellCenter.x, cellCenter.y, cellCenter.z), accel);

        minHeight = min(minHeight, it.avgHeight);
    }

    if (minHeight != 999999)
    {
        float& minSurfaceHeight = field[cx+cz*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x];
        float& minSurfaceHeightDelta = field[cx+cz*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x + (NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z)];

        criticallyDampedSmooth(minSurfaceHeight, minSurfaceHeightDelta, minHeight, 4, 1);
    }
}

//-----------------------------------------------------------------------------------
void computeHeightField_CUDA(float4* positions, uint* tags, float* outField, float radius,
                    SpatialGrid::SpatialGridData accel,
                    cudaStream_t stream)
{
    int nGridLayerCells = NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x * NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z;
    if(nGridLayerCells == 0 || outField == 0)
        return;

    uint nThreads, nBlocks;
    computeGridSize(nGridLayerCells, 256, nBlocks, nThreads);

    computeHeightFieldKernel<<<nBlocks, nThreads, 0, stream>>>(
        positions,
        tags,
        outField,
        radius,
        accel
    );
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeHeightFieldKernel");
}

//-----------------------------------------------------------------------------------


