/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "NvParticlesIteratorCudaInline.h"
#include "NvParticlesColorMathCudaInline.h"
#include "NvParticlesGridCuda.h"

//------------------------------------------------------------------------------------------
__device__ __constant__	InternalParameters d_parameters;
static InternalParameters h_parameters;

#ifdef __CUDA_ARCH__
#define NVPARTICLES_SYSTEM_PARAM(x) d_parameters.x
#else
#define NVPARTICLES_SYSTEM_PARAM(x) h_parameters.x
#endif

//------------------------------------------------------------------------------------------
//these need to come after these parameters have been declared!
//#include "particles/NvParticlesParticleStructCudaInline.cc"
#include "NvParticlesGridCudaInline.h"
#include "NvParticlesPrimitivesCudaInline.h"
#include "NvParticlesForcesCudaInline.h"

//-----------------------------------------------------------------------------------
/// Helper function for calculating the pressure from density.
///
inline NVPARTICLES_CUDA_EXPORT
float computePressure(const float density)
{
#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    // Use the Tait Equation-of-State.
    return NVPARTICLES_SYSTEM_PARAM(cache.eosTaitConstantB) * (powf(density / NVPARTICLES_SYSTEM_PARAM(restDensity), NVPARTICLES_SYSTEM_PARAM(eosTaitPower)) - 1.f);
#else
    // Use the Ideal Gas Equation-of-State (Muller et al.)
	return NVPARTICLES_SYSTEM_PARAM(restPressure) + NVPARTICLES_SYSTEM_PARAM(cache.speedOfSoundPow2) * (density - NVPARTICLES_SYSTEM_PARAM(restDensity));
#endif
}

//-----------------------------------------------------------------------------------

#define NVPARTICLES_WCSPH_TAG_NONE (0)
#define NVPARTICLES_WCSPH_TAG_SURFACE (1<<1)

#include "Wcsph_Kernels_CudaInline.h"
#include "Wcsph_Density_CudaInline.h"
#include "Wcsph_Forces_CudaInline.h"
#include "Wcsph_BilateralFilter_CudaInline.h"
#include "Wcsph_Rasterizer_CudaInline.h"
#include "Wcsph_Integrate_CudaInline.h"

//------------------------------------------------------------------------------------------
void uploadParticleParameters(const InternalParameters* parameters, cudaStream_t stream=0)
{
    h_parameters = *parameters;

    // precalculate some constants...

    h_parameters.cache.velocityLimitPow2 = parameters->velocityLimit * parameters->velocityLimit;

#if defined(NVPARTICLES_WCSPH_USE_ARTIFICIAL_VISCOSITY)
    h_parameters.cache.viscosityKernelConstant = /*h_parameters.viscosity */ sph::Wendland::gradientConstant(h_parameters.smoothingLength);
#else
    h_parameters.cache.viscosityKernelConstant = h_parameters.viscosity * sph::Viscosity::laplaceConstant(h_parameters.smoothingLength);
#endif
    h_parameters.cache.smoothingLengthPow2 = h_parameters.smoothingLength * h_parameters.smoothingLength;
    h_parameters.cache.smoothingLengthPow3 = h_parameters.cache.smoothingLengthPow2 * h_parameters.smoothingLength;
    h_parameters.cache.smoothingLengthPow4 = h_parameters.cache.smoothingLengthPow2 * h_parameters.cache.smoothingLengthPow2;
    h_parameters.cache.invSmoothingLength = 1.f/h_parameters.smoothingLength;
	h_parameters.cache.speedOfSoundPow2 = h_parameters.speedOfSound * h_parameters.speedOfSound;
	h_parameters.cache.deltaTimePow2 = h_parameters.deltaTime * h_parameters.deltaTime;

	// for the artificial viscosity (0.01*h^2) to avoid singularities.
	// (from M. Becker & M. Teschner / Weakly compressible SPH for free surface flows)
	h_parameters.cache.epsArtificialViscosity = 0.01f*h_parameters.smoothingLength*h_parameters.smoothingLength;

    h_parameters.cache.densityKernelConstant = sph::Poly6::kernelConstant(h_parameters.smoothingLength);

#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    //h_parameters.eosTaitPower = parameters->eosTaitPower;
    h_parameters.cache.eosTaitConstantB = (h_parameters.cache.speedOfSoundPow2 * parameters->restDensity) / parameters->eosTaitPower;
	h_parameters.speedOfSoundPower = (parameters->eosTaitPower - 1)/2;
#endif

#ifdef NVPARTICLES_WCSPH_USE_MULLER_PRESSURE
    // Muller fast pressure.
    h_parameters.cache.pressureKernelConstant = 0.5f * sph::Spiky::gradientConstant(h_parameters.smoothingLength);
#else
    // Monaghan pressure:
    h_parameters.cache.pressureKernelConstant = sph::Wspiky::gradientConstant(h_parameters.smoothingLength);
#endif

    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_parameters, &h_parameters, sizeof(InternalParameters), 0, cudaMemcpyHostToDevice, stream));

	buildKernelLookup<KernelFunctor_Poly6_3D_Kernel, float> (h_parameters.particleRestDistance, h_parameters.smoothingLength, NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, h_wallLookup_Poly6_3D_Kernel);
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_wallLookup_Poly6_3D_Kernel, h_wallLookup_Poly6_3D_Kernel, sizeof(float)*NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, 0, cudaMemcpyHostToDevice, stream));

	buildKernelLookup<KernelFunctor_Viscosity_3D_Laplace, float> (h_parameters.particleRestDistance, h_parameters.smoothingLength, NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, h_wallLookup_Viscosity_3D_Laplace);
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_wallLookup_Viscosity_3D_Laplace, h_wallLookup_Viscosity_3D_Laplace, sizeof(float)*NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, 0, cudaMemcpyHostToDevice, stream));

	buildKernelLookup<KernelFunctor_Poly6_3D_Gradient, float> (h_parameters.particleRestDistance, h_parameters.smoothingLength, NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, h_wallLookup_Poly6_3D_Gradient);
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_wallLookup_Poly6_3D_Gradient, h_wallLookup_Poly6_3D_Gradient, sizeof(float)*NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE, 0, cudaMemcpyHostToDevice, stream));
}


#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)

//------------------------------------------------------------------------------------------
/// use the SPH smoothing-radius to reduce the size of the adjacency-list.
///
struct SphBuildNeighborFunctor3 : public BuildNeibsIterator3
{
    inline NVPARTICLES_CUDA_EXPORT static
	bool item (SphBuildNeighborFunctor3 &it, uint const &index_i, uint const &index_j, float3 const &position_i)
    {
		float3 position_j = make_float3(it.positions[index_j]);

		float3 relPos = (position_i - position_j);
		if(NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) == 2)
			relPos -= it.periodicDisplacement;
		relPos *= NVPARTICLES_SYSTEM_PARAM(invScale);
		float rPow2 = dot(relPos, relPos);

		if (rPow2 < NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2))
        {
			if (it.counter < it.data.adjacencyListMaxItems)
			{
				// store the periodic sign within the adjacentIndex (so we can reconstruct later)
				uint indexWrap = index_j;

				if (1)// || isPeriodic)
				{
					if (it.periodicSign.x == 1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXPLUS;
					else if (it.periodicSign.x == -1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXMINUS;
					if (it.periodicSign.y == 1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYPLUS;
					else if (it.periodicSign.y == -1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYMINUS;
					if (it.periodicSign.z == 1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZPLUS;
					else if (it.periodicSign.z == -1)
						indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZMINUS;
				}

#if defined(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)

			    const uint lane = index_i/NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE;
			    const uint offset = index_i&(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE-1);
                it.data.adjacencyListPtr[lane * it.data.adjacencyListPitch + it.counter*NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE + offset] = indexWrap;

#else
				it.data.adjacencyListPtr[index_i * it.data.adjacencyListPitch + it.counter] = indexWrap;
#endif
		    }

	        it.counter++;
	    }

        return true;
    }
};

#endif // defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)

//-----------------------------------------------------------------------------------
/// format the particle data for rendering...
///
__global__
void updateRenderPositionKernel(const int nParticles,
                                ParticleData data
                               )
{
    const int index = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index >= nParticles)
        return;

	float4 simPosition = NVPARTICLES_SYSTEM_FETCH_NOTEX(data, position, index);
    float4 warpedPosition = NVPARTICLES_SYSTEM_FETCH_NOTEX(data, warpedPosition, index);

    // simple copy for now...
    warpedPosition = simPosition;

    const int displacementWidth = NVPARTICLES_SYSTEM_PARAM(glTexDisplacementSize);

    if (displacementWidth != 0)
    {
        //warpedPosition.y *= sinf(simPosition.x*simPosition.z/180);

        int3 cell = ComputeHeightFieldIterator::posToCell(make_float4(simPosition.x, simPosition.y, simPosition.z, simPosition.w));
        cell.x = min(NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x-1, cell.x);
        cell.y = min(NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).y-1, cell.y);
        cell.z = min(NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z-1, cell.z);
        cell.x = max(0, cell.x);
        cell.y = max(0, cell.y);
        cell.z = max(0, cell.z);
        float minSurfaceHeight = data.field[cell.x+cell.z*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x];

        //if (minSurfaceHeight - simPosition.y > 0.1)
        {
            // we are near the surface or underneath...

            if (NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) & 3)
	        {
                vec3f Pw = make_vec3f(simPosition.x, simPosition.y, simPosition.z);
                vec3f Pb = NVPARTICLES_SYSTEM_PARAM(boundaryMatrixInv).multiplyPoint(Pw);

                float rPow2 = (Pb.x*Pb.x+Pb.z*Pb.z);
                rPow2 = clamp(rPow2, 0.f, 1.f);
                float w = 1.0f - powf(rPow2,9);
                w = 1-powf(w, 9);

                warpedPosition.y -= minSurfaceHeight * w;

                if (data.displacement != 0)
                {
                    // lookup from the displacement...
                    int ix = (displacementWidth-1)*(Pb.x/2+0.5);
                    int iy = (displacementWidth-1)*(Pb.z/2+0.5);
                    ix = min((displacementWidth-1), ix);
                    iy = min((displacementWidth-1), iy);
                    ix = max(0, ix);
                    iy = max(0, iy);
                    float4 geomPw = data.displacement[ix+iy*displacementWidth];

                    warpedPosition.y += geomPw.y * w;
                }
            }
        }
    }

	const int outIndex = index;
	data.warpedPosition[outIndex] = warpedPosition;
}

//------------------------------------------------------------------------------------------
void updateRenderPosition(int start, int nParticles, InternalParameters* parameters, ParticleData& data, float* field, float* blurredField, cudaStream_t stream)
{
    if(nParticles == 0)
        return;

    uint nThreads, nBlocks;

    const int displacementWidth = NVPARTICLES_SYSTEM_PARAM(glTexDisplacementSize);

    if (field && displacementWidth)
    {
        // do a bilateral blur on the data to denoise the missing surface values...
        dim3 gridSize((NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x + 16 - 1) / 16, (NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z + 16 - 1) / 16);
        dim3 blockSize(16, 16);
        d_bilateral_filter <<<gridSize, blockSize, 0, stream>>> (field, blurredField, NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x, NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z, 2);
        cudaMemcpyAsync(field, blurredField, NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).x*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount).z*sizeof(float), cudaMemcpyDeviceToDevice, stream);
    }

#if defined(NVPARTICLES_HAS_FERMI)
    computeGridSize(nParticles, 256, nBlocks, nThreads);
#else
    computeGridSize(nParticles, 128, nBlocks, nThreads);
#endif

    data.field = field;
	updateRenderPositionKernel <<<nBlocks, nThreads, 0, stream>>>(
		nParticles,
		data
	);
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: updateRenderPositionKernel");
}

//------------------------------------------------------------------------------------------
