/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

//-----------------------------------------------------------------------------------
/// calculate speed-of-sound from density.
///
inline NVPARTICLES_CUDA_EXPORT 
float computeSpeedOfSound(const float density)
{
	return NVPARTICLES_SYSTEM_PARAM(speedOfSound) * powf(density/NVPARTICLES_SYSTEM_PARAM(restDensity), NVPARTICLES_SYSTEM_PARAM(speedOfSoundPower));
}

//-----------------------------------------------------------------------------------
/// Kernel arguments.
///
struct CalcInternalForcesKernelParams
{
	uint numParticles;
	ParticleData inData;
#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
	SpatialGrid::AdjacencyListData spatialGridData;
#else
	SpatialGrid::SpatialGridData spatialGridData;
#endif
    float* maxCflPtr;

	CalcInternalForcesKernelParams()
	{
	}

	CalcInternalForcesKernelParams(
		const uint _numParticles,
		const ParticleData _inData,
#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
        const SpatialGrid::AdjacencyListData _spatialGridData,
#else
        const SpatialGrid::SpatialGridData _spatialGridData,
#endif
        float* _maxCflPtr
        )
		:
			numParticles(_numParticles),
			inData(_inData),
			spatialGridData(_spatialGridData),
            maxCflPtr(_maxCflPtr)
	{
	}
};

__device__ __constant__	CalcInternalForcesKernelParams d_CalcInternalForcesKernelParams;
static CalcInternalForcesKernelParams h_CalcInternalForcesKernelParams;

#ifdef NVPARTICLES_KERNEL_ARG
#error NVPARTICLES_KERNEL_ARG already defined!
#endif
#ifdef __CUDA_ARCH__
#define NVPARTICLES_KERNEL_ARG(x) d_CalcInternalForcesKernelParams.x
#else
#define NVPARTICLES_KERNEL_ARG(x) h_CalcInternalForcesKernelParams.x
#endif

//-----------------------------------------------------------------------------------
/// calc symmetrized pressure and calculate viscosity forces
///
template <bool USE_PRESSURE, bool USE_VISCOSITY, bool USE_TENSION, bool USE_XSPH>
struct CalcInternalForcesIterator : public IteratorFunctorBase3<float4, true>
{
    float3 velocity_i;
    float density_i;
    float pressure_i;
	float speedOfSound_i;
    float3 viscosityForce;
    float3 pressureForce;
    float3 colorGradient;
    float colorLaplacian;
	float3 totalForce;
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    float3 xsphVelocity;
#endif
#if defined(NVPARTICLES_WCSPH_USE_TAG_SURFACE)
    float4 massCenter;
#endif
    ParticleData inData;


    static NVPARTICLES_CUDA_EXPORT
	void pre(CalcInternalForcesIterator &it, uint index_i)
    {
        it.density_i = NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, density, index_i);

        if(USE_PRESSURE)
        {
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
            it.pressure_i = NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, pressure, index_i);
#else
            it.pressure_i = computePressure(it.density_i);
#endif

            it.pressureForce = make_float3(0);
        }

        if(USE_VISCOSITY)
        {
			it.velocity_i = make_float3(NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, veleval, index_i));
            it.viscosityForce = make_float3(0);
        }

#if defined(NVPARTICLES_WCSPH_USE_XSPH)
        if (USE_XSPH)
        {
            it.xsphVelocity = make_float3(0);
        }
#endif

        if (USE_TENSION)
        {
            it.colorGradient = make_float3(0);
            it.colorLaplacian = 0;
        }

#if defined(NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP) || (NVPARTICLES_WCSPH_USE_VISCOSITY == NVPARTICLES_WCSPH_VISCOSITY_ARTIFICIAL)
		it.speedOfSound_i = computeSpeedOfSound(it.density_i);
#endif

#if defined(NVPARTICLES_WCSPH_USE_TAG_SURFACE)
        it.massCenter = make_float4(0);
#endif
    }

    static NVPARTICLES_CUDA_EXPORT
	bool item(CalcInternalForcesIterator &it, uint const &index_i, uint const &index_j, float3 const &position_i)
    {
        // the same item doesn't impose a force on itself!
        if (index_j != index_i)
        {
            float3 position_j = make_float3(NVPARTICLES_SYSTEM_FETCH(it.inData, position, index_j));
			if(NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) == 2)
				position_j += it.periodicDisplacement;

			// get relative position.
			float3 position_ij = (position_i - position_j) * NVPARTICLES_SYSTEM_PARAM(invScale);
            float rPow2 = dot(position_ij, position_ij);

            if (rPow2 < NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2))
            {
				// load all the data we need...
                float density_j = NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, density,  index_j);

                float3 velocity_j = make_float3(NVPARTICLES_SYSTEM_FETCH(it.inData, veleval, index_j));
				float3 velocity_ij = it.velocity_i - velocity_j;

				float relVelDotRelPos = dot(velocity_ij, position_ij);

#if defined(NVPARTICLES_WCSPH_USE_TAG_SURFACE)
                // sum the mass centers...
                float weight = (it.density_i / density_j);
			    it.massCenter += make_float4(weight * position_j, weight);
#endif

				/// CAVEAT:
				// The gradient calcs divide by r, so (r == 0) is a problem.
                float r = sqrtf(rPow2) + NVPARTICLES_EPSILON;

                if (USE_PRESSURE)
                {
                    // density_i should NEVER be zero here; if a nearby neighbor 
                    // was found then density MUST logically be > 0.
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
                    float pressure_j = NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, pressure, index_j);
#else
                    float pressure_j = computePressure(density_j);
#endif
                    // reduce the negative pressure...
                    if (pressure_j < 0)
                        pressure_j *= NVPARTICLES_SYSTEM_PARAM(negativePressureFactor);

#if defined(NVPARTICLES_WCSPH_USE_MULLER_PRESSURE)
                    // A faster method by balancing with the arithmetic-mean-pressure.
                    // NOTE: mass and the divide-by-two are hoisted out of the inner loop.

                    it.pressureForce -= position_ij * ((it.pressure_i + pressure_j) / density_j)
										* sph::Spiky::gradientVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), r);
#else
                    // NOTE: only mass is hoisted out of this formula.
					/// OPTIMIZATION:
					// we could precalc the particle_i component of this formula outside of this inner-loop (1 div and 2 muls).

					it.pressureForce -= position_ij * ( (it.pressure_i/(it.density_i*it.density_i)) + (pressure_j/(density_j*density_j)) )
                                     * sph::Spiky::gradientVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), r);
#endif
                }

                if (USE_VISCOSITY)
                {
#if defined(NVPARTICLES_WCSPH_USE_ARTIFICIAL_VISCOSITY)

                    float artificalViscosity = 0;
                    if (relVelDotRelPos < 0)
                    {
                	    const float mu = relVelDotRelPos * NVPARTICLES_SYSTEM_PARAM(smoothingLength) / (rPow2 + NVPARTICLES_SYSTEM_PARAM(cache.epsArtificialViscosity));
                        // use the calculated speedOfSound.
	                    //artificalViscosity = (mu * viscosity * (it.speedOfSound_i + computeSpeedOfSound(density_j))) / (density_i + density_j);
                        // ... or use the average speedOfSound.
	                    artificalViscosity = (mu * NVPARTICLES_SYSTEM_PARAM(artificialViscosity) * 2.0f * NVPARTICLES_SYSTEM_PARAM(speedOfSound)) / (it.density_i + density_j);
                    }

					it.viscosityForce += position_ij * artificalViscosity *
                        sph::Wendland::gradientVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), r);

#else
					it.viscosityForce -= ( (velocity_ij) / (density_j) )
                        * sph::Viscosity::laplaceVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), r);
#endif
                }


#if defined(NVPARTICLES_WCSPH_USE_XSPH)
                if (computeXsph)
                {
                    it.xsphVelocity -= ((velocity_ij) / (density_j + it.density_i))
                        * sph::Poly6::kernelVariable(NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2), rPow2);
                }
#endif

#if defined(NVPARTICLES_WCSPH_USE_SURFACE_TENSION)
                if (USE_TENSION)
                {
                    // color-field for smoothing...
                    it.colorGradient += position_ij * ( 1.0f / (density_j) )
                        * sph::Poly6::gradientVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2), r);

                    it.colorLaplacian += ( 1.0f / (density_j) )
                        * sph::Poly6::laplaceVariable(NVPARTICLES_SYSTEM_PARAM(smoothingLength), NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2), r, rPow2);
                }
#endif
            }
        }

        return true;
    }

    static NVPARTICLES_CUDA_EXPORT
	void post(CalcInternalForcesIterator &it, uint index_i)
    {
        float3 totalForce = make_float3(0);

#if defined(NVPARTICLES_WCSPH_USE_TAG_SURFACE)

        uint tag_i = NVPARTICLES_WCSPH_TAG_NONE;

        if (it.massCenter.w <= 2.0)
        {
            tag_i = NVPARTICLES_WCSPH_TAG_SURFACE;
        }
        else
        {
            it.massCenter.x /= it.massCenter.w;
            it.massCenter.y /= it.massCenter.w;
            it.massCenter.z /= it.massCenter.w;
            float distToMassCenter = length(it.position_i - make_float3(it.massCenter));            

		    if (NVPARTICLES_SYSTEM_PARAM(surfaceDistance) > 0 && distToMassCenter > NVPARTICLES_SYSTEM_PARAM(surfaceDistance))
            {
			    tag_i = NVPARTICLES_WCSPH_TAG_SURFACE;
            }
        }

        float wallPenetration = 0;

		if (NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) & 3)
		{
			// add the density of the wall (if any)...
			vec3f wallContact;
			const float wallCollisionRadius = (NVPARTICLES_SYSTEM_PARAM(smoothingLength)) / NVPARTICLES_SYSTEM_PARAM(invScale);
			vec3f wallNormal;
			
			wallPenetration = computeWallPenetration(
									make_vec3f(it.position_i.x, it.position_i.y, it.position_i.z),
									wallCollisionRadius,
									wallContact, wallNormal
									);

            /// HACK:
            // hack to make sure the walls don't create a surface.
            /// TODO:
            // ideally this should get the center of mass of the static wall particles,
            // and then incorporate these into the center of mass computation.
            // theoretically this should prevent a surface being created.
            if (wallPenetration > 0)
                tag_i = NVPARTICLES_WCSPH_TAG_NONE;
        }

        it.inData.tag[index_i] = tag_i;
#endif

		// we assume particle_i and particle_j have the same mass,
		// so mass has been hoisted out of the inner loop into here.

        if (USE_PRESSURE)
        {
#if defined(NVPARTICLES_WCSPH_USE_MULLER_PRESSURE)
			it.pressureForce *=  NVPARTICLES_SYSTEM_PARAM(particleMass) * NVPARTICLES_SYSTEM_PARAM(cache.pressureKernelConstant) / it.density_i;
#else
            // density_i is handled inside inner-loop.
			it.pressureForce *=  NVPARTICLES_SYSTEM_PARAM(particleMass) * NVPARTICLES_SYSTEM_PARAM(cache.pressureKernelConstant);
#endif
			totalForce += it.pressureForce;
        }

        if (USE_VISCOSITY)
        {
#if defined(NVPARTICLES_WCSPH_USE_ARTIFICIAL_VISCOSITY)
            // density_i is handled inside inner-loop.
			it.viscosityForce *= NVPARTICLES_SYSTEM_PARAM(particleMass) * NVPARTICLES_SYSTEM_PARAM(cache.viscosityKernelConstant);
#else
            it.viscosityForce *= NVPARTICLES_SYSTEM_PARAM(particleMass) * NVPARTICLES_SYSTEM_PARAM(cache.viscosityKernelConstant) / it.density_i;
#endif
			totalForce += it.viscosityForce;
        }

#if defined(NVPARTICLES_WCSPH_USE_XSPH)
        if (USE_XSPH)
        {
            it.xsphVelocity *= NVPARTICLES_SYSTEM_PARAM(particleMass) * 2.0f * sph::Poly6::kernelConstant(NVPARTICLES_SYSTEM_PARAM(smoothingLength));
            it.inData.xsphVelocity[index_i] = make_float4(it.xsphVelocity, 0);
        }
#endif

#if defined(NVPARTICLES_WCSPH_USE_SURFACE_TENSION)
        if (USE_TENSION)
        {
            it.colorGradient *= NVPARTICLES_SYSTEM_PARAM(particleMass) * sph::Poly6::gradientConstant(NVPARTICLES_SYSTEM_PARAM(smoothingLength));

			/// HACK:
			// get the colorfieldGradient from the wall in the w attribute!
			//float3 wallColorGradient = make_float3(it.inData.color[index_i])*2;//*NVPARTICLES_SYSTEM_PARAM(colorScale);
			//it.colorGradient += wallColorGradient;

            // calculate the surface-tension forces...
            float colorGradientLen = length(it.colorGradient);
            
            if (colorGradientLen >= NVPARTICLES_SYSTEM_PARAM(surfaceTensionThreshold))
            {
	            it.colorLaplacian *= NVPARTICLES_SYSTEM_PARAM(particleMass) * sph::Poly6::laplaceConstant(NVPARTICLES_SYSTEM_PARAM(smoothingLength));
                float3 tensionForce = (-NVPARTICLES_SYSTEM_PARAM(surfaceTension) * it.colorLaplacian * (it.colorGradient / colorGradientLen)) / it.density_i;
				totalForce += tensionForce;
            }
        }
#endif

		// pass this out because we need it for the CFL condition.
		it.totalForce = totalForce;

#if defined(NVPARTICLES_WCSPH_USE_WALL_WEIGHT)
		// because we are initializing it in the wall density calc we accumulate rather than set it.
		it.inData.force[index_i] += make_float4(totalForce, 0);
#else
        // store the total internal-force...
        it.inData.force[index_i] = make_float4(totalForce, 0);
#endif
    }
};

//-----------------------------------------------------------------------------------
template <bool USE_PRESSURE, bool USE_VISCOSITY, bool USE_TENSION, bool USE_XSPH>
__global__ void
computeInternalForcesKernel(CalcInternalForcesKernelParams* args)
{
    const uint index = (blockIdx.x * blockDim.x) + threadIdx.x;
    const bool valid = (index < NVPARTICLES_KERNEL_ARG(numParticles));

#if defined(NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP)
	extern __shared__ float smem_maxCfl[];
	smem_maxCfl[threadIdx.x] = 0.0f;
#endif

    if (valid)
	{
        typedef CalcInternalForcesIterator<USE_PRESSURE, USE_VISCOSITY, USE_TENSION, USE_XSPH> Iter;
		Iter it;
		it.inData = NVPARTICLES_KERNEL_ARG(inData);
		float3 position_i = make_float3(NVPARTICLES_SYSTEM_FETCH_NOTEX(it.inData, position, index));

		iterateNeibs<Iter, true, 1>(it, index, position_i, NVPARTICLES_KERNEL_ARG(spatialGridData));

#if defined(NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP)
		smem_maxCfl[threadIdx.x] = max(length(it.totalForce), (it.speedOfSound_i*it.speedOfSound_i) / NVPARTICLES_SYSTEM_PARAM(smoothingLength));
#endif
	}

#if defined(NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP)
	// do a reduction to get the max CFL value within this block.
    __syncthreads();

    for (uint i=blockDim.x/2; i>0; i/=2)
	{
		if (threadIdx.x < i)
		{
			smem_maxCfl[threadIdx.x] = max(smem_maxCfl[threadIdx.x + i], smem_maxCfl[threadIdx.x]);
		}
		__syncthreads();
	}

	if (threadIdx.x == 0)
    {
        atomicMax((unsigned int*)NVPARTICLES_KERNEL_ARG(maxCflPtr), floatFlip((unsigned int&)smem_maxCfl[0]));
    }
#endif
}

__device__ float d_globalMaxForce;

//------------------------------------------------------------------------------------------
static void computeInternalForces(
	bool useSurfaceTension,
	SpatialGrid::SpatialGridData spatialGridData,
	int numParticles,
	ParticleData inData,
    float* cflDeltaTime,
	cudaStream_t stream=0)
{
    if(numParticles == 0)
        return;

    uint BLOCK_SIZE;
#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
    BLOCK_SIZE = 128;
#else
    BLOCK_SIZE = 128;
#endif
#if defined(NVPARTICLES_HAS_FERMI)
    BLOCK_SIZE = 128;
#endif

    uint nThreads, nBlocks;
    computeGridSize(numParticles, BLOCK_SIZE, nBlocks, nThreads);

    while(nBlocks >= 64*1024)
    {
        std::cout << "ALERT: have to rescale blockSize due to too large gridSize >=65536\n";
        nThreads += 32;
        computeGridSize(numParticles, BLOCK_SIZE, nBlocks, nThreads);
    }

#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
	const SpatialGrid::AdjacencyListData& spatialData = spatialGridData.adjacencyData;
#else
	const SpatialGrid::SpatialGridData& spatialData = spatialGridData;
#endif

    float* maxCflPtr = 0;
    NVPARTICLES_CUDA_SAFE_CALL( cudaGetSymbolAddress((void**)&maxCflPtr, d_globalMaxForce) );
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemset(maxCflPtr, 0, sizeof(float)) );

	h_CalcInternalForcesKernelParams = CalcInternalForcesKernelParams(numParticles, inData, spatialData, maxCflPtr);
	NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_CalcInternalForcesKernelParams, &h_CalcInternalForcesKernelParams, sizeof(CalcInternalForcesKernelParams), 0, cudaMemcpyHostToDevice, stream));

    if(useSurfaceTension)
    {
        computeInternalForcesKernel<true, true, true, false> <<<nBlocks, nThreads, nThreads*sizeof(float), stream>>>( &d_CalcInternalForcesKernelParams );
        NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeInternalForcesKernel");
    }
    else
    {
        computeInternalForcesKernel<true, true, true, false> <<<nBlocks, nThreads, nThreads*sizeof(float), stream>>>( &d_CalcInternalForcesKernelParams );
        NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeInternalForcesKernel");
    }


#if defined(NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP)

    if (NVPARTICLES_SYSTEM_PARAM(cflFactor))
    {
        NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));

        unsigned int maxCflUint;
        NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpy(&maxCflUint, maxCflPtr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
        maxCflUint = invFloatFlip(maxCflUint);
        float cflMax = (float&)maxCflUint;

        *cflDeltaTime = NVPARTICLES_SYSTEM_PARAM(cflFactor) * sqrtf(NVPARTICLES_SYSTEM_PARAM(smoothingLength) / cflMax);
    }
#endif
}

#undef NVPARTICLES_KERNEL_ARG

//------------------------------------------------------------------------------------------
