/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


//-----------------------------------------------------------------------------------
#define NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE 16
__device__ __constant__ float d_wallLookup_Poly6_3D_Kernel[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];
static float h_wallLookup_Poly6_3D_Kernel[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];

__device__ __constant__ float d_wallLookup_Viscosity_3D_Laplace[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];
static float h_wallLookup_Viscosity_3D_Laplace[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];

__device__ __constant__ float d_wallLookup_Poly6_3D_Gradient[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];
static float h_wallLookup_Poly6_3D_Gradient[NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE];

//------------------------------------------------------------------------------------------
struct KernelFunctor_Poly6_3D_Kernel
{
	static void init(float& sum)
	{
		sum = 0;
	}

	static void item(float& sum, const vec3f& R_ij, float r, float h)
	{
		sum += sph::Poly6::kernelVariable(h*h, r*r) * sph::Poly6::kernelConstant(h);
	}
};

//-----------------------------------------------------------------------------------
struct KernelFunctor_Viscosity_3D_Laplace
{
	static void init(float& sum)
	{
		sum = 0;
	}

	static void item(float& sum, const vec3f& R_ij, float r, float h)
	{
		sum += sph::Viscosity::laplaceVariable(h, r) * sph::Viscosity::laplaceConstant(h);
	}
};

//-----------------------------------------------------------------------------------
struct KernelFunctor_Poly6_3D_Gradient
{
	static void init(float& sum)
	{
		sum = 0;
	}

	static void item(float& sum, const vec3f& R_ij, float r, float h)
	{
		sum += R_ij.z * sph::Poly6::gradientVariable(h, h*h, r) * sph::Poly6::gradientConstant(h);
	}
};

//-----------------------------------------------------------------------------------
template <class KernelFunctor, class T>
void buildKernelLookup(float dx, float h, int tableSize, T* outTable)
{
	int iradius = (int)ceilf(h/dx);
	int size = iradius*2;
	float hPow2 = h*h;

	for (int i=0; i<tableSize; ++i)
	{
		float fi = size * (float(i)/(tableSize-1));

		T sum;
		KernelFunctor::init(sum);

		for (int iz=-iradius; iz<iradius; ++iz)
		{
			for (int iy=-iradius; iy<iradius; ++iy)
			{
				for (int ix=-iradius; ix<iradius; ++ix)
				{
					vec3f R_ij = (make_vec3f(0, 0, fi) - make_vec3f(ix,iy,iz))*dx;
					// does this fall into the smoothing radius?
					float rPow2 = dot(R_ij, R_ij);

					if (rPow2 < hPow2)
					{
						KernelFunctor::item(sum, R_ij, sqrtf(rPow2), h);
					}
				}
			}
		}

		outTable[(tableSize-1)-i] = sum;
	}
}

//-----------------------------------------------------------------------------------
/// interpolate the kernel lookup.
///
template <class T>
inline NVPARTICLES_CUDA_EXPORT 
T calcKernelLookup(float penetration, T* table)
{
	// given coverage, calculate the kernel using the wall.

	// normalize this penetration. 
	// (remember the table goes from completely outside to completely inside)
	// i.e. -h to +h penetration.
	float f = (penetration * NVPARTICLES_SYSTEM_PARAM(invScale)) / (NVPARTICLES_SYSTEM_PARAM(smoothingLength)*2); 
	// negative distance not supported.
	f = max(f, 0.f);

	// clamp to range
	float fx = f * (NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE-1);
	fx = min(fx, float(NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE-1));
	int ix0 = floorf(fx);
	int ix1 = ix0+1;
	ix1 = min(ix1, NVPARTICLES_WCSPH_KERNEL_LOOKUP_SIZE-1);

	float fraction = fx - ix0;
	T value = (1-fraction)*table[ix0] + (fraction)*table[ix1];

	return value;
}

//-----------------------------------------------------------------------------------
/// Helper function for calculating the pressure force from a wall.
/// (from Harada et al. - "Smoothed Particle Hydrodynamics on GPUs")
///
inline NVPARTICLES_CUDA_EXPORT 
vec3f computeWallPressureForce(float distance, const vec3f& N)
{
	if (distance <= 0)
		distance = 0;

	float separation = NVPARTICLES_SYSTEM_PARAM(particleRestDistance)/2 - distance*NVPARTICLES_SYSTEM_PARAM(invScale);
	if (separation <= 0)
		return make_vec3f(0);

	vec3f Fpressure = (separation * N) / NVPARTICLES_SYSTEM_PARAM(cache.deltaTimePow2);
	//Fpressure *= NVPARTICLES_SYSTEM_PARAM(particleMass);
	return Fpressure;
}

//-----------------------------------------------------------------------------------
/// calculate the penetration with the wall (taking account of any periodic walls).
///
inline static NVPARTICLES_CUDA_EXPORT
float computeWallPenetration(const vec3f Pw, const float collisionDist, vec3f& contactPoint, vec3f& contactNormal)
{
	// transform point into primitive-space.
	vec3f P = NVPARTICLES_SYSTEM_PARAM(boundaryMatrixInv).multiplyPoint(Pw);
	contactNormal = make_vec3f(0);
	contactPoint = P;

	// TODO: transform collision-distance into primitive-space. (creating a non-uniform scaled sphere)
	vec3f collisionRadius = make_vec3f(0);
    collisionRadius = NVPARTICLES_SYSTEM_PARAM(boundaryMatrixInv).multiply(make_vec3f(collisionDist));

	float xlen = length(NVPARTICLES_SYSTEM_PARAM(boundaryMatrix).X());
	float ylen = length(NVPARTICLES_SYSTEM_PARAM(boundaryMatrix).Y());
	float zlen = length(NVPARTICLES_SYSTEM_PARAM(boundaryMatrix).Z());

	vec3f Pcontact[6];
	vec3f Ncontact[6];
	float penetration[6] = {0,0,0,0,0,0};

    /// HACK:
    // ignore the positive-y penetration because we will kill particles that leave the domain through the top.
	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYPOS(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[0] = Collision::HalfSpace(P, collisionRadius.y, make_vec4f(0,1,0,1), Pcontact[0], Ncontact[0]) * ylen;
	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYNEG(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[1] = Collision::HalfSpace(P, collisionRadius.y, make_vec4f(0,-1,0,1), Pcontact[1], Ncontact[1]) * ylen;

	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXPOS(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[2] = Collision::HalfSpace(P, collisionRadius.x, make_vec4f(1,0,0,1), Pcontact[2], Ncontact[2]) * xlen;
	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXNEG(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[3] = Collision::HalfSpace(P, collisionRadius.x, make_vec4f(-1,0,0,1), Pcontact[3], Ncontact[3]) * xlen;

	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZPOS(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[4] = Collision::HalfSpace(P, collisionRadius.z, make_vec4f(0,0,1,1), Pcontact[4], Ncontact[4]) * zlen;
	if (!NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZNEG(NVPARTICLES_SYSTEM_PARAM(boundaryMode)))
		penetration[5] = Collision::HalfSpace(P, collisionRadius.z, make_vec4f(0,0,-1,1), Pcontact[5], Ncontact[5]) * zlen;

	int maxIndex = 0;
	float maxPenetration = 0;
	for(int i=0; i<6; ++i)
	{
		if (penetration[i] > maxPenetration)
		{
			maxPenetration = penetration[i];
			maxIndex = i;
		}
	}

    contactPoint = NVPARTICLES_SYSTEM_PARAM(boundaryMatrix).multiplyPoint(Pcontact[maxIndex]);
    contactNormal = normalize(NVPARTICLES_SYSTEM_PARAM(boundaryMatrixInv).multiplyTranspose(Ncontact[maxIndex]));

	return maxPenetration;
}

//-----------------------------------------------------------------------------------
/// Iterator to calculate density using the discrete summation approach.
///
struct CalcDensitySummationIterator : public IteratorFunctorBase3<float4, true>
{
	typedef CalcDensitySummationIterator Iter;
    float totalDensity;
    ParticleData inData;

    static NVPARTICLES_CUDA_EXPORT 
	void pre(Iter &it, uint const index_i)
    {
        it.totalDensity = 0;
    }
	
    static NVPARTICLES_CUDA_EXPORT 
	bool item(Iter &it, uint const &index_i, uint const &index_j, float3 const& position_i)
    {
        float3 position_j = make_float3(NVPARTICLES_SYSTEM_FETCH(it.inData, position, index_j));
		if (NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) == 2)
			position_j += it.periodicDisplacement;

		// get relative position.
		float3 position_ij = (position_i - position_j) * NVPARTICLES_SYSTEM_PARAM(invScale);
        float rPow2 = dot(position_ij, position_ij);

        if (rPow2 < NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2))
        {
			// particle-mass and kernel-constants have been hoisted outside of the loop.
			/// CAVEAT: this assumes the same particle-mass for all particles.

			it.totalDensity += sph::Poly6::kernelVariable(NVPARTICLES_SYSTEM_PARAM(cache.smoothingLengthPow2), rPow2);
		}

        return true;
    }

    static NVPARTICLES_CUDA_EXPORT 
	void post(Iter &it, const uint index_i)
    {
        // get the mass-density at this particle...
        float density_i = (it.totalDensity * NVPARTICLES_SYSTEM_PARAM(particleMass) * NVPARTICLES_SYSTEM_PARAM(cache.densityKernelConstant));

#if defined(NVPARTICLES_WCSPH_USE_WALL_WEIGHT)

		float3 velocity_i = make_float3(it.inData.velocity[index_i]);

		float wallDensity = 0;
		vec3f wallViscosity = make_vec3f(0);
		vec3f wallColorFieldGradient = make_vec3f(0);
		const float wallCollisionRadius = NVPARTICLES_SYSTEM_PARAM(smoothingLength)/NVPARTICLES_SYSTEM_PARAM(invScale);
		vec3f wallNormal;
		float wallPenetration = 0;

		if (NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SYSTEM_PARAM(boundaryMode)) & 3)
		{
			// add the density of the wall (if any)...
			vec3f Pcontact;
			wallPenetration = computeWallPenetration(
									make_vec3f(it.position_i.x, it.position_i.y, it.position_i.z), 
									wallCollisionRadius, 
									Pcontact, wallNormal
									);

			if (wallPenetration >= NVPARTICLES_EPSILON)
			{
				wallDensity = NVPARTICLES_SYSTEM_PARAM(particleMass)
						* calcKernelLookup<float>(wallPenetration, d_wallLookup_Poly6_3D_Kernel);

				wallViscosity = (NVPARTICLES_SYSTEM_PARAM(particleMass)/NVPARTICLES_SYSTEM_PARAM(restDensity))
						* NVPARTICLES_SYSTEM_PARAM(viscosity) * (make_vec3f(0)-make_vec3f(velocity_i.x,velocity_i.y,velocity_i.z))
						* calcKernelLookup<float>(wallPenetration, d_wallLookup_Viscosity_3D_Laplace);

				wallColorFieldGradient = (NVPARTICLES_SYSTEM_PARAM(particleMass)/NVPARTICLES_SYSTEM_PARAM(restDensity))
						* calcKernelLookup<float>(wallPenetration, d_wallLookup_Poly6_3D_Gradient) * wallNormal;
			}
		}

		density_i += wallDensity;

		// initialize the internal-forces with the wall pressure!
		vec3f wallForce = computeWallPressureForce(wallCollisionRadius-wallPenetration, wallNormal);// / density_i;
		wallForce += wallViscosity / density_i;
		it.inData.force[index_i] = make_float4(wallForce.x, wallForce.y, wallForce.z, 0);
		
		//it.inData.color[index_i] = make_float4(wallColorFieldGradient.x, wallColorFieldGradient.y, wallColorFieldGradient.z, 1);

#endif
        // store the density.
        it.inData.density[index_i] = density_i;

#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
        // Since we have the density already loaded, we can initialize the particle's pressure,
        // potentially reducing register pressure on the computeInternalForces kernel.
        // BUT this may incur a penalty as we are doing an uncoaleased read of BOTH
        // density AND pressure in the next pass. Hence the option.
        it.inData.pressure[index_i] = computePressure(density_i);
#endif
    }
};

//-----------------------------------------------------------------------------------
__global__ void computeDensityKernel(uint numParticles, ParticleData inData,
#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
	SpatialGrid::AdjacencyListData const spatialGridData
#else
	SpatialGrid::SpatialGridData const spatialGridData
#endif
                                 )
{
    uint index = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index >= numParticles)
        return;

    CalcDensitySummationIterator it;
    it.inData = inData;

    float3 position_i = make_float3(NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, position, index));

	iterateNeibs<CalcDensitySummationIterator, true, 1>(it, index, position_i, spatialGridData);
}

//------------------------------------------------------------------------------------------
static void computeDensity(
	SpatialGrid::SpatialGridData spatialGridData, 
	int n, 
	ParticleData inData, 
	cudaStream_t stream=0
	)
{
    if(n == 0)
        return;

    uint threadsPerBlock;
#ifdef NVPARTICLES_WCSPH_USE_NEIBS_LIST
    threadsPerBlock = 128;
#else
    threadsPerBlock = 128;
#endif

#ifdef NVPARTICLES_HAS_FERMI
    threadsPerBlock = 224;
#endif

    uint numThreads, numBlocks;
    computeGridSize(n, threadsPerBlock, numBlocks, numThreads);

    while(numBlocks >= 64*1024)
    {
        // have to rescale threadsPerBlock due to too large grid size >=65536
        threadsPerBlock += 32;
        computeGridSize(n, threadsPerBlock, numBlocks, numThreads);
    }

#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
	const SpatialGrid::AdjacencyListData& spatialData = spatialGridData.adjacencyData;
#else
	const SpatialGrid::SpatialGridData& spatialData = spatialGridData;
#endif

    computeDensityKernel<<<numBlocks, numThreads, 0, stream>>>(n, inData, spatialData);
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: computeDensityKernel");
}

//-----------------------------------------------------------------------------------
