/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef NVPARTICLES_SPATIAL_GRID_CUDA_INLINE_CC_INCLUDED
#define NVPARTICLES_SPATIAL_GRID_CUDA_INLINE_CC_INCLUDED

#include "NvParticlesGridCuda.h"

//-----------------------------------------------------------------------------------
/// NB.
/// There is nothing in this file specific to a particle system.
/// i.e. This can be included in any nvcc-compiled code.
/// The only caveat is that the textures are only be visible to kernels
/// within the same object file.

//-----------------------------------------------------------------------------------
/// constants and definitions.
///

#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(x) (x&15)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x) ((x>>4)&63)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXPOS(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&1)!=0)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXNEG(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&2)!=0)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYPOS(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&4)!=0)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYNEG(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&8)!=0)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZPOS(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&16)!=0)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZNEG(x) ((NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAP(x)&32)!=0)

#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZMINUS				(1U<<31)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZPLUS				(1U<<30)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYMINUS				(1U<<29)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYPLUS				(1U<<28)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXMINUS				(1U<<27)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXPLUS				(1U<<26)
#define NVPARTICLES_SPATIAL_GRID_BOUNDARY_NOWARP					~(NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXPLUS|NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXMINUS|NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYPLUS|NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYMINUS|NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZPLUS|NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZMINUS)

#ifdef __CUDA_ARCH__
#define NVPARTICLES_SPATIAL_GRID_PARAM(x) d_spatialGridParams.x
#else
#define NVPARTICLES_SPATIAL_GRID_PARAM(x) h_spatialGridParams.x
#endif

__device__ __constant__	SpatialGrid::SpatialGridParameters d_spatialGridParams;
SpatialGrid::SpatialGridParameters h_spatialGridParams;

#define NVPARTICLES_SPATIAL_GRID_FETCH(a, t, i) a.t[i]
#define NVPARTICLES_SPATIAL_GRID_FETCH_NOTEX(a, t, i) a.t[i]

//------------------------------------------------------------------------------------------
/// Upload the SpatialGrid's parameters.
///
extern "C" static
void uploadSpatialGridParameters(const SpatialGrid::SpatialGridParameters* p, cudaStream_t stream=0)
{
    h_spatialGridParams = *p;
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_spatialGridParams, p, sizeof(SpatialGrid::SpatialGridParameters), 0, cudaMemcpyHostToDevice, stream));
}

//-----------------------------------------------------------------------------------
/// Iterator over items in the adjacent cells.
///
/// we can pass in float3 or float4, (or any other type which has x,y,z member variables)
///
template<class T, bool IS_PERIODIC>
struct IteratorFunctorBase3
{
	float3 periodicDisplacement;
	vec3f periodicSign;
	float3 position_i;

    inline static NVPARTICLES_CUDA_EXPORT
    int3 posToCell(T p)
    {
		if (IS_PERIODIC)
		{
			return SpatialGrid::PosCellFunctor<T>::getCoord(p, NVPARTICLES_SPATIAL_GRID_PARAM(xformInv), NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount));
		}
		else
		{
			return SpatialGrid::PosCellFunctor<T>::getCoord(p, NVPARTICLES_SPATIAL_GRID_PARAM(low), NVPARTICLES_SPATIAL_GRID_PARAM(countPerCell));
			//return SpatialGrid::PosCellFunctor<T>::getCoord(p, NVPARTICLES_SPATIAL_GRID_PARAM(low), NVPARTICLES_SPATIAL_GRID_PARAM(high), NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount));
		}
    }

    inline static NVPARTICLES_CUDA_EXPORT
    T cellToPos(int3 c)
    {
		if (IS_PERIODIC)
		{
			return SpatialGrid::PosCellFunctor<T>::getPos(c,NVPARTICLES_SPATIAL_GRID_PARAM(xform), NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount));
		}
		else
		{
			return SpatialGrid::PosCellFunctor<T>::getPos(c, NVPARTICLES_SPATIAL_GRID_PARAM(low), NVPARTICLES_SPATIAL_GRID_PARAM(countPerCell));
		}
    }

    inline static NVPARTICLES_CUDA_EXPORT
    uint cellToHash(int3 c)
    {
        return SpatialGrid::CellHashFunctor<IS_PERIODIC, false>::getHash(c, NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount));
    }

    inline static NVPARTICLES_CUDA_EXPORT
    int3 hashToCell(uint h)
    {
        return SpatialGrid::CellHashFunctor<IS_PERIODIC, false>::getCoord(h, NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount));
    }
};

//-----------------------------------------------------------------------------------
/// Iterate over the items in this bucket.
///
template<class Iterator, typename T>
inline static NVPARTICLES_CUDA_EXPORT
bool iterateCell(
    uint const &cellHash,
    Iterator &it,
    uint const &index_i,
    T const &position_i,
    SpatialGrid::SpatialGridData const &grid
)
{
    volatile uint startIndex = NVPARTICLES_SPATIAL_GRID_FETCH(grid, cellFirstItemIndices, cellHash);

    // First check if this bucket contains items...
    if (startIndex != uint(-1))
    {
        volatile uint endIndex = NVPARTICLES_SPATIAL_GRID_FETCH(grid, cellLastItemIndices, cellHash);

        for (uint index_j = startIndex; index_j < endIndex; ++index_j)
        {
            /// DEBUG:
            //printf("particle[%d]: index_j=%d\n", index_i, index_j);

            if (!Iterator::item(it, index_i, index_j, position_i))
                return false;
        }
    }

    return true;
}

//-----------------------------------------------------------------------------------
inline static NVPARTICLES_CUDA_EXPORT
vec3f periodicDisplacement(int x, int y, int z)
{
	vec3f sign = make_vec3f(0);

	if(NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)) != 2)
		return sign;

	if (x < 0 && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXNEG(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.x = -1;
	}
	else if (x >= NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.x) && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPXPOS(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.x = 1;
	}

	if (y < 0 && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYNEG(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.y = -1;
	}
	else if (y >= NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.y) && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPYPOS(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.y = 1;
	}

	if (z < 0 && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZNEG(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.z = -1;
	}
	else if (z >= NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.z) && NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_WRAPZPOS(NVPARTICLES_SPATIAL_GRID_PARAM(boundaryMode)))
	{
		sign.z = 1;
	}

	return sign;
}

//-----------------------------------------------------------------------------------
/// Iterate over the specified bucket and its adjacent neighbours,
/// using a 3x3x3 grid.
///
template<class Iterator, bool IS_PERIODIC, int RADIUS>
inline static NVPARTICLES_CUDA_EXPORT
void iterateNeibs(
    Iterator &it,
    uint const &index_i,
    float3 const &position_i,
    SpatialGrid::SpatialGridData const& data
    )
{
	it.position_i = position_i;
    Iterator::pre(it, index_i);

	int3 cellCoord = Iterator::posToCell(make_float4(position_i.x, position_i.y, position_i.z, 0));

	it.periodicDisplacement = make_float3(0);

    for (int z=cellCoord.z-RADIUS; z<=cellCoord.z+RADIUS; ++z)
    {
        for (int y=cellCoord.y-RADIUS; y<=cellCoord.y+RADIUS; ++y)
        {
            for (int x=cellCoord.x-RADIUS; x<=cellCoord.x+RADIUS; ++x)
            {
				if (IS_PERIODIC)
				{
					// is this neighbor past the grid bounds?
					// (note that this doesn't mean it is wrapped around the entire particle boundary;
					// only that it is on the edge of the grid domain)
					//
					it.periodicSign = periodicDisplacement(x, y, z);
					// multiply by 2 because our box is from (-1 to 1)
					vec3f boxDisplacement = NVPARTICLES_SPATIAL_GRID_PARAM(xform).multiply(it.periodicSign*2.0f);
					it.periodicDisplacement = make_float3(boxDisplacement.x, boxDisplacement.y, boxDisplacement.z);
				}

				uint cellHash = Iterator::cellToHash(make_int3(x,y,z));

                if (!iterateCell<Iterator, float3>(cellHash, it, index_i, position_i, data))
                {
                    z = cellCoord.z+RADIUS+1;
                    y = cellCoord.y+RADIUS+1;
                    break;
                }
            }
        }
    }

    Iterator::post(it, index_i);
}

#if defined(NVPARTICLES_SPATIAL_GRID_USE_ADJACENCY_LIST)

//------------------------------------------------------------------------------------------
/// Iterator to build adjacency data.
///
struct BuildNeibsIterator3 : public IteratorFunctorBase3<float4, true>
{
    typedef BuildNeibsIterator3 Iter;
    SpatialGrid::AdjacencyListData data;
    uint counter;
    float4* positions;

    inline static NVPARTICLES_CUDA_EXPORT
	void pre(Iter& it, uint const &index_i)
    {
        it.counter = 0;
    }

    inline static NVPARTICLES_CUDA_EXPORT
	bool item(Iter& it, uint const &index_i, uint const &index_j, float3 const &position_i)
    {
		if (it.counter < it.data.adjacencyListMaxItems)
		{
			// store the periodic sign within the adjacentIndex (so we can reconstruct later)
			uint indexWrap = index_j;

			if (1)//IS_PERIODIC)
			{
				if (it.periodicSign.x == 1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXPLUS;
				else if (it.periodicSign.x == -1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXMINUS;
				if (it.periodicSign.y == 1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYPLUS;
				else if (it.periodicSign.y == -1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYMINUS;
				if (it.periodicSign.z == 1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZPLUS;
				else if (it.periodicSign.z == -1)
					indexWrap |= NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZMINUS;
			}

#if defined(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)

			const uint lane = index_i/NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE;
			const uint offset = index_i&(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE-1);
            it.data.adjacencyListPtr[lane * it.data.adjacencyListPitch + it.counter*NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE + offset] = indexWrap;
#else
			it.data.adjacencyListPtr[index_i * it.data.adjacencyListPitch + it.counter] = indexWrap;
#endif
		}

	    it.counter++;

        return true;
    }

    inline static NVPARTICLES_CUDA_EXPORT
	void post(Iter& it, uint index_i)
    {
#if defined(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)
		const uint lane = index_i/NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE;
		const uint offset = index_i&(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE-1);

        // invalidate any remaining neighbours.
        if (it.counter < it.data.adjacencyListMaxItems)
			it.data.adjacencyListPtr[lane * it.data.adjacencyListPitch + it.counter*NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE + offset] = 0xffffffff;
#endif
    }
};

//-----------------------------------------------------------------------------------
/// Iterate over items in the adjacent cells using adjacency data.
///
/// NB. the periodic and radius parameters are unused (as they are used when the list is built)
///
template<class Iterator, bool IS_PERIODIC, int RADIUS>
inline static NVPARTICLES_CUDA_EXPORT
void iterateNeibs(
    Iterator &it,
    uint const &index_i,
    float3 const &position_i,
    SpatialGrid::AdjacencyListData const& adjacencyData
)
{
	it.position_i = position_i;
    Iterator::pre(it, index_i);

#if defined(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)

	const uint lane = index_i/NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE;
	const uint offset = index_i&(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE-1);

    for(uint counter = 0; counter < NVPARTICLES_SPATIAL_GRID_PARAM(adjacencyData.adjacencyListPitch); counter += NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)
	{
        const uint index_j = NVPARTICLES_SPATIAL_GRID_FETCH(adjacencyData, adjacencyListPtr, lane*NVPARTICLES_SPATIAL_GRID_PARAM(adjacencyData.adjacencyListPitch) + counter + offset);
#else
    for(uint counter = 0; counter < adjacencyData.adjacencyListMaxItems; counter++)
	{
        const uint index_j = NVPARTICLES_SPATIAL_GRID_FETCH(adjacencyData, adjacencyListPtr, index_i*NVPARTICLES_SPATIAL_GRID_PARAM(adjacencyData.adjacencyListPitch)+counter);
#endif

        if(index_j == uint(-1))
            break;

		uint indexNoWrap_j = index_j;
		vec3f periodicSign = make_vec3f(0);
		it.periodicDisplacement = make_float3(0);

		if (IS_PERIODIC)
		{
			if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXPLUS)
				periodicSign.x = 1;
			else if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPXMINUS)
				periodicSign.x = -1;
			if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYPLUS)
				periodicSign.y = 1;
			else if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPYMINUS)
				periodicSign.y = -1;
			if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZPLUS)
				periodicSign.z = 1;
			else if (index_j & NVPARTICLES_SPATIAL_GRID_BOUNDARY_WRAPZMINUS)
				periodicSign.z = -1;

			// remove the wrap-bits
			indexNoWrap_j = index_j&NVPARTICLES_SPATIAL_GRID_BOUNDARY_NOWARP;

			// reconstruct the periodicDisplacement.
			vec3f boxDisplacement = NVPARTICLES_SPATIAL_GRID_PARAM(xform).multiply(periodicSign*2.0f);
			it.periodicDisplacement = make_float3(boxDisplacement.x, boxDisplacement.y, boxDisplacement.z);
		}

        if (!Iterator::item(it, index_i, indexNoWrap_j, position_i))
            break;
    }

    Iterator::post(it, index_i);
}

//------------------------------------------------------------------------------------------
/// Kernel to build adjacency data.
///
template<int BLOCK_SIZE, class Iterator, bool IS_PERIODIC, int RADIUS>
__global__
void gridBuildNeibsKernel (
    int nGrids,
    int gridSize,
    int start,
    int count,
    float4* positions,
    SpatialGrid::SpatialGridData uniformGrid,
    uint* maxNeibsPtr,
    uint* sumNeibsPtr
)
{
    uint sumNeibs = 0;
    uint maxNeibs = 0;

    for (int grid = 0; grid < nGrids; ++grid)
    {
        const int index = (threadIdx.x) + (blockIdx.x*BLOCK_SIZE) + (grid*gridSize) + start;
        const bool valid = (index-start < count);

        if (!valid)
            break;

        Iterator it;
        it.counter = 0;
        it.positions = positions;
        it.data = uniformGrid.adjacencyData;

        float3 position_i = make_float3(positions[index]);

        iterateNeibs<Iterator, IS_PERIODIC, RADIUS> (it, index, position_i, uniformGrid);

        sumNeibs += it.counter;
        maxNeibs = max(maxNeibs, it.counter);

#if 0
#if (__CUDA_ARCH__ >= 200)

        if (maxNeibs > NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_MAX_NEIBS)
        {
            printf("particle[%d]: Too many neibs maxNeibs(%d)\n", index, maxNeibs);
        }
#endif
#endif
    }


#if defined(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE)

	// Shared memory reduction...
	__shared__ volatile uint smem_sumNeibs[BLOCK_SIZE];
	__shared__ volatile uint smem_maxNeibs[BLOCK_SIZE];

	const uint tid = threadIdx.x;

	smem_sumNeibs[tid] = sumNeibs;
	smem_maxNeibs[tid] = maxNeibs;
	__syncthreads();

    for (uint i=blockDim.x/2; i>0; i/=2)
	{
		if (tid < i)
		{
            smem_sumNeibs[tid] += smem_sumNeibs[tid + i];

            // get max...
			const float a = smem_maxNeibs[tid];
			const float b = smem_maxNeibs[tid + i];
			if (b > a)
				smem_maxNeibs[tid] = b;
		}
		__syncthreads();
	}

	if (tid == 0)
	{
		atomicMax(maxNeibsPtr, smem_maxNeibs[0]);
        atomicAdd(sumNeibsPtr, smem_sumNeibs[0]);
	}
#endif
}

__device__ uint d_globalMaxNeibs;
__device__ uint d_globalSumNeibs;

//------------------------------------------------------------------------------------------
/// Build adjacency data using CUDA.
///
template<class Iterator>
static
void gridBuildNeibs_CUDA (
    int start,
    int count,
    float4* positions,
    SpatialGrid::SpatialGridData accel,
    cudaStream_t stream=0
)
{
    if (count <= 0)
        return;

    NVPARTICLES_CUDA_SAFE_CALL( cudaStreamSynchronize(stream) );

    unsigned int nThreads, nBlocks;
#if defined(NVPARTICLES_HAS_FERMI)
	const int BLOCK_SIZE = 256;
#else
    const int BLOCK_SIZE = 128;
#endif

    computeGridSize(count, BLOCK_SIZE, nBlocks, nThreads);
	dim3 dimBlock = dim3( BLOCK_SIZE, 1, 1 );
	dim3 dimGrid = dim3( nBlocks, 1, 1 );

	const int gridSize = BLOCK_SIZE * nBlocks;
	const int nGrids = (count + gridSize-1) / gridSize;

    unsigned int* maxNeibsPtr = 0;
    NVPARTICLES_CUDA_SAFE_CALL( cudaGetSymbolAddress((void**)&maxNeibsPtr, d_globalMaxNeibs) );
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemsetAsync(maxNeibsPtr, 0, sizeof(unsigned int), stream) );

    unsigned int* sumNeibsPtr = 0;
    NVPARTICLES_CUDA_SAFE_CALL( cudaGetSymbolAddress((void**)&sumNeibsPtr, d_globalSumNeibs) );
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemsetAsync(sumNeibsPtr, 0, sizeof(unsigned int), stream) );

	// invalidate the neighbours list.
	///NVPARTICLES_CUDA_SAFE_CALL( cudaMemsetAsync(accel.adjacencyData.adjacencyListPtr, 0xffffffff, accel.adjacencyData.adjacencyListSize * sizeof(uint), stream) );

	gridBuildNeibsKernel<BLOCK_SIZE, Iterator, true, 1> <<< dimGrid, dimBlock, 0, stream>>> (
        nGrids,
        gridSize,
        start,
 		count,
 		positions,
 		accel,
        maxNeibsPtr,
        sumNeibsPtr
 		);
    NVPARTICLES_CUDA_CHECK_ERROR("gridBuildNeibsKernel");
    NVPARTICLES_CUDA_SAFE_CALL( cudaStreamSynchronize(stream) );

    unsigned int maxNeibs = 0;
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyAsync(&maxNeibs, maxNeibsPtr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );

    unsigned int sumNeibs = 0;
    NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyAsync(&sumNeibs, sumNeibsPtr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );

    NVPARTICLES_CUDA_SAFE_CALL( cudaStreamSynchronize(stream) );

    if (maxNeibs > NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_MAX_NEIBS)
    {
        printf("Warning: Too many neibs! maxNeibs(%d) maxAllowed(%d) avgNeibs(%d)\n", maxNeibs, NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_MAX_NEIBS, sumNeibs/count);
    }

#if 0
    printf("avgNeibs(%d)\n", sumNeibs/count);
#endif
}


//------------------------------------------------------------------------------------------
inline
void gridDumpNeibs(
    const char* title,
    int start,
    int count,
    SpatialGrid::SpatialGridData grid,
    cudaStream_t stream=0)
{
    const SpatialGrid::AdjacencyListData& adjacencyData = grid.adjacencyData;

    int* h_neibs = (int*)malloc(adjacencyData.adjacencyListSize * sizeof(int));
    if (h_neibs)
    {
        NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize((cudaStream_t)stream));
        NVPARTICLES_CUDA_SAFE_CALL(cudaMemcpy(h_neibs, adjacencyData.adjacencyListPtr, adjacencyData.adjacencyListSize * sizeof(int), cudaMemcpyDeviceToHost));

        if (title)
        {
            printf("%s:\n", title);
        }

        for (int tid = 0; tid < count; ++tid)
        {
            const int index_i = tid + start;

            printf("neibs[%d]: ", index_i);

            const int lane = index_i/NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE;
	        const int offset = index_i&(NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE-1);

            for (int counter = 0; counter < adjacencyData.adjacencyListMaxItems; ++counter)
	        {
                const int index_j = h_neibs[lane*adjacencyData.adjacencyListPitch + counter*NVPARTICLES_SPATIAL_GRID_ADJACENCY_LIST_INTERLEAVE + offset];

                if (index_j != -1)
                    printf("%d ", index_j);
            }
            printf("\n");
        }
    }
    free(h_neibs);
}

//------------------------------------------------------------------------------------------
/// Build adjacency data on CPU.
///
template<class Iterator>
static
void gridBuildNeibs_CPU (
    uint numItems,
    float4* positions,
    SpatialGrid::SpatialGridData data
)
{
    for(int index_i=0; index_i<numItems; ++index_i)
    {
        Iterator it;
        it.data = data.adjacencyData;
        float3 position_i = make_float3(positions[index_i]);

        iterateNeibs<Iterator, true, 1> (it, index_i, position_i, data);
    }
}

#endif // defined(NVPARTICLES_SPATIAL_GRID_USE_ADJACENCY_LIST)


//------------------------------------------------------------------------------------------
/// Kernel arguments.
///
struct GridSortDataKernelParams
{
	ParticleData inData;
	ParticleData outData;
    SpatialGrid::SpatialGridData uniformGrid;
    uint* outSortedIndices;

	GridSortDataKernelParams()
	{
	}

	GridSortDataKernelParams(
		const ParticleData _inData,
		const ParticleData _outData,
        const SpatialGrid::SpatialGridData _uniformGrid,
        uint* _outSortedIndices
        )
		:
			inData(_inData),
			outData(_outData),
            uniformGrid(_uniformGrid),
            outSortedIndices(_outSortedIndices)
	{
	}
};

__device__ __constant__	GridSortDataKernelParams d_GridSortDataKernelParams;
static GridSortDataKernelParams h_GridSortDataKernelParams;

#ifdef NVPARTICLES_KERNEL_ARG
#error NVPARTICLES_KERNEL_ARG already defined!
#endif
#ifdef __CUDA_ARCH__
#define NVPARTICLES_KERNEL_ARG(x) d_GridSortDataKernelParams.x
#else
#define NVPARTICLES_KERNEL_ARG(x) h_GridSortDataKernelParams.x
#endif

//------------------------------------------------------------------------------------------
/// Sort multiple buffers using order in spatialGridData.
///
/// \param  start       The start in the buffer, (and the offset for the sorted item).
/// \param  end         The end in the buffer (not inclusive).
///
template<class F, class T>
__global__
void gridSortDataKernel(const int start, const int count/*, T inData, T outData, SpatialGrid::SpatialGridData grid*/)
{
    const int i = (blockIdx.x * blockDim.x) + threadIdx.x + start;
    if (i-start >= count)
        return;

    const int sortedIndex = NVPARTICLES_SPATIAL_GRID_GET_SORTED_ITEM_INDEX(NVPARTICLES_KERNEL_ARG(uniformGrid), i);
    if (sortedIndex < 0)
    {
#if (__CUDA_ARCH__ >= 200) && 0
        printf("Error in Kernel_SortData: sortedIndex out of bounds: (%d)\n", sortedIndex);
#endif
        return;
    }
    F::exchange(sortedIndex, NVPARTICLES_KERNEL_ARG(inData), i, NVPARTICLES_KERNEL_ARG(outData), NVPARTICLES_KERNEL_ARG(uniformGrid));
}

//------------------------------------------------------------------------------------------
__global__
void gridSortDataIndexKernel(const int start, const int count/*, uint* outData, SpatialGrid::SpatialGridData grid*/)
{
    const int i = (blockIdx.x * blockDim.x) + threadIdx.x + start;
    if (i-start >= count)
        return;

	const int sortedIndex = NVPARTICLES_SPATIAL_GRID_GET_SORTED_ITEM_INDEX(NVPARTICLES_KERNEL_ARG(uniformGrid), i);
    if (sortedIndex < 0)
    {
#if __CUDA_ARCH__ >= 200 && 0
        printf("Error in Kernel_SortDataIndex: sortedIndex out of bounds: (%d)\n", sortedIndex);
#endif
        return;
    }
    //printf("particle[%d+%d]: %d\n", start, i, sortedIndex);
    NVPARTICLES_KERNEL_ARG(outSortedIndices[sortedIndex]) = i;
}

//------------------------------------------------------------------------------------------
template<class DataSortFunctor, class T>
inline static
void gridSortData(const int start, const int count, T inData, T outData, SpatialGrid::SpatialGridData uniformGrid, cudaStream_t stream=0)
{
    if(count <= 0)
        return;

    uint numThreads, numBlocks;
    computeGridSize(count, 256, numBlocks, numThreads);

#if defined(NVPARTICLES_WCSPH_USE_MULTI_LEVEL)
    h_GridSortDataKernelParams = GridSortDataKernelParams(inData, outData, uniformGrid, outData.sortedIndex);
#else
    h_GridSortDataKernelParams = GridSortDataKernelParams(inData, outData, uniformGrid, 0);
#endif

	NVPARTICLES_CUDA_SAFE_CALL( cudaMemcpyToSymbolAsync(d_GridSortDataKernelParams, &h_GridSortDataKernelParams, sizeof(GridSortDataKernelParams), 0, cudaMemcpyHostToDevice, stream) );

#if defined(NVPARTICLES_WCSPH_USE_MULTI_LEVEL)
    gridSortDataIndexKernel<<< numBlocks, numThreads, 0, stream>>> (
        start,
        count/*,
        outData.sortedIndex,
        uniformGrid*/
    );
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: gridSortDataIndexKernel");
#endif

    gridSortDataKernel<DataSortFunctor, T><<< numBlocks, numThreads, 0, stream>>> (
        start,
        count/*,
        inData,
        outData,
        uniformGrid*/
    );
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: gridSortDataKernel");
}

#undef NVPARTICLES_KERNEL_ARG

struct DefaultUniformGridAssignment
{
    static inline NVPARTICLES_CUDA_EXPORT
    uint cell(int index, float4 p)
    {
        uint itemCellIndex;

        if (p.w <= 0.f)
        {
            // this is a hack to make an extra "hidden" grid level.
            // if particle is dead, then move it to the hidden level for later rebirth!
            itemCellIndex = 0 + NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.x)*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.y)*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.z);
        }
        else
        {
            int3 coord = IteratorFunctorBase3<float4, true>::posToCell(p);
            itemCellIndex = IteratorFunctorBase3<float4, true>::cellToHash(coord);
        }

        return itemCellIndex;
    }
};

//-----------------------------------------------------------------------------------
/// assign items to buckets using templated functors,
/// but use the hidden cell to store DEAD items.
///
template <class T, typename Functor>
__global__
void gridInsertKernel(int start, int count, T *positions, SpatialGrid::SpatialGridData spatialGridData)
{
    uint i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (i >= count)
        return;
    i += start;

    uint itemCellIndex = Functor::cell(i, positions[i]);

	NVPARTICLES_SPATIAL_GRID_SET_ITEM_CELL_INDEX(spatialGridData, i, itemCellIndex);
}

//------------------------------------------------------------------------------------------
template <typename Functor>
static
void gridInsert(int start, int count, float4 *d_particlesPos, SpatialGrid::SpatialGridData spatialGridData, cudaStream_t stream=0)
{
    if(count == 0)
        return;
    assert(d_particlesPos);
    assert(spatialGridData.itemCellIndices);

    uint numThreads, numBlocks;
    computeGridSize(count, 256, numBlocks, numThreads);

    gridInsertKernel <float4, Functor> <<< numBlocks, numThreads, 0, stream >>>(
        start, count, d_particlesPos,
        spatialGridData);
    NVPARTICLES_CUDA_CHECK_ERROR("Kernel execution failed: AssignParticlesBucketIdKernel");
}

//------------------------------------------------------------------------------------------
/// Update grid
///
template <typename GridInsertFunctor>
inline static
void updateSpatialGrid(ParticleGrid* grid, int start, int count, int maxParticles, float* positions, cudaStream_t stream=0, int dumpCount=0)
{
    if (count > maxParticles)
        count = maxParticles;

    if (grid)
    {
        if (1)
        {
            uploadSpatialGridParameters(grid->getParameters(), stream);
        }

        SpatialGrid::SpatialGridData spatialGridData = grid->deviceData();

        if (1)
        {
            NVPARTICLES_SCOPED_TIMER("gridInsert", stream);

            gridInsert<GridInsertFunctor>(start, count, (float4*)positions, spatialGridData, stream);
        }

        if (dumpCount > 0)
        {
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
            grid->d_particlesBucketIdMem.DumpAs<uint2>("uniformGrid.items[cell,oldIdx]", start+count+4, (start+count)/dumpCount);
#else
            grid->d_particlesBucketIdMem.DumpAs<uint>("uniformGrid.items[cell]", start+count+4, (start+count)/dumpCount);
            grid->d_particlesSortedIndexMem.DumpAs<uint>("uniformGrid.items[oldIdx]", start+count+4, (start+count)/dumpCount);
#endif
        }

        if (count > 0)
        {
            NVPARTICLES_SCOPED_TIMER("gridSort", stream);

			SpatialGrid::gridSort(start, count, maxParticles, grid->numCells, grid->deviceData(), stream);
		    grid->_isGridDataCopiedToHost = false;
        }

        /*if (dumpCount > 0)
        {
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
#if !defined(NVPARTICLES_SPATIAL_GRID_USE_CUDPP) && !defined(NVPARTICLES_SPATIAL_GRID_USE_THRUST)
            grid->d_particlesBucketIdMem.DumpAs<uint2>("sorted uniformGrid.items[cell,newIdx]", start+count+4, (start+count)/dumpCount);
#else
            grid->d_particlesBucketIdMem.DumpAs<uint>("sorted uniformGrid.items[cell]", start+count+4, (start+count)/dumpCount);
            grid->d_particlesSortedIndexMem.DumpAs<uint>("sorted uniformGrid.items[newIdx]", start+count+4, (start+count)/dumpCount);
#endif
        }*/

        if (1)
        {
            // get the ranges for ALL particles (even potentially dead ones)
            NVPARTICLES_SCOPED_TIMER("gridComputeCellRanges", stream);

            SpatialGrid::gridComputeCellRanges(start, count, spatialGridData.itemCellIndices, grid->numCells, spatialGridData.cellFirstItemIndices, spatialGridData.cellLastItemIndices, stream);
        }

        if (dumpCount > 0)
        {
            grid->dump(start+count+4, (start+count)/dumpCount, (long)stream);
        }
    }
}

//-----------------------------------------------------------------------------------
#endif // NVPARTICLES_SPATIAL_GRID_CUDA_INLINE_CC_INCLUDED
