/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "Wcsph.h"

#include "NvParticlesConfig.h"
#include "NvParticlesProfiler.h"

#include "gl_utils.h"
#include "cuda_utils.h"
#include "cuda_std_utils.h"
#include "math_utils_cuda.h"

#include "NvParticlesParticleModifier.h"
#include "NvParticlesForces.h"
#include "NvParticlesGrid.h"
#include "NvParticlesPrimitives.h"
#include "NvParticlesGridCuda.h"

#define NVPARTICLES_HAS_FERMI
//#define NVPARTICLES_WCSPH_USE_NEIBS_LIST // pre-calculate the adjacent particles.
#define NVPARTICLES_WCSPH_USE_TAG_SURFACE // determine the surface using the mass-center.
#define NVPARTICLES_WCSPH_USE_SURFACE_TENSION // use color-field to create artificial surface-tension.
//#define NVPARTICLES_WCSPH_USE_XSPH // calculate xsph velocity to smooth velocities.
#define NVPARTICLES_WCSPH_STORE_PRESSURE // cache the pressure to reduce cuda-kernel register-pressure.
#define NVPARTICLES_WCSPH_USE_MULLER_PRESSURE // use faster symmetric pressure kernel.
#define NVPARTICLES_WCSPH_USE_LEAPFROG_EULER // use leapfrog-euler integation.
#define NVPARTICLES_WCSPH_USE_ADAPTIVE_TIMESTEP // use an adaptive timestep.
#define NVPARTICLES_WCSPH_USE_ARTIFICIAL_VISCOSITY // use the artificial visocity from muller
#define NVPARTICLES_WCSPH_USE_WALL_WEIGHT // use the harada wall-weight calc.

#define NVPARTICLES_SYSTEM_FETCH(a, t, i) a.t[i]
#define NVPARTICLES_SYSTEM_FETCH_NOTEX(a, t, i) a.t[i]

namespace Easy
{
namespace NvParticles
{
namespace Wcsph
{

//------------------------------------------------------------------------------------------
/// extra buffers
///
struct ParticleData
{
    uint* id;
    float4* position;
    float4* color;
    float4* velocity;
    float* birthTime;
    float4* veleval;
    float4* staticColor;
    float4* force;
    float* density;

#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    float* pressure;
#endif

#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    float4* xsphVelocity;
#endif

	uint* tag;

    float4* displacement;
    float* field;

    float4* warpedPosition;
};

//-----------------------------------------------------------------------------------
/// Persistent buffer sorter.
///
struct ParticleDataSorter
{
    static NVPARTICLES_CUDA_EXPORT void exchange(uint index, const ParticleData &inData, uint outIndex, const ParticleData &outData, const SpatialGrid::SpatialGridData& grid)
    {
        // copy ALL persistent buffers...
        outData.id[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, id, index);
        outData.position[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, position, index);
        outData.velocity[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, velocity, index);
        outData.veleval[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, veleval, index);
        //outData.color[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, color, index);
        outData.birthTime[outIndex] = NVPARTICLES_SYSTEM_FETCH_NOTEX(inData, birthTime, index);
    }
};

//------------------------------------------------------------------------------------------
// extra parameters
//
struct InternalParameters
{
    int glTexDisplacement;
    int glTexDisplacementSize;

    float invScale; // simulation-space to world-space scale
    float time;
    float deltaTime;
    float worldParticleRadius;

    float surfaceDistance;
    float negativePressureFactor; // how much do we allow negative pressure.

    int maxParticles;

    float lifespan;
    int lifespanMode;
    float lifespanRandom;
    float particleMass;
    float colorScale;
    uint seed;

    float gravity;

    float particleRestDistance;
    float smoothingLength;

    float boundaryStiffness;
    float boundaryDamping;
    int boundaryMode;
    mat44f boundaryMatrix;
    mat44f boundaryMatrixInv;

    float restDensity;
    float restPressure;

    float viscosity;
    float artificialViscosity;

    float velocityLimit;

    float surfaceTension;
    float surfaceTensionThreshold;

    float speedOfSound; //(about 10x the wave water speed)
    float speedOfSoundPower;
    float cflFactor;

#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    float xsphFactor;
#endif
    float densityThreshold;

    struct
    {
        float epsArtificialViscosity;
        float smoothingLengthPow2;
        float smoothingLengthPow3;
        float smoothingLengthPow4;
        float densityKernelConstant;
        float pressureKernelConstant;
        float viscosityKernelConstant;
        float invSmoothingLength;
        float velocityLimitPow2;
        float speedOfSoundPow2;
		float deltaTimePow2;
    } cache;
};

//------------------------------------------------------------------------------------------
#include "Wcsph_CudaInline.h"

//------------------------------------------------------------------------------------------
void* Solver::creator()
{
    return new Solver();
}

//------------------------------------------------------------------------------------------
//virtual
void Solver::initialize()
{
    defineParameter("__description", ParameterSpec::STRING, std::string("Weakly Compressible SPH"));
    defineParameter("__url", ParameterSpec::STRING, std::string("solvers/wcsph/index.html"));

    defineParameter("lifespan", ParameterSpec::FLOAT, float(10), Parameters().set("description", std::string("Lifespan of a particle.")));
    defineParameter("lifespanMode", ParameterSpec::INT, int(0), Parameters().set("description", std::string("Lifespan mode.")).set("enum", "forever,constant"));
    defineParameter("lifespanRandom", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("Lifespan jitter.")));

    defineParameter("colorStyle", ParameterSpec::INT, int(0), Parameters().set("description", std::string("Dynamic color style.")).set("ui-min", int(0)).set("ui-max", int(9)));
    defineParameter("colorScale", ParameterSpec::FLOAT,  float(1), Parameters().set("description", std::string("Scale the dynamic color.")).set("ui-softmin", float(0)).set("ui-softmax", float(50)));

    defineParameter("internalScale", ParameterSpec::FLOAT,  float(1), Parameters().set("description", std::string("Scale the simulation up or down (default=100%)")).set("ui-min", float(0)));

    defineParameter("glTexDisplacement", ParameterSpec::INT, int(0), Parameters().set("description", std::string("OpenGL texture ID.")).set("ui-min", int(0)));
    defineParameter("glTexDisplacementSize", ParameterSpec::INT, int(0), Parameters().set("description", std::string("OpenGL texture size.")).set("ui-min", int(0)));

    defineParameter("deltaTime", ParameterSpec::FLOAT, float(0.01), Parameters().set("description", std::string("Keep value low or instability (seconds)")).set("ui-min", float(0)));
    defineParameter("deltaTimeUseCfl", ParameterSpec::BOOL, bool(false), Parameters().set("description", std::string("Use the CFL condition to determine the maximum (safe) timestep")));
    defineParameter("cflFactor", ParameterSpec::FLOAT, float(0.3), Parameters().set("description", std::string("A factor for the CFL timestep")).set("ui-min", float(0)));


    defineParameter("particleRadius", ParameterSpec::FLOAT, float(0.5), Parameters().set("description", std::string("Radius of particle (m), set to ZERO to scale using the particleMass")));
    defineParameter("particleMass", ParameterSpec::FLOAT, float(0.05), Parameters().set("description", std::string("Mass of unit volume (kg)")).set("ui-min", float(0)));
    defineParameter("restDensity", ParameterSpec::FLOAT,  float(1000), Parameters().set("description", std::string("kg / m^3 (water=600 kg/m^3)")).set("ui-min", float(0)));
    defineParameter("restPressure", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("What pressure the rest state has. (water=0)")).set("ui-min", float(0)));
    defineParameter("negativePressureFactor", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("How much we allow negative pressure.")).set("ui-min", float(0)));
    defineParameter("surfaceDistance", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("Distance to the surface.")).set("ui-min", float(0)));

    defineParameter("artificialViscosity", ParameterSpec::FLOAT, float(0.5), Parameters().set("description", std::string("pascal-second (Pa.s) = 1 kg m^-1 s^-1 (water=0.2)")).set("ui-min", float(0)));
    defineParameter("viscosity", ParameterSpec::FLOAT, float(3.5), Parameters().set("description", std::string("pascal-second (Pa.s) = 1 kg m^-1 s^-1 (water=0.2)")).set("ui-min", float(0)));
    defineParameter("velocityLimit", ParameterSpec::FLOAT, float(200), Parameters().set("description", std::string("m/s")).set("ui-min", float(0)));
    defineParameter("surfaceTension", ParameterSpec::FLOAT, float(0.0), Parameters().set("description", std::string("Surface tension force (water=0.1)")).set("ui-min", float(0)));
    defineParameter("surfaceTensionThreshold", ParameterSpec::FLOAT, float(7), Parameters().set("description", std::string("Surface tension force threshold (def=7.0)")).set("ui-min", float(0)));
    defineParameter("gravity", ParameterSpec::FLOAT, float(9.8), Parameters().set("description", std::string("Gravitational acceleration. (Earth=9.8)")));

    defineParameter("boundaryDamping", ParameterSpec::FLOAT, float(1), Parameters().set("description", std::string("Boundary damping coefficient (0=preserve normal velocity, 1=remove normal velocity).")).set("ui-min", float(0)));
    defineParameter("boundaryStiffness", ParameterSpec::FLOAT, float(1000), Parameters().set("description", std::string("Boundary stiffness coefficient.")).set("ui-min", float(0)));
    defineParameter("boundaryMode", ParameterSpec::INT, int(0), Parameters().set("description", std::string("Boundary mode none=0, solid=1, periodic-X=50, periodic-Y=194, periodic-Z=770, periodic-XZ=818, periodic-XY=137, periodic-XYZ=1010.")).set("enum", "none,solid,periodic"));
    defineParameter("boundaryMatrix", ParameterSpec::MATRIX, mat44f::scale(10,10,10), Parameters().set("description", std::string("Boundary is from -1 to +1, and is transformed by this matrix.")));

    defineParameter("smoothingFactor", ParameterSpec::FLOAT, float(1), Parameters().set("description", std::string("Boost Radius of SPH kernel (m)")).set("ui-min", float(0)));
    defineParameter("eosTaitPower", ParameterSpec::FLOAT, float(7), Parameters().set("description", std::string("default=7")).set("ui-min", float(0)));
    defineParameter("speedOfSound", ParameterSpec::FLOAT, float(40), Parameters().set("description", std::string("Should be about 10*|Vmax|")).set("ui-min", float(0)));
    defineParameter("debugLevel", ParameterSpec::INT, int(0), Parameters().set("description", std::string("Print info on command-line.")).set("ui-min", int(0)).set("ui-max", int(2)));

    defineParameter("xsph", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("How much velocity smoothing to use.")).set("ui-min", float(0)));

    defineParameter("densityThreshold", ParameterSpec::FLOAT, float(0), Parameters().set("description", std::string("Kill particle when density is less than this. (default=400)")).set("ui-min", float(0)));


    defineBuffer("id", ParticleBufferSpec::UINT, ParticleBufferSpec::PERSISTENT);
    defineBuffer("position", ParticleBufferSpec::FLOAT4, ParticleBufferSpec::PERSISTENT);
    defineBuffer("velocity", ParticleBufferSpec::FLOAT4,  ParticleBufferSpec::PERSISTENT | ParticleBufferSpec::RENDERABLE);
    defineBuffer("velocityPrediction", ParticleBufferSpec::FLOAT4, ParticleBufferSpec::PERSISTENT);
    defineBuffer("color", ParticleBufferSpec::FLOAT4, ParticleBufferSpec::PERSISTENT | ParticleBufferSpec::RENDERABLE);
    defineBuffer("birthTime", ParticleBufferSpec::FLOAT, ParticleBufferSpec::PERSISTENT);

	defineBuffer("static-color", ParticleBufferSpec::FLOAT4, ParticleBufferSpec::STATIC);

    defineBuffer("density", ParticleBufferSpec::FLOAT, 0);
	defineBuffer("tag", ParticleBufferSpec::UINT, 0);
    defineBuffer("force", ParticleBufferSpec::FLOAT4, 0);
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    defineBuffer("pressure", ParticleBufferSpec::FLOAT, 0);
#endif
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    defineBuffer("xsphVelocity", ParticleBufferSpec::FLOAT4, 0);
#endif

    defineBuffer("warpedPosition", ParticleBufferSpec::FLOAT4, ParticleBufferSpec::RENDERABLE, "position");
}

//------------------------------------------------------------------------------------------
Solver::Solver()
{
    colorStyle = 0;

    glResourceDisplacement = 0;
    oldGlTexDisplacement = 0;
    oldGlTexDisplacementSize = 0;

    parameters = new InternalParameters;

    InternalParameters* params = parameters;

    params->glTexDisplacement = 0;
    params->glTexDisplacementSize = 0;

    params->negativePressureFactor = 0;
    params->surfaceDistance = 0;

    params->densityThreshold = 0.0f;

    params->smoothingLength = 0.1f;

#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    params->speedOfSound = 40.0f;
#else
	params->speedOfSound = 40.0f;
    params->speedOfSoundPower = 2.0f;
#endif
    params->seed = 0;

    //params->internalScale = 0.05f;//0.05f; // worldToSimulation scale (default=0.05)
    params->invScale = 1;

#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    params->deltaTime = 0.000452f;
#else
    params->deltaTime = 0.01f;
#endif
    params->cflFactor = 0.3;

    params->lifespan = 100;
    params->lifespanMode = 0;
    params->lifespanRandom = 0;
    params->colorScale = 1.f;

    params->worldParticleRadius = 0.5;
    params->restDensity = 1000.0f; // kg / m^3 (default=600)
    params->restPressure = 0.0f; // (default=0.0)

    params->boundaryStiffness = 256.0f; // (default=10000)
    params->boundaryDamping = 0.0f; // (default=256)
    params->boundaryMode = 1;
    params->boundaryMatrix = mat44f::identity();
    params->boundaryMatrixInv = mat44f::identity();

    params->artificialViscosity = 0.5f;
    params->viscosity = 3.5f; // pascal-second (Pa.s) = 1 kg m^-1 s^-1 (default=0.2) (see wikipedia page on viscosity)
    params->velocityLimit = 200.0f; // m / s (default=200)
    params->surfaceTension = 0.0f;
    params->surfaceTensionThreshold = 7.0f;

    params->gravity = 9.80665;

#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    params->eosTaitPower = 7.0f;
#else
    //params->eosGasStiffness = 3.0f; // (default=3.0)
#endif

    // pre-computed values...

    // get the unit cell length.
    params->particleRestDistance = params->worldParticleRadius * 2 * params->invScale;
    float particleVolume = powf(params->particleRestDistance, 3.0f );
    params->particleMass = particleVolume * params->restDensity;

    params->smoothingLength = 2.0f * params->particleRestDistance;


#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    params->xsphFactor = 0.5;
#endif
}

//------------------------------------------------------------------------------------------
Solver::~Solver()
{
    delete parameters;

    if (glResourceDisplacement)
        cudaGraphicsUnregisterResource(glResourceDisplacement);
}

//------------------------------------------------------------------------------------------
//virtual
void Solver::updateParameters(Parameters& inAttr)
{
    InternalParameters* params = parameters;

    // standard solver parameters:
    params->time = inAttr.asFloat("time", 0.f);

    params->seed = inAttr.asInt("seed", 0);

    debugLevel = inAttr.inputValue(getParameter("debugLevel"), 0)->asInt();

    float internalToWorldScale = inAttr.inputValue(getParameter("internalScale"), 1.0f/params->invScale)->asFloat();
    params->invScale = 1.0f / internalToWorldScale;

    params->deltaTime = inAttr.inputValue(getParameter("deltaTime"), params->deltaTime)->asFloat();

    bool useCFL = inAttr.inputValue(getParameter("deltaTimeUseCfl"), false)->asBool();
    params->cflFactor = inAttr.inputValue(getParameter("cflFactor"), params->cflFactor)->asFloat();
    if (!useCFL)
        params->cflFactor = 0;

    params->glTexDisplacement = inAttr.inputValue(getParameter("glTexDisplacement"), params->glTexDisplacement)->asInt();
    params->glTexDisplacementSize = inAttr.inputValue(getParameter("glTexDisplacementSize"), params->glTexDisplacementSize)->asInt();

    params->negativePressureFactor = inAttr.inputValue(getParameter("negativePressureFactor"), params->negativePressureFactor)->asFloat();
    params->surfaceDistance = inAttr.inputValue(getParameter("surfaceDistance"), params->surfaceDistance)->asFloat();

    params->gravity = inAttr.inputValue(getParameter("gravity"), params->gravity)->asFloat();

    params->velocityLimit = inAttr.inputValue(getParameter("velocityLimit"), params->velocityLimit)->asFloat();
    FORCE_MAX(params->velocityLimit, 0);
    //params->eosGasStiffness = inAttr.asFloat(getParameter("solver_eosGasStiffness", params->eosGasStiffness)->asFloat();
    //FORCE_MAX(params->eosGasStiffness, 1.f);
#if defined(NVPARTICLES_WCSPH_USE_TAIT_EQUATION)
    params->eosTaitPower = inAttr.inputValue(getParameter("eosTaitPower"), params->eosTaitPower)->asFloat();
    FORCE_MAX(params->eosTaitPower, 1.f);
#endif

    params->restPressure = inAttr.inputValue(getParameter("restPressure"), params->restPressure)->asFloat();
    FORCE_MAX(params->restPressure, 0);
    params->restDensity = inAttr.inputValue(getParameter("restDensity"), params->restDensity)->asFloat();
    FORCE_MAX(params->restDensity, 0);
    params->densityThreshold = inAttr.inputValue(getParameter("densityThreshold"), params->densityThreshold)->asFloat();

    params->viscosity = inAttr.inputValue(getParameter("viscosity"), params->viscosity)->asFloat();
    FORCE_MAX(params->viscosity, 0);

    params->artificialViscosity = inAttr.inputValue(getParameter("artificialViscosity"), params->artificialViscosity)->asFloat();
    FORCE_MAX(params->artificialViscosity, 0);

    float smoothingFactor = inAttr.inputValue(getParameter("smoothingFactor"), 1.0f)->asFloat();


    params->worldParticleRadius = inAttr.inputValue(getParameter("particleRadius"), 0.f)->asFloat();

    params->surfaceTension = inAttr.inputValue(getParameter("surfaceTension"), params->surfaceTension)->asFloat();
    params->surfaceTensionThreshold = inAttr.inputValue(getParameter("surfaceTensionThreshold"), params->surfaceTensionThreshold)->asFloat();

    params->speedOfSound = inAttr.inputValue(getParameter("speedOfSound"), params->speedOfSound)->asFloat();
    //params->speedOfSound *= params->invScale;

#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    params->xsphFactor = inAttr.inputValue(getParameter("xsph"), params->xsphFactor)->asFloat();
#endif

    params->boundaryStiffness = inAttr.inputValue(getParameter("boundaryStiffness"), params->boundaryStiffness)->asFloat();
    params->boundaryDamping = inAttr.inputValue(getParameter("boundaryDamping"), params->boundaryDamping)->asFloat();
    params->boundaryMode = inAttr.inputValue(getParameter("boundaryMode"), params->boundaryMode)->asInt();
    // convert to actual periodic boundary codes.
    int boundaryCode[] = {0, 1, 2 | ((1|2|16|32)<<4)};

    params->boundaryMode = boundaryCode[params->boundaryMode];

    params->boundaryMatrix = inAttr.inputValue(getParameter("boundaryMatrix"), params->boundaryMatrix)->asMatrix44();
	params->boundaryMatrixInv = params->boundaryMatrix.inverseAffine();

    if (params->worldParticleRadius <= 0)
    {
        params->particleMass = inAttr.inputValue(getParameter("particleMass"), params->particleMass)->asFloat();
        float particleVolume = params->particleMass / params->restDensity;
        params->particleRestDistance = powf(particleVolume, 1/3.0f );
        params->worldParticleRadius = (params->particleRestDistance * 0.5f) / params->invScale;
    }
    else
    {
        params->particleRestDistance = params->worldParticleRadius * 2 * params->invScale;
        float particleVolume = powf(params->particleRestDistance, 3.0f );
        params->particleMass = particleVolume * params->restDensity;
    }

    params->smoothingLength = smoothingFactor * 2.0f * params->particleRestDistance;

    // these values are in world-space...
    params->lifespan = inAttr.inputValue(getParameter("lifespan"), params->lifespan)->asFloat();
    params->lifespanRandom = inAttr.inputValue(getParameter("lifespanRandom"), params->lifespanRandom)->asFloat();
    params->lifespanMode = inAttr.inputValue(getParameter("lifespanMode"), params->lifespanMode)->asInt();

    params->colorScale = inAttr.inputValue(getParameter("colorScale"), params->colorScale)->asFloat();

    colorStyle = inAttr.inputValue(getParameter("colorStyle"), colorStyle)->asInt();

    if (useCFL && cflDeltaTime > 0)
    {
        params->deltaTime = cflDeltaTime;
    }

    // determine the deltaTime from the integer-quantized subSteps.
    float frameRate = inAttr.asFloat("frameRate", 24.f);
    float frameDuration = (1.0/frameRate);
    subSteps = ceilf(frameDuration / params->deltaTime);
    subSteps = std::max(1, subSteps);
    params->deltaTime = frameDuration / subSteps;
}

//------------------------------------------------------------------------------------------
//virtual
float Solver::particleSpacing()
{
    return parameters->particleRestDistance / parameters->invScale;
}

//-----------------------------------------------------------------------------------
struct UniformGridAssignmentFunctor
{
    static inline NVPARTICLES_CUDA_EXPORT
    uint cell(int index, float4 p)
    {
        uint itemCellIndex;

        if (p.w != 0)
        {
            int3 coord = IteratorFunctorBase3<float4, true>::posToCell(p);
            itemCellIndex = IteratorFunctorBase3<float4, true>::cellToHash(coord);
        }
        else
        {
            // add this particle in the final cell.
            // this cell is used to store all dead particles, so that the sort-by-cellIndex
            // will put them in a contiguous block and the cell range computation will
            // give us the first dead item's offset.
            itemCellIndex = 0 + NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.x)*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.y)*NVPARTICLES_SPATIAL_GRID_PARAM(bucketCount.z);
        }

        return itemCellIndex;
    }
};

//------------------------------------------------------------------------------------------
//virtual
void Solver::evaluate(SimulatorContext_GPU& cxt)
{
    if (cxt.maxParticles == 0)
        return;

    cudaStream_t stream = (cudaStream_t)cxt.stream;

    NVPARTICLES_SCOPED_TIMER("Wcsph::evaluate", stream);

	// get the buffers...

    ParticleBuffer* densityBuffer = cxt.owner->getBuffer("density");
    ParticleBuffer* pressureBuffer = cxt.owner->getBuffer("pressure");
    ParticleBuffer* forceBuffer = cxt.owner->getBuffer("force");
    ParticleBuffer* tagBuffer = cxt.owner->getBuffer("tag");
    ParticleBuffer* idBuffer = cxt.owner->getBuffer("id");
    ParticleBuffer* positionBuffer = cxt.owner->getBuffer("position");
    ParticleBuffer* velocityBuffer = cxt.owner->getBuffer("velocity");
    ParticleBuffer* colorBuffer = cxt.owner->getBuffer("color");
    ParticleBuffer* velocityPredictionBuffer = cxt.owner->getBuffer("velocityPrediction");
    ParticleBuffer* birthTimeBuffer = cxt.owner->getBuffer("birthTime");
    ParticleBuffer* xsphVelocityBuffer = cxt.owner->getBuffer("xsphVelocity");
    ParticleBuffer* staticColorBuffer = cxt.owner->getBuffer("static-color");
    ParticleBuffer* idSortedBuffer = cxt.owner->getBuffer("idSorted");
    ParticleBuffer* positionSortedBuffer = cxt.owner->getBuffer("positionSorted");
    ParticleBuffer* velocitySortedBuffer = cxt.owner->getBuffer("velocitySorted");
    ParticleBuffer* colorSortedBuffer = cxt.owner->getBuffer("colorSorted");
    ParticleBuffer* velocityPredictionSortedBuffer = cxt.owner->getBuffer("velocityPredictionSorted");
    ParticleBuffer* birthTimeSortedBuffer = cxt.owner->getBuffer("birthTimeSorted");

    //ParticleBuffer* warpedPositionBuffer = cxt.owner->getBuffer("warpedPositionRender");
    ParticleBuffer* warpedPositionBuffer = cxt.owner->getBuffer("warpedPosition");

    ParticleGrid* grid = cxt.owner->particleGrid;

    parameters->maxParticles = cxt.owner->maxParticles;

    grid->boundaryMode = parameters->boundaryMode;
    //grid->xform = mat44f::fromVectors(make_vec4f(0),make_vec4f(0),make_vec4f(0),make_vec4f(0));

	boundingbox4f containerBbox;

	if (NVPARTICLES_SPATIAL_GRID_BOUNDARYMODE_TYPE(parameters->boundaryMode) & 0x03)
    {
		// use the bounds defined by the boundaryMatrix...
		vec3f low = parameters->boundaryMatrix.multiplyPoint(make_vec3f(-1,-1,-1));
		vec3f high = parameters->boundaryMatrix.multiplyPoint(make_vec3f(1,1,1));
		containerBbox = boundingbox4f(make_vec4f(low.x, low.y, low.z, 0),
										make_vec4f(high.x, high.y, high.z, 0));
	}
	else
	{
		// use the container's bbox...
		containerBbox = boundingbox4f(make_vec4f(cxt.boundsLow.x, cxt.boundsLow.y, cxt.boundsLow.z, 0),
										make_vec4f(cxt.boundsHigh.x, cxt.boundsHigh.y, cxt.boundsHigh.z, 0));

		vec3f scale = (cxt.boundsHigh - cxt.boundsLow)/2;
		parameters->boundaryMatrix = mat44f::translate(cxt.boundsLow.x+scale.x,cxt.boundsLow.y+scale.y,cxt.boundsLow.z+scale.z) * mat44f::scale(scale.x,scale.y,scale.z);
		if (debugLevel > 0)
			STDERR(parameters->boundaryMatrix);
	}

    grid->xform = parameters->boundaryMatrix;
	grid->setCellSize(containerBbox, make_vec4f(parameters->smoothingLength / parameters->invScale));

    // prepare the ocean displacement buffers...

    void* displacementPtr = 0;

    if (parameters->glTexDisplacement != oldGlTexDisplacement || parameters->glTexDisplacementSize != oldGlTexDisplacementSize)
    {
        if (glResourceDisplacement != 0)
            cudaGraphicsUnregisterResource(glResourceDisplacement);
        glResourceDisplacement = 0;

        if (parameters->glTexDisplacement)
        {
            NVPARTICLES_CUDA_SAFE_CALL( cudaGraphicsGLRegisterBuffer(&glResourceDisplacement, parameters->glTexDisplacement, cudaGraphicsRegisterFlagsReadOnly) );
            oldGlTexDisplacement = parameters->glTexDisplacement;
            oldGlTexDisplacementSize = parameters->glTexDisplacementSize;
        }
    }

    if (parameters->glTexDisplacement && glResourceDisplacement)
    {
        NVPARTICLES_CUDA_SAFE_CALL( cudaGraphicsMapResources (1, &glResourceDisplacement, stream) );
        size_t displacementSize = 0;

        NVPARTICLES_CUDA_SAFE_CALL( cudaGraphicsResourceGetMappedPointer((void**)&displacementPtr, (size_t*)&displacementSize, glResourceDisplacement) );
        assert(displacementSize >= oldGlTexDisplacementSize*oldGlTexDisplacementSize*sizeof(float4));
    }


    void* fieldPtr = 0;
    void* fieldPtr2 = 0;

    if (parameters->glTexDisplacementSize > 0)
    {
        if (fieldBuffer.Size() != grid->bucketCount.x*grid->bucketCount.z*sizeof(float)*2)
        {
            fieldBuffer.Allocate(Cu::Buffer::CUDA, grid->bucketCount.x*grid->bucketCount.z*sizeof(float)*2);
            fieldBuffer.Clear();
        }

        if (fieldBuffer2.Size() != grid->bucketCount.x*grid->bucketCount.z*sizeof(float))
        {
            fieldBuffer2.Allocate(Cu::Buffer::CUDA, grid->bucketCount.x*grid->bucketCount.z*sizeof(float));
            fieldBuffer2.Clear();
        }

        fieldPtr = fieldBuffer.Data();
        fieldPtr2 = fieldBuffer2.Data();
    }

    // update constant data...
    //
    if(1)
    {
        NVPARTICLES_SCOPED_TIMER("upload constants", stream);

        uploadParticleParameters(parameters, stream);

        if (cxt.owner->particleForces)
        {
            cxt.owner->particleForces->forces.scale = parameters->invScale;
            cxt.owner->particleForces->forces.internalScale = parameters->invScale;
            uploadForces(cxt.owner->particleForces->forces, stream);
        }

        if (cxt.owner->primitives)
        {
            cxt.owner->primitives->group.deltaTime = parameters->deltaTime;
		    cxt.owner->primitives->group.particleRadius = parameters->particleRestDistance / (2*parameters->invScale);
            cxt.owner->primitives->group.restitution = 0;
            cxt.owner->primitives->group.friction = 0.5;
            cxt.owner->primitives->group.stiffness = parameters->boundaryStiffness / parameters->invScale; // put into internal-space
            cxt.owner->primitives->group.damping = parameters->boundaryDamping;
            cxt.owner->primitives->group.internalScale = parameters->invScale;
            uploadPrimitives(cxt.owner->primitives->group, stream);
        }
    }

    // map buffers...

    birthTimeBuffer->lock((long)stream);
    idBuffer->lock((long)stream);
    idSortedBuffer->lock((long)stream);
    positionBuffer->lock((long)stream);
    velocityBuffer->lock((long)stream);
    colorBuffer->lock((long)stream);
    velocityPredictionBuffer->lock((long)stream);
    birthTimeSortedBuffer->lock((long)stream);
    positionSortedBuffer->lock((long)stream);
    velocitySortedBuffer->lock((long)stream);
    colorSortedBuffer->lock((long)stream);
    velocityPredictionSortedBuffer->lock((long)stream);
    tagBuffer->lock((long)stream);
    forceBuffer->lock((long)stream);
    densityBuffer->lock((long)stream);
    staticColorBuffer->lock((long)stream);
    warpedPositionBuffer->lock((long)stream);
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    pressureBuffer->lock((long)stream);
#endif
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    xsphVelocityBuffer->lock((long)stream);
#endif

    // package up the pointers into the structs...

    ParticleData data[2];

    data[0].id = (uint*)idBuffer->devicePointer();
    data[0].birthTime = (float*)birthTimeBuffer->devicePointer();
    data[0].position = (float4*)positionBuffer->devicePointer();
    data[0].velocity = (float4*)velocityBuffer->devicePointer();
    data[0].veleval = (float4*)velocityPredictionBuffer->devicePointer();
    data[0].color = (float4*)colorBuffer->devicePointer();
    data[0].tag = (uint*)tagBuffer->devicePointer();
    data[0].force = (float4*)forceBuffer->devicePointer();
    data[0].density = (float*)densityBuffer->devicePointer();
    data[0].staticColor = (float4*)staticColorBuffer->devicePointer();
    data[0].displacement = (float4*)displacementPtr;
    data[0].warpedPosition = (float4*)warpedPositionBuffer->devicePointer();
    data[0].field = (float*)fieldPtr;
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    data[0].pressure = (float*)pressureBuffer->devicePointer();
#endif
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    data[0].xsphVelocity = (float4*)xsphVelocityBuffer->devicePointer();
#endif

    data[1].id = (uint*)idSortedBuffer->devicePointer();
    data[1].birthTime = (float*)birthTimeSortedBuffer->devicePointer();
    data[1].position = (float4*)positionSortedBuffer->devicePointer();
    data[1].velocity = (float4*)velocitySortedBuffer->devicePointer();
    data[1].veleval = (float4*)velocityPredictionSortedBuffer->devicePointer();
    data[1].color = (float4*)colorSortedBuffer->devicePointer();
    data[1].tag = (uint*)tagBuffer->devicePointer();
    data[1].force = (float4*)forceBuffer->devicePointer();
    data[1].density = (float*)densityBuffer->devicePointer();
    data[1].staticColor = (float4*)staticColorBuffer->devicePointer();
    data[1].displacement = (float4*)displacementPtr;
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    data[1].pressure = (float*)pressureBuffer->devicePointer();
#endif
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    data[1].xsphVelocity = (float4*)xsphVelocityBuffer->devicePointer();
#endif

    for (int subStep=0; subStep<subSteps; ++subStep)
    {
        if (debugLevel > 0)
            std::cout << "dt(" << parameters->deltaTime << ") subStep(" << subStep << ") nSubSteps(" << subSteps << ")" << std::endl;

        if(1)
        {
            // get the interpolated transform.
            if(subSteps > 1)
                cxt.owner->primitiveResidentData->updateTransforms(cxt.owner->primitives, float(subStep)/(subSteps-1), (long)stream);
            else
                cxt.owner->primitiveResidentData->updateTransforms(cxt.owner->primitives, 1.0f, (long)stream);

            // because we changed the data we need to update the pointers in the constant memory.
            uploadPrimitives(cxt.owner->primitives->group, stream);
        }

        if(1)
        {
            NVPARTICLES_SCOPED_TIMER("computeWrap", stream);

		    computeWrap(cxt.numParticles, data[0], stream);
        }

        if(1)
        {
            // update the acceleration structure...
            updateSpatialGrid<UniformGridAssignmentFunctor>(grid, 0, cxt.numParticles+1, cxt.maxParticles, (float*)data[0].position, stream);

			// After sorting, the firstDeadIndex is known.
			// Note that there may be dead particles in the (0 - firstDeadIndex) range.
			if(cxt.firstDeadIndexPtr)
				cxt.firstDeadIndexPtr->Copy(grid->d_cellsParticleStartMem, 0, (grid->numCells-1)*sizeof(uint), sizeof(uint), Cu::Buffer::CopyOptions().SetStream((long)stream));


            /// OPTIMIZE:
            // we could only sort if we need to.

            /// OPTIMIZE:
            // NB. The buffer is alive up to numParticles, but
            // the previous numParticles might be larger due to deaths.
            // This means we need to sort with a count of the previous (numParticles+emissionCount) so we don't leave living particles around!

            gridSortData<ParticleDataSorter, ParticleData>(0, cxt.numParticles, data[0], data[1], grid->deviceData(), stream);
#if 0
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
            positionSortedBuffer->buffer()->DumpAs<float4>("sorted-P", min(cxt.numParticles,60));
#endif
        }

#if defined(NVPARTICLES_WCSPH_USE_NEIBS_LIST)
		if(1)
		{
			NVPARTICLES_SCOPED_TIMER("gridBuildNeibs_CUDA", stream);

			gridBuildNeibs_CUDA<SphBuildNeighborFunctor3>(0, cxt.numParticles, (float4*)data[1].position, grid->deviceData(), stream);
		}
#endif

        if (1)
        {
            NVPARTICLES_SCOPED_TIMER("computeDensity", stream);

            computeDensity(grid->deviceData(), cxt.numParticles, data[1], stream);
#if 0
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
            densityBuffer->buffer()->DumpAs<float>("density", min(cxt.numParticles,60));
#endif
        }

        if (1)
        {
            NVPARTICLES_SCOPED_TIMER("computeInternalForces", stream);

            computeInternalForces((parameters->surfaceTension > 0), grid->deviceData(), cxt.numParticles, data[1], &cflDeltaTime, stream);
#if 0
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
            forceBuffer->buffer()->DumpAs<float4>("forces", min(cxt.numParticles,60));
#endif
        }

        if (1)
        {
            NVPARTICLES_SCOPED_TIMER("integrate", stream);

	        int cs = colorStyle%8;

	        bool useXsph = false;
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
	        useXsph = (parameters->xsphFactor > 0);
#endif

	        if(cs == 0)
		        integratePass< (int)COLOR_SOURCE_COLOR, 0> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 1)
		        integratePass< (int)COLOR_SOURCE_ID, (int)COLOR_GRADIENT_BLACK_TO_WHITE> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 2)
		        integratePass< (int)COLOR_SOURCE_AGE, (int)COLOR_GRADIENT_WHITE_TO_BLACK> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 3)
		        integratePass< (int)COLOR_SOURCE_VELOCITY, (int)COLOR_GRADIENT_BLUE_TO_RED_HSV> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 4)
		        integratePass< (int)COLOR_SOURCE_PRESSURE, (int)COLOR_GRADIENT_TEMPERATURE> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 5)
		        integratePass< (int)COLOR_SOURCE_DENSITY, (int)COLOR_GRADIENT_TEMPERATURE> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 6)
		        integratePass< (int)COLOR_SOURCE_TAG, 0> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else if(cs == 7)
		        integratePass< (int)COLOR_SOURCE_FORCE, (int)COLOR_GRADIENT_TEMPERATURE> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
	        else
		        integratePass< 0, 0> (useXsph, 0, cxt.numParticles, data[1], data[0], grid->deviceData(), parameters, stream);
#if 0
            NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
            positionBuffer->buffer()->DumpAs<float4>("new-positions", min(cxt.numParticles,60));
#endif
        }

    } // end of substep

    if (1)
    {
        NVPARTICLES_SCOPED_TIMER("computeHeightField", stream);

        computeHeightField_CUDA(data[0].position, data[0].tag, (float*)fieldPtr, parameters->smoothingLength, grid->deviceData(), stream);
    }

    if (1)
    {
        NVPARTICLES_SCOPED_TIMER("updateRenderPosition", stream);

        updateRenderPosition(0, cxt.numParticles, parameters, data[0], (float*)fieldPtr, (float*)fieldPtr2, stream);
#if 0
        NVPARTICLES_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
        warpedPositionBuffer->Buffer()->DumpAs<float4>("render-P", min(cxt.numParticles,60));
#endif
    }

    if (parameters->glTexDisplacement != 0)
    {
        NVPARTICLES_CUDA_SAFE_CALL( cudaGraphicsUnmapResources (1, &glResourceDisplacement, stream) );
    }

    // unmap all the buffers...

    idBuffer->unlock();
    positionBuffer->unlock();
    velocityBuffer->unlock();
    colorBuffer->unlock();
    velocityPredictionBuffer->unlock();
    birthTimeBuffer->unlock();
    idSortedBuffer->unlock();
    positionSortedBuffer->unlock();
    velocitySortedBuffer->unlock();
    colorSortedBuffer->unlock();
    velocityPredictionSortedBuffer->unlock();
    birthTimeSortedBuffer->unlock();
	tagBuffer->unlock();
    forceBuffer->unlock();
    densityBuffer->unlock();
    staticColorBuffer->unlock();
    warpedPositionBuffer->unlock();
#if defined(NVPARTICLES_WCSPH_STORE_PRESSURE)
    pressureBuffer->unlock();
#endif
#if defined(NVPARTICLES_WCSPH_USE_XSPH)
    xsphVelocityBuffer->unlock();
#endif
}

//------------------------------------------------------------------------------------------
} // end namespace Wcsph
}
}

