#include "NvParticlesParticleModifier.h"
#include "NvParticlesConfig.h"
#include "NvParticlesManager.h"
#include <climits>
#include <cfloat>
#include <cstring>
#include <algorithm>
#include <iostream>

#include "std_utils.h"
#include "cuda_utils.h"
#include "CudaScheduler.h"
#include "NvParticlesProfiler.h"

#include "NvParticlesParticleContainer.h"
#include "NvParticlesParticleSolverImpl.h"
#include "NvParticlesParticleRenderer.h"
#include "NvParticlesPrimitives.h"
#include "NvParticlesForces.h"
#include "NvParticlesGrid.h"

#include "cuda_std_utils.h"
#include "math_utils_cuda.h"

#include "cutil.h"
#include <nvToolsExtCudaRt.h>

// Note:
// A ParticleModifier must be always be used inside a thread which has a cuda context.

namespace Easy
{
namespace NvParticles
{

//-----------------------------------------------------------------------------------
class ModifierInitializeJob : public CudaSchedulerJob
{
    ParticleModifier* p;
public:
    ModifierInitializeJob(ParticleModifier* _p)
        : p(_p) {}

    bool onExecute(CudaSchedulerTask *task)
    {
        p->initialize();
        return true;
    }
};

//-----------------------------------------------------------------------------------
class ModifierDestroyJob : public CudaSchedulerJob
{
    ParticleModifier* p;
public:
    ModifierDestroyJob(ParticleModifier* _p)
        : p(_p) {}

    bool onExecute(CudaSchedulerTask *task)
    {
        p->destroy();
        return true;
    }
};

//-----------------------------------------------------------------------------------
class ModifierSyncContainerJob : public CudaSchedulerJob
{
    ParticleModifier* p;

public:
    ModifierSyncContainerJob(ParticleModifier* _p)
        : p(_p) {}

    bool onExecute(CudaSchedulerTask *task)
    {
        p->syncContainer();
        return true;
    }
};

//-----------------------------------------------------------------------------------
class ModifierSolveJob : public CudaSchedulerJob
{
    ParticleModifier* p;

public:
    ModifierSolveJob(ParticleModifier* _p)
        : p(_p) {}

    bool onExecute(CudaSchedulerTask *task)
    {
		p->initialize();
        p->updateAsync();
        return true;
    }
};

//-----------------------------------------------------------------------------------
class ModifierResetJob : public CudaSchedulerJob
{
    ParticleModifier* p;

public:
    ModifierResetJob(ParticleModifier* _p)
        : p(_p) {}

    bool onExecute(CudaSchedulerTask *task)
    {
		p->initialize();
        p->reset();
        return true;
    }
};

//-----------------------------------------------------------------------------------
int ParticleModifier::debugLevel = 0;

//------------------------------------------------------------------------------------------
ParticleModifier::ParticleModifier(int cudaDevice)
{
    deviceIndex = cudaDevice;

    particleForces = 0;
    primitives = 0;
    //particleRenderer = 0;
    primitiveResidentData = 0;

    //fieldBufferTexture = 0;

    container = 0;
    _init();

    // we only let the setup method happen once.
    // if it fails, then that's it until we delete it and start again.
    initialized = false;

    _glRenderSync[0] = 0;
    _glRenderSync[1] = 0;
    _renderSyncIndex = 0;

    pendingMaxParticles = 0;
    pendingSolverMethod = "";
	updateFlags = 0;

    _currentUpdateState = requestedUpdateState = 0;
}

//-----------------------------------------------------------------------------------
void ParticleModifier::_init()
{
    enableSorting = true;
    solverMethod = "";
    maxParticles = 0;
	numParticles = 0;
    _renderNumParticles[0] = 0;
    _renderNumParticles[1] = 0;
    particleGrid = 0;
    cuStream = 0;
	initialized = false;
	dirtyBounds = true;
}

//-----------------------------------------------------------------------------------
//! clean-up all data.
//!
void ParticleModifier::destroy()
{
    if (cuStream)
    {
        printInfo(Stringf("Destroying particle-simulator-context[%p]", this));
    }

	NVPARTICLES_CUDA_SAFE_CALL(cudaThreadSynchronize());

    if (cuStream)
    {
        cuStream->sync();
        delete cuStream;
		cuStream = 0;
        NVPARTICLES_CUDA_CHECK_ERROR("sync and destroy");
    }

    if (_glRenderSync[0])
        glDeleteSync(_glRenderSync[0]);
    _glRenderSync[0] = 0;

    if (_glRenderSync[1])
        glDeleteSync(_glRenderSync[1]);
    _glRenderSync[1] = 0;

    /// CAVEAT:
    // It is important to clear any GPU memory while the context is active!

    delete particleGrid;
    particleGrid = 0;

    _buffers.clear();

    reduceTempBuffer.Free();

	dirtyBounds = true;

	solver.setNull();

    delete primitiveResidentData;
    primitiveResidentData = 0;

    //attributes.clear();

    h_firstDeadIndexBuffer.Free();
    d_firstDeadIndexBuffer.Free();

    d_posMinBuffer.Free();
    d_posMaxBuffer.Free();
    h_posMinBuffer.Free();
    h_posMaxBuffer.Free();

    _init();
}

//-----------------------------------------------------------------------------------
ParticleModifier::~ParticleModifier()
{
    /// CAVEAT:
    // Do NOT call Destroy here as that requires a GPU context to be active;

    if (cuStream)
    {
        // cuStream is as good a test as any!
        printError(Stringf("You have not destroyed the particle-simulator-context[%p]", this));
    }
}

//-----------------------------------------------------------------------------------
void ParticleModifier::setParameters(const Parameters& params)
{
    attributes = params;
}

//-----------------------------------------------------------------------------------
bool ParticleModifier::initialize()
{
    // do we need to re-initialize?
    if(pendingMaxParticles == maxParticles && pendingSolverMethod == solverMethod)
        return true;

    int cudaDevice = -1;
    NVPARTICLES_CUDA_SAFE_CALL(cudaGetDevice(&cudaDevice));

    destroy();
    // now we can assume the entire object is synced and then uninitialized...

    if (debugLevel > 0)
        printInfo(Stringf("Initializing GPU particle-simulator-context[%p]: device(%d) method(%s) N(%d)", this, cudaDevice, pendingSolverMethod.c_str(), pendingMaxParticles));

    cuStream = new Cu::Stream();

    maxParticles = pendingMaxParticles;
    solverMethod = pendingSolverMethod;

    bool rc = true;

    if (!solverMethod.empty())
	{
		solver = Manager::getSingleton().createSolver(solverMethod);
		if (!solver)
		{
            printError("Unknown particle-solver: " + solverMethod);
			rc = false;
		}

		if (solver)
		{
			solver->initialize();

    		rc = _ensureBuffers("", solver->getBufferDefinitions(), false);
		}
	}

    // misc buffers:
    if(rc)
        rc &= d_firstDeadIndexBuffer.Allocate(Cu::Buffer::CUDA, sizeof(uint), 0, "d_firstDeadIndex");
    if(rc)
        rc &= h_firstDeadIndexBuffer.Allocate(Cu::Buffer::HOST, sizeof(uint), Cu::Buffer::HOST_PINNED, "h_firstDeadIndex");
    if(rc)
        rc &= d_posMinBuffer.Allocate(Cu::Buffer::CUDA, sizeof(float4), 0, "d_posMin");
    if(rc)
        rc &= d_posMaxBuffer.Allocate(Cu::Buffer::CUDA, sizeof(float4), 0, "d_posMax");
    if(rc)
        rc &= h_posMinBuffer.Allocate(Cu::Buffer::HOST, sizeof(float4), Cu::Buffer::HOST_PINNED, "h_posMin");
    if(rc)
        rc &= h_posMaxBuffer.Allocate(Cu::Buffer::HOST, sizeof(float4), Cu::Buffer::HOST_PINNED, "h_posMax");

    // allocate a buffer to hold the blocks we reduce into for bbox calc...
    if(rc)
    {
        /// CAVEAT:
        // divide count by num threads used in the reduction calc!
        rc &=  reduceTempBuffer.Allocate(Cu::Buffer::CUDA, maxParticles*sizeof(float4)/128, 0, "d_reduceTemp");
        reduceTempBuffer.Clear(0);
    }

    if(rc)
    {
        particleGrid = new ParticleGrid();
        rc = particleGrid->setup(maxParticles);
    }

    if(rc)
    {
        primitiveResidentData = new PrimitiveResidentData;
    }

    if(rc)
    {
		initialized = true;

        if (solver)
        {
            attributes.setSpecs("", &solver->parameterDefinitions());
            solver->updateParameters(attributes);
        }

        // this has to happen to set the flags on the particles.
        reset();
        return true;
    }

    // if anything goes wrong, then uninitialize the entire solver.
    printError(Stringf("Failed to initialize GPU particle-simulator-context[%p]: device(%d) method(%s) N(%d)", this, cudaDevice, pendingSolverMethod.c_str(), pendingMaxParticles));
    destroy();
    return false;
}

//------------------------------------------------------------------------------------------
/// Create a buffer.
///
ParticleBufferPtr ParticleModifier::_createBuffer(const ParticleBufferSpec& spec, int n, bool vbo)
{
    int f = 0;
    Cu::Buffer::MemType t = Cu::Buffer::CUDA;

    if (vbo)
    {
        t = Cu::Buffer::VBO;
        f = cudaGraphicsRegisterFlagsWriteDiscard;

        if (debugLevel > 0)
            printInfo("particle-solver allocating vbo: " + spec.name);
    }
    else
    {
        if (debugLevel > 0)
            printInfo("particle-solver allocating cuda: " + spec.name);
    }


    ParticleBufferPtr buf = new ParticleBuffer(spec);

	bool rc = true;

    for (int i=0; i<buf->_nBuffers; ++i)
    {
	    rc = buf->_buffers[i].Allocate(t, spec.elementBytes*n, f, spec.name.c_str());
	    buf->_buffers[i].Clear();
    }

    if (!rc)
	{
        printError("particle-solver unable to allocate: " + spec.name);
        buf.setNull();
	}

    return buf;
}

//------------------------------------------------------------------------------------------
/// Add a buffer and all its supplementary buffers.
///
bool ParticleModifier::addBuffer(const ParticleBufferSpec& spec)
{
	ParticleBufferPtr buf;
	std::string newName;
	ParticleBufferSpec newSpec;

	newName = spec.name;
	newSpec = spec;

    ParticleBuffers::const_iterator it = _buffers.find(newName);

	if (it == _buffers.end() || it->second->spec != newSpec)
	{
		buf = _createBuffer(newSpec, maxParticles);
		if (!buf.valid())
			return false;

		setBuffer(newName, buf);
	}

    if (spec.flags & ParticleBufferSpec::PERSISTENT)
    {
        if(enableSorting)
        {
            newName = spec.name + "Sorted";
			newSpec = ParticleBufferSpec(newName, spec.type, 0);

            it = _buffers.find(newName);
			if (it == _buffers.end() || it->second->spec != newSpec)
			{
				buf = _createBuffer(newSpec, maxParticles);
				if (!buf.valid())
					return false;

				setBuffer(newName, buf);
			}
        }
    }

    if(spec.flags & ParticleBufferSpec::RENDERABLE)
    {
        newName = spec.name + "Render";
		newSpec = ParticleBufferSpec(newName, spec.type, ParticleBufferSpec::DOUBLEBUFFER);

        it = _buffers.find(newName);
        if (it == _buffers.end() || it->second->spec != newSpec)
		{
			buf = _createBuffer(newSpec, maxParticles, true);
			if (!buf.valid())
				return false;

			setBuffer(newName, buf);
		}
    }

    return true;
}

//------------------------------------------------------------------------------------------
/// Make dynamic buffers for this simulator.
///
bool ParticleModifier::_ensureBuffers(const std::string& prefix, const ParticleBufferSpecs& bufferSpecs, bool clearFirst)
{
    if (clearFirst)
        _buffers.clear();

    if (bufferSpecs.size() == 0)
        return true;

    if(maxParticles == 0)
        return false;

	bool rc = true;
    // allocate a sorted variant of all the persistent buffers...
    for (ParticleBufferSpecs::const_iterator it=bufferSpecs.begin(); rc && it!=bufferSpecs.end(); ++it)
    {
		ParticleBufferSpec spec = it->second;
		if (prefix != "")
			spec.name = prefix + "_" + spec.name;
		rc = addBuffer(spec);
    }

	if(rc)
		return true;

    // this is bad news.
	return false;
}

//-----------------------------------------------------------------------------------
void ParticleModifier::setContainer(ParticleContainer* c)
{
	container = c;
	pendingMaxParticles = c->maxParticles;
    dirty = true;
}

//-----------------------------------------------------------------------------------
/// reset.
///
void ParticleModifier::reset()
{
	if (!initialized)
		return;

    // we must finish the current processing.
    if (cuStream)
        cuStream->sync();

    resetBuffers();

    if(cuStream)
        cuStream->sync();

    _renderNumParticles[0] = 0;
    _renderNumParticles[1] = 0;
}



extern "C" void calculateBoundsAsync_CUDA(float* h_outMin, float* h_outMax,
                                float* d_outMin, float* d_outMax,
                                int n, float* positions, float* tempBuf,
                                cudaStream_t stream);

//------------------------------------------------------------------------------------------
/// update the bounds.
///
void ParticleModifier::calculateBoundsAsync()
{
	if(!dirtyBounds)
		return;

	dirtyBounds = false;

    if(numParticles == 0 || !getBuffer("position"))
    {
        vec3f* v;
        v = (vec3f*)h_posMinBuffer.Data();
        *v = make_vec3f(0);
        v = (vec3f*)h_posMaxBuffer.Data();
        *v = make_vec3f(0);
        return;
    }

    NVPARTICLES_SCOPED_TIMER("calculateBoundsAsync", *cuStream);

    calculateBoundsAsync_CUDA((float*)h_posMinBuffer.Data(), (float*)h_posMaxBuffer.Data(),
                    (float*)d_posMinBuffer.Data(), (float*)d_posMaxBuffer.Data(),
                    numParticles, (float*)getBuffer("position")->devicePointer(), (float*)reduceTempBuffer.Data(),
                    (cudaStream_t)*cuStream);
}

//------------------------------------------------------------------------------------------
void ParticleModifier::computeBounds(vec3f& minCorner, vec3f& maxCorner)
{
	if (!initialized)
		return;

	calculateBoundsAsync();

	// wait for everything to finish.
    cuStream->sync();

    // get the data that has been computed...
    minCorner = vec3f::fromArray((float*)h_posMinBuffer.Data());
	maxCorner = vec3f::fromArray((float*)h_posMaxBuffer.Data());
}

//-----------------------------------------------------------------------------------
/// Clear all particle buffers and reset the bounds.
///
void ParticleModifier::resetBuffers()
{
	if (!initialized)
		return;

    // reset the bounds
    // mark the bounds as dirty, so we clear them.
	//dirtyBounds = true;
    *((float4*)h_posMinBuffer.Data()) = make_float4(0,0,0,1);
    *((float4*)h_posMaxBuffer.Data()) = make_float4(0,0,0,1);

    // make sure the pending firstDeadIndex is cleared to zero.
    h_firstDeadIndexBuffer.Clear();

	numParticles = 0;

	// we don't need this anymore because we do all this in the particle-container
	if(1)
	{
		if (getBuffer("position"))
		{
			// set particles to DEAD...
			Cu::BufferMapper<float4> p_position(Cu::Buffer::HOST, *getBuffer("position")->buffer());
			for (int i=0; i<maxParticles; ++i)
				p_position[i] = make_float4(0);
		}

		if (getBuffer("id"))
		{
			// initialize IDs as sequential numbers...
			Cu::BufferMapper<uint> p_id(Cu::Buffer::HOST, *getBuffer("id")->buffer());
			for (int i=0; i<maxParticles; ++i)
				p_id[i] = i;
		}
	}

}

//-----------------------------------------------------------------------------------
void ParticleModifier::setMaxParticles(int n)
{
    if (maxParticles != n)
	{
        pendingMaxParticles = n;
		initialized = false;
	}
}

//-----------------------------------------------------------------------------------
void ParticleModifier::setSolverType(std::string type)
{
    if (solverMethod != type)
	{
        pendingSolverMethod = type;
		initialized = false;
	}
}

//----------------------------------------------------------------------------------------------
void ParticleModifier::dump(size_t count, size_t step)
{
    cuStream->sync();

	printf("Particles = %d\n", numParticles);

	if(count == size_t(-1))
		count = numParticles;
	HasParticleBuffers::dump(count, step);
}

//------------------------------------------------------------------------------------------
bool CheckIncludesExcludesString(std::string s, bool useIncludes, std::string includes, bool useExcludes, std::string excludes)
{
	if (useExcludes)
	{
		if(StringArray(excludes).contains(s))
			return false;
	}

	if (useIncludes)
	{
		if(!StringArray(includes).contains(s))
			return false;
	}

	return true;
}

//-----------------------------------------------------------------------------------
size_t ParticleModifier::readContainerAsync(
	ParticleContainer* container,
	size_t d_offset, size_t s_offset, size_t count,
	bool useIncludes, std::string includes,
	bool useExcludes, std::string excludes
	)
{
	if (!initialize())
		return 0;

    NVPARTICLES_SCOPED_TIMER("ParticleModifier::ReadContainerAsync", *cuStream);

	int numcount = count;
	int maxcount = count;

	if (count == size_t(-1))
	{
		maxcount = maxParticles;
        numcount = container->numParticles;
	}

	if (numcount+(int)d_offset > maxParticles)
		numcount = maxParticles - d_offset;

	if (numcount+(int)s_offset > container->maxParticles)
		numcount = container->maxParticles - s_offset;

	if (maxcount > maxParticles)
		maxcount = maxParticles;

	Cu::Buffer::CopyOptions co;
	co.SetStream(*cuStream);

	if (container->getBuffer("id"))
	{
		if (CheckIncludesExcludesString("id", useIncludes, includes, useExcludes, excludes))
		{
			ParticleBuffer* srcBuf = container->getBuffer("id");
			addBuffer(srcBuf->spec);
			ParticleBuffer* dstBuf = getBuffer(srcBuf->spec.name);
			assert(dstBuf);
			// copy maxParticles because the ordering is important!
			dstBuf->copy(*srcBuf, 0, 0, maxcount, co);
		}
	}

	if(container->getBuffer("position"))
	{
		if(CheckIncludesExcludesString("position", useIncludes, includes, useExcludes, excludes))
		{
			ParticleBuffer* srcBuf = container->getBuffer("position");
			addBuffer(srcBuf->spec);
			ParticleBuffer* dstBuf = getBuffer(srcBuf->spec.name);
			assert(dstBuf);
			// copy maxParticles because the ordering is important!
			dstBuf->copy(*srcBuf, 0, 0, maxcount, co);
		}
	}

	if(numcount > 0)
	{
		// if we are storing the body on the host then
		// transfer all dynamic buffers from the container for processing...
		for (ParticleBuffers::const_iterator bit=container->_buffers.begin(); bit!=container->_buffers.end(); ++bit)
		{
			if(!CheckIncludesExcludesString(bit->second->spec.name, useIncludes, includes, useExcludes, excludes))
				continue;

			if (//bit->second->spec.flags&ParticleBufferSpec::PERSISTENT &&
                bit->second->spec.name != "id" && bit->second->spec.name != "position"
				)
			{
				ParticleBuffer* destBuf = getBuffer(bit->second->spec.name);

				if (!destBuf)
				{
					// create the buffer if it doesn't exist...
					addBuffer(bit->second->spec);
					destBuf = getBuffer(bit->second->spec.name);
				}

				assert(destBuf);

				if(destBuf->copy(*bit->second.pointer(), d_offset, s_offset, numcount, co) == size_t(-1))
				{
                    printError("Can't copy body to solver");
				}
			}
		}
	}

	// set the numParticles to the amount we have read in.
	numParticles = d_offset+numcount;

	/// OPTIMIZE:
	// positions will have likely changed, so flag the bounds as dirty.
	// (we COULD only update the dirty flag if position HAS changed.)
	dirtyBounds = true;
	return numcount;
}

//-----------------------------------------------------------------------------------
size_t ParticleModifier::emitContainerAsync(
	ParticleContainer* container,
	//size_t d_offset, size_t s_offset, size_t count,
	bool useIncludes, std::string includes,
	bool useExcludes, std::string excludes
	)
{
	if (!initialize())
		return 0;

    const int s_offset = numParticles;
    const int d_offset = numParticles;
	int numcount = container->numParticles-numParticles;

	if (numcount < 0)
	{
		// the modifier has more particles than the container,
		// presumably because we created new particles inside the solver.
		return 0;
	}

    assert(numcount >= 0);

    // ensure we can read the range.
	if(numcount+s_offset > container->maxParticles)
		numcount = container->maxParticles - s_offset;

    // ensure we can write the range.
	if(numcount+d_offset > maxParticles)
		numcount = maxParticles - d_offset;

	if(numcount <= 0)
		return 0;

    if (debugLevel > 0)
    {
        printInfo(Stringf("particle-container - emission into simulator[%p] - emission=%d, d_offset=%d, s_offset=%d", this, numcount, d_offset, s_offset));
    }

    NVPARTICLES_SCOPED_TIMER("ParticleModifier::emitContainerAsync", *cuStream);

	Cu::Buffer::CopyOptions co;
	co.SetStream(*cuStream);

	// if we are storing the body on the host then
	// transfer all dynamic buffers from the container for processing...
	for (ParticleBuffers::const_iterator bit=container->_buffers.begin(); bit != container->_buffers.end(); ++bit)
	{
		if (!CheckIncludesExcludesString(bit->second->spec.name, useIncludes, includes, useExcludes, excludes))
			continue;

		if (/*bit->second->spec.flags&ParticleBufferSpec::PERSISTENT && */bit->second->spec.name != "id")
		{
			ParticleBuffer* destBuf = getBuffer(bit->second->spec.name);

			if(!destBuf)
			{
				// create the buffer if it doesn't exist...
				addBuffer(ParticleBufferSpec(bit->second->spec.name, bit->second->spec.type));
				destBuf = getBuffer(bit->second->spec.name);
			}

			if(destBuf->copy(*bit->second, d_offset, s_offset, numcount, co) == size_t(-1))
			{
                printError("Can't emit into simulator");
			}
		}
	}

    // treat the uploaded numParticles as an emission.
    numParticles += numcount;

    // positions will have likely changed, so flag the bounds as dirty.
    // (we COULD only update the dirty flag if position HAS changed.)
    dirtyBounds = true;

	return numcount;
}

//-----------------------------------------------------------------------------------
size_t ParticleModifier::writeContainerAsync(ParticleContainer* container, bool storeBodyOnHost)
{
	if(!initialize())
		return 0;

    NVPARTICLES_SCOPED_TIMER("ParticleModifier::writeContainerAsync", *cuStream);

	Cu::Buffer::CopyOptions co;
	co.SetStream(*cuStream);

    if (storeBodyOnHost)
    {
        if(debugLevel)
        {
            printInfo(Stringf("particle-simulator[%p] - downloading to container", this));
        }
	    /// TODO:
	    // we need to copy the entire id array!
	    // Make this more efficient.
	    if(container->getBuffer("id"))
	    {
		    if(getBuffer("id"))
			    container->getBuffer("id")->copy(*getBuffer("id"), 0, 0, maxParticles, co);
		    else
                printInfo(Stringf("particle-simulator[%p] - missing id buffer", this));
	    }

    }

    if (storeBodyOnHost)
    {
        // transfer all persistent buffers to the container...
        for(ParticleBuffers::const_iterator it=container->_buffers.begin(); it != container->_buffers.end(); ++it)
        {
			if(it->second->spec.flags&ParticleBufferSpec::PERSISTENT && it->second->spec.name != "id")
			{
				if(getBuffer(it->second->spec.name))
                {
                    if(it->second->_currentUpdateState != requestedUpdateState)
                    {
                        it->second->copy(*getBuffer(it->second->spec.name), 0, 0, (debugLevel)?maxParticles:numParticles, co);
                        it->second->_currentUpdateState = requestedUpdateState;
                    }
                }
				else
                    printError(Stringf("particle-simulator[%p] - missing buffer: %s", this, it->second->spec.name.c_str()));
			}
        }
    }

    if (!container->_exportRequests.empty())
    {
        // if we are exporting to host...
        // transfer all persistent buffers to the container...
        for (ParticleBuffers::const_iterator bit=_buffers.begin(); bit!=_buffers.end(); ++bit)
        {
            // just copy what we ask for...
            if (!container->isExportingBuffer(bit->second->spec.name))
                continue;

            ParticleBuffer* destBuf = container->getBuffer(bit->second->spec.name);

		    if (!destBuf)
		    {
			    // create the buffer if it doesn't exist...
                container->addBuffer(ParticleBufferSpec(bit->second->spec.name, bit->second->spec.type, 0, bit->second->spec.renderSemantic));
			    destBuf = container->getBuffer(bit->second->spec.name);
                printInfo(Stringf("particle-container[%p] - creating buffer: %s", container, bit->second->spec.name.c_str()));
		    }

            if (destBuf->getUpdateId() != requestedUpdateState)
            {
                destBuf->copy(*bit->second, 0, 0, (debugLevel)?maxParticles:numParticles, co);
                destBuf->setUpdateId(requestedUpdateState);

                if (debugLevel > 0)
                {
                    printInfo(Stringf("particle-simulator[%p] - exporting buffer: %s", this, bit->second->spec.name.c_str()));
                }
            }
        }
    }

    assert(container->numParticles == numParticles);

	return numParticles;
}

//-----------------------------------------------------------------------------------
void ParticleModifier::updateAsync()
{
	if (!initialized)
		return;

    if (!container)
        return;

    if (container->_currentUpdateState == requestedUpdateState)
        return;

    // we need a sync after this method!
	dirty = true;

    if (debugLevel)
    {
        printInfo(Stringf("particle-simulator[%p] - updating container", this));
    }

    nvtxRangePushA("updateAsync");
	NVPARTICLES_SCOPED_TIMER("ParticleModifier::updateAsync", *cuStream);

    if (!container->_firstTime)
    {
        // read just the new particles from the container.
        emitContainerAsync(container);
    }
    else
    {
        // read the entire container.
        readContainerAsync(container);
        container->_firstTime = false;
    }

    //cuStream->sync();
    //getBuffer("position")->dump("solver-position", numParticles, numParticles/100);

	if (dirtyBounds)
	{
        nvtxRangePushA("getBounds1");
		// we may have emitted new particles...
		calculateBoundsAsync();
		cuStream->sync();
        nvtxRangePop();
	}

	container->boundsLow = *(vec3f*)h_posMinBuffer.Data();
	container->boundsHigh = *(vec3f*)h_posMaxBuffer.Data();

    bool contentsChanged = false;

/*
	if (numParticles)
	{
		/// CAVEAT:
		// make a box for the grid in world units
		boundingbox4f bbox;
		bbox = boundingbox4f(make_float4(container->boundsLow.x, container->boundsLow.y, container->boundsLow.z,0),
				make_float4(container->boundsHigh.x, container->boundsHigh.y, container->boundsHigh.z,0));
		particleGrid->SetCellSize(bbox, particleGrid->bucketSize);
		//STDERR2(particleGrid->particles_bbox.low, particleGrid->particles_bbox.high);
		//STDERR2(particleGrid->bucketSize, particleGrid->bucketCount);
	}
*/

	// assume the size doesn't change!
	*((int*)h_firstDeadIndexBuffer.Data()) = numParticles;

	if (numParticles && isSolving)
	{
		SimulatorContext_GPU cxt;
		cxt.owner = this;
		cxt.stream = *cuStream;
		cxt.numParticles = numParticles;
		cxt.maxParticles = maxParticles;
		cxt.firstDeadIndexPtr = (Cu::Buffer*)&h_firstDeadIndexBuffer;
		cxt.boundsLow = container->boundsLow;
		cxt.boundsHigh = container->boundsHigh;

        attributes.setSpecs("", &solver->parameterDefinitions());
        solver->updateParameters(attributes);
		solver->evaluate(cxt);

		dirtyBounds = true;
		contentsChanged = true;
	}

    nvtxRangePushA("getBounds2");
    // update the bounding-box and download to host...
    calculateBoundsAsync();
    nvtxRangePop();

    // if we are storing data inside the containers,
    // or if require export, then download after processing...
    if (contentsChanged)
	{
        writeContainerAsync(container, false);
	}
	else
	{
		// mark all container buffers as up-to-date. (because we haven't touched them!)
		for (ParticleBuffers::const_iterator bit=container->_buffers.begin(); bit!=container->_buffers.end(); ++bit)
			bit->second->setUpdateId(requestedUpdateState);
	}

    nvtxRangePop();
}

//-----------------------------------------------------------------------------------
/// Ensure the previous frame is complete and get the current data.
///
void ParticleModifier::syncContainer()
{
	if(!initialize())
		return;

    if(!dirty)
		return;

    nvtxRangePushA("sync");

    // wait for stream to finish.
    cuStream->sync();

    nvtxRangePushA("blit");
    _renderLock.claim();

    dirty = false;

    // get the data that has been computed...

    int firstDeadIndex = *((uint*)h_firstDeadIndexBuffer.Data());

    if(firstDeadIndex == -1)
    {
        // WARNING: it could be -1...
        // there are NO dead particles.
        // or
        // if we have forgotten to set the dead particles.w to NVPARTICLES_PARTICLE_DEAD_VALUE !
        firstDeadIndex = maxParticles;
        //assert(false && "no dead particles");
    }

	if (firstDeadIndex >= maxParticles)
		firstDeadIndex = maxParticles;

    int numDeaths = (numParticles - firstDeadIndex);


	if(numDeaths)
	{
	    if(debugLevel)
            STDERR3(numParticles, firstDeadIndex, numDeaths);
	}

    if(container)
    {

        if(debugLevel)
        {
            printInfo(Stringf("particle-simulator[%p] - syncing container", this));
        }

        // blit the latest simulation buffers into the render buffers...
        // this is essential because we want the rendering engine to render these particles
        // at the same time as we modify these buffers during the simulation update...
        //
        // NB. We could double-buffer the render-buffers and then this method could occur during the render.
        //

        //solver->Blit(*cuStream);

        // if there are no particles, then wipe the buffers with the dead particles.
        int numToUpdate = firstDeadIndex;
        if (numToUpdate == 0)
            numToUpdate = numParticles;

        if (1)
        {

            // ensure that openGL is finished with the buffers we are writing to...

            if (1)
            {
                //printInfo(Stringf("syncing [%d]", _renderSyncIndex));

                if (glIsSync(_glRenderSync[_renderSyncIndex]))
                {
                    NVPARTICLES_SCOPED_TIMER("solver-glsync", *cuStream);

                    GLenum glSyncRc;
                    do
                    {
                        //printInfo("_glRenderSync wait...");
                        // wait forever until sync is signaled...
                        glSyncRc = glClientWaitSync(_glRenderSync[_renderSyncIndex], GL_SYNC_FLUSH_COMMANDS_BIT, 1000);

                    } while (glSyncRc == GL_TIMEOUT_EXPIRED);

                    if (glSyncRc == GL_ALREADY_SIGNALED)
                        {}//printInfo("_glRenderSync was already signaled.");
                    else if (glSyncRc == GL_CONDITION_SATISFIED)
                        printInfo(Stringf("_glRenderSync[%d] blocked until signaled.", _renderSyncIndex));
                    else
                        printInfo(Stringf("_glRenderSync[%d] error or timeout.", _renderSyncIndex));

                    if (_glRenderSync[_renderSyncIndex])
                        glDeleteSync(_glRenderSync[_renderSyncIndex]);
                    _glRenderSync[_renderSyncIndex] = 0;
                }

#if defined(NVPARTICLES_USE_DOUBLEBUFFERING)
                _renderSyncIndex = (++_renderSyncIndex)%2;
#endif

                glCheckErrors();
            }

            if (1)
            {
                NVPARTICLES_SCOPED_TIMER("solver-blit", *cuStream);

                for (ParticleBuffers::const_iterator it=_buffers.begin(); it != _buffers.end(); ++it)
                {
                    if (it->second->spec.flags&ParticleBufferSpec::RENDERABLE)
                    {
                        // we are handling this in the solver!
                        //if (it->second->spec.renderSemantic == "position")
                        //    continue;

                        // get render buffer for this buffer
                        ParticleBuffer* renderBuf = getBuffer(it->second->spec.name+"Render");
				        assert(renderBuf);

                        if (numToUpdate)
                        {
                            ParticleBuffer* sourceBuf = getBuffer(it->second->spec.name);
				            assert(sourceBuf);

                            //printInfo(Stringf("rendering to buffer: %s [%d]", renderBuf->spec.name.c_str(), renderBuf->page()));

                            renderBuf->lock(*cuStream);
                            renderBuf->copy(*sourceBuf, 0, 0, numToUpdate, Cu::Buffer::CopyOptions().SetStream(*cuStream));
                            renderBuf->unlock();
                        }

#if defined(NVPARTICLES_USE_DOUBLEBUFFERING)
				        renderBuf->flip();
#endif
                    }
                }
                cuStream->sync();
            }
        }

        _renderNumParticles[_renderSyncIndex] = firstDeadIndex;
    }

    numParticles = firstDeadIndex;

    if(container)
    {
        // update the container...

		container->boundsLow = *(vec3f*)h_posMinBuffer.Data();
		container->boundsHigh = *(vec3f*)h_posMaxBuffer.Data();
        container->lastNumParticles = firstDeadIndex;
        container->numParticles = firstDeadIndex;
        // mark container as clean.
        container->_currentUpdateState = requestedUpdateState;

        // let other threads know that this is finished.
        container->readyLock.release();

        if (numDeaths)
        {
            if (debugLevel > 0)
                printInfo(Stringf("particle-simulator[%p] - syncing: nParticles=%d, nDeaths=%d, bounds=(%f,%f,%f) (%f,%f,%f)",
                    this, container->numParticles, numDeaths, container->boundsLow.x, container->boundsLow.y, container->boundsLow.z,
                    container->boundsHigh.x, container->boundsHigh.y, container->boundsHigh.z));
        }
	}

    /// CAVEAT:
    // if we want to access this data, then we need to copy it inside this thread.
	//if (particleGrid)
	//	particleGrid->CopyGridDataToHost();

    if (isSolving)
		isSolving = false;

    // mark this simulator as clean.
    _currentUpdateState = requestedUpdateState;

    _renderLock.release();
    nvtxRangePop(); // renderlock

    nvtxRangePop();
}

//-----------------------------------------------------------------------------------
void ParticleModifier::updateRenderBuffers(NvParticles::Parameters& outAttr, NvParticles::Parameters& outBuffers)
{
	if (!initialized)
		return;

    for (ParticleBuffers::const_iterator it=_buffers.begin(); it!=_buffers.end(); ++it)
    {
        if (it->second->spec.flags&ParticleBufferSpec::RENDERABLE)
        {
            ParticleBuffer* renderBuf = getBuffer(it->second->spec.name+"Render");
            assert(renderBuf);

            //printInfo(Stringf("rendering from buffer: %s [%d]", renderBuf->spec.name.c_str(), renderBuf->page()));

            std::string outname = Stringf("%s-vbo", it->second->spec.renderSemantic.c_str());
            outBuffers[outname] = (int)(*renderBuf)[0].Vbo();
        }
    }

    outBuffers["numParticles"] = _renderNumParticles[(_renderSyncIndex+1)%2];

	/// TODO:
	// these should be in particles.
	// as should the solver!
	if (solver)
	{
		outAttr["__particleRadius"] = solver->particleSpacing()/2;
	}
	else
	{
		outAttr["__particleRadius"] = 1.0f;
	}
}

//------------------------------------------------------------------------------------------
ParticleSolver::ParticleSolver()
{
    _context = 0;
    _cudaScheduler = 0;
    _cudaDevice = -1;
    _display = 0;
    _glcontext = 0;
    _currentUpdateState = _pendingUpdateState = 0;
    _container = 0;
    _debugLevel = 0;
    _isUpdating = false;
    _currentTime = 0;

    _forces = new ParticleForces();
	_primitives = new Primitives();
}

//------------------------------------------------------------------------------------------
ParticleSolver::~ParticleSolver()
{
    destroy();
}

//----------------------------------------------------------------------------------------------
void ParticleSolver::destroy()
{
    sync();

    if (_context)
    {
        // destroy the context...
        _cudaScheduler->add(new ModifierDestroyJob(_context));
        _cudaScheduler->waitAllDone();
        delete _cudaScheduler;
        _cudaScheduler = 0;
        _cudaDevice = -1;
        delete _context;
        _context = 0;
    }

    delete _forces;
    _forces = 0;
    delete _primitives;
    _primitives = 0;
}

//----------------------------------------------------------------------------------------------
bool ParticleSolver::setCudaDeviceConfig(int cudaDevice, void* display, void* glcontext)
{
    // no need to reset the device...
    if (_cudaDevice == cudaDevice && _display == display && _glcontext == glcontext)
        return false;

    destroy();

    // make a new context because the old one is now out of date.

#ifdef _WIN32
    wglMakeCurrent((HDC)display, 0);
    _cudaScheduler = new CudaScheduler(cudaDevice, 1, (HDC)display, (HGLRC)glcontext);
    wglMakeCurrent((HDC)display, (HGLRC)glcontext);
#else
    _cudaScheduler = new CudaScheduler(cudaDevice, 1, (Display*)display, (GLXContext)glcontext);
#endif

    _context = new ParticleModifier(cudaDevice);
    _cudaDevice = cudaDevice;
    _display = display;
    _glcontext = glcontext;

    _forces = new ParticleForces();
	_primitives = new Primitives();

    return true;
}

//------------------------------------------------------------------------------------------
void ParticleSolver::setContainer(ParticleContainer* container)
{
    if (_container != container)
	{
        sync();
        _container = container;
        ++_pendingUpdateState;
    }
}

//-----------------------------------------------------------------------------------
void ParticleSolver::setSolverType(std::string solverMethod)
{
    if (_solverMethod != solverMethod)
	{
        sync();
        _solverMethod = solverMethod;
        ++_pendingUpdateState;
	}
}

//-----------------------------------------------------------------------------------
void ParticleSolver::setParameters(const Parameters& parameters)
{
    _parameters = parameters;
    ++_pendingUpdateState;
}

//-----------------------------------------------------------------------------------
void ParticleSolver::addForce(const std::string& name, const ForceData& item)
{
    sync();

    bool isAdded = _forces->add(name, item);
    if (isAdded)
        ++_pendingUpdateState;
}

//-----------------------------------------------------------------------------------
void ParticleSolver::addPrimitive(const std::string& name, const Primitive& item)
{
    sync();

	bool isAdded = _primitives->add(name, item);
    if (isAdded)
        ++_pendingUpdateState;
}

//-----------------------------------------------------------------------------------
void ParticleSolver::setTime(float t)
{
    if (_currentTime != t)
    {
        sync();
        _currentTime = t;
        ++_pendingUpdateState;
    }
}

//------------------------------------------------------------------------------------------
void ParticleSolver::updateContext()
{
	if (!_context)
		return;

	_context->setSolverType(_solverMethod);
    _context->setParameters(_parameters);
	_context->primitives = _primitives;
	_context->particleForces = _forces;
    if (_container)
        _context->setMaxParticles(_container->maxParticles);

    // ensures the solver's context is allocated and initialized
    // which gives us access to the solver implementation object.
    _cudaScheduler->add(new ModifierInitializeJob(_context));
    _cudaScheduler->waitAllDone();
}

//------------------------------------------------------------------------------------------
float ParticleSolver::getParticleSpacing()
{
	if (!_context)
		return 1;

	if (_context->solver)
    {
        return _context->solver->particleSpacing();
    }

    return 1;
}

//------------------------------------------------------------------------------------------
void ParticleSolver::updateAsync(bool solve, bool force)
{
	if (!_context)
		return;

    if (!_container)
        return;

    NVPARTICLES_PROFILE("ParticleSolver::updateAsync");

	// finish any previous update.
	sync();

    // do we need to do an update?
    if (!force && _currentUpdateState == _pendingUpdateState)
        return;

    if (_debugLevel > 0)
    {
        printInfo(Stringf("ParticleSolver - update[%d] - time = %f", _pendingUpdateState, _currentTime));
    }

	// let the system know we are running an update.
	_isUpdating = true;

    if (!force && _container->_currentUpdateState == _pendingUpdateState)
        return;

    if (_solverMethod != "" && _container->enableSimulation)
    {
	    // prepare this solver for work on this container.
	    _context->setContainer(_container);
	    _context->setSolverType(_solverMethod);
        _context->setParameters(_parameters);
        _context->setMaxParticles(_container->maxParticles);
	    _context->primitives = _primitives;
	    _context->particleForces = _forces;
        _context->requestedUpdateState = _pendingUpdateState;
        _context->_currentTime = _currentTime;
        _context->isSolving = solve;

        _container->readyLock.claim();
	    _container->dirty = false;
        _container->requestedUpdateState = _pendingUpdateState;

        _cudaScheduler->add(new ModifierSolveJob(_context));
        _cudaScheduler->add(new ModifierSyncContainerJob(_context));
    }
    else
    {
        // nothing to do...
        _container->_currentUpdateState = _pendingUpdateState;
    }
}

//-----------------------------------------------------------------------------------
void ParticleSolver::sync()
{
	if (!_context)
		return;

    // ensure the solver has completed.
    _cudaScheduler->waitAllDone();

	if (_isUpdating)
	{
		_isUpdating = false;

        if (_debugLevel > 0)
        {
            printInfo(Stringf("ParticleSolver - update[%d] finished", _pendingUpdateState));
        }

        _currentUpdateState = _pendingUpdateState;
	}
}

//-----------------------------------------------------------------------------------
void ParticleSolver::render(ParticleRenderer* particleRenderer, bool drawBounds, bool drawGrid, bool drawPrimitives)
{
    assert(particleRenderer);

    if (_context == 0)
        return;

    nvtxRangePushA("render");

    // make sure that the thread's buffers are ready...
    _context->_renderLock.claim();

    // get the updated buffer info and rendereing parameters...
    Parameters rendererParams, bufferParams;
    _context->updateRenderBuffers(rendererParams, bufferParams);

    particleRenderer->update(bufferParams);

    /// HACK:
    // silly that I do this here, but the solver may have vital rendering info!
    particleRenderer->updateParameters(rendererParams);

    particleRenderer->render();

/*
    // you can only call this if we elected to transfer the grid data to
    // the host. This can only be copied while in the thread!
    glColor4f(1,1,0,1);
	if (_context->particleGrid && drawGrid)
    {
        _context->particleGrid->Render(true,true,true);
    }
*/
    glColor4f(1,0,0,1);
	if (_context->primitives && drawPrimitives)
        _context->primitives->render();

#if defined(NVPARTICLES_USE_DOUBLEBUFFERING)
    _context->_glRenderSync[(_context->_renderSyncIndex+1)%2] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
#else
    _context->_glRenderSync[0] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
#endif

    if (_context->container && drawBounds)
    {
        glColor3f(0, 1, 0);
        gl::drawWireBox((float *)&_context->container->boundsLow, (float *)&_context->container->boundsHigh);
    }

    glCheckErrors();

    _context->_renderLock.release();

    nvtxRangePop();
}

//-----------------------------------------------------------------------------------
void ParticleSolver::dump(size_t count, size_t step)
{
    if (!_context)
        return;

    sync();
    _context->dump(count, step);
}

//----------------------------------------------------------------------------------------------
void ParticleSolver::setDebugLevel(int v)
{
	_debugLevel = v;
    CudaScheduler::debugging = v;
    ParticleModifier::debugLevel = v;
    ParticleContainer::debugLevel = v;
    Primitives::debugLevel = v;
    ParticleForces::debugLevel = v;
}

//-----------------------------------------------------------------------------------
}
}
