/* ---------------------------------------------------------------------------
 * This software is in the public domain, furnished "as is", without technical
 * support, and with no warranty, express or implied, as to its usefulness for
 * any purpose.

 * Author: Wil Braithwaite.
 *
 */

/* This is a simple maths header for vectors, matrices, and quaternions.
*/

#ifndef NVPARTICLES_MATH_H
#define NVPARTICLES_MATH_H

#ifndef NOMINMAX
#define NOMINMAX
#endif

#include <stdio.h>
#include <float.h>

// taken from vector_types.h
#if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(_WIN32) && !defined(_WIN64)

#pragma warning(push)
#pragma warning(disable: 4201 4408)

#define __cuda_builtin_vector_align8(tag, members) \
        struct tag {                               \
          union {                                  \
            struct { members };                    \
            struct { long long int :1,:0; };       \
          };                                       \
        }

#else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

#define __cuda_builtin_vector_align8(tag, members) \
        struct __align__(8) tag {                  \
          members                                  \
        }

#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

#if defined(__CUDACC__) || defined(__CUDABE__)
#include "host_defines.h"
#endif

// taken from host_defines.h

#if defined(__GNUC__)

#define NVPARTICLES_NO_RETURN \
        __attribute__((noreturn))
#define NVPARTICLES_INLINE \
        inline
#define NVPARTICLES_NO_INLINE \
        __attribute__((noinline))
#define NVPARTICLES_FORCE_INLINE \
        inline
        //NVPARTICLES_INLINE inline
        //__attribute__((always_inline))
#define NVPARTICLES_ALIGN(n) \
        __attribute__((aligned(n)))
#define NVPARTICLES_ANNOTATE(a) \
        __attribute__((a))
#define NVPARTICLES_LOCATION(a) \
        NVPARTICLES_ANNOTATE(a)

#elif defined(_WIN32)

#define NVPARTICLES_NO_RETURN \
        __declspec(noreturn)
#define NVPARTICLES_INLINE \
        __inline
#define NVPARTICLES_NO_INLINE \
        __declspec(noinline)
#define NVPARTICLES_FORCE_INLINE \
        __forceinline
#define NVPARTICLES_ALIGN(n) \
        __declspec(align(n))
#define NVPARTICLES_ANNOTATE(a) \
        __declspec(a)
#define NVPARTICLES_LOCATION(a) \
        NVPARTICLES_ANNOTATE(__##a##__)

#else

#error --- !!! UNKNOWN COMPILER !!! ---

#endif

#if defined(__CUDACC__) || defined(__CUDABE__)
#define NVPARTICLES_CUDA_EXPORT __host__ __device__
#define NVPARTICLES_CUDA_DEVICE_MEMORY
#else
#define NVPARTICLES_CUDA_EXPORT
#endif


#ifndef __CUDA_ARCH__
#include <iostream>
#endif

#ifndef __CUDACC__
#include <math.h>
#include <stdlib.h>
#endif

namespace Easy
{
#ifdef _WIN32
typedef unsigned int uint;
#endif

#ifndef __CUDACC__

////////////////////////////////////////////////////////////////////////////////
// host implementations of CUDA functions
////////////////////////////////////////////////////////////////////////////////

NVPARTICLES_FORCE_INLINE
float fminf(float a, float b)
{
  return a < b ? a : b;
}

NVPARTICLES_FORCE_INLINE
float fmaxf(float a, float b)
{
  return a > b ? a : b;
}

#if !defined(_WIN32)

NVPARTICLES_FORCE_INLINE
int max(int a, int b)
{
  return a > b ? a : b;
}

NVPARTICLES_FORCE_INLINE
int min(int a, int b)
{
  return a < b ? a : b;
}

#endif

NVPARTICLES_FORCE_INLINE
float rsqrtf(float x)
{
    return 1.0f / sqrtf(x);
}
#else

#include "math_functions.h"
#endif

template<typename T>
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
T min3(T a, T b, T c)
{
  return a < b ? (a < c ? a : c) : (b < c ? b : c);
}

template<typename T>
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
T max3(T a, T b, T c)
{
  return a > b ? (a > c ? a : c) : (b > c ? b : c);
}

#ifndef FLT_MAX
#define FLT_MAX 9999999.f
#endif

//---------------------------------------------------------------------------------------
// definitions:

#ifndef PI
const float PI = 3.14165f;
#endif

#ifndef DEGTORAD

NVPARTICLES_FORCE_INLINE
NVPARTICLES_CUDA_EXPORT float DEGTORAD(float x)
{
    return (((x)*PI)/180.f);
}
#endif


//---------------------------------------------------------------------------------------
// Vec3f

struct vec3f
{
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT static vec3f fromArray(float* v);
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT uint is_zero();
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT uint get_dominant_axis();

    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT void normalize();
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT vec3f normalized() const;

    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT void Dump(const char *name=NULL);

    float x,y,z;
};

struct NVPARTICLES_ALIGN(16) vec4f
{
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT static vec4f unitX();
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT static vec4f unitY();
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT static vec4f unitZ();

    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT void normalize();
    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT vec4f normalized() const;

    NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT static vec4f fromArray(float* v);

    float x,y,z,w;
};

typedef struct vec3f vec3f;
typedef struct vec4f vec4f;

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
bool operator== (const vec4f& a, const vec4f& b)
{
    return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
bool operator!= (const vec4f& a, const vec4f& b)
{
    return (a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w);
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f make_vec3f(vec3f v)
{
    return v;
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f make_vec3f(const vec4f& v)
{
    vec3f a;
    a.x = v.x;
    a.y = v.y;
    a.z = v.z;
    return a;
}

NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f vec3f::fromArray(float* ptr)
{
    vec3f a;
    a.x = ptr[0];
    a.y = ptr[1];
    a.z = ptr[2];
    return a;
}
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f vec4f::fromArray(float* ptr)
{
    vec4f a;
    a.x = ptr[0];
    a.y = ptr[1];
    a.z = ptr[2];
    a.w = ptr[3];
    return a;
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f make_vec3f(float vx, float vy, float vz)
{
    vec3f a;
    a.x = vx;
    a.y = vy;
    a.z = vz;
    return a;
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f make_vec3f(float v)
{
    return make_vec3f(v,v,v);
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f make_vec4f(float vx, float vy, float vz, float vw)
{
    vec4f a;
    a.x = vx;
    a.y = vy;
    a.z = vz;
    a.w = vw;
    return a;
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f make_vec4f(float v)
{
    return make_vec4f(v,v,v,v);
}

//---------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f make_vec4f(vec3f v, float vw)
{
    vec4f a;
    a.x = v.x;
    a.y = v.y;
    a.z = v.z;
    a.w = vw;
    return a;
}
//---------------------------------------------------------------------------------------

//------------------------------------------------------------------------------------------
NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f ffloorf(vec4f a)
{
#ifndef __CUDACC__
	return make_vec4f(::floorf(a.x), ::floorf(a.y), ::floorf(a.z), ::floorf(a.w));
#else
	return make_vec4f(::floorf(a.x), ::floorf(a.y), ::floorf(a.z), ::floorf(a.w));
#endif
}

NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec3f fminf(vec3f a, vec3f b)
{
#ifndef __CUDACC__
	return make_vec3f(Easy::fminf(a.x,b.x), Easy::fminf(a.y,b.y), Easy::fminf(a.z,b.z));
#else
	return make_vec3f(::fminf(a.x,b.x), ::fminf(a.y,b.y), ::fminf(a.z,b.z));
#endif
}

NVPARTICLES_FORCE_INLINE NVPARTICLES_CUDA_EXPORT
vec4f fminf(vec4f a, vec4f b)
{
#ifndef __CUDACC__
	return make_vec4f(Easy::fminf(a.x,b.x), Easy::fminf(a.y,b.y), Easy::fminf(a.z,b.z), fminf(a.w,b.w));
#else
	return make_vec4f(::fminf(a.x,b.x), ::fminf(a.y,b.y), ::fminf(a.z,b.z), ::fminf(a.w,b.w));
#endif
}

inline NVPARTICLES_CUDA_EXPORT vec3f fmaxf(vec3f a, vec3f b)
{
#ifndef __CUDACC__
	return make_vec3f(Easy::fmaxf(a.x,b.x), Easy::fmaxf(a.y,b.y), Easy::fmaxf(a.z,b.z));
#else
	return make_vec3f(::fmaxf(a.x,b.x), ::fmaxf(a.y,b.y), ::fmaxf(a.z,b.z));
#endif
}

inline NVPARTICLES_CUDA_EXPORT vec4f fmaxf(vec4f a, vec4f b)
{
#ifndef __CUDACC__
	return make_vec4f(Easy::fmaxf(a.x,b.x), Easy::fmaxf(a.y,b.y), Easy::fmaxf(a.z,b.z), fmaxf(a.w,b.w));
#else
	return make_vec4f(::fmaxf(a.x,b.x), ::fmaxf(a.y,b.y), ::fmaxf(a.z,b.z), ::fmaxf(a.w,b.w));
#endif
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void vec3f::Dump(const char *name)
{
#ifndef __CUDA_ARCH__
    printf("%s = %f %f %f\n",(name)?name:"",x,y,z);
#endif
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT bool operator==(vec3f a, vec3f b)
{
    return (a.x == b.x && a.y == b.y && a.z == b.z);
}
inline NVPARTICLES_CUDA_EXPORT bool operator<(vec3f a, vec3f b)
{
    return (a.x < b.x && a.y < b.y && a.z < b.z);
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec3f operator+(vec3f a, vec3f b)
{
    return make_vec3f(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline NVPARTICLES_CUDA_EXPORT void operator+=(vec3f &a, vec3f b)
{
    a.x += b.x; a.y += b.y; a.z += b.z;
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator+(vec3f a, float b)
{
    return make_vec3f(a.x + b, a.y + b, a.z + b);
}
inline NVPARTICLES_CUDA_EXPORT void operator+=(vec3f &a, float b)
{
    a.x += b; a.y += b; a.z += b;
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator+(float b, vec3f a)
{
    return make_vec3f(a.x + b, a.y + b, a.z + b);
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator+(vec4f a, vec4f b)
{
    return make_vec4f(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline NVPARTICLES_CUDA_EXPORT void operator+=(vec4f &a, vec4f b)
{
    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator+(vec4f a, float b)
{
    return make_vec4f(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator+(float b, vec4f a)
{
    return make_vec4f(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline NVPARTICLES_CUDA_EXPORT void operator+=(vec4f &a, float b)
{
    a.x += b; a.y += b; a.z += b; a.w += b;
}

//------------------------------------------------------------------------------------------

inline NVPARTICLES_CUDA_EXPORT
vec3f operator-(vec3f a, vec3f b)
{
    return make_vec3f(a.x - b.x, a.y - b.y, a.z - b.z);
}

inline NVPARTICLES_CUDA_EXPORT
void operator-=(vec3f &a, vec3f b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z;
}

inline NVPARTICLES_CUDA_EXPORT
vec3f operator-(vec3f a, float b)
{
    return make_vec3f(a.x - b, a.y - b, a.z - b);
}

inline NVPARTICLES_CUDA_EXPORT
vec3f operator-(float b, vec3f a)
{
    return make_vec3f(b - a.x, b - a.y, b - a.z);
}

inline NVPARTICLES_CUDA_EXPORT
void operator-=(vec3f &a, float b)
{
    a.x -= b; a.y -= b; a.z -= b;
}


inline NVPARTICLES_CUDA_EXPORT
vec4f operator-(vec4f a, vec4f b)
{
    return make_vec4f(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline NVPARTICLES_CUDA_EXPORT
void operator-=(vec4f &a, vec4f b)
{
    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator-(vec4f a, float b)
{
    return make_vec4f(a.x - b, a.y - b, a.z - b,  a.w - b);
}
inline NVPARTICLES_CUDA_EXPORT void operator-=(vec4f &a, float b)
{
    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
}

//------------------------------------------------------------------------------------------

inline NVPARTICLES_CUDA_EXPORT vec3f operator-(const vec3f &a)
{
    return make_vec3f(-a.x, -a.y, -a.z);
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator-(const vec4f &a)
{
    return make_vec4f(-a.x, -a.y, -a.z, -a.w);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec3f operator*(vec3f a, vec3f b)
{
    return make_vec3f(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline NVPARTICLES_CUDA_EXPORT void operator*=(vec3f &a, vec3f b)
{
    a.x *= b.x; a.y *= b.y; a.z *= b.z;
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator*(vec3f a, float b)
{
    return make_vec3f(a.x * b, a.y * b, a.z * b);
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator*(float b, vec3f a)
{
    return make_vec3f(b * a.x, b * a.y, b * a.z);
}
inline NVPARTICLES_CUDA_EXPORT void operator*=(vec3f &a, float b)
{
    a.x *= b; a.y *= b; a.z *= b;
}

inline NVPARTICLES_CUDA_EXPORT vec4f operator*(vec4f a, vec4f b)
{
    return make_vec4f(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
}
inline NVPARTICLES_CUDA_EXPORT void operator*=(vec4f &a, vec4f b)
{
    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator*(vec4f a, float b)
{
    return make_vec4f(a.x * b, a.y * b, a.z * b,  a.w * b);
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator*(float b, vec4f a)
{
    return make_vec4f(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline NVPARTICLES_CUDA_EXPORT void operator*=(vec4f &a, float b)
{
    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec3f operator/(vec3f a, vec3f b)
{
    return make_vec3f(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline NVPARTICLES_CUDA_EXPORT void operator/=(vec3f &a, vec3f b)
{
    a.x /= b.x; a.y /= b.y; a.z /= b.z;
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator/(vec3f a, float b)
{
    return make_vec3f(a.x / b, a.y / b, a.z / b);
}
inline NVPARTICLES_CUDA_EXPORT void operator/=(vec3f &a, float b)
{
    a.x /= b; a.y /= b; a.z /= b;
}
inline NVPARTICLES_CUDA_EXPORT vec3f operator/(float b, vec3f a)
{
    return make_vec3f(b / a.x, b / a.y, b / a.z);
}

inline NVPARTICLES_CUDA_EXPORT vec4f operator/(vec4f a, vec4f b)
{
    return make_vec4f(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
}
inline NVPARTICLES_CUDA_EXPORT void operator/=(vec4f &a, vec4f b)
{
    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator/(vec4f a, float b)
{
    return make_vec4f(a.x / b, a.y / b, a.z / b,  a.w / b);
}
inline NVPARTICLES_CUDA_EXPORT void operator/=(vec4f &a, float b)
{
    a.x /= b; a.y /= b; a.z /= b; a.w /= b;
}
inline NVPARTICLES_CUDA_EXPORT vec4f operator/(float b, vec4f a){
    return make_vec4f(b / a.x, b / a.y, b / a.z, b / a.w);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float dot(vec3f a, vec3f b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline NVPARTICLES_CUDA_EXPORT float dot(vec4f a, vec4f b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec3f cross(vec3f a, vec3f b)
{
    return make_vec3f(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}

inline NVPARTICLES_CUDA_EXPORT vec4f cross(vec4f a, vec4f b)
{
    return make_vec4f(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 1);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float length(vec3f v)
{
    return sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
}

inline NVPARTICLES_CUDA_EXPORT float length(vec4f v)
{
    return sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
vec3f normalize(vec3f v)
{
    float len = sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
	if (len == 0)
		return make_vec3f(0,0,1);
    return v / len;
}

inline NVPARTICLES_CUDA_EXPORT
vec4f normalize(vec4f v)
{
    float len = sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
	if (len == 0)
		return make_vec4f(0,0,1,0);
    return v / len;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
void vec4f::normalize()
{
    float len = sqrtf(x*x + y*y + z*z);
	if (len == 0)
	{
		x = 0;
		y = 0;
		z = 1;
	}
	else
	{
		x = x/len;
		y = y/len;
		z = z/len;
	}
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
vec4f vec4f::normalized() const
{
    vec4f r;
    float len = sqrtf(x*x + y*y + z*z);
	if (len == 0)
	{
		r.x = 0;
		r.y = 0;
		r.z = 1;
	}
	else
	{
		r.x = x/len;
		r.y = y/len;
		r.z = z/len;
	}
	r.w = 0;
    return r;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
void vec3f::normalize()
{
    float len = sqrtf(x*x + y*y + z*z);
	if (len == 0)
	{
		x = 0;
		y = 0;
		z = 1;
	}
	else
	{
		x = x/len;
		y = y/len;
		z = z/len;
	}
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
vec3f vec3f::normalized() const
{
    vec3f r;
    float len = sqrtf(x*x + y*y + z*z);
	if (len == 0)
	{
		r.x = 0;
		r.y = 0;
		r.z = 1;
	}
	else
	{
		r.x = x/len;
		r.y = y/len;
		r.z = z/len;
	}
    return r;
}

//------------------------------------------------------------------------------------------
#ifndef __CUDA_ARCH__
inline std::ostream &operator<< (std::ostream &s, const vec3f &v)
{
    s << v.x << " " << v.y << " " << v.z;
    return s;
}

//------------------------------------------------------------------------------------------
inline std::ostream &operator<< (std::ostream &s, const vec4f &v)
{
    s << v.x << " " << v.y << " " << v.z << " " << v.w;
    return s;
}
#endif

/*inline NVPARTICLES_CUDA_EXPORT vec3f::operator float3() const
{
    return make_vec3f(x,y,z);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f::operator float3() const
{
    return make_vec3f(x,y,z);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f::operator float4() const
{
    return make_float4(x,y,z,w);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void vec3f::operator=(const float3 v)
{
    x = v.x;
    y = v.y;
    z = v.z;
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void vec4f::operator=(const float4 v)
{
    x = v.x;
    y = v.y;
    z = v.z;
    w = v.w;
}
*/
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f vec4f::unitX()
{
    return make_vec4f(1,0,0,0);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f vec4f::unitY()
{
    return make_vec4f(0,1,0,0);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f vec4f::unitZ()
{
    return make_vec4f(0,0,1,0);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT uint is_zero(vec4f v)
{
    const float e = 1.0e-5F;
    return (fabsf(v.x) < e &&fabsf(v.y) < e && fabsf(v.z) < e);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT uint is_zero(vec3f v)
{
    const float e = 1.0e-5F;
    return (fabsf(v.x) < e &&fabsf(v.y) < e && fabsf(v.z) < e);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT uint vec3f::get_dominant_axis()
{
    float xx, yy;
    if ((xx = fabs(x)) > (yy = fabs(y)))
    {
        if (xx > fabs(z))
            return 0;
        else
            return 2;
    }
    else
    {
        if (yy > fabs(z))
            return 1;
        else
            return 2;
    }
}

//---------------------------------------------------------------------------------------
// quatf
//---------------------------------------------------------------------------------------
struct NVPARTICLES_ALIGN(16) quatf : public vec4f
{
	inline NVPARTICLES_CUDA_EXPORT static quatf fromVectors(vec3f a, vec3f b);
	inline NVPARTICLES_CUDA_EXPORT static quatf fromEuler(float yaw, float pitch, float roll);
    inline NVPARTICLES_CUDA_EXPORT static quatf fromAxisAngle(vec3f axis, float radians);
};

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf make_quatf(float angle=0, float vx=0, float vy=0, float vz=1)
{
    quatf a;
    a.x = vx;
    a.y = vy;
    a.z = vz;
    a.w = angle;
    return a;
}

#if 1
/*
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float mat_get(vec3f m00,vec3f m10,vec3f m20, int x,int y)
{
    switch (x)
    {
    case 0:
        if (y==0) return m00.x;
        else if (y==1) return m00.y;
        else return m00.z;
    case 1:
        if (y==0) return m10.x;
        else if (y==1) return m10.y;
        else return m10.z;
    case 2:
        if (y==0) return m20.x;
        else if (y==1) return m20.y;
        else return m20.z;
    }
    return 0.f;
}
*/
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf getAxisZ(quatf q)
{
    quatf m2;

    float ww = q.w*q.w;
    float xx = q.x*q.x;
    float yy = q.y*q.y;
    float zz = q.z*q.z;
    float s = 2.0f/(ww + xx + yy + zz);
    float xz = q.x*q.z;
    float yz = q.y*q.z;
    float wx = q.w*q.x;
    float wy = q.w*q.y;

    m2.x = s * (xz - wy);
    m2.y = s * (yz + wx);
    m2.z = 1.0f - s * (xx + yy);
    m2.w = 0.0f;

    return m2;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf quatf::fromAxisAngle(vec3f axis, float radians)
{
    quatf r;
    float sumOfSquares = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;

    if (sumOfSquares <= 1.0e-5F)
    {
        r=make_quatf(0,0,0,1);
    }
    else
    {
        radians *= 0.5;
        r.w = cosf(radians);
        float commonFactor = sinf(radians);
        if (!((sumOfSquares > 1.0f) ? (sumOfSquares - 1.0f <= 1.0e-5F) : (1.0f - sumOfSquares <= 1.0e-5F)))
            commonFactor /= sqrtf(sumOfSquares);
        r.x = commonFactor * axis.x;
        r.y = commonFactor * axis.y;
        r.z = commonFactor * axis.z;
    }

    return r;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf quatf::fromVectors(vec3f a, vec3f b)
{
    quatf q = make_quatf(0,0,0,1);
    float factor = length(a) * length(b);
    if (fabs(factor) > 1.0e-5F)
    {
        // Vectors have length > 0
        float d = dot(a,b) / factor;

        if (-1.0f>d)
            d=-1.0f;
        else if (1.0f<d)
            d=1.0f;

        float theta = acos(d);

        vec3f pivotVector = cross(a,b);
        /*
                if (d < 0.0 && length(pivotVector) < 1.0e-5F)
                {
                    // Vectors parallel and opposite direction, therefore a rotation
                    // of 180 degrees about any vector perpendicular to this vector
                    // will rotate vector a onto vector b.
                    //
                    // The following guarantees the dot-product will be 0.0.
                    //
                    uint dominantIndex = get_dominant_axis(a);

                    pivotVector[dominantIndex] = -a[(dominantIndex+1)%3];
                    pivotVector[(dominantIndex+1)%3] = a[dominantIndex];
                    pivotVector[(dominantIndex+2)%3] = 0.0;
                }
        */
        q = quatf::fromAxisAngle(pivotVector,theta);
    }
    return q;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf quatf::fromEuler(float yaw, float pitch, float roll)
{
    float cy = cos(yaw/2);
    float sy = sin(yaw/2);
    float cp = cos(pitch/2);
    float sp = sin(pitch/2);
    float cr = cos(roll/2);
    float sr = sin(roll/2);
    return make_quatf(cr*sp*cy + sr*cp*sy,
                       cr*cp*sy - sr*sp*cy,
                       sr*cp*cy - cr*sp*sy,
                       cr*cp*cy + sr*sp*sy);
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
quatf multiply(quatf a, quatf b)
{
    quatf result;
    vec3f v1 = make_vec3f(a.x,a.y,a.z);
    vec3f v2 = make_vec3f(b.x,b.y,b.z);
    vec3f v3 = cross(v1,v2) + a.w*v2 + b.w*v1;
    result.x = v3.x;
    result.y = v3.y;
    result.z = v3.z;
    result.w = a.w*b.w - dot(v1,v2);
    return result;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
vec3f rotateVector(quatf q, vec3f vector)
{
    float rw =-q.x * vector.x - q.y * vector.y - q.z * vector.z;
    float rx = q.w * vector.x + q.y * vector.z - q.z * vector.y;
    float ry = q.w * vector.y + q.z * vector.x - q.x * vector.z;
    float rz = q.w * vector.z + q.x * vector.y - q.y * vector.x;
    return make_vec3f(- rw * q.x +  rx * q.w - ry * q.z + rz * q.y,
                       - rw * q.y +  ry * q.w - rz * q.x + rx * q.z,
                       - rw * q.z +  rz * q.w - rx * q.y + ry * q.x);
}

inline NVPARTICLES_CUDA_EXPORT
vec4f rotateVector(quatf q, vec4f vector)
{
    return make_vec4f(rotateVector(q, make_vec3f(vector.x,vector.y,vector.z)), 0);
}

inline NVPARTICLES_CUDA_EXPORT
quatf slerp(quatf from, quatf to, float t)
{
    quatf res;
    quatf to1;
    float omega, cosom, sinom, scale0, scale1;

    // calc cosine
    cosom = from.x * to.x + from.y * to.y + from.z * to.z  + from.w * to.w;

    // adjust signs (if necessary)
    if ( cosom < 0.0f )
    {
        cosom = -cosom;
        to1.x = - to.x;
        to1.y = - to.y;
        to1.z = - to.z;
        to1.w = - to.w;
    }
    else
    {
        to1.x = to.x;
        to1.y = to.y;
        to1.z = to.z;
        to1.w = to.w;
    }

    // calculate coefficients
    if ( (1.0f - cosom) > 1.0e-5F )
    {
        // standard case (slerp)
        omega = acos(cosom);
        sinom = sin(omega);
        scale0 = sin((1.0f - t) * omega) / sinom;
        scale1 = sin(t * omega) / sinom;
    }
    else
    {
        // "from" and "to" quaternions are very close
        //  ... so we can do a linear interpolation
        scale0 = 1.0f - t;
        scale1 = t;
    }
    // calculate final values
    res.x = scale0 * from.x + scale1 * to1.x;
    res.y = scale0 * from.y + scale1 * to1.y;
    res.z = scale0 * from.z + scale1 * to1.z;
    res.w = scale0 * from.w + scale1 * to1.w;
    return res;
}

inline NVPARTICLES_CUDA_EXPORT
quatf normalize(const quatf& q)
{
    float l = length(q);
    if ( l > 1.0e-5F )
        return make_quatf(q.w,q.x/l,q.y/l,q.z/l);
    else
        return make_quatf(0);
}
#endif

}

#include "mat44f.h"
#include "mat33f.h"

namespace Easy
{
///---------------------------------------------------------------------------------------
/// random functions:
///---------------------------------------------------------------------------------------

//---------------------------------------------------------------------------------------
/// from http://www.concentric.net/~Ttwang/tech/inthash.htm
///
inline NVPARTICLES_CUDA_EXPORT
unsigned int hashSeed(unsigned int seed)
{
  seed = (seed+0x7ed55d16) + (seed<<12);
  seed = (seed^0xc761c23c) ^ (seed>>19);
  seed = (seed+0x165667b1) + (seed<<5);
  seed = (seed+0xd3a2646c) ^ (seed<<9);
  seed = (seed+0xfd7046c5) + (seed<<3);
  seed = (seed^0xb55a4f09) ^ (seed>>16);
  return seed;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float hashIntToFloat(unsigned int ival)
{
    return float(ival + 1) / 4294967296.0f;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float random(unsigned int& seed)
{
    seed = hashSeed(seed);
    return hashIntToFloat(seed);
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float randomRange(float low, float high, unsigned int& seed)
{
    return random(seed)*(high-low)+low;
}

//---------------------------------------------------------------------------------------
/// smoothstep.
///
template<typename T>
inline NVPARTICLES_CUDA_EXPORT
T smoothstep( T edge0, T edge1, T x )
{
  T t = (x-edge0) / (edge1-edge0);
  if (t < 0)
      t = 0;
  else if (t > 1)
      t = 1;
  return t*t * ( (T)3.0 - (T)2.0*t );
}

//---------------------------------------------------------------------------------------
/// critically-smoothed damping (over time).
///
template<class T>
inline NVPARTICLES_CUDA_EXPORT
T criticallyDampedSmooth(T &current_x, T &current_dx, const T &desired_x,
        const float smoothTime, const float deltaTime=0.04f)
{
    float omega = 2.f / max(0.001f,smoothTime);
    float x = omega * deltaTime;
    float exp = 1.f/(1.f + x + 0.48f*x*x + 0.235f*x*x*x);

    T dx = current_x - desired_x;
    T temp = (current_dx + dx*omega);
    current_dx = (current_dx - temp*deltaTime*omega) * exp;
    current_x = desired_x + (dx + temp*deltaTime)*exp;

    return current_x;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
vec3f pseudoTemperature( float t )
{
	const float b = t < 0.25f ? smoothstep( -0.25f, 0.25f, t ) : 1.0f-smoothstep( 0.25f, 0.5f, t );
	const float g = t < 0.5f  ? smoothstep( 0.0f, 0.5f, t ) : (t < 0.75f ? 1.0f : 1.0f-smoothstep( 0.75f, 1.0f, t ));
	const float r = smoothstep( 0.5f, 0.75f, t );
	return make_vec3f( r, g, b );
}

//---------------------------------------------------------------------------------------
/// from http://www.cs.rit.edu/~ncs/color/t_convert.html
/// The hue value H runs from 0 to 360�.
/// The saturation S is the degree of strength or purity and is from 0 to 1.
/// Purity is how much white is added to the color, so S=1 makes the purest color (no white).
/// Brightness V also ranges from 0 to 1, where 0 is the black.
///
inline NVPARTICLES_CUDA_EXPORT
vec3f hsvToRgb(float h, float s, float v )
{
	float r=0,g=0,b=0;
	int i;
	float f, p, q, t;
	if( s == 0 ) {
		// achromatic (grey)
		r = g = b = v;
		return make_vec3f(r,g,b);
	}
	h /= 60.f;			// sector 0 to 5
	i = (int)floor( h );
	f = h - i;			// factorial part of h
	p = v * ( 1.0f - s );
	q = v * ( 1.0f - s * f );
	t = v * ( 1.0f - s * ( 1.0f - f ) );
	switch( i ) {
		case 0:
			r = v;	g = t;	b = p;
			break;
		case 1:
			r = q;	g = v;	b = p;
			break;
		case 2:
			r = p;	g = v;	b = t;
			break;
		case 3:
			r = p;	g = q;	b = v;
			break;
		case 4:
			r = t;	g = p;	b = v;
			break;
		default:		// case 5:
			r = v;	g = p;	b = q;
			break;
	}

	return make_vec3f(r,g,b);
}

//---------------------------------------------------------------------------------------
/// Adapted from http://www.cs.rit.edu/~ncs/color/t_convert.html
/// r,g,b values are from 0 to 1
/// h = [0,360], s = [0,1], v = [0,1]
///
inline NVPARTICLES_CUDA_EXPORT vec3f rgbToHsv(float r, float g, float b)
{
	vec3f hsv;
	float min, max, delta;

	min = min3( r, g, b );
	max = max3( r, g, b );

	hsv.z = max;				// v

	delta = max - min;

	if( max != 0 )
		hsv.y = delta / max;		// s
	else {
		// r = g = b = 0		// s = 0, v is undefined
		hsv.y = 0;
		hsv.x = -1;
		return hsv;
	}

	if( r == max )
		hsv.x = ( g - b ) / delta;		// between yellow & magenta
	else if( g == max )
		hsv.x = 2 + ( b - r ) / delta;	// between cyan & yellow
	else
		hsv.x = 4 + ( r - g ) / delta;	// between magenta & cyan

	hsv.x *= 60;				// degrees
	if( hsv.x < 0 )
		hsv.x += 360;

  return hsv;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f ceil(const vec4f v)
{
    return make_vec4f(::ceil(v.x), ::ceil(v.y), ::ceil(v.z), ::ceil(v.w));
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f rgbaIntToFloat(uint rgba)
{
    return make_vec4f(float((rgba>>0)&0xff),float((rgba>>8)&0xff),float((rgba>>16)&0xff),float((rgba>>24)&0xff))/255.f;
}
//---------------------------------------------------------------------------------------
// separate each of the low 10 bits of input by 2 bits
// See: p317, Real-Time Collision Detection, Christer Ericson
inline NVPARTICLES_CUDA_EXPORT uint separateBy2(uint n)
{
    // n = ----------------------9876543210 : Bits initially
    // n = ------98----------------76543210 : After (1)
    // n = ------98--------7654--------3210 : After (2)
    // n = ------98----76----54----32----10 : After (3)
    // n = ----9--8--7--6--5--4--3--2--1--0 : After (4)
    n = (n ^ (n << 16)) & 0xff0000ff; // (1)
    n = (n ^ (n <<  8)) & 0x0300f00f; // (2)
    n = (n ^ (n <<  4)) & 0x030c30c3; // (3)
    n = (n ^ (n <<  2)) & 0x09249249; // (4)
    return n;
}
//---------------------------------------------------------------------------------------
// convert a 3d position into a linear 1D address in Morton (Z-curve) order
// takes three 10-bit numbers and interleaves the bits into one number
inline NVPARTICLES_CUDA_EXPORT uint morton3(uint x, uint y, uint z)
{
    // z--z--z--z--z--z--z--z--z--z-- : separateBy2(z) << 2
    // -y--y--y--y--y--y--y--y--y--y- : separateBy2(y) << 1
    // --x--x--x--x--x--x--x--x--x--x : separateBy2(x)
    // zyxzyxzyxzyxzyxzyxzyxzyxzyxzyx : Final result
    return (separateBy2(z) << 2) | (separateBy2(y) << 1) | separateBy2(x);
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float intAsFloat(int a)
{
    volatile union{
        float f;
        int i;
    } u;

  u.i = a;
  return u.f;
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
int floatAsInt(float a)
{
    volatile union{
        float f;
        int i;
    } u;

  u.f = a;
  return u.i;
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
unsigned short float2half_rn(float f)
{
  unsigned int x = floatAsInt(f);
  unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
  unsigned int sign, exponent, mantissa;


  if (u > 0x7f800000) {
    return 0x7fff;
  }

  sign = ((x >> 16) & 0x8000);


  if (u > 0x477fefff) {
    return sign | 0x7c00;
  }
  if (u < 0x33000001) {
    return sign | 0x0000;
  }

  exponent = ((u >> 23) & 0xff);
  mantissa = (u & 0x7fffff);

  if (exponent > 0x70) {
    shift = 13;
    exponent -= 0x70;
  } else {
    shift = 0x7e - exponent;
    exponent = 0;
    mantissa |= 0x800000;
  }
  lsb = (1 << shift);
  lsb_s1 = (lsb >> 1);
  lsb_m1 = (lsb - 1);


  remainder = (mantissa & lsb_m1);
  mantissa >>= shift;
  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
    ++mantissa;
    if (!(mantissa & 0x3ff)) {
      ++exponent;
      mantissa = 0;
    }
  }

  return sign | (exponent << 10) | mantissa;
}
//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
float half2float(unsigned short h)
{
  unsigned int sign = ((h >> 15) & 1);
  unsigned int exponent = ((h >> 10) & 0x1f);
  unsigned int mantissa = ((h & 0x3ff) << 13);

  if (exponent == 0x1f) {
    mantissa = (mantissa
                ? (sign = 0, 0x7fffff)
                : 0);
    exponent = 0xff;
  } else if (!exponent) {
    if (mantissa) {
      unsigned int msb;
      exponent = 0x71;
      do {
        msb = (mantissa & 0x400000);
        mantissa <<= 1;
        --exponent;
      } while (!msb);
      mantissa &= 0x7fffff;
    }
  } else {
    exponent += 0x70;
  }

  return intAsFloat((sign << 31) | (exponent << 23) | mantissa);
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
unsigned int floatFlip(unsigned int f)
{
	unsigned int mask = -int(f >> 31) | 0x80000000;
	return f ^ mask;
}

//---------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT
unsigned int invFloatFlip(unsigned int f)
{
	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
	return f ^ mask;
}

//---------------------------------------------------------------------------------------
struct NVPARTICLES_ALIGN(16) boundingbox4f
{
    vec4f low;
    vec4f high;

    boundingbox4f()
    {
        low = make_vec4f(FLT_MAX, FLT_MAX, FLT_MAX, 0);
        high = make_vec4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0);
    }

    boundingbox4f(const vec4f& _low, const vec4f& _high)
		: low(_low), high(_high)
		{
		}
};

//---------------------------------------------------------------------------------------
} // end of namespace

#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
    defined(_WIN32) && !defined(_WIN64)

#pragma warning(pop)

#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */


#endif // NVPARTICLES_MATH_H
