/* ---------------------------------------------------------------------------
 * This software is in the public domain, furnished "as is", without technical
 * support, and with no warranty, express or implied, as to its usefulness for
 * any purpose.

 * Author: Wil Braithwaite.
 *
 */

#ifndef MAT44F_H_INCLUDED
#define MAT44F_H_INCLUDED

#include "math_utils.h"
#include "mat33f.h"

namespace Easy
{

//------------------------------------------------------------------------------------------
/// mat44f
///
/// data is stored as column-vectors in column-major layout (like OpenGL).
///
/// M = 1 0 0 tx   V = x
///     0 1 0 ty       y
///     0 0 1 tz       z
///     0 0 0 1        1
///
/// operations are taken from the right (like OpenGL). i.e. S * R * T = translate, then rotate, then scale.
///
/// Note that all data needs to be transposed before using in OpenGL as the memory layout is rowmajor.
///
struct NVPARTICLES_ALIGN(16) mat44f
{
    union NVPARTICLES_ALIGN(16)
    {
        vec4f m[4];
        float _array[16];
    };

    inline NVPARTICLES_CUDA_EXPORT float& element (int row, int col);
    inline NVPARTICLES_CUDA_EXPORT float element (int row, int col) const;
    inline NVPARTICLES_CUDA_EXPORT float& operator() (int iRow, int iCol);
    inline NVPARTICLES_CUDA_EXPORT float operator() (int iRow, int iCol) const;

    inline NVPARTICLES_CUDA_EXPORT operator float*();
    inline NVPARTICLES_CUDA_EXPORT operator const float*() const;

    inline NVPARTICLES_CUDA_EXPORT void set_row (int row, vec4f v);

    // constructions:
    inline NVPARTICLES_CUDA_EXPORT static mat44f fromVectors(const vec4f& X, const vec4f& Y, const vec4f& Z, const vec4f& T=make_vec4f(0));
    inline NVPARTICLES_CUDA_EXPORT static mat44f fromVectors(const vec3f& X, const vec3f& Y, const vec3f& Z, const vec3f& T=make_vec3f(0));
    inline NVPARTICLES_CUDA_EXPORT static mat44f zero();
    inline NVPARTICLES_CUDA_EXPORT static mat44f identity();
    inline NVPARTICLES_CUDA_EXPORT static mat44f fromArray(float *s);

    /// returns a matrix that rotates anti-clockwise around X-axis.
    inline NVPARTICLES_CUDA_EXPORT static mat44f rotateX(float rad);
    /// returns a matrix that rotates anti-clockwise around Y-axis.
    inline NVPARTICLES_CUDA_EXPORT static mat44f rotateY(float rad);
    /// returns a matrix that rotates anti-clockwise around Z-axis.
    inline NVPARTICLES_CUDA_EXPORT static mat44f rotateZ(float rad);
    /// returns a matrix that rotates using a quaternion.
    inline NVPARTICLES_CUDA_EXPORT static mat44f rotateQ(vec4f quat);
    /// returns a matrix that scales.
    inline NVPARTICLES_CUDA_EXPORT static mat44f scale(const float sx,const float sy,const float sz);
    inline NVPARTICLES_CUDA_EXPORT static mat44f scale(const float s);
    /// returns a matrix that translates.
    inline NVPARTICLES_CUDA_EXPORT static mat44f translate(const float tx, const float ty, const float tz);
    /// returns a matrix that projects into frustum bounds.
    inline NVPARTICLES_CUDA_EXPORT static mat44f frustum(const float l, const float r, const float b, const float t, const float n, const float f);
    /// returns a matrix that projects into a frustum using simple camera attributes.
    inline NVPARTICLES_CUDA_EXPORT static mat44f perspective(const float fovy, const float aspect, const float n, const float f);
    /// returns a matrix that looks along a ray with optional roll around the ray's axis.
    inline NVPARTICLES_CUDA_EXPORT static mat44f lookAt(vec3f from, vec3f to, float roll);

    // operations:
    inline NVPARTICLES_CUDA_EXPORT float determinant() const;
    inline NVPARTICLES_CUDA_EXPORT mat44f inverse() const;
    inline NVPARTICLES_CUDA_EXPORT mat44f inverseAffine() const;
    inline NVPARTICLES_CUDA_EXPORT mat44f inverseAffineNoScale() const;
    inline NVPARTICLES_CUDA_EXPORT mat44f transposed() const;
    /// transform a vector with the matrix.
    inline NVPARTICLES_CUDA_EXPORT vec3f multiply(vec3f v) const;
    inline NVPARTICLES_CUDA_EXPORT vec4f multiply(vec4f v) const;
    inline NVPARTICLES_CUDA_EXPORT vec3f multiplyNormalized(vec3f v) const;
    inline NVPARTICLES_CUDA_EXPORT vec3f multiplyTranspose(vec3f v) const;
    inline NVPARTICLES_CUDA_EXPORT vec4f multiplyTranspose(vec4f v) const;
	inline NVPARTICLES_CUDA_EXPORT vec3f multiplyPoint(vec3f v) const;

    /// pre-multiply by a matrix B (on the right, i.e. M * B).
    inline NVPARTICLES_CUDA_EXPORT void preMultiply(const mat44f &b);
    /// pre-multiply by a translation matrix.
    inline NVPARTICLES_CUDA_EXPORT void preTranslate(float tx, float ty, float tz);
    /// pre-multiply by a scaling matrix.
    inline NVPARTICLES_CUDA_EXPORT void preScale(float sx, float sy, float sz);
    /// post-multiply by a matrix B (on the left, i.e. B * M).
    inline NVPARTICLES_CUDA_EXPORT void postMultiply(const mat44f &b);
	/// make this matrix orthogonal.
    inline NVPARTICLES_CUDA_EXPORT void orthogonalise();


    // get components:
    inline NVPARTICLES_CUDA_EXPORT const vec4f& X() const;
    inline NVPARTICLES_CUDA_EXPORT const vec4f& Y() const;
    inline NVPARTICLES_CUDA_EXPORT const vec4f& Z() const;
    inline NVPARTICLES_CUDA_EXPORT const vec4f& translation() const;
    inline NVPARTICLES_CUDA_EXPORT mat44f rotation() const;

    // static operations:
    inline NVPARTICLES_CUDA_EXPORT static mat44f multiply(const mat44f &a, const mat44f &b);
    inline NVPARTICLES_CUDA_EXPORT static mat44f multiplyTranspose(const mat44f &a, const mat44f &tb);
    inline NVPARTICLES_CUDA_EXPORT static mat44f transpose(const mat44f& m);
    inline NVPARTICLES_CUDA_EXPORT static mat44f slerp(float t, const mat44f& a, const mat44f& b);
};

/// post-multiply matrix A by matrix B (i.e. B * A).
inline NVPARTICLES_CUDA_EXPORT mat44f operator*(const mat44f &a, const mat44f &b);
inline NVPARTICLES_CUDA_EXPORT vec3f operator*(const mat44f &m, const vec3f &v);
inline NVPARTICLES_CUDA_EXPORT vec4f operator*(const mat44f &m, const vec4f &v);

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT const vec4f& mat44f::X() const
{
    return m[0];
}

inline NVPARTICLES_CUDA_EXPORT const vec4f& mat44f::Y() const
{
    return m[1];
}

inline NVPARTICLES_CUDA_EXPORT const vec4f& mat44f::Z() const
{
    return m[2];
}

//------------------------------------------------------------------------------------------
#ifndef __CUDA_ARCH__
inline std::ostream &operator<< (std::ostream &s, const mat44f &v)
{
/*
    s << "[" << v.m[0] << "] ";
    s << "[" << v.m[1] << "] ";
    s << "[" << v.m[2] << "] ";
    s << "[" << v.m[3] << "] ";
*/

    for(int r=0;r<4;++r)
    {
        s << "[ ";
        for(int c=0;c<4;++c)
        {
            s << v(r, c) << " ";
        }
        s << "] ";
    }

    return s;
}
#endif
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f::operator const float*() const
{
    return _array;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f::operator float*()
{
    return _array;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float& mat44f::element (int row, int col)
{
    return _array[row | (col<<2)];
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float mat44f::element (int row, int col) const
{
    return _array[row | (col<<2)];
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float mat44f::operator() (int row, int col) const
{
    return element(row,col);
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT float& mat44f::operator() (int row, int col)
{
    return element(row,col);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::identity()
{
    mat44f m;
    m.m[0] = make_vec4f(1,0,0,0);
    m.m[1] = make_vec4f(0,1,0,0);
    m.m[2] = make_vec4f(0,0,1,0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::zero()
{
    mat44f m;
    m.m[0] = make_vec4f(0);
    m.m[1] = make_vec4f(0);
    m.m[2] = make_vec4f(0);
    m.m[3] = make_vec4f(0);
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::fromArray(float *s)
{
    mat44f m;
    for(int i=0;i<16;++i)
        m._array[i] = s[i];
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::fromVectors(const vec4f& X, const vec4f& Y, const vec4f& Z, const vec4f& T)
{
    mat44f m;
    m.m[0] = X;
    m.m[1] = Y;
    m.m[2] = Z;
    m.m[3] = T;
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::fromVectors(const vec3f& X, const vec3f& Y, const vec3f& Z, const vec3f& T)
{
    mat44f m;
    m.m[0] = make_vec4f(X.x,X.y,X.z,0);
    m.m[1] = make_vec4f(Y.x,Y.y,Y.z,0);
    m.m[2] = make_vec4f(Z.x,Z.y,Z.z,0);
    m.m[3] = make_vec4f(T.x,T.y,T.z,1);
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT const vec4f& mat44f::translation() const
{
    return m[3];
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::rotation() const
{
    mat44f r = fromVectors(m[0],m[1],m[2],make_vec4f(0,0,0,1));
    return r;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void mat44f::orthogonalise()
{
    vec4f x = m[0];
    vec4f z = m[2].normalized();
    vec4f y = normalize(cross(z,x));
    x = normalize(cross(y,z));

    m[0] = x;
    m[1] = y;
    m[2] = z;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::rotateX(float rad)
{
    mat44f m;
    m.m[0] = make_vec4f(1,0,0,0);
    m.m[1] = make_vec4f(0,cosf(rad),sinf(rad),0);
    m.m[2] = make_vec4f(0,-sinf(rad),cosf(rad),0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::rotateY(float rad)
{
    mat44f m;
    m.m[0] = make_vec4f(cosf(rad),0,-sinf(rad),0);
    m.m[1] = make_vec4f(0,1,0,0);
    m.m[2] = make_vec4f(sinf(rad),0,cosf(rad),0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::rotateZ(float rad)
{
    mat44f m;
    m.m[0] = make_vec4f(cosf(rad),sinf(rad),0,0);
    m.m[1] = make_vec4f(-sinf(rad),cosf(rad),0,0);
    m.m[2] = make_vec4f(0,0,1,0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::rotateQ(vec4f q)
{
    vec4f m0;
    vec4f m1;
    vec4f m2;
    vec4f m3;

    float ww = q.w*q.w;
    float xx = q.x*q.x;
    float yy = q.y*q.y;
    float zz = q.z*q.z;
    float s = 2.f/(ww + xx + yy + zz);
    float xy = q.x*q.y;
    float xz = q.x*q.z;
    float yz = q.y*q.z;
    float wx = q.w*q.x;
    float wy = q.w*q.y;
    float wz = q.w*q.z;

    // vectors are multiplied on the right (pre-multiply).
    m0.x = 1.f - s * (yy + zz);
    m0.y = s * (xy - wz);
    m0.z = s * (xz + wy);
    m0.w = 0.f;
    m1.x = s * (xy + wz);
    m1.y = 1.f - s * (xx + zz);
    m1.z = s * (yz - wx);
    m1.w = 0.f;
    m2.x = s * (xz - wy);
    m2.y = s * (yz + wx);
    m2.z = 1.f - s * (xx + yy);
    m2.w = 0.f;
    m3.x = 0.f;
    m3.y = 0.f;
    m3.z = 0.f;
    m3.w = 1.f;

    mat44f r;
    r.m[0] = m0;
    r.m[1] = m1;
    r.m[2] = m2;
    r.m[3] = m3;

    return r;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::scale(const float sx,const float sy,const float sz)
{
    mat44f m;
    m.m[0] = make_vec4f(sx,0,0,0);
    m.m[1] = make_vec4f(0,sy,0,0);
    m.m[2] = make_vec4f(0,0,sz,0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::scale(const float s)
{
    mat44f m;
    m.m[0] = make_vec4f(s,0,0,0);
    m.m[1] = make_vec4f(0,s,0,0);
    m.m[2] = make_vec4f(0,0,s,0);
    m.m[3] = make_vec4f(0,0,0,1);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::translate(const float tx, const float ty, const float tz)
{
    mat44f m;
    m.m[0] = make_vec4f(1,0,0,0);
    m.m[1] = make_vec4f(0,1,0,0);
    m.m[2] = make_vec4f(0,0,1,0);
    m.m[3] = make_vec4f(tx,ty,tz,1.f);
    return m;
}
//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::frustum(const float l, const float r, const float b, const float t, const float n, const float f)
{
    mat44f m;
    m.m[0].x = (2.f*n) / (r-l);
    m.m[1].x = 0.f;
    m.m[2].x = 0.f;
    m.m[3].x = 0.f;

    m.m[0].y = 0.f;
    m.m[1].y = (2.f*n) / (t-b);
    m.m[2].y = 0.f;
    m.m[3].y = 0.f;

    m.m[0].z = (r+l) / (r-l);
    m.m[1].z = (t+b) / (t-b);
    m.m[2].z = -(f+n) / (f-n);
    m.m[3].z = -1.f;

    m.m[0].w = 0.f;
    m.m[1].w = 0.f;
    m.m[2].w = -(2.f*f*n) / (f-n);
    m.m[3].w = 0.f;
    return m;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::perspective(const float fovyrad, const float aspect, const float n, const float f)
{
    float xmin, xmax, ymin, ymax;
    ymax = n * tanf(fovyrad * 0.5f);
    ymin = -ymax;
    xmin = ymin * aspect;
    xmax = ymax * aspect;
    return frustum(xmin, xmax, ymin, ymax, n, f);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void mat44f::preMultiply(const mat44f &b)
{
    mat44f tb = b.transposed();
    *this = multiplyTranspose(*this,tb);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void mat44f::postMultiply(const mat44f &b)
{
    mat44f ab = transposed();
    ab = multiplyTranspose(b,ab);
    m[0] = ab.m[0];
    m[1] = ab.m[1];
    m[2] = ab.m[2];
    m[3] = ab.m[3];
}

//------------------------------------------------------------------------------------------
// multiply two matrices and return the result
//
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::multiply(const mat44f &a, const mat44f &b)
{
    mat44f tb = b.transposed();
    return multiplyTranspose(a,tb);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::multiplyTranspose(const mat44f &a, const mat44f &tb)
{
    mat44f r;
    for (int i=0;i<4;++i)
    {
        r.m[i].x = dot(a.m[i], tb.m[0]);
        r.m[i].y = dot(a.m[i], tb.m[1]);
        r.m[i].z = dot(a.m[i], tb.m[2]);
        r.m[i].w = dot(a.m[i], tb.m[3]);
    }
    return r;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f operator*(const mat44f &a, const mat44f &b)
{
    return mat44f::multiply(b, a);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec4f operator*(const mat44f &m, const vec4f &v)
{
    return m.multiply(v);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT vec3f operator*(const mat44f &m, const vec3f &v)
{
    return m.multiply(v);
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec4f mat44f::multiplyTranspose(vec4f v) const
{
    vec4f r;
    r.x = dot(v, m[0]);
    r.y = dot(v, m[1]);
    r.z = dot(v, m[2]);
    r.w = dot(v, m[3]);
    return r;
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec3f mat44f::multiplyTranspose(vec3f v) const
{
    vec3f r;
    vec4f v4 = make_vec4f(v, 0.0f);
    r.x = dot(v4, m[0]);
    r.y = dot(v4, m[1]);
    r.z = dot(v4, m[2]);
    //float w = dot(v4, m[3]);
    //r /= w;
    return r;
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec4f mat44f::multiply(vec4f v) const
{
    return transposed().multiplyTranspose(v);
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec3f mat44f::multiplyPoint(vec3f v) const
{
    return make_vec3f(transposed().multiplyTranspose(make_vec4f(v,1)));
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec3f mat44f::multiply(vec3f v) const
{
    return transposed().multiplyTranspose(v);
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline vec3f mat44f::multiplyNormalized(vec3f v) const
{
    vec4f r = multiply(make_vec4f(v, 1.0f));
    return make_vec3f(r.x/r.w, r.y/r.w, r.z/r.w);
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline void mat44f::preScale(float sx, float sy, float sz)
{
    m[0].x *= sx;
    m[1].y *= sy;
    m[2].z *= sz;
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline void mat44f::preTranslate(float tx, float ty, float tz)
{
    m[3] += tx*m[0] + ty*m[1] + tz*m[2];
}

//----------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::slerp(float t, const mat44f& a, const mat44f& b)
{
    mat33f arot = mat33f::fromVectors(a.m[0], a.m[1], a.m[2]);
    mat33f brot = mat33f::fromVectors(b.m[0], b.m[1], b.m[2]);

    mat33f irot = mat33f::slerp(t, arot, brot);

    // interpolate the position...
    vec4f at = a.m[3];
    vec4f bt = b.m[3];
    vec4f it = t*bt + (1-t)*at;

    vec3f x = make_vec3f(irot.m[0]);
    vec3f y = make_vec3f(irot.m[1]);
    vec3f z = make_vec3f(irot.m[2]);

    return fromVectors(make_vec4f(x,0), make_vec4f(y,0), make_vec4f(z,0), it);
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::lookAt(vec3f from, vec3f to, float twist)
{
    vec3f Z = (normalize(to-from));
    vec3f X = (normalize(cross(Z,make_vec3f(0,1,0))));
    vec3f Y = (normalize(cross(Z,X)));
    return mat44f::rotateZ(twist) * mat44f::fromVectors(X,Y,Z,from);
}

///------------------------------------------------------------------------------------------
/// quaternion:
///------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT void quatf_to_mat44f(quatf q, mat44f &mat)
{
    vec4f m0;
    vec4f m1;
    vec4f m2;
    vec4f m3;

    float ww = q.w*q.w;
    float xx = q.x*q.x;
    float yy = q.y*q.y;
    float zz = q.z*q.z;
    float s = 2.0f/(ww + xx + yy + zz);
    float xy = q.x*q.y;
    float xz = q.x*q.z;
    float yz = q.y*q.z;
    float wx = q.w*q.x;
    float wy = q.w*q.y;
    float wz = q.w*q.z;

    // vectors are multiplied on the left (pre-multipy).
    //
    m0.x = 1.0f - s * (yy + zz);
    m0.y = s * (xy - wz);
    m0.z = s * (xz + wy);
    m0.w = 0.0f;
    m1.x = s * (xy + wz);
    m1.y = 1.0f - s * (xx + zz);
    m1.z = s * (yz - wx);
    m1.w = 0.0f;
    m2.x = s * (xz - wy);
    m2.y = s * (yz + wx);
    m2.z = 1.0f - s * (xx + yy);
    m2.w = 0.0f;
    m3.x = 0.0f;
    m3.y = 0.0f;
    m3.z = 0.0f;
    m3.w = 1.0f;

    mat.m[0] = m0;
    mat.m[1] = m1;
    mat.m[2] = m2;
    mat.m[3] = m3;

    return;
}

inline NVPARTICLES_CUDA_EXPORT quatf quatf_from_mat44f(mat44f m)
{
    // Algorithm in Ken Shoemake's article in 1987 SIGGRAPH course notes
    // article "Quaternion Calculus and Fast Animation".
    quatf result;

    float fTrace = m(0,0) + m(1,1) + m(2,2);
    float fRoot;

    if ( fTrace > 0.0f )
    {
        // |w| > 1/2, may as well choose w > 1/2
        fRoot = sqrtf(fTrace + 1.0f);  // 2w
        result.w = (0.5f)*fRoot;
        fRoot = (0.5f)/fRoot;  // 1/(4w)
        result.x = (m(2,1)-m(1,2))*fRoot;
        result.y = (m(0,2)-m(2,0))*fRoot;
        result.z = (m(1,0)-m(0,1))*fRoot;
    }
    else
    {
        // |w| <= 1/2
        int i = 0;
        if ( m(1,1) > m(0,0) )
            i = 1;
        if ( m(2,2) > m(i,i) )
            i = 2;
        int j = (i+1)%3;
        int k = (j+1)%3;

        fRoot = sqrtf(m(i,i)-m(j,j)-m(k,k)+1.0f);

        if (i==0)
        {
            result.x = (0.5f)*fRoot;
            fRoot = (0.5f)/fRoot;
            result.w = (m(k,j)-m(j,k))*fRoot;
            result.y = (m(j,i)+m(i,j))*fRoot;
            result.z = (m(k,i)+m(i,k))*fRoot;
        }
        else if (i==1)
        {
            result.y = (0.5f)*fRoot;
            fRoot = (0.5f)/fRoot;
            result.w = (m(k,j)-m(j,k))*fRoot;
            result.z = (m(j,i)+m(i,j))*fRoot;
            result.x = (m(k,i)+m(i,k))*fRoot;
        }
        else if (i==2)
        {
            result.z = (0.5f)*fRoot;
            fRoot = (0.5f)/fRoot;
            result.w = (m(k,j)-m(j,k))*fRoot;
            result.x = (m(j,i)+m(i,j))*fRoot;
            result.y = (m(k,i)+m(i,k))*fRoot;
        }
    }

    return result;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT quatf quatf_look_at(vec3f from, vec3f to, float twist)
{
    return quatf_from_mat44f(mat44f::lookAt(from,to,twist));
}
//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline void mat44f::set_row(int row, const vec4f v)
{
    m[row] = v;
}
//------------------------------------------------------------------------------------------
// return the determinant of the matrix
//
NVPARTICLES_CUDA_EXPORT inline float mat44f::determinant() const
{
    float det;
    det = _array[0] * _array[5] * _array[10];
    det += _array[4] * _array[9] * _array[2];
    det += _array[8] * _array[1] * _array[6];
    det -= _array[8] * _array[5] * _array[2];
    det -= _array[4] * _array[1] * _array[10];
    det -= _array[0] * _array[9] * _array[6];
    return det;
}
//------------------------------------------------------------------------------------------
// return the transpose of the matrix
//
NVPARTICLES_CUDA_EXPORT inline mat44f mat44f::transposed() const
{
    mat44f ret;
    ret._array[0] = _array[0];
    ret._array[1] = _array[4];
    ret._array[2] = _array[8];
    ret._array[3] = _array[12];
    ret._array[4] = _array[1];
    ret._array[5] = _array[5];
    ret._array[6] = _array[9];
    ret._array[7] = _array[13];
    ret._array[8] = _array[2];
    ret._array[9] = _array[6];
    ret._array[10] = _array[10];
    ret._array[11] = _array[14];
    ret._array[12] = _array[3];
    ret._array[13] = _array[7];
    ret._array[14] = _array[11];
    ret._array[15] = _array[15];
    return ret;
}

//------------------------------------------------------------------------------------------
inline NVPARTICLES_CUDA_EXPORT mat44f mat44f::transpose(const mat44f& m)
{
    return m.transposed();
}

//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline mat44f mat44f::inverseAffineNoScale() const
{
    mat44f inv;
    inv.m[0].x = m[0].x;
    inv.m[0].y = m[1].x;
    inv.m[0].z = m[2].x;
    inv.m[1].x = m[0].y;
    inv.m[1].y = m[1].y;
    inv.m[1].z = m[2].y;
    inv.m[2].x = m[0].z;
    inv.m[2].y = m[1].z;
    inv.m[2].z = m[2].z;
    inv.m[3].x = inv.m[0].x*-m[3].x+inv.m[1].x*-m[3].y+inv.m[2].x*-m[3].z;
    inv.m[3].y = inv.m[0].y*-m[3].x+inv.m[1].y*-m[3].y+inv.m[2].y*-m[3].z;
    inv.m[3].z = inv.m[0].z*-m[3].x+inv.m[1].z*-m[3].y+inv.m[2].z*-m[3].z;
    inv.m[0].w = 0.f;
    inv.m[1].w = 0.f;
    inv.m[2].w = 0.f;
    inv.m[3].w = 1.f;
    return inv;
}
//------------------------------------------------------------------------------------------
NVPARTICLES_CUDA_EXPORT inline mat44f mat44f::inverseAffine() const
{
    mat44f ret;
    float idet = 1.0f / determinant();
    ret._array[0] =  (_array[5] * _array[10] - _array[9] * _array[6]) * idet;
    ret._array[1] = -(_array[1] * _array[10] - _array[9] * _array[2]) * idet;
    ret._array[2] =  (_array[1] * _array[6] - _array[5] * _array[2]) * idet;
    ret._array[3] = 0.0;
    ret._array[4] = -(_array[4] * _array[10] - _array[8] * _array[6]) * idet;
    ret._array[5] =  (_array[0] * _array[10] - _array[8] * _array[2]) * idet;
    ret._array[6] = -(_array[0] * _array[6] - _array[4] * _array[2]) * idet;
    ret._array[7] = 0.0;
    ret._array[8] =  (_array[4] * _array[9] - _array[8] * _array[5]) * idet;
    ret._array[9] = -(_array[0] * _array[9] - _array[8] * _array[1]) * idet;
    ret._array[10] =  (_array[0] * _array[5] - _array[4] * _array[1]) * idet;
    ret._array[11] = 0.0;
    ret._array[12] = -(_array[12] * ret._array[0] + _array[13] * ret._array[4] + _array[14] * ret._array[8]);
    ret._array[13] = -(_array[12] * ret._array[1] + _array[13] * ret._array[5] + _array[14] * ret._array[9]);
    ret._array[14] = -(_array[12] * ret._array[2] + _array[13] * ret._array[6] + _array[14] * ret._array[10]);
    ret._array[15] = 1.0;
    return ret;
}
//------------------------------------------------------------------------------------------
// return the inverse of the matrix
//
NVPARTICLES_CUDA_EXPORT inline mat44f mat44f::inverse() const
{
    mat44f minv = mat44f::identity();

    float r1[8], r2[8], r3[8], r4[8];
    float *s[4], *tmprow;

    s[0] = &r1[0];
    s[1] = &r2[0];
    s[2] = &r3[0];
    s[3] = &r4[0];

    register int i,j,p,jj;
    for (i=0;i<4;i++)
    {
        for (j=0;j<4;j++)
        {
            s[i][j] = element(i,j);
            if (i==j) s[i][j+4] = 1.0;
            else     s[i][j+4] = 0.0;
        }
    }
    float scp[4];
    for (i=0;i<4;i++)
    {
        scp[i] = float(fabs(s[i][0]));
        for (j=1;j<4;j++)
            if (float(fabs(s[i][j])) > scp[i]) scp[i] = float(fabs(s[i][j]));
        if (scp[i] == 0.0) return minv; // singular matrix!
    }

    int pivot_to;
    float scp_max;
    for (i=0;i<4;i++)
    {
        // select pivot row
        pivot_to = i;
        scp_max = float(fabs(s[i][i]/scp[i]));
        // find out which row should be on top
        for (p=i+1;p<4;p++)
            if (float(fabs(s[p][i]/scp[p])) > scp_max)
            {
                scp_max = float(fabs(s[p][i]/scp[p]));
                pivot_to = p;
            }
        // Pivot if necessary
        if (pivot_to != i)
        {
            tmprow = s[i];
            s[i] = s[pivot_to];
            s[pivot_to] = tmprow;
            float tmpscp;
            tmpscp = scp[i];
            scp[i] = scp[pivot_to];
            scp[pivot_to] = tmpscp;
        }

        float mji;
        // perform gaussian elimination
        for (j=i+1;j<4;j++)
        {
            mji = s[j][i]/s[i][i];
            s[j][i] = 0.0;
            for (jj=i+1;jj<8;jj++)
                s[j][jj] -= mji*s[i][jj];
        }
    }
    if (s[3][3] == 0.0)
        return minv; // singular matrix!

    float mij;
    for (i=3;i>0;i--)
    {
        for (j=i-1;j > -1; j--)
        {
            mij = s[j][i]/s[i][i];
            for (jj=j+1;jj<8;jj++)
                s[j][jj] -= mij*s[i][jj];
        }
    }

    for (i=0;i<4;i++)
        for (j=0;j<4;j++)
            minv.element(i,j) = s[i][j+4] / s[i][i];


    return minv;
}

//---------------------------------------------------------------------------------------
// external function to access inverse
//
inline NVPARTICLES_CUDA_EXPORT mat44f inverse(const mat44f& a)
{
    return a.inverse();
}

//------------------------------------------------------------------------------------------
#ifndef __CUDA_ARCH__
/*inline std::ostream &operator<< (std::ostream &s, const mat33f &v)
{
    s << v.x << " " << v.y << " " << v.z;
    return s;
}*/
/*
//------------------------------------------------------------------------------------------
inline std::ostream &operator<< (std::ostream &s, const mat44f &v)
{
    s << v.m[0] << "\n" << v.m[1] << "\n" << v.m[2] << "\n" << v.m[3];
    return s;
}
*/
//---------------------------------------------------------------------------------------
#endif

//------------------------------------------------------------------------------------------
}

#endif // MAT44F_H_INCLUDED
