Index: source/maths/Matrix3D.h =================================================================== --- source/maths/Matrix3D.h +++ source/maths/Matrix3D.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2019 Wildfire Games. +/* Copyright (C) 2021 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify @@ -23,6 +23,7 @@ #ifndef INCLUDED_MATRIX3D #define INCLUDED_MATRIX3D +#include "lib/sysdep/arch/x86_x64/simd.h" #include "maths/Vector3D.h" #include "maths/Vector4D.h" @@ -100,6 +101,50 @@ // matrix multiplication CMatrix3D operator*(const CMatrix3D &matrix) const { +#if OS_WIN && COMPILER_HAS_SSE + CMatrix3D result; + __m128 col1 = _mm_loadu_ps(_data2d[0]); + __m128 col2 = _mm_loadu_ps(_data2d[1]); + __m128 col3 = _mm_loadu_ps(_data2d[2]); + __m128 col4 = _mm_loadu_ps(_data2d[3]); + + __m128 mvec = _mm_set_ps1(matrix._11); + __m128 vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._21); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._31); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._41); + _mm_storeu_ps(result._data2d[0], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._12); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._22); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._32); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._42); + _mm_storeu_ps(result._data2d[1], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._13); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._23); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._33); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._43); + _mm_storeu_ps(result._data2d[2], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._14); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._24); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._34); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._44); + _mm_storeu_ps(result._data2d[3], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + return result; +#else return CMatrix3D( _11*matrix._11 + _12*matrix._21 + _13*matrix._31 + _14*matrix._41, _11*matrix._12 + _12*matrix._22 + _13*matrix._32 + _14*matrix._42, @@ -121,6 +166,7 @@ _41*matrix._13 + _42*matrix._23 + _43*matrix._33 + _44*matrix._43, _41*matrix._14 + _42*matrix._24 + _43*matrix._34 + _44*matrix._44 ); +#endif } // matrix multiplication/assignment @@ -196,19 +242,54 @@ // blend matrix using only 4x3 subset void Blend(const CMatrix3D& m, float f) { +#if OS_WIN && COMPILER_HAS_SSE + __m128 fvec = _mm_set_ps(0.f, f, f, f); + __m128 mask = _mm_set_ps(1.f, 0.f, 0.f, 0.f); + + __m128 col = _mm_loadu_ps(_data2d[0]); + __m128 mcol = _mm_loadu_ps(m._data2d[0]); + _mm_storeu_ps(_data2d[0], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(_data2d[1]); + mcol = _mm_loadu_ps(m._data2d[1]); + _mm_storeu_ps(_data2d[1], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(_data2d[2]); + mcol = _mm_loadu_ps(m._data2d[2]); + _mm_storeu_ps(_data2d[2], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(_data2d[3]); + mcol = _mm_loadu_ps(m._data2d[3]); + _mm_storeu_ps(_data2d[3], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); +#else _11 = m._11*f; _21 = m._21*f; _31 = m._31*f; _12 = m._12*f; _22 = m._22*f; _32 = m._32*f; _13 = m._13*f; _23 = m._23*f; _33 = m._33*f; _14 = m._14*f; _24 = m._24*f; _34 = m._34*f; +#endif } // blend matrix using only 4x3 and add onto existing blend void AddBlend(const CMatrix3D& m, float f) { +#if OS_WIN && COMPILER_HAS_SSE + __m128 fvec = _mm_set_ps(0.f, f, f, f); + + __m128 col = _mm_loadu_ps(_data2d[0]); + __m128 mcol = _mm_loadu_ps(m._data2d[0]); + _mm_storeu_ps(_data2d[0], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(_data2d[1]); + mcol = _mm_loadu_ps(m._data2d[1]); + _mm_storeu_ps(_data2d[1], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(_data2d[2]); + mcol = _mm_loadu_ps(m._data2d[2]); + _mm_storeu_ps(_data2d[2], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(_data2d[3]); + mcol = _mm_loadu_ps(m._data2d[3]); + _mm_storeu_ps(_data2d[3], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); +#else _11 += m._11*f; _21 += m._21*f; _31 += m._31*f; _12 += m._12*f; _22 += m._22*f; _32 += m._32*f; _13 += m._13*f; _23 += m._23*f; _33 += m._33*f; _14 += m._14*f; _24 += m._24*f; _34 += m._34*f; +#endif } // set this matrix to a rotation matrix for a rotation about X axis of given angle