Index: source/maths/Matrix3D.h =================================================================== --- source/maths/Matrix3D.h +++ source/maths/Matrix3D.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2019 Wildfire Games. +/* Copyright (C) 2021 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify @@ -97,30 +97,14 @@ return _data[idx]; } + static CMatrix3D(*Multiply)(const CMatrix3D& source, const CMatrix3D& matrix); + static void(*BlendMat)(CMatrix3D* s, const CMatrix3D& m, float f); + static void(*AddBlendMat)(CMatrix3D* s, const CMatrix3D& m, float f); + // matrix multiplication - CMatrix3D operator*(const CMatrix3D &matrix) const + CMatrix3D operator* (const CMatrix3D& matrix) const { - return CMatrix3D( - _11*matrix._11 + _12*matrix._21 + _13*matrix._31 + _14*matrix._41, - _11*matrix._12 + _12*matrix._22 + _13*matrix._32 + _14*matrix._42, - _11*matrix._13 + _12*matrix._23 + _13*matrix._33 + _14*matrix._43, - _11*matrix._14 + _12*matrix._24 + _13*matrix._34 + _14*matrix._44, - - _21*matrix._11 + _22*matrix._21 + _23*matrix._31 + _24*matrix._41, - _21*matrix._12 + _22*matrix._22 + _23*matrix._32 + _24*matrix._42, - _21*matrix._13 + _22*matrix._23 + _23*matrix._33 + _24*matrix._43, - _21*matrix._14 + _22*matrix._24 + _23*matrix._34 + _24*matrix._44, - - _31*matrix._11 + _32*matrix._21 + _33*matrix._31 + _34*matrix._41, - _31*matrix._12 + _32*matrix._22 + _33*matrix._32 + _34*matrix._42, - _31*matrix._13 + _32*matrix._23 + _33*matrix._33 + _34*matrix._43, - _31*matrix._14 + _32*matrix._24 + _33*matrix._34 + _34*matrix._44, - - _41*matrix._11 + _42*matrix._21 + _43*matrix._31 + _44*matrix._41, - _41*matrix._12 + _42*matrix._22 + _43*matrix._32 + _44*matrix._42, - _41*matrix._13 + _42*matrix._23 + _43*matrix._33 + _44*matrix._43, - _41*matrix._14 + _42*matrix._24 + _43*matrix._34 + _44*matrix._44 - ); + return Multiply(*this, matrix); } // matrix multiplication/assignment @@ -196,19 +180,13 @@ // blend matrix using only 4x3 subset void Blend(const CMatrix3D& m, float f) { - _11 = m._11*f; _21 = m._21*f; _31 = m._31*f; - _12 = m._12*f; _22 = m._22*f; _32 = m._32*f; - _13 = m._13*f; _23 = m._23*f; _33 = m._33*f; - _14 = m._14*f; _24 = m._24*f; _34 = m._34*f; + BlendMat(this, m, f); } // blend matrix using only 4x3 and add onto existing blend void AddBlend(const CMatrix3D& m, float f) { - _11 += m._11*f; _21 += m._21*f; _31 += m._31*f; - _12 += m._12*f; _22 += m._22*f; _32 += m._32*f; - _13 += m._13*f; _23 += m._23*f; _33 += m._33*f; - _14 += m._14*f; _24 += m._24*f; _34 += m._34*f; + AddBlendMat(this, m, f); } // set this matrix to a rotation matrix for a rotation about X axis of given angle @@ -322,4 +300,6 @@ CVector3D RotateTransposed(const CVector3D& vector) const; }; +extern void Matrix3DActivateFastImpl(); + #endif // INCLUDED_MATRIX3D Index: source/maths/Matrix3D.cpp =================================================================== --- source/maths/Matrix3D.cpp +++ source/maths/Matrix3D.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2019 Wildfire Games. +/* Copyright (C) 2021 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify @@ -22,10 +22,16 @@ #include "precompiled.h" +#include "lib/sysdep/compiler.h" #include "Matrix3D.h" #include "Quaternion.h" #include "Vector4D.h" +#if COMPILER_HAS_SSE +# include "lib/sse.h" +# include +#endif + //Sets the identity matrix void CMatrix3D::SetIdentity () { @@ -442,3 +448,154 @@ // Negate the return angle to match the SetYRotation convention return -atan2(axis.Z, axis.X); } + +#if COMPILER_HAS_SSE + +// matrix SSE multiplication + +CMatrix3D MultiplicationSSE(const CMatrix3D& source, const CMatrix3D& matrix) +{ + CMatrix3D result; + __m128 col1 = _mm_loadu_ps(source._data2d[0]); + __m128 col2 = _mm_loadu_ps(source._data2d[1]); + __m128 col3 = _mm_loadu_ps(source._data2d[2]); + __m128 col4 = _mm_loadu_ps(source._data2d[3]); + + __m128 mvec = _mm_set_ps1(matrix._11); + __m128 vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._21); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._31); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._41); + _mm_storeu_ps(result._data2d[0], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._12); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._22); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._32); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._42); + _mm_storeu_ps(result._data2d[1], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._13); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._23); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._33); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._43); + _mm_storeu_ps(result._data2d[2], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + + mvec = _mm_set_ps1(matrix._14); + vec = _mm_mul_ps(mvec, col1); + mvec = _mm_set_ps1(matrix._24); + vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec); + mvec = _mm_set_ps1(matrix._34); + vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec); + mvec = _mm_set_ps1(matrix._44); + _mm_storeu_ps(result._data2d[3], _mm_add_ps(_mm_mul_ps(mvec, col4), vec)); + return result; +} + +// blend matrix SSE using only 4x3 and add onto existing blend + +void AddBlendSSE(CMatrix3D* s, const CMatrix3D& m, float f) +{ + __m128 fvec = _mm_set_ps(0, f, f, f); + + __m128 col = _mm_loadu_ps(s->_data2d[0]); + __m128 mcol = _mm_loadu_ps(m._data2d[0]); + _mm_storeu_ps(s->_data2d[0], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(s->_data2d[1]); + mcol = _mm_loadu_ps(m._data2d[1]); + _mm_storeu_ps(s->_data2d[1], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(s->_data2d[2]); + mcol = _mm_loadu_ps(m._data2d[2]); + _mm_storeu_ps(s->_data2d[2], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); + col = _mm_loadu_ps(s->_data2d[3]); + mcol = _mm_loadu_ps(m._data2d[3]); + _mm_storeu_ps(s->_data2d[3], _mm_add_ps(_mm_mul_ps(mcol, fvec), col)); +} + +// blend matrix SSE using only 4x3 subset + +void BlendSSE(CMatrix3D* s, const CMatrix3D& m, float f) +{ + __m128 fvec = _mm_set_ps(0, f, f, f); + __m128 mask = _mm_set_ps(1, 0, 0, 0); + + __m128 col = _mm_loadu_ps(s->_data2d[0]); + __m128 mcol = _mm_loadu_ps(m._data2d[0]); + _mm_storeu_ps(s->_data2d[0], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(s->_data2d[1]); + mcol = _mm_loadu_ps(m._data2d[1]); + _mm_storeu_ps(s->_data2d[1], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(s->_data2d[2]); + mcol = _mm_loadu_ps(m._data2d[2]); + _mm_storeu_ps(s->_data2d[2], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); + col = _mm_loadu_ps(s->_data2d[3]); + mcol = _mm_loadu_ps(m._data2d[3]); + _mm_storeu_ps(s->_data2d[3], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec))); +} +#endif + +CMatrix3D MultiplicationFallback(const CMatrix3D& source, const CMatrix3D& matrix) +{ + return CMatrix3D( + source._11 * matrix._11 + source._12 * matrix._21 + source._13 * matrix._31 + source._14 * matrix._41, + source._11 * matrix._12 + source._12 * matrix._22 + source._13 * matrix._32 + source._14 * matrix._42, + source._11 * matrix._13 + source._12 * matrix._23 + source._13 * matrix._33 + source._14 * matrix._43, + source._11 * matrix._14 + source._12 * matrix._24 + source._13 * matrix._34 + source._14 * matrix._44, + + source._21 * matrix._11 + source._22 * matrix._21 + source._23 * matrix._31 + source._24 * matrix._41, + source._21 * matrix._12 + source._22 * matrix._22 + source._23 * matrix._32 + source._24 * matrix._42, + source._21 * matrix._13 + source._22 * matrix._23 + source._23 * matrix._33 + source._24 * matrix._43, + source._21 * matrix._14 + source._22 * matrix._24 + source._23 * matrix._34 + source._24 * matrix._44, + + source._31 * matrix._11 + source._32 * matrix._21 + source._33 * matrix._31 + source._34 * matrix._41, + source._31 * matrix._12 + source._32 * matrix._22 + source._33 * matrix._32 + source._34 * matrix._42, + source._31 * matrix._13 + source._32 * matrix._23 + source._33 * matrix._33 + source._34 * matrix._43, + source._31 * matrix._14 + source._32 * matrix._24 + source._33 * matrix._34 + source._34 * matrix._44, + + source._41 * matrix._11 + source._42 * matrix._21 + source._43 * matrix._31 + source._44 * matrix._41, + source._41 * matrix._12 + source._42 * matrix._22 + source._43 * matrix._32 + source._44 * matrix._42, + source._41 * matrix._13 + source._42 * matrix._23 + source._43 * matrix._33 + source._44 * matrix._43, + source._41 * matrix._14 + source._42 * matrix._24 + source._43 * matrix._34 + source._44 * matrix._44 + ); +} + +void BlendFallback(CMatrix3D* s, const CMatrix3D& m, float f) +{ + s->_11 = m._11 * f; s->_21 = m._21 * f; s->_31 = m._31 * f; + s->_12 = m._12 * f; s->_22 = m._22 * f; s->_32 = m._32 * f; + s->_13 = m._13 * f; s->_23 = m._23 * f; s->_33 = m._33 * f; + s->_14 = m._14 * f; s->_24 = m._24 * f; s->_34 = m._34 * f; +} + +void AddBlendFallback(CMatrix3D* s, const CMatrix3D& m, float f) +{ + s->_11 += m._11 * f; s->_21 += m._21 * f; s->_31 += m._31 * f; + s->_12 += m._12 * f; s->_22 += m._22 * f; s->_32 += m._32 * f; + s->_13 += m._13 * f; s->_23 += m._23 * f; s->_33 += m._33 * f; + s->_14 += m._14 * f; s->_24 += m._24 * f; s->_34 += m._34 * f; +} + + +CMatrix3D(*CMatrix3D::Multiply)(const CMatrix3D& source, const CMatrix3D& matrix) = MultiplicationFallback; +void(*CMatrix3D::BlendMat)(CMatrix3D* s, const CMatrix3D& m, float f) = BlendFallback; +void(*CMatrix3D::AddBlendMat)(CMatrix3D* s, const CMatrix3D& m, float f) = AddBlendFallback; + +void Matrix3DActivateFastImpl() +{ +#if COMPILER_HAS_SSE + if (HostHasSSE()) + { + CMatrix3D::Multiply = MultiplicationSSE; + CMatrix3D::AddBlendMat = AddBlendSSE; + CMatrix3D::BlendMat = BlendSSE; + return; + } +#endif +} Index: source/maths/tests/test_Matrix3d.h =================================================================== --- source/maths/tests/test_Matrix3d.h +++ source/maths/tests/test_Matrix3d.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011 Wildfire Games. +/* Copyright (C) 2021 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify @@ -21,10 +21,262 @@ #include #include "maths/Matrix3D.h" #include "maths/Quaternion.h" +#include +#include +#include +#include + +namespace +{ + bool EqualsWithEpsilon(const CMatrix3D& m1, const CMatrix3D& m2) + { + const float EPS = 0.01f; + return + std::abs(m1._11 - m2._11) <= EPS && + std::abs(m1._12 - m2._12) <= EPS && + std::abs(m1._13 - m2._13) <= EPS && + std::abs(m1._14 - m2._14) <= EPS && + std::abs(m1._21 - m2._21) <= EPS && + std::abs(m1._22 - m2._22) <= EPS && + std::abs(m1._23 - m2._23) <= EPS && + std::abs(m1._24 - m2._24) <= EPS && + std::abs(m1._31 - m2._31) <= EPS && + std::abs(m1._32 - m2._32) <= EPS && + std::abs(m1._33 - m2._33) <= EPS && + std::abs(m1._34 - m2._34) <= EPS && + std::abs(m1._41 - m2._41) <= EPS && + std::abs(m1._42 - m2._42) <= EPS && + std::abs(m1._43 - m2._43) <= EPS && + std::abs(m1._44 - m2._44) <= EPS; + } + + double GetDuration(double start, double finish) + { + return finish - start; + } +} class TestMatrix : public CxxTest::TestSuite { public: + + void test_MatrixMultiplicationPerformance() + { + const size_t number_of_samples = 200; + const size_t number_of_iteration = 1000000; + + std::cout << std::endl; + std::cout << "matrix multiplication" << std::endl; + std::cout << "number_of_samples = " << number_of_samples << std::endl; + std::cout << "number_of_iteration = " << number_of_iteration << std::endl; +#ifdef COMPILER_HAS_SSE + const bool have_sse = true; +#else + const bool have_sse = false; +#endif + std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl; + std::vector source_mat1(number_of_iteration); + std::vector source_mat2(number_of_iteration); + std::vector dest_mat1(number_of_iteration); + std::vector dest_mat2(number_of_iteration); + + CMatrix3D m; + srand(0); + for (size_t i = 0; i < number_of_iteration; ++i) + { + for (size_t j = 0; j < 16; ++j) + { + m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + source_mat1[i] = m; + + for (size_t j = 0; j < 16; ++j) + { + m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + source_mat2[i] = m; + } + + double duration1 = 0.0; + double duration2 = 0.0; + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start1 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat1[i] = source_mat1[i] * source_mat2[i]; + double finish1 = timer_Time(); + + duration1 += GetDuration(start1, finish1); + + std::reverse(source_mat1.begin(), source_mat1.end()); + std::reverse(source_mat2.begin(), source_mat2.end()); + } + + Matrix3DActivateFastImpl(); + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start2 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat2[i] = source_mat1[i] * source_mat2[i]; + double finish2 = timer_Time(); + + duration2 += GetDuration(start2, finish2); + + std::reverse(source_mat1.begin(), source_mat1.end()); + std::reverse(source_mat2.begin(), source_mat2.end()); + } + + for (size_t i = 0; i < number_of_iteration; ++i) + TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i])); + + std::cout << "SSE (off): " << duration1 << "ms" << std::endl; + std::cout << "SSE (on): " << duration2 << "ms" << std::endl; + } + + void test_MatrixBlendPerformance() + { + const size_t number_of_samples = 200; + const size_t number_of_iteration = 1000000; + + std::cout << std::endl; + std::cout << "matrix Blend" << std::endl; + std::cout << "number_of_samples = " << number_of_samples << std::endl; + std::cout << "number_of_iteration = " << number_of_iteration << std::endl; +#ifdef COMPILER_HAS_SSE + const bool have_sse = true; +#else + const bool have_sse = false; +#endif + std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl; + std::vector source_mat(number_of_iteration); + std::vector source_float(number_of_iteration); + std::vector dest_mat1(number_of_iteration); + std::vector dest_mat2(number_of_iteration); + + CMatrix3D m; + srand(0); + for (size_t i = 0; i < number_of_iteration; ++i) + { + for (size_t j = 0; j < 16; ++j) + { + m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + source_mat[i] = m; + + source_float[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + + double duration1 = 0.0; + double duration2 = 0.0; + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start1 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat1[i].Blend(source_mat[i], source_float[i]); + double finish1 = timer_Time(); + + duration1 += GetDuration(start1, finish1); + + std::reverse(source_mat.begin(), source_mat.end()); + std::reverse(source_float.begin(), source_float.end()); + } + + Matrix3DActivateFastImpl(); + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start2 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat2[i].Blend(source_mat[i], source_float[i]); + double finish2 = timer_Time(); + + duration2 += GetDuration(start2, finish2); + + std::reverse(source_mat.begin(), source_mat.end()); + std::reverse(source_float.begin(), source_float.end()); + } + + for (size_t i = 0; i < number_of_iteration; ++i) + TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i])); + + std::cout << "SSE (off): " << duration1 << "ms" << std::endl; + std::cout << "SSE (on): " << duration2 << "ms" << std::endl; + } + + void test_MatrixAddBlendPerformance() + { + const size_t number_of_samples = 200; + const size_t number_of_iteration = 1000000; + + std::cout << std::endl; + std::cout << "matrix AddBlend" << std::endl; + std::cout << "number_of_samples = " << number_of_samples << std::endl; + std::cout << "number_of_iteration = " << number_of_iteration << std::endl; +#ifdef COMPILER_HAS_SSE + const bool have_sse = true; +#else + const bool have_sse = false; +#endif + std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl; + std::vector source_mat(number_of_iteration); + std::vector source_float(number_of_iteration); + std::vector dest_mat1(number_of_iteration); + std::vector dest_mat2(number_of_iteration); + + CMatrix3D m; + srand(0); + for (size_t i = 0; i < number_of_iteration; ++i) + { + for (size_t j = 0; j < 16; ++j) + { + m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + source_mat[i] = m; + + source_float[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX); + } + + double duration1 = 0.0; + double duration2 = 0.0; + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start1 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat1[i].AddBlend(source_mat[i], source_float[i]); + double finish1 = timer_Time(); + + duration1 += GetDuration(start1, finish1); + + std::reverse(source_mat.begin(), source_mat.end()); + std::reverse(source_float.begin(), source_float.end()); + } + + Matrix3DActivateFastImpl(); + + for (size_t sample = 0; sample < number_of_samples; ++sample) + { + double start2 = timer_Time(); + for (size_t i = 0; i < number_of_iteration; ++i) + dest_mat2[i].AddBlend(source_mat[i], source_float[i]); + double finish2 = timer_Time(); + + duration2 += GetDuration(start2, finish2); + + std::reverse(source_mat.begin(), source_mat.end()); + std::reverse(source_float.begin(), source_float.end()); + } + + for (size_t i = 0; i < number_of_iteration; ++i) + TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i])); + + std::cout << "SSE (off): " << duration1 << "ms" << std::endl; + std::cout << "SSE (on): " << duration2 << "ms" << std::endl; + } + void test_inverse() { CMatrix3D m; Index: source/ps/GameSetup/GameSetup.cpp =================================================================== --- source/ps/GameSetup/GameSetup.cpp +++ source/ps/GameSetup/GameSetup.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2020 Wildfire Games. +/* Copyright (C) 2021 Wildfire Games. * This file is part of 0 A.D. * * 0 A.D. is free software: you can redistribute it and/or modify @@ -40,6 +40,7 @@ #include "gui/GUIManager.h" #include "i18n/L10n.h" #include "maths/MathUtil.h" +#include "maths/Matrix3D.h" #include "network/NetServer.h" #include "network/NetClient.h" #include "network/NetMessage.h" @@ -613,6 +614,7 @@ vp.m_Width = g_xres; vp.m_Height = g_yres; g_Renderer.SetViewport(vp); + Matrix3DActivateFastImpl(); ModelDefActivateFastImpl(); ColorActivateFastImpl(); ModelRenderer::Init();