Index: source/maths/Matrix3D.h
===================================================================
--- source/maths/Matrix3D.h
+++ source/maths/Matrix3D.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2019 Wildfire Games.
+/* Copyright (C) 2021 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -97,30 +97,14 @@
 		return _data[idx];
 	}
 
+	static CMatrix3D(*Multiply)(const CMatrix3D& source, const CMatrix3D& matrix);
+	static void(*BlendMat)(CMatrix3D* s, const CMatrix3D& m, float f);
+	static void(*AddBlendMat)(CMatrix3D* s, const CMatrix3D& m, float f);
+
 	// matrix multiplication
-	CMatrix3D operator*(const CMatrix3D &matrix) const
+	CMatrix3D operator* (const CMatrix3D& matrix) const
 	{
-		return CMatrix3D(
-			_11*matrix._11 + _12*matrix._21 + _13*matrix._31 + _14*matrix._41,
-			_11*matrix._12 + _12*matrix._22 + _13*matrix._32 + _14*matrix._42,
-			_11*matrix._13 + _12*matrix._23 + _13*matrix._33 + _14*matrix._43,
-			_11*matrix._14 + _12*matrix._24 + _13*matrix._34 + _14*matrix._44,
-
-			_21*matrix._11 + _22*matrix._21 + _23*matrix._31 + _24*matrix._41,
-			_21*matrix._12 + _22*matrix._22 + _23*matrix._32 + _24*matrix._42,
-			_21*matrix._13 + _22*matrix._23 + _23*matrix._33 + _24*matrix._43,
-			_21*matrix._14 + _22*matrix._24 + _23*matrix._34 + _24*matrix._44,
-
-			_31*matrix._11 + _32*matrix._21 + _33*matrix._31 + _34*matrix._41,
-			_31*matrix._12 + _32*matrix._22 + _33*matrix._32 + _34*matrix._42,
-			_31*matrix._13 + _32*matrix._23 + _33*matrix._33 + _34*matrix._43,
-			_31*matrix._14 + _32*matrix._24 + _33*matrix._34 + _34*matrix._44,
-
-			_41*matrix._11 + _42*matrix._21 + _43*matrix._31 + _44*matrix._41,
-			_41*matrix._12 + _42*matrix._22 + _43*matrix._32 + _44*matrix._42,
-			_41*matrix._13 + _42*matrix._23 + _43*matrix._33 + _44*matrix._43,
-			_41*matrix._14 + _42*matrix._24 + _43*matrix._34 + _44*matrix._44
-		);
+		return Multiply(*this, matrix);
 	}
 
 	// matrix multiplication/assignment
@@ -196,19 +180,13 @@
 	// blend matrix using only 4x3 subset
 	void Blend(const CMatrix3D& m, float f)
 	{
-		_11 = m._11*f; _21 = m._21*f; _31 = m._31*f;
-		_12 = m._12*f; _22 = m._22*f; _32 = m._32*f;
-		_13 = m._13*f; _23 = m._23*f; _33 = m._33*f;
-		_14 = m._14*f; _24 = m._24*f; _34 = m._34*f;
+		BlendMat(this, m, f);
 	}
 
 	// blend matrix using only 4x3 and add onto existing blend
 	void AddBlend(const CMatrix3D& m, float f)
 	{
-		_11 += m._11*f; _21 += m._21*f; _31 += m._31*f;
-		_12 += m._12*f; _22 += m._22*f; _32 += m._32*f;
-		_13 += m._13*f; _23 += m._23*f; _33 += m._33*f;
-		_14 += m._14*f; _24 += m._24*f; _34 += m._34*f;
+		AddBlendMat(this, m, f);
 	}
 
 	// set this matrix to a rotation matrix for a rotation about X axis of given angle
@@ -322,4 +300,6 @@
 	CVector3D RotateTransposed(const CVector3D& vector) const;
 };
 
+extern void Matrix3DActivateFastImpl();
+
 #endif // INCLUDED_MATRIX3D
Index: source/maths/Matrix3D.cpp
===================================================================
--- source/maths/Matrix3D.cpp
+++ source/maths/Matrix3D.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2019 Wildfire Games.
+/* Copyright (C) 2021 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -22,10 +22,16 @@
 
 #include "precompiled.h"
 
+#include "lib/sysdep/compiler.h"
 #include "Matrix3D.h"
 #include "Quaternion.h"
 #include "Vector4D.h"
 
+#if COMPILER_HAS_SSE
+# include "lib/sse.h"
+# include <xmmintrin.h>
+#endif
+
 //Sets the identity matrix
 void CMatrix3D::SetIdentity ()
 {
@@ -442,3 +448,154 @@
 	// Negate the return angle to match the SetYRotation convention
 	return -atan2(axis.Z, axis.X);
 }
+
+#if COMPILER_HAS_SSE
+
+// matrix SSE multiplication
+
+CMatrix3D MultiplicationSSE(const CMatrix3D& source, const CMatrix3D& matrix)
+{
+	CMatrix3D result;
+	__m128 col1 = _mm_loadu_ps(source._data2d[0]);
+	__m128 col2 = _mm_loadu_ps(source._data2d[1]);
+	__m128 col3 = _mm_loadu_ps(source._data2d[2]);
+	__m128 col4 = _mm_loadu_ps(source._data2d[3]);
+
+	__m128 mvec = _mm_set_ps1(matrix._11);
+	__m128 vec = _mm_mul_ps(mvec, col1);
+	mvec = _mm_set_ps1(matrix._21);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec);
+	mvec = _mm_set_ps1(matrix._31);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec);
+	mvec = _mm_set_ps1(matrix._41);
+	_mm_storeu_ps(result._data2d[0], _mm_add_ps(_mm_mul_ps(mvec, col4), vec));
+
+	mvec = _mm_set_ps1(matrix._12);
+	vec = _mm_mul_ps(mvec, col1);
+	mvec = _mm_set_ps1(matrix._22);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec);
+	mvec = _mm_set_ps1(matrix._32);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec);
+	mvec = _mm_set_ps1(matrix._42);
+	_mm_storeu_ps(result._data2d[1], _mm_add_ps(_mm_mul_ps(mvec, col4), vec));
+
+	mvec = _mm_set_ps1(matrix._13);
+	vec = _mm_mul_ps(mvec, col1);
+	mvec = _mm_set_ps1(matrix._23);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec);
+	mvec = _mm_set_ps1(matrix._33);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec);
+	mvec = _mm_set_ps1(matrix._43);
+	_mm_storeu_ps(result._data2d[2], _mm_add_ps(_mm_mul_ps(mvec, col4), vec));
+
+	mvec = _mm_set_ps1(matrix._14);
+	vec = _mm_mul_ps(mvec, col1);
+	mvec = _mm_set_ps1(matrix._24);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col2), vec);
+	mvec = _mm_set_ps1(matrix._34);
+	vec = _mm_add_ps(_mm_mul_ps(mvec, col3), vec);
+	mvec = _mm_set_ps1(matrix._44);
+	_mm_storeu_ps(result._data2d[3], _mm_add_ps(_mm_mul_ps(mvec, col4), vec));
+	return result;
+}
+
+// blend matrix SSE using only 4x3 and add onto existing blend
+
+void AddBlendSSE(CMatrix3D* s, const CMatrix3D& m, float f)
+{
+	__m128 fvec = _mm_set_ps(0, f, f, f);
+
+	__m128 col = _mm_loadu_ps(s->_data2d[0]);
+	__m128 mcol = _mm_loadu_ps(m._data2d[0]);
+	_mm_storeu_ps(s->_data2d[0], _mm_add_ps(_mm_mul_ps(mcol, fvec), col));
+	col = _mm_loadu_ps(s->_data2d[1]);
+	mcol = _mm_loadu_ps(m._data2d[1]);
+	_mm_storeu_ps(s->_data2d[1], _mm_add_ps(_mm_mul_ps(mcol, fvec), col));
+	col = _mm_loadu_ps(s->_data2d[2]);
+	mcol = _mm_loadu_ps(m._data2d[2]);
+	_mm_storeu_ps(s->_data2d[2], _mm_add_ps(_mm_mul_ps(mcol, fvec), col));
+	col = _mm_loadu_ps(s->_data2d[3]);
+	mcol = _mm_loadu_ps(m._data2d[3]);
+	_mm_storeu_ps(s->_data2d[3], _mm_add_ps(_mm_mul_ps(mcol, fvec), col));
+}
+
+// blend matrix SSE using only 4x3 subset
+
+void BlendSSE(CMatrix3D* s, const CMatrix3D& m, float f)
+{
+	__m128 fvec = _mm_set_ps(0, f, f, f);
+	__m128 mask = _mm_set_ps(1, 0, 0, 0);
+
+	__m128 col = _mm_loadu_ps(s->_data2d[0]);
+	__m128 mcol = _mm_loadu_ps(m._data2d[0]);
+	_mm_storeu_ps(s->_data2d[0], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec)));
+	col = _mm_loadu_ps(s->_data2d[1]);
+	mcol = _mm_loadu_ps(m._data2d[1]);
+	_mm_storeu_ps(s->_data2d[1], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec)));
+	col = _mm_loadu_ps(s->_data2d[2]);
+	mcol = _mm_loadu_ps(m._data2d[2]);
+	_mm_storeu_ps(s->_data2d[2], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec)));
+	col = _mm_loadu_ps(s->_data2d[3]);
+	mcol = _mm_loadu_ps(m._data2d[3]);
+	_mm_storeu_ps(s->_data2d[3], _mm_add_ps(_mm_mul_ps(col, mask), _mm_mul_ps(mcol, fvec)));
+}
+#endif
+
+CMatrix3D MultiplicationFallback(const CMatrix3D& source, const CMatrix3D& matrix)
+{
+	return CMatrix3D(
+		source._11 * matrix._11 + source._12 * matrix._21 + source._13 * matrix._31 + source._14 * matrix._41,
+		source._11 * matrix._12 + source._12 * matrix._22 + source._13 * matrix._32 + source._14 * matrix._42,
+		source._11 * matrix._13 + source._12 * matrix._23 + source._13 * matrix._33 + source._14 * matrix._43,
+		source._11 * matrix._14 + source._12 * matrix._24 + source._13 * matrix._34 + source._14 * matrix._44,
+
+		source._21 * matrix._11 + source._22 * matrix._21 + source._23 * matrix._31 + source._24 * matrix._41,
+		source._21 * matrix._12 + source._22 * matrix._22 + source._23 * matrix._32 + source._24 * matrix._42,
+		source._21 * matrix._13 + source._22 * matrix._23 + source._23 * matrix._33 + source._24 * matrix._43,
+		source._21 * matrix._14 + source._22 * matrix._24 + source._23 * matrix._34 + source._24 * matrix._44,
+
+		source._31 * matrix._11 + source._32 * matrix._21 + source._33 * matrix._31 + source._34 * matrix._41,
+		source._31 * matrix._12 + source._32 * matrix._22 + source._33 * matrix._32 + source._34 * matrix._42,
+		source._31 * matrix._13 + source._32 * matrix._23 + source._33 * matrix._33 + source._34 * matrix._43,
+		source._31 * matrix._14 + source._32 * matrix._24 + source._33 * matrix._34 + source._34 * matrix._44,
+
+		source._41 * matrix._11 + source._42 * matrix._21 + source._43 * matrix._31 + source._44 * matrix._41,
+		source._41 * matrix._12 + source._42 * matrix._22 + source._43 * matrix._32 + source._44 * matrix._42,
+		source._41 * matrix._13 + source._42 * matrix._23 + source._43 * matrix._33 + source._44 * matrix._43,
+		source._41 * matrix._14 + source._42 * matrix._24 + source._43 * matrix._34 + source._44 * matrix._44
+	);
+}
+
+void BlendFallback(CMatrix3D* s, const CMatrix3D& m, float f)
+{
+	s->_11 = m._11 * f; s->_21 = m._21 * f; s->_31 = m._31 * f;
+	s->_12 = m._12 * f; s->_22 = m._22 * f; s->_32 = m._32 * f;
+	s->_13 = m._13 * f; s->_23 = m._23 * f; s->_33 = m._33 * f;
+	s->_14 = m._14 * f; s->_24 = m._24 * f; s->_34 = m._34 * f;
+}
+
+void AddBlendFallback(CMatrix3D* s, const CMatrix3D& m, float f)
+{
+	s->_11 += m._11 * f; s->_21 += m._21 * f; s->_31 += m._31 * f;
+	s->_12 += m._12 * f; s->_22 += m._22 * f; s->_32 += m._32 * f;
+	s->_13 += m._13 * f; s->_23 += m._23 * f; s->_33 += m._33 * f;
+	s->_14 += m._14 * f; s->_24 += m._24 * f; s->_34 += m._34 * f;
+}
+
+
+CMatrix3D(*CMatrix3D::Multiply)(const CMatrix3D& source, const CMatrix3D& matrix) = MultiplicationFallback;
+void(*CMatrix3D::BlendMat)(CMatrix3D* s, const CMatrix3D& m, float f) = BlendFallback;
+void(*CMatrix3D::AddBlendMat)(CMatrix3D* s, const CMatrix3D& m, float f) = AddBlendFallback;
+
+void Matrix3DActivateFastImpl()
+{
+#if COMPILER_HAS_SSE
+	if (HostHasSSE())
+	{
+		CMatrix3D::Multiply = MultiplicationSSE;
+		CMatrix3D::AddBlendMat = AddBlendSSE;
+		CMatrix3D::BlendMat = BlendSSE;
+		return;
+	}
+#endif
+}
Index: source/maths/tests/test_Matrix3d.h
===================================================================
--- source/maths/tests/test_Matrix3d.h
+++ source/maths/tests/test_Matrix3d.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011 Wildfire Games.
+/* Copyright (C) 2021 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -21,10 +21,262 @@
 #include <cmath>
 #include "maths/Matrix3D.h"
 #include "maths/Quaternion.h"
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+namespace
+{
+	bool EqualsWithEpsilon(const CMatrix3D& m1, const CMatrix3D& m2)
+	{
+		const float EPS = 0.01f;
+		return
+			std::abs(m1._11 - m2._11) <= EPS &&
+			std::abs(m1._12 - m2._12) <= EPS &&
+			std::abs(m1._13 - m2._13) <= EPS &&
+			std::abs(m1._14 - m2._14) <= EPS &&
+			std::abs(m1._21 - m2._21) <= EPS &&
+			std::abs(m1._22 - m2._22) <= EPS &&
+			std::abs(m1._23 - m2._23) <= EPS &&
+			std::abs(m1._24 - m2._24) <= EPS &&
+			std::abs(m1._31 - m2._31) <= EPS &&
+			std::abs(m1._32 - m2._32) <= EPS &&
+			std::abs(m1._33 - m2._33) <= EPS &&
+			std::abs(m1._34 - m2._34) <= EPS &&
+			std::abs(m1._41 - m2._41) <= EPS &&
+			std::abs(m1._42 - m2._42) <= EPS &&
+			std::abs(m1._43 - m2._43) <= EPS &&
+			std::abs(m1._44 - m2._44) <= EPS;
+	}
+
+	double GetDuration(double start, double finish)
+	{
+		return finish - start;
+	}
+}
 
 class TestMatrix : public CxxTest::TestSuite
 {
 public:
+
+	void test_MatrixMultiplicationPerformance()
+	{
+		const size_t number_of_samples = 200;
+		const size_t number_of_iteration = 1000000;
+
+		std::cout << std::endl;
+		std::cout << "matrix multiplication" << std::endl;
+		std::cout << "number_of_samples = " << number_of_samples << std::endl;
+		std::cout << "number_of_iteration = " << number_of_iteration << std::endl;
+#ifdef COMPILER_HAS_SSE
+		const bool have_sse = true;
+#else
+		const bool have_sse = false;
+#endif
+		std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl;
+		std::vector<CMatrix3D> source_mat1(number_of_iteration);
+		std::vector<CMatrix3D> source_mat2(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat1(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat2(number_of_iteration);
+
+		CMatrix3D m;
+		srand(0);
+		for (size_t i = 0; i < number_of_iteration; ++i)
+		{
+			for (size_t j = 0; j < 16; ++j)
+			{
+				m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+			}
+			source_mat1[i] = m;
+
+			for (size_t j = 0; j < 16; ++j)
+			{
+				m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+			}
+			source_mat2[i] = m;
+		}
+
+		double duration1 = 0.0;
+		double duration2 = 0.0;
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start1 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat1[i] = source_mat1[i] * source_mat2[i];
+			double finish1 = timer_Time();
+
+			duration1 += GetDuration(start1, finish1);
+
+			std::reverse(source_mat1.begin(), source_mat1.end());
+			std::reverse(source_mat2.begin(), source_mat2.end());
+		}
+
+		Matrix3DActivateFastImpl();
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start2 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat2[i] = source_mat1[i] * source_mat2[i];
+			double finish2 = timer_Time();
+
+			duration2 += GetDuration(start2, finish2);
+
+			std::reverse(source_mat1.begin(), source_mat1.end());
+			std::reverse(source_mat2.begin(), source_mat2.end());
+		}
+
+		for (size_t i = 0; i < number_of_iteration; ++i)
+			TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i]));
+
+		std::cout << "SSE (off): " << duration1 << "ms" << std::endl;
+		std::cout << "SSE (on):  " << duration2 << "ms" << std::endl;
+	}
+
+	void test_MatrixBlendPerformance()
+	{
+		const size_t number_of_samples = 200;
+		const size_t number_of_iteration = 1000000;
+
+		std::cout << std::endl;
+		std::cout << "matrix Blend" << std::endl;
+		std::cout << "number_of_samples = " << number_of_samples << std::endl;
+		std::cout << "number_of_iteration = " << number_of_iteration << std::endl;
+#ifdef COMPILER_HAS_SSE
+		const bool have_sse = true;
+#else
+		const bool have_sse = false;
+#endif
+		std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl;
+		std::vector<CMatrix3D> source_mat(number_of_iteration);
+		std::vector<float> source_float(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat1(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat2(number_of_iteration);
+
+		CMatrix3D m;
+		srand(0);
+		for (size_t i = 0; i < number_of_iteration; ++i)
+		{
+			for (size_t j = 0; j < 16; ++j)
+			{
+				m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+			}
+			source_mat[i] = m;
+
+			source_float[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+		}
+
+		double duration1 = 0.0;
+		double duration2 = 0.0;
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start1 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat1[i].Blend(source_mat[i], source_float[i]);
+			double finish1 = timer_Time();
+
+			duration1 += GetDuration(start1, finish1);
+
+			std::reverse(source_mat.begin(), source_mat.end());
+			std::reverse(source_float.begin(), source_float.end());
+		}
+
+		Matrix3DActivateFastImpl();
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start2 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat2[i].Blend(source_mat[i], source_float[i]);
+			double finish2 = timer_Time();
+
+			duration2 += GetDuration(start2, finish2);
+
+			std::reverse(source_mat.begin(), source_mat.end());
+			std::reverse(source_float.begin(), source_float.end());
+		}
+
+		for (size_t i = 0; i < number_of_iteration; ++i)
+			TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i]));
+
+		std::cout << "SSE (off): " << duration1 << "ms" << std::endl;
+		std::cout << "SSE (on):  " << duration2 << "ms" << std::endl;
+	}
+
+	void test_MatrixAddBlendPerformance()
+	{
+		const size_t number_of_samples = 200;
+		const size_t number_of_iteration = 1000000;
+
+		std::cout << std::endl;
+		std::cout << "matrix AddBlend" << std::endl;
+		std::cout << "number_of_samples = " << number_of_samples << std::endl;
+		std::cout << "number_of_iteration = " << number_of_iteration << std::endl;
+#ifdef COMPILER_HAS_SSE
+		const bool have_sse = true;
+#else
+		const bool have_sse = false;
+#endif
+		std::cout << "have_sse = " << std::boolalpha << have_sse << std::endl;
+		std::vector<CMatrix3D> source_mat(number_of_iteration);
+		std::vector<float> source_float(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat1(number_of_iteration);
+		std::vector<CMatrix3D> dest_mat2(number_of_iteration);
+
+		CMatrix3D m;
+		srand(0);
+		for (size_t i = 0; i < number_of_iteration; ++i)
+		{
+			for (size_t j = 0; j < 16; ++j)
+			{
+				m._data[j] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+			}
+			source_mat[i] = m;
+
+			source_float[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
+		}
+
+		double duration1 = 0.0;
+		double duration2 = 0.0;
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start1 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat1[i].AddBlend(source_mat[i], source_float[i]);
+			double finish1 = timer_Time();
+
+			duration1 += GetDuration(start1, finish1);
+
+			std::reverse(source_mat.begin(), source_mat.end());
+			std::reverse(source_float.begin(), source_float.end());
+		}
+
+		Matrix3DActivateFastImpl();
+
+		for (size_t sample = 0; sample < number_of_samples; ++sample)
+		{
+			double start2 = timer_Time();
+			for (size_t i = 0; i < number_of_iteration; ++i)
+				dest_mat2[i].AddBlend(source_mat[i], source_float[i]);
+			double finish2 = timer_Time();
+
+			duration2 += GetDuration(start2, finish2);
+
+			std::reverse(source_mat.begin(), source_mat.end());
+			std::reverse(source_float.begin(), source_float.end());
+		}
+
+		for (size_t i = 0; i < number_of_iteration; ++i)
+			TS_ASSERT(EqualsWithEpsilon(dest_mat1[i], dest_mat2[i]));
+
+		std::cout << "SSE (off): " << duration1 << "ms" << std::endl;
+		std::cout << "SSE (on):  " << duration2 << "ms" << std::endl;
+	}
+
 	void test_inverse()
 	{
 		CMatrix3D m;
Index: source/ps/GameSetup/GameSetup.cpp
===================================================================
--- source/ps/GameSetup/GameSetup.cpp
+++ source/ps/GameSetup/GameSetup.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2020 Wildfire Games.
+/* Copyright (C) 2021 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -40,6 +40,7 @@
 #include "gui/GUIManager.h"
 #include "i18n/L10n.h"
 #include "maths/MathUtil.h"
+#include "maths/Matrix3D.h"
 #include "network/NetServer.h"
 #include "network/NetClient.h"
 #include "network/NetMessage.h"
@@ -613,6 +614,7 @@
 	vp.m_Width = g_xres;
 	vp.m_Height = g_yres;
 	g_Renderer.SetViewport(vp);
+	Matrix3DActivateFastImpl();
 	ModelDefActivateFastImpl();
 	ColorActivateFastImpl();
 	ModelRenderer::Init();