Index: binaries/data/config/default.cfg
===================================================================
--- binaries/data/config/default.cfg
+++ binaries/data/config/default.cfg
@@ -404,6 +404,9 @@
 zoom.in = 5
 zoom.out = 4
 
+[multithreading]
+pathfinder = 0                      ; How many threads to use for pathfinding. Special values: 0 chooses automatically, 1 de-activates threading entirely.
+
 [chat]
 timestamp = true                    ; Show at which time chat messages have been sent
 
Index: binaries/data/mods/public/gui/credits/texts/programming.json
===================================================================
--- binaries/data/mods/public/gui/credits/texts/programming.json
+++ binaries/data/mods/public/gui/credits/texts/programming.json
@@ -124,6 +124,7 @@
 			{"nick": "kabzerek", "name": "Grzegorz Kabza"},
 			{"nick": "Kai", "name": "Kai Chen"},
 			{"name": "Kareem Ergawy"},
+			{"nick": "Kuba386", "name":"Jakub Kośmicki"},
 			{"nick": "kevmo", "name": "Kevin Caffrey"},
 			{"nick": "kezz", "name": "Graeme Kerry"},
 			{"nick": "kingadami", "name": "Adam Winsor"},
Index: binaries/data/mods/public/gui/options/options.json
===================================================================
--- binaries/data/mods/public/gui/options/options.json
+++ binaries/data/mods/public/gui/options/options.json
@@ -81,7 +81,16 @@
 				"label": "Chat Timestamp",
 				"tooltip": "Show time that messages are posted in the lobby, gamesetup and ingame chat.",
 				"config": "chat.timestamp"
+			},
+			{
+				"type": "number",
+				"label": "Number of pathfinder threads",
+				"tooltip": "Number of pathfinder worker threads. Use 0 to choose automatically and 1 to disable threading altogether.",
+				"config": "pathfinder.threads",
+				"min": 0,
+				"max": 64
 			}
+
 		]
 	},
 	{
Index: source/ps/ThreadUtil.h
===================================================================
--- source/ps/ThreadUtil.h
+++ source/ps/ThreadUtil.h
@@ -20,6 +20,10 @@
 
 #include "lib/posix/posix_pthread.h"
 
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
 #ifdef DEBUG_LOCKS
 
 #define LOCK_MUTEX(_mutex) STMT( \
@@ -96,6 +100,54 @@
 	pthread_mutex_t* m_Mutex;
 };
 
+/*
+ * A Frontier is similar to a Barrier in that it synchronizes n threads.
+ * A frontier has one thread waiting for n other threads to go through the Frontier.
+ */
+class Frontier
+{
+private:
+	std::mutex m_Mutex;
+	std::condition_variable m_ConditionVariable;
+	int m_Expecting;
+	int m_Count;
+public:
+	Frontier() : m_Expecting(0), m_Count(0) {};
+
+	void Setup(int expect)
+	{
+		ENSURE(m_Expecting == 0 && m_Count == 0);
+		std::lock_guard<std::mutex> lock(m_Mutex);
+		m_Expecting = expect;
+		// The frontier is open, call Reset() to close it.
+		m_Count = m_Expecting;
+	}
+
+	void Reset()
+	{
+		m_Count = 0;
+	}
+
+	void Watch()
+	{
+		std::unique_lock<std::mutex> lock(m_Mutex);
+		// If all threads have already gone through the frontier, we can stop watching right away.
+		if (m_Count == m_Expecting)
+			return;
+		m_ConditionVariable.wait(lock, [this] { return m_Count == m_Expecting; });
+	}
+
+	void GoThrough()
+	{
+		// Acquire the lock: we must be sure that the watching thread is either not yet in Watch()
+		// or is fully in the waiting state. Without this mutex lock, we could notify when the watching thread
+		// is in wait() but not yet in the waiting state, thus deadlocking.
+		std::lock_guard<std::mutex> lock(m_Mutex);
+		// Notify the watching thread if we are the last to go through.
+		if (++m_Count == m_Expecting)
+			m_ConditionVariable.notify_one();
+	}
+};
 
 namespace ThreadUtil
 {
@@ -112,6 +164,10 @@
  */
 void SetMainThread();
 
+/**
+ * Returns the number of threads we want for the pathfinder.
+ */
+u32 GetNumberOfPathfindingThreads();
 }
 
 #endif // INCLUDED_THREADUTIL
Index: source/ps/ThreadUtil.cpp
===================================================================
--- source/ps/ThreadUtil.cpp
+++ source/ps/ThreadUtil.cpp
@@ -17,7 +17,11 @@
 
 #include "precompiled.h"
 
+#include <thread>
+
 #include "ThreadUtil.h"
+#include "ConfigDB.h"
+#include "tools/atlas/GameInterface/GameLoop.h"
 
 static bool g_MainThreadSet;
 static pthread_t g_MainThread;
@@ -37,3 +41,22 @@
 	g_MainThread = pthread_self();
 	g_MainThreadSet = true;
 }
+
+u32 ThreadUtil::GetNumberOfPathfindingThreads()
+{
+	// TODO BEFORE COMMIT ID SAY: atlas threading is de-activated because the vertex pathfinder uses the obstruction manager's obstructions.
+	// this can be changed in-betweenturns in Atlas. We should probably mutex it in the pathfinder to make sure it's not changed.
+	if ((g_AtlasGameLoop && g_AtlasGameLoop->running))
+		return 1;
+
+	u32 wantedThreads = 0;
+
+	if (CConfigDB::IsInitialised())
+		CFG_GET_VAL("multithreading.pathfinder", wantedThreads);
+
+	// By default use 2 * (# of cores - 1) cores to benefit from hardware load-balancing as ours is very simple.
+	if (wantedThreads == 0)
+		return (std::thread::hardware_concurrency() - 1) * 2;
+
+	return wantedThreads;
+}
Index: source/simulation2/Simulation2.cpp
===================================================================
--- source/simulation2/Simulation2.cpp
+++ source/simulation2/Simulation2.cpp
@@ -540,8 +540,8 @@
 	CmpPtr<ICmpPathfinder> cmpPathfinder(simContext, SYSTEM_ENTITY);
 	if (cmpPathfinder)
 	{
+		cmpPathfinder->FetchAsyncResultsAndSendMessages();
 		cmpPathfinder->UpdateGrid();
-		cmpPathfinder->FinishAsyncRequests();
 	}
 
 	// Push AI commands onto the queue before we use them
@@ -555,14 +555,17 @@
 
 	// Process newly generated move commands so the UI feels snappy
 	if (cmpPathfinder)
-		cmpPathfinder->ProcessSameTurnMoves();
-
+	{
+		cmpPathfinder->StartProcessingMoves(true);
+		cmpPathfinder->FetchAsyncResultsAndSendMessages();
+	}
 	// Send all the update phases
 	{
 		PROFILE2("Sim - Update");
 		CMessageUpdate msgUpdate(turnLengthFixed);
 		componentManager.BroadcastMessage(msgUpdate);
 	}
+
 	{
 		CMessageUpdate_MotionFormation msgUpdate(turnLengthFixed);
 		componentManager.BroadcastMessage(msgUpdate);
@@ -570,7 +573,10 @@
 
 	// Process move commands for formations (group proxy)
 	if (cmpPathfinder)
-		cmpPathfinder->ProcessSameTurnMoves();
+	{
+		cmpPathfinder->StartProcessingMoves(true);
+		cmpPathfinder->FetchAsyncResultsAndSendMessages();
+	}
 
 	{
 		PROFILE2("Sim - Motion Unit");
@@ -583,12 +589,12 @@
 		componentManager.BroadcastMessage(msgUpdate);
 	}
 
-	// Process moves resulting from group proxy movement (unit needs to catch up or realign) and any others
-	if (cmpPathfinder)
-		cmpPathfinder->ProcessSameTurnMoves();
-
 	// Clean up any entities destroyed during the simulation update
 	componentManager.FlushDestroyedComponents();
+
+	// Process all remaining moves
+	if (cmpPathfinder)
+		cmpPathfinder->StartProcessingMoves();
 }
 
 void CSimulation2Impl::Interpolate(float simFrameLength, float frameOffset, float realFrameLength)
Index: source/simulation2/components/CCmpPathfinder.cpp
===================================================================
--- source/simulation2/components/CCmpPathfinder.cpp
+++ source/simulation2/components/CCmpPathfinder.cpp
@@ -51,10 +51,6 @@
 
 	m_AtlasOverlay = NULL;
 
-	m_SameTurnMovesCount = 0;
-
-	m_VertexPathfinder = std::unique_ptr<VertexPathfinder>(new VertexPathfinder(m_MapSize, m_TerrainOnlyGrid));
-
 	// Register Relax NG validator
 	CXeromyces::AddValidator(g_VFS, "pathfinder", "simulation/data/pathfinder.rng");
 
@@ -64,21 +60,6 @@
 	CParamNode externalParamNode;
 	CParamNode::LoadXML(externalParamNode, L"simulation/data/pathfinder.xml", "pathfinder");
 
-    // Previously all move commands during a turn were
-    // queued up and processed asynchronously at the start
-    // of the next turn.  Now we are processing queued up
-    // events several times duing the turn.  This improves
-    // responsiveness and units move more smoothly especially.
-    // when in formation.  There is still a call at the
-    // beginning of a turn to process all outstanding moves -
-    // this will handle any moves above the MaxSameTurnMoves
-    // threshold.
-    //
-    // TODO - The moves processed at the beginning of the
-    // turn do not count against the maximum moves per turn
-    // currently.  The thinking is that this will eventually
-    // happen in another thread.  Either way this probably
-    // will require some adjustment and rethinking.
 	const CParamNode pathingSettings = externalParamNode.GetChild("Pathfinder");
 	m_MaxSameTurnMoves = (u16)pathingSettings.GetChild("MaxSameTurnMoves").ToInt();
 
@@ -92,10 +73,35 @@
 		m_PassClasses.push_back(PathfinderPassability(mask, it->second));
 		m_PassClassMasks[name] = mask;
 	}
-}
+
+
+	u32 wantedThreads = ThreadUtil::GetNumberOfPathfindingThreads();
+
+	LOGMESSAGE("Initialising %i threads for pathfinding.", wantedThreads);
+
+	// The worker thread will only call std::thread if we actually have > 1 threads, otherwise we're running in the main thread.
+	if (wantedThreads <= 1) // <= 1 as the above computations returns 0 for one core.
+	{
+		m_UseThreading = false;
+		m_Workers.emplace_back(new PathfinderWorker(*this, m_Workers.size()));
+	}
+	else
+	{
+		m_PathfinderFrontier.Setup(wantedThreads);
+		m_UseThreading = true;
+		for (size_t i = 0; i < wantedThreads; ++i)
+			m_Workers.emplace_back(new PathfinderWorker(*this, m_Workers.size()));
+	}
+};
 
 void CCmpPathfinder::Deinit()
 {
+	for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
+		worker->PrepareToKill();
+	m_PathfinderConditionVariable.notify_all();
+
+	m_Workers.clear();
+
 	SetDebugOverlay(false); // cleans up memory
 	SAFE_DELETE(m_AtlasOverlay);
 
@@ -141,7 +147,6 @@
 	SerializeVector<SerializeLongRequest>()(serialize, "long requests", m_LongPathRequests);
 	SerializeVector<SerializeShortRequest>()(serialize, "short requests", m_ShortPathRequests);
 	serialize.NumberU32_Unbounded("next ticket", m_NextAsyncTicket);
-	serialize.NumberU16_Unbounded("same turn moves count", m_SameTurnMovesCount);
 	serialize.NumberU16_Unbounded("map size", m_MapSize);
 }
 
@@ -176,15 +181,13 @@
 		m_TerrainDirty = true;
 		UpdateGrid();
 		break;
-	case MT_TurnStart:
-		m_SameTurnMovesCount = 0;
-		break;
 	}
 }
 
 void CCmpPathfinder::RenderSubmit(SceneCollector& collector)
 {
-	m_VertexPathfinder->RenderSubmit(collector);
+	for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
+		worker->m_VertexPathfinder.RenderSubmit(collector);
 	m_LongPathfinder.HierarchicalRenderSubmit(collector);
 }
 
@@ -671,7 +674,96 @@
 
 //////////////////////////////////////////////////////////
 
-// Async path requests:
+// Async pathfinder workers
+
+CCmpPathfinder::PathfinderWorker::PathfinderWorker(const CCmpPathfinder& pathfinder, size_t index)
+	: m_Pathfinder(pathfinder), m_VertexPathfinder(m_Pathfinder.m_MapSize, m_Pathfinder.m_TerrainOnlyGrid)
+{
+	m_Computing = false;
+	m_Kill = false;
+	if (m_Pathfinder.m_UseThreading)
+		m_Thread = std::unique_ptr<std::thread>(new std::thread(&CCmpPathfinder::PathfinderWorker::InitThread, this, index));
+}
+
+CCmpPathfinder::PathfinderWorker::~PathfinderWorker()
+{
+	if (m_Thread)
+	{
+		if (m_Thread->joinable())
+			m_Thread->join();
+	}
+}
+
+void CCmpPathfinder::PathfinderWorker::InitThread(size_t index)
+{
+	g_Profiler2.RegisterCurrentThread("Pathfinder thread " + std::to_string(index));
+	WaitForWork();
+}
+
+template<typename T>
+void CCmpPathfinder::PathfinderWorker::PushRequests(std::vector<T>&, ssize_t)
+{
+	static_assert(sizeof(T) == 0, "Only specializations can be used");
+}
+
+template<> void CCmpPathfinder::PathfinderWorker::PushRequests(std::vector<LongPathRequest>& from, ssize_t amount)
+{
+	m_LongRequests.insert(m_LongRequests.end(), std::make_move_iterator(from.end() - amount), std::make_move_iterator(from.end()));
+}
+
+template<> void CCmpPathfinder::PathfinderWorker::PushRequests(std::vector<ShortPathRequest>& from, ssize_t amount)
+{
+	m_ShortRequests.insert(m_ShortRequests.end(), std::make_move_iterator(from.end() - amount), std::make_move_iterator(from.end()));
+}
+
+void CCmpPathfinder::PathfinderWorker::PrepareToKill()
+{
+	m_Kill = true;
+}
+
+void CCmpPathfinder::PathfinderWorker::WaitForWork()
+{
+	while (true)
+	{
+		{
+			std::unique_lock<std::mutex> lock(m_Pathfinder.m_PathfinderMutex);
+			m_Pathfinder.m_PathfinderConditionVariable.wait(lock, [this] { return m_Computing || m_Kill; });
+		}
+
+		if (m_Kill)
+			return;
+
+		Work();
+
+		// We must be the ones setting our m_Computing to false.
+		ENSURE(m_Computing);
+		m_Computing = false;
+
+		m_Pathfinder.m_PathfinderFrontier.GoThrough();
+	}
+}
+
+void CCmpPathfinder::PathfinderWorker::Work()
+{
+	while (!m_LongRequests.empty())
+	{
+		const LongPathRequest& req = m_LongRequests.back();
+		WaypointPath path;
+		ComputePath(req.x0, req.z0, req.goal, req.passClass, path);
+		m_Results.push_back({req.ticket, req.notify, path});
+
+		m_LongRequests.pop_back();
+	}
+
+	while (!m_ShortRequests.empty())
+	{
+		const ShortPathRequest& req = m_ShortRequests.back();
+		WaypointPath path = ComputeShortPath(req);
+		m_Results.push_back({req.ticket, req.notify, path});
+
+		m_ShortRequests.pop_back();
+	}
+}
 
 u32 CCmpPathfinder::ComputePathAsync(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, entity_id_t notify)
 {
@@ -680,111 +772,123 @@
 	return req.ticket;
 }
 
-u32 CCmpPathfinder::ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range, const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits, entity_id_t group, entity_id_t notify)
+u32 CCmpPathfinder::ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range,
+										  const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits,
+										  entity_id_t group, entity_id_t notify)
 {
 	ShortPathRequest req = { m_NextAsyncTicket++, x0, z0, clearance, range, goal, passClass, avoidMovingUnits, group, notify };
 	m_ShortPathRequests.push_back(req);
 	return req.ticket;
 }
 
-void CCmpPathfinder::FinishAsyncRequests()
+void CCmpPathfinder::ComputePathImmediate(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret) const
 {
-	PROFILE2("Finish Async Requests");
-	// Save the request queue in case it gets modified while iterating
-	std::vector<LongPathRequest> longRequests;
-	m_LongPathRequests.swap(longRequests);
+	m_LongPathfinder.ComputePath(x0, z0, goal, passClass, ret);
+}
 
-	std::vector<ShortPathRequest> shortRequests;
-	m_ShortPathRequests.swap(shortRequests);
+WaypointPath CCmpPathfinder::ComputeShortPathImmediate(const ShortPathRequest& request) const
+{
+	VertexPathfinder pathfinder(m_MapSize, m_TerrainOnlyGrid);
+	return pathfinder.ComputeShortPath(request, CmpPtr<ICmpObstructionManager>(GetSystemEntity()));
+}
 
-	// TODO: we should only compute one path per entity per turn
+void CCmpPathfinder::FetchAsyncResultsAndSendMessages()
+{
+	PROFILE2("FetchAsyncResults");
 
-	// TODO: this computation should be done incrementally, spread
-	// across multiple frames (or even multiple turns)
+	// TODO maybe: a possible improvement here would be to push results from workers whenever they are done, and not when all are done.
 
-	ProcessLongRequests(longRequests);
-	ProcessShortRequests(shortRequests);
-}
+	// Wait until all threads have finished computing.
+	m_PathfinderFrontier.Watch();
 
-void CCmpPathfinder::ProcessLongRequests(const std::vector<LongPathRequest>& longRequests)
-{
-	PROFILE2("Process Long Requests");
-	for (size_t i = 0; i < longRequests.size(); ++i)
+	std::vector<PathResult> results;
+	for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
 	{
-		const LongPathRequest& req = longRequests[i];
-		WaypointPath path;
-		ComputePath(req.x0, req.z0, req.goal, req.passClass, path);
-		CMessagePathResult msg(req.ticket, path);
-		GetSimContext().GetComponentManager().PostMessage(req.notify, msg);
+		results.insert(results.end(), std::make_move_iterator(worker->m_Results.begin()), std::make_move_iterator(worker->m_Results.end()));
+		worker->m_Results.clear();
 	}
-}
 
-void CCmpPathfinder::ProcessShortRequests(const std::vector<ShortPathRequest>& shortRequests)
-{
-	PROFILE2("Process Short Requests");
-	for (size_t i = 0; i < shortRequests.size(); ++i)
 	{
-		const ShortPathRequest& req = shortRequests[i];
-		WaypointPath path = m_VertexPathfinder->ComputeShortPath(req, CmpPtr<ICmpObstructionManager>(GetSystemEntity()));
-		CMessagePathResult msg(req.ticket, path);
-		GetSimContext().GetComponentManager().PostMessage(req.notify, msg);
+		PROFILE2("PostMessages");
+		for (PathResult& path : results)
+		{
+			CMessagePathResult msg(path.ticket, path.path);
+			GetSimContext().GetComponentManager().PostMessage(path.notify, msg);
+		}
 	}
 }
 
-void CCmpPathfinder::ProcessSameTurnMoves()
+void CCmpPathfinder::StartProcessingMoves(bool useMax)
 {
-	if (!m_LongPathRequests.empty())
-	{
-		// Figure out how many moves we can do this time
-		i32 moveCount = m_MaxSameTurnMoves - m_SameTurnMovesCount;
+	// We will send new path requests to worker threads,
+	// trying to balance the workload somewhat
+	// and then notify them they can start working.
+	// To avoid data races, we can only push jobs when workers are not computing them,
+	// So FetchAsyncResultsAndSendMessages must have been called first.
+
+	std::vector<LongPathRequest> longRequests = PopMovesToProcess(m_LongPathRequests, useMax, m_MaxSameTurnMoves);
+	std::vector<ShortPathRequest> shortRequests = PopMovesToProcess(m_ShortPathRequests, useMax, m_MaxSameTurnMoves - longRequests.size());
 
-		if (moveCount <= 0)
-			return;
+	PushRequestsToWorkers(longRequests);
+	PushRequestsToWorkers(shortRequests);
 
-		// Copy the long request elements we are going to process into a new array
-		std::vector<LongPathRequest> longRequests;
-		if ((i32)m_LongPathRequests.size() <= moveCount)
+	m_PathfinderFrontier.Reset();
+
+	if (m_UseThreading)
+	{
+		for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
 		{
-			m_LongPathRequests.swap(longRequests);
-			moveCount = (i32)longRequests.size();
+			// Mark as computing to unblock.
+			ENSURE(!worker->m_Computing);
+			worker->m_Computing = true;
 		}
-		else
+		m_PathfinderConditionVariable.notify_all();
+	}
+	else
+		m_Workers.back()->Work();
+}
+
+template <typename T>
+std::vector<T> CCmpPathfinder::PopMovesToProcess(std::vector<T>& requests, bool useMax, size_t maxMoves)
+{
+	std::vector<T> poppedRequests;
+	if (useMax)
+	{
+		size_t amount = std::min(poppedRequests.size(), maxMoves);
+		if (amount > 0)
 		{
-			longRequests.resize(moveCount);
-			copy(m_LongPathRequests.begin(), m_LongPathRequests.begin() + moveCount, longRequests.begin());
-			m_LongPathRequests.erase(m_LongPathRequests.begin(), m_LongPathRequests.begin() + moveCount);
+			poppedRequests.insert(poppedRequests.begin(), std::make_move_iterator(requests.end() - amount), std::make_move_iterator(requests.end()));
+			requests.erase(requests.end() - amount, requests.end());
 		}
-
-		ProcessLongRequests(longRequests);
-
-		m_SameTurnMovesCount = (u16)(m_SameTurnMovesCount + moveCount);
 	}
-
-	if (!m_ShortPathRequests.empty())
+	else
 	{
-		// Figure out how many moves we can do now
-		i32 moveCount = m_MaxSameTurnMoves - m_SameTurnMovesCount;
+		poppedRequests.swap(requests);
+		requests.clear();
+	}
 
-		if (moveCount <= 0)
-			return;
+	return poppedRequests;
+}
 
-		// Copy the short request elements we are going to process into a new array
-		std::vector<ShortPathRequest> shortRequests;
-		if ((i32)m_ShortPathRequests.size() <= moveCount)
-		{
-			m_ShortPathRequests.swap(shortRequests);
-			moveCount = (i32)shortRequests.size();
-		}
-		else
-		{
-			shortRequests.resize(moveCount);
-			copy(m_ShortPathRequests.begin(), m_ShortPathRequests.begin() + moveCount, shortRequests.begin());
-			m_ShortPathRequests.erase(m_ShortPathRequests.begin(), m_ShortPathRequests.begin() + moveCount);
-		}
+template <typename T>
+void CCmpPathfinder::PushRequestsToWorkers(std::vector<T>& from)
+{
+	if (from.empty())
+		return;
 
-		ProcessShortRequests(shortRequests);
+	// Trivial load-balancing, / rounds towards zero so add 1 to ensure we do push all requests.
+	size_t amount = from.size() / m_Workers.size() + 1;
 
-		m_SameTurnMovesCount = (u16)(m_SameTurnMovesCount + moveCount);
+	for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
+	{
+		// Prevent pushing requests when the worker is computing.
+		// Call FetchAsyncResultsAndSendMessages() before pushing new requests.
+		ENSURE(!worker->m_Computing);
+
+		// Since we are rounding up before, ensure we aren't pushing beyond the end.
+		amount = std::min(amount, from.size());
+		worker->PushRequests(from, amount);
+		from.erase(from.end() - amount, from.end());
 	}
 }
 
Index: source/simulation2/components/CCmpPathfinder_Common.h
===================================================================
--- source/simulation2/components/CCmpPathfinder_Common.h
+++ source/simulation2/components/CCmpPathfinder_Common.h
@@ -39,6 +39,10 @@
 #include "simulation2/helpers/LongPathfinder.h"
 #include "simulation2/helpers/VertexPathfinder.h"
 
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
 class SceneCollector;
 class AtlasOverlay;
 
@@ -53,6 +57,73 @@
  */
 class CCmpPathfinder : public ICmpPathfinder
 {
+protected:
+
+	class PathfinderWorker
+	{
+		friend CCmpPathfinder;
+	public:
+		PathfinderWorker(const CCmpPathfinder& pathfinder, size_t index);
+
+		~PathfinderWorker();
+
+		void PrepareToKill();
+
+		// Will loop until a conditional_variable notifies us, and call Work().
+		void WaitForWork();
+
+		// Process path requests, checking if we should stop before each new one.
+		// Should be callable both synchronously and asynchronously.
+		void Work();
+
+		void ComputePath(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret) const
+		{
+			m_Pathfinder.m_LongPathfinder.ComputePath(x0, z0, goal, passClass, ret);
+		}
+
+		WaypointPath ComputeShortPath(const ShortPathRequest& request) const
+		{
+			return m_VertexPathfinder.ComputeShortPath(request, CmpPtr<ICmpObstructionManager>(m_Pathfinder.GetSystemEntity()));
+		}
+
+
+	private:
+		// Takes care of what needs to be called to initialise the thread before calling WaitForWork().
+		void InitThread(size_t index);
+
+		// Insert requests in m_[Long/Short]Requests depending on from.
+		// This could be removed when we may use if-constexpr in CCmpPathfinder::PushRequestsToWorkers
+		template<typename T>
+		void PushRequests(std::vector<T>& from, ssize_t amount);
+
+		/*
+		 * General state
+		 */
+
+		// Read-only pathfinder, for getting required information (grids,…).
+		const CCmpPathfinder& m_Pathfinder;
+
+		// Stores our results, the main thread will fetch this.
+		std::vector<PathResult> m_Results;
+
+		std::unique_ptr<std::thread> m_Thread;
+
+		/*
+		 * Thread synchronisation
+		 */
+
+		std::atomic<bool> m_Kill;
+		std::atomic<bool> m_Computing;
+
+		std::vector<LongPathRequest> m_LongRequests;
+		std::vector<ShortPathRequest> m_ShortRequests;
+
+		VertexPathfinder m_VertexPathfinder;
+	};
+
+	// Allow the workers to access our private variables
+	friend class PathfinderWorker;
+
 public:
 	static void ClassInit(CComponentManager& componentManager)
 	{
@@ -76,7 +147,7 @@
 	std::vector<LongPathRequest> m_LongPathRequests;
 	std::vector<ShortPathRequest> m_ShortPathRequests;
 	u32 m_NextAsyncTicket; // unique IDs for asynchronous path requests
-	u16 m_SameTurnMovesCount; // current number of same turn moves we have processed this turn
+	u16 m_MaxSameTurnMoves; // How many moves to immediately compute.
 
 	// Lazily-constructed dynamic state (not serialized):
 
@@ -91,13 +162,15 @@
 	GridUpdateInformation m_AIPathfinderDirtinessInformation;
 	bool m_TerrainDirty;
 
-	std::unique_ptr<VertexPathfinder> m_VertexPathfinder;
 	// Interface to the long-range pathfinder.
-	LongPathfinder m_LongPathfinder;
+	mutable LongPathfinder m_LongPathfinder;
 
-	// For responsiveness we will process some moves in the same turn they were generated in
-
-	u16 m_MaxSameTurnMoves; // max number of moves that can be created and processed in the same turn
+	// Worker process pathing requests.
+	std::vector<std::unique_ptr<PathfinderWorker>> m_Workers;
+	bool m_UseThreading = false;
+	mutable std::mutex m_PathfinderMutex;
+	mutable std::condition_variable m_PathfinderConditionVariable;
+	mutable Frontier m_PathfinderFrontier;
 
 	AtlasOverlay* m_AtlasOverlay;
 
@@ -162,19 +235,15 @@
 
 	virtual Grid<u16> ComputeShoreGrid(bool expandOnWater = false);
 
-	virtual void ComputePath(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret)
-	{
-		m_LongPathfinder.ComputePath(x0, z0, goal, passClass, ret);
-	}
+	virtual void ComputePathImmediate(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret) const;
 
 	virtual u32 ComputePathAsync(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, entity_id_t notify);
 
-	virtual WaypointPath ComputeShortPath(const ShortPathRequest& request)
-	{
-		return m_VertexPathfinder->ComputeShortPath(request, CmpPtr<ICmpObstructionManager>(GetSystemEntity()));
-	}
+	virtual WaypointPath ComputeShortPathImmediate(const ShortPathRequest& request) const;
 
-	virtual u32 ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range, const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits, entity_id_t controller, entity_id_t notify);
+	virtual u32 ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range,
+									  const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits,
+									  entity_id_t controller, entity_id_t notify);
 
 	virtual void SetDebugPath(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass)
 	{
@@ -183,7 +252,8 @@
 
 	virtual void SetDebugOverlay(bool enabled)
 	{
-		m_VertexPathfinder->SetDebugOverlay(enabled);
+		for (std::unique_ptr<PathfinderWorker>& worker : m_Workers)
+			worker->m_VertexPathfinder.SetDebugOverlay(enabled);
 		m_LongPathfinder.SetDebugOverlay(enabled);
 	}
 
@@ -207,13 +277,15 @@
 
 	virtual ICmpObstruction::EFoundationCheck CheckBuildingPlacement(const IObstructionTestFilter& filter, entity_pos_t x, entity_pos_t z, entity_pos_t a, entity_pos_t w, entity_pos_t h, entity_id_t id, pass_class_t passClass, bool onlyCenterPoint) const;
 
-	virtual void FinishAsyncRequests();
+	virtual void FetchAsyncResultsAndSendMessages();
 
-	void ProcessLongRequests(const std::vector<LongPathRequest>& longRequests);
+	virtual void StartProcessingMoves(bool useMax);
 
-	void ProcessShortRequests(const std::vector<ShortPathRequest>& shortRequests);
+	template <typename T>
+	std::vector<T> PopMovesToProcess(std::vector<T>& requests, bool useMax = false, size_t maxMoves = 0);
 
-	virtual void ProcessSameTurnMoves();
+	template <typename T>
+	void PushRequestsToWorkers(std::vector<T>& from);
 
 	/**
 	 * Regenerates the grid based on the current obstruction list, if necessary
Index: source/simulation2/components/CCmpRallyPointRenderer.cpp
===================================================================
--- source/simulation2/components/CCmpRallyPointRenderer.cpp
+++ source/simulation2/components/CCmpRallyPointRenderer.cpp
@@ -686,7 +686,7 @@
 		start.X = m_RallyPoints[index-1].X;
 		start.Y = m_RallyPoints[index-1].Y;
 	}
-	cmpPathfinder->ComputePath(start.X, start.Y, goal, cmpPathfinder->GetPassabilityClass(m_LinePassabilityClass), path);
+	cmpPathfinder->ComputePathImmediate(start.X, start.Y, goal, cmpPathfinder->GetPassabilityClass(m_LinePassabilityClass), path);
 
 	// Check if we got a path back; if not we probably have two markers less than one tile apart.
 	if (path.m_Waypoints.size() < 2)
Index: source/simulation2/components/ICmpPathfinder.h
===================================================================
--- source/simulation2/components/ICmpPathfinder.h
+++ source/simulation2/components/ICmpPathfinder.h
@@ -33,6 +33,16 @@
 
 template<typename T> class Grid;
 
+// returned by asynchronous workers, used to send messages in the main thread.
+struct WaypointPath;
+
+struct PathResult
+{
+	u32 ticket;
+	entity_id_t notify;
+	WaypointPath path;
+};
+
 /**
  * Pathfinder algorithms.
  *
@@ -89,41 +99,36 @@
 	virtual Grid<u16> ComputeShoreGrid(bool expandOnWater = false) = 0;
 
 	/**
-	 * Compute a tile-based path from the given point to the goal, and return the set of waypoints.
-	 * The waypoints correspond to the centers of horizontally/vertically adjacent tiles
-	 * along the path.
-	 */
-	virtual void ComputePath(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret) = 0;
-
-	/**
-	 * Asynchronous version of ComputePath.
+	 * Request a long path computation, asynchronously.
 	 * The result will be sent as CMessagePathResult to 'notify'.
 	 * Returns a unique non-zero number, which will match the 'ticket' in the result,
 	 * so callers can recognise each individual request they make.
 	 */
 	virtual u32 ComputePathAsync(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, entity_id_t notify) = 0;
 
+	/*
+	 * Request a long-path computation immediately
+	 */
+	virtual void ComputePathImmediate(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass, WaypointPath& ret) const = 0;
+
+	/**
+	 * Request a short path computation, asynchronously.
+	 * The result will be sent as CMessagePathResult to 'notify'.
+	 * Returns a unique non-zero number, which will match the 'ticket' in the result,
+	 * so callers can recognise each individual request they make.
+	 */
+	virtual u32 ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range, const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits, entity_id_t group, entity_id_t notify) = 0;
+
+	/*
+	 * Request a short-path computation immediately.
+	 */
+	virtual WaypointPath ComputeShortPathImmediate(const ShortPathRequest& request) const = 0;
+
 	/**
 	 * If the debug overlay is enabled, render the path that will computed by ComputePath.
 	 */
 	virtual void SetDebugPath(entity_pos_t x0, entity_pos_t z0, const PathGoal& goal, pass_class_t passClass) = 0;
 
-	/**
-	 * Compute a precise path from the given point to the goal, and return the set of waypoints.
-	 * The path is based on the full set of obstructions that pass the filter, such that
-	 * a unit of clearance 'clearance' will be able to follow the path with no collisions.
-	 * The path is restricted to a box of radius 'range' from the starting point.
-	 */
-	virtual WaypointPath ComputeShortPath(const ShortPathRequest& request) = 0;
-
-	/**
-	 * Asynchronous version of ComputeShortPath (using ControlGroupObstructionFilter).
-	 * The result will be sent as CMessagePathResult to 'notify'.
-	 * Returns a unique non-zero number, which will match the 'ticket' in the result,
-	 * so callers can recognise each individual request they make.
-	 */
-	virtual u32 ComputeShortPathAsync(entity_pos_t x0, entity_pos_t z0, entity_pos_t clearance, entity_pos_t range, const PathGoal& goal, pass_class_t passClass, bool avoidMovingUnits, entity_id_t group, entity_id_t notify) = 0;
-
 	/**
 	 * Check whether the given movement line is valid and doesn't hit any obstructions
 	 * or impassable terrain.
@@ -171,12 +176,12 @@
 	/**
 	 * Finish computing asynchronous path requests and send the CMessagePathResult messages.
 	 */
-	virtual void FinishAsyncRequests() = 0;
+	virtual void FetchAsyncResultsAndSendMessages() = 0;
 
 	/**
-	 * Process moves during the same turn they were created in to improve responsiveness.
+	 * Tell asynchronous pathfinder threads that they can begin computing paths.
 	 */
-	virtual void ProcessSameTurnMoves() = 0;
+	virtual void StartProcessingMoves(bool useMax = false) = 0;
 
 	/**
 	 * Regenerates the grid based on the current obstruction list, if necessary
Index: source/simulation2/components/tests/test_Pathfinder.h
===================================================================
--- source/simulation2/components/tests/test_Pathfinder.h
+++ source/simulation2/components/tests/test_Pathfinder.h
@@ -184,7 +184,7 @@
 			PathGoal goal = { PathGoal::POINT, x1, z1 };
 
 			WaypointPath path;
-			cmp->ComputePath(x0, z0, goal, cmp->GetPassabilityClass("default"), path);
+			cmp->ComputePathImmediate(x0, z0, goal, cmp->GetPassabilityClass("default"), path);
 		}
 
 		t = timer_Time() - t;
@@ -370,7 +370,7 @@
 		PathGoal goal = { PathGoal::POINT, x1, z1 };
 
 		WaypointPath path;
-		cmpPathfinder->ComputePath(x0, z0, goal, cmpPathfinder->GetPassabilityClass("default"), path);
+		cmpPathfinder->ComputePathImmediate(x0, z0, goal, cmpPathfinder->GetPassabilityClass("default"), path);
 
 		u32 debugSteps;
 		double debugTime;
@@ -419,7 +419,7 @@
 		for (int i = 0; i < n; ++i)
 		{
 			WaypointPath path;
-			cmpPathfinder->ComputePath(x0, z0, goal, cmpPathfinder->GetPassabilityClass("default"), path);
+			cmpPathfinder->ComputePathImmediate(x0, z0, goal, cmpPathfinder->GetPassabilityClass("default"), path);
 		}
 		t = timer_Time() - t;
 		debug_printf("### RepeatPath %fms each (%fs total)\n", 1000*t / n, t);
Index: source/simulation2/helpers/LongPathfinder.h
===================================================================
--- source/simulation2/helpers/LongPathfinder.h
+++ source/simulation2/helpers/LongPathfinder.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2017 Wildfire Games.
+/* Copyright (C) 2019 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -18,6 +18,7 @@
 #ifndef INCLUDED_LONGPATHFINDER
 #define INCLUDED_LONGPATHFINDER
 
+#include <mutex>
 #include "Pathfinding.h"
 #include "HierarchicalPathfinder.h"
 
@@ -261,6 +262,8 @@
 	mutable WaypointPath* m_DebugPath;
 	mutable pass_class_t m_DebugPassClass;
 
+	mutable std::mutex m_DebugMutex;
+
 private:
 	PathCost CalculateHeuristic(int i, int j, int iGoal, int jGoal) const;
 	void ProcessNeighbour(int pi, int pj, int i, int j, PathCost pg, PathfinderState& state) const;
Index: source/simulation2/helpers/LongPathfinder.cpp
===================================================================
--- source/simulation2/helpers/LongPathfinder.cpp
+++ source/simulation2/helpers/LongPathfinder.cpp
@@ -718,6 +718,8 @@
 	PROFILE2_IFSPIKE("ComputePathJPS", 0.0002);
 	PathfinderState state = { 0 };
 
+	ENSURE(!m_UseJPSCache && "Jump Point Cache currently not supported as it must handle threading properly.");
+	/*
 	if (m_UseJPSCache && m_JumpPointCache.find(passClass) == m_JumpPointCache.end())
 	{
 		state.jpc = new JumpPointCache;
@@ -725,7 +727,7 @@
 		debug_printf("PATHFINDER: JPC memory: %d kB\n", (int)state.jpc->GetMemoryUsage() / 1024);
 		m_JumpPointCache[passClass] = shared_ptr<JumpPointCache>(state.jpc);
 	}
-
+	*/
 	// Convert the start coordinates to tile indexes
 	u16 i0, j0;
 	Pathfinding::NearestNavcell(x0, z0, i0, j0, m_GridSize, m_GridSize);
@@ -896,6 +898,7 @@
 	ImprovePathWaypoints(path, passClass, origGoal.maxdist, x0, z0);
 
 	// Save this grid for debug display
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 	delete m_DebugGrid;
 	m_DebugGrid = state.tiles;
 	m_DebugSteps = state.steps;
@@ -958,6 +961,7 @@
 
 void LongPathfinder::GetDebugDataJPS(u32& steps, double& time, Grid<u8>& grid) const
 {
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 	steps = m_DebugSteps;
 	time = m_DebugTime;
 
@@ -982,6 +986,7 @@
 
 void LongPathfinder::SetDebugOverlay(bool enabled)
 {
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 	if (enabled && !m_DebugOverlay)
 		m_DebugOverlay = new LongOverlay(*this);
 	else if (!enabled && m_DebugOverlay)
Index: source/simulation2/helpers/VertexPathfinder.h
===================================================================
--- source/simulation2/helpers/VertexPathfinder.h
+++ source/simulation2/helpers/VertexPathfinder.h
@@ -98,6 +98,7 @@
 
 	std::atomic<bool> m_DebugOverlay;
 	mutable std::vector<SOverlayLine> m_DebugOverlayShortPathLines;
+	mutable std::mutex m_DebugMutex;
 
 	// These vectors are expensive to recreate on every call, so we cache them here.
 	// They are made mutable to allow using them in the otherwise const ComputeShortPath.
Index: source/simulation2/helpers/VertexPathfinder.cpp
===================================================================
--- source/simulation2/helpers/VertexPathfinder.cpp
+++ source/simulation2/helpers/VertexPathfinder.cpp
@@ -828,6 +828,7 @@
 {
 	if (!m_DebugOverlay)
 		return;
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 
 	m_DebugOverlayShortPathLines.clear();
 
@@ -861,6 +862,7 @@
 {
 	if (!m_DebugOverlay)
 		return;
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 
 #define PUSH_POINT(p) STMT(xz.push_back(p.X.ToFloat()); xz.push_back(p.Y.ToFloat()))
 	// Render the vertexes as little Pac-Man shapes to indicate quadrant direction
@@ -938,6 +940,7 @@
 		return;
 
 	return; // Disabled by default.
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 
 	m_DebugOverlayShortPathLines.push_back(SOverlayLine());
 	m_DebugOverlayShortPathLines.back().m_Color = visible ? CColor(0, 1, 0, 0.5) : CColor(1, 0, 0, 0.5);
@@ -957,6 +960,7 @@
 	if (!m_DebugOverlay)
 		return;
 
+	std::lock_guard<std::mutex> lock(m_DebugMutex);
 	for (size_t i = 0; i < m_DebugOverlayShortPathLines.size(); ++i)
 		collector.Submit(&m_DebugOverlayShortPathLines[i]);
 }