diff --git a/src/Features/UnifiedWater/WaterCache.cpp b/src/Features/UnifiedWater/WaterCache.cpp
index 2da8f1cbc6..a2addeaf97 100644
--- a/src/Features/UnifiedWater/WaterCache.cpp
+++ b/src/Features/UnifiedWater/WaterCache.cpp
@@ -2,6 +2,8 @@
 
 #include <BS_thread_pool.hpp>
 
+#include "Utils/WinApi.h"
+
 bool WaterCache::SetCurrentWorldSpace(const RE::TESWorldSpace* worldSpace)
 {
 	if (!worldSpace)
@@ -214,7 +216,8 @@ bool WaterCache::GenerateCaches()
 		}
 	}
 
-	const unsigned hw = std::max(1u, std::thread::hardware_concurrency());
+	// Use P-core logical count on Intel hybrid CPUs; falls back to hardware_concurrency() on non-hybrid.
+	const unsigned hw = Util::GetPerformanceCoreCount();
 	const unsigned threads = std::max(1u, hw > 4 ? hw - 4 : (hw * 3) / 4);
 	async.pool = std::make_unique<BS::thread_pool<>>(threads);
 
diff --git a/src/Menu/AdvancedSettingsRenderer.cpp b/src/Menu/AdvancedSettingsRenderer.cpp
index f985f6f680..85310a0ec4 100644
--- a/src/Menu/AdvancedSettingsRenderer.cpp
+++ b/src/Menu/AdvancedSettingsRenderer.cpp
@@ -126,15 +126,16 @@ void AdvancedSettingsRenderer::RenderLoggingSection()
 	ImGui::SliderInt("Compiler Threads", &shaderCache->compilationThreadCount, 1, static_cast<int32_t>(std::thread::hardware_concurrency()));
 	if (auto _tt = Util::HoverTooltipWrapper()) {
 		ImGui::Text(
-			"Number of threads to use to compile shaders. "
-			"The more threads the faster compilation will finish but may make the system unresponsive. ");
+			"Number of threads used to compile shaders at startup. "
+			"Defaults to all logical cores minus one for OS headroom (E-cores included). "
+			"Higher values finish compilation faster but may make the system less responsive.");
 	}
 	ImGui::SliderInt("Background Compiler Threads", &shaderCache->backgroundCompilationThreadCount, 1, static_cast<int32_t>(std::thread::hardware_concurrency()));
 	if (auto _tt = Util::HoverTooltipWrapper()) {
 		ImGui::Text(
-			"Number of threads to use to compile shaders while playing game. "
-			"This is activated if the startup compilation is skipped. "
-			"The more threads the faster compilation will finish but may make the system unresponsive. ");
+			"Number of threads used to compile shaders during gameplay. "
+			"Defaults to half of performance cores to avoid impacting the render thread. "
+			"Higher values finish compilation faster but may cause stuttering.");
 	}
 
 	ImGui::Columns(2, nullptr, false);
@@ -552,6 +553,103 @@ void AdvancedSettingsRenderer::RenderDeveloperSection()
 	// Statistics section (moved from Advanced/Logging)
 	if (ImGui::TreeNodeEx("Statistics", ImGuiTreeNodeFlags_DefaultOpen)) {
 		ImGui::Text(std::format("Shader Compiler : {}", shaderCache->GetShaderStatsString()).c_str());
+
+		// Derived parallelism metrics are computed lazily on demand and only shown
+		// once compilation has completed to avoid per-frame analysis while compiling.
+		if (!shaderCache->IsCompiling()) {
+			auto parallelism = shaderCache->GetParallelismStats();
+			if (parallelism.has_value()) {
+				const auto& p = parallelism.value();
+				ImGui::Spacing();
+				ImGui::TextDisabled("Parallelism (derived from %zu compiled tasks)", p.sampleCount);
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Computed lazily from the last completed build.");
+					ImGui::Text("Only evaluated when this Statistics section is open.");
+				}
+				ImGui::Text("Work (W, sum of task wall times): %s", Util::FormatDuration(p.workMs).c_str());
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Total compile work: sum of all per-shader wall-clock compile times.");
+					ImGui::Text("This is not CPU time; it is accumulated task elapsed time.");
+					ImGui::Text("Equivalent serial time on one worker if overhead stayed the same.");
+				}
+				ImGui::Text("Span (S, longest): %s", Util::FormatDuration(p.spanMs).c_str());
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Critical-path lower bound, approximated by the single slowest shader.");
+					ImGui::Text("Even infinite cores cannot finish faster than this.");
+				}
+				ImGui::Text("Makespan (T_p): %s", Util::FormatDuration(p.makespanMs).c_str());
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Observed wall-clock duration for the full shader build.");
+				}
+				ImGui::Text("Queue wait (avg/max): %s / %s",
+					Util::FormatDuration(p.avgQueueWaitMs).c_str(),
+					Util::FormatDuration(p.maxQueueWaitMs).c_str());
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Time spent waiting in the ready queue before a worker started compilation.");
+					ImGui::Text("Useful for identifying scheduler-induced delay separate from compile cost.");
+				}
+				ImGui::Text("Average parallelism (W/S): %.2fx", p.avgParallelism);
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Average useful concurrency in this workload.");
+					ImGui::Text("Roughly the worker count where adding more cores gives diminishing returns.");
+				}
+				ImGui::Text("Infinite-core efficiency (S/T_p): %.1f%%", 100.0 * p.infiniteCoreEfficiency);
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("How close runtime is to the infinite-core lower bound.");
+					ImGui::Text("100%% means T_p == S.");
+				}
+				ImGui::Text("Infinite-core gap: %.1f%%", p.infiniteCoreGapPercent);
+				if (auto _tt = Util::HoverTooltipWrapper()) {
+					ImGui::Text("Distance from ideal infinite-core time.");
+					ImGui::Text("Defined as 100 * (1 - S / T_p). Lower is better.");
+				}
+
+				ImGui::Spacing();
+				ImGui::TextDisabled("Infinite-core efficiency");
+				float efficiency = static_cast<float>(std::clamp(p.infiniteCoreEfficiency, 0.0, 1.0));
+				ImGui::ProgressBar(efficiency, ImVec2(-1.0f, 0.0f), std::format("{:.1f}% efficient / {:.1f}% gap", 100.0 * p.infiniteCoreEfficiency, p.infiniteCoreGapPercent).c_str());
+
+				ImGui::Spacing();
+				ImGui::TextDisabled("Relative durations (normalized)");
+				double maxMs = std::max({ p.workMs, p.spanMs, p.makespanMs, 1.0 });
+				auto drawRelativeBar = [maxMs](const char* label, double value) {
+					float ratio = static_cast<float>(std::clamp(value / maxMs, 0.0, 1.0));
+					ImGui::TextUnformatted(label);
+					ImGui::SameLine();
+					ImGui::ProgressBar(ratio, ImVec2(-1.0f, 0.0f), std::format("{} ({:.1f}%)", Util::FormatDuration(value), 100.0 * ratio).c_str());
+				};
+				drawRelativeBar("Span (S)", p.spanMs);
+				drawRelativeBar("Makespan (T_p)", p.makespanMs);
+				drawRelativeBar("Work (W)", p.workMs);
+			}
+		}
+
+		// Top-3 slowest shaders from the last build
+		auto topSlow = shaderCache->GetTopSlowTasks(3);
+		if (!topSlow.empty()) {
+			ImGui::Spacing();
+			ImGui::TextDisabled("Top %zu Slowest Shaders (last build)", topSlow.size());
+			for (size_t i = 0; i < topSlow.size(); ++i) {
+				const auto& rec = topSlow[i];
+				ImGui::Text("#%zu  %s  (weight %d)", i + 1,
+					Util::FormatDuration(rec.elapsedMs).c_str(), rec.priority);
+				ImGui::SameLine();
+				ImGui::TextDisabled("%s", rec.key.c_str());
+				if (ImGui::IsItemHovered()) {
+					if (auto _tt = Util::HoverTooltipWrapper()) {
+						ImGui::Text("%s", rec.key.c_str());
+					}
+				}
+				// Allow copying the full key with a right-click
+				if (ImGui::BeginPopupContextItem(std::format("##slowcopy{}", i).c_str())) {
+					if (ImGui::MenuItem("Copy key")) {
+						ImGui::SetClipboardText(rec.key.c_str());
+					}
+					ImGui::EndPopup();
+				}
+			}
+		}
+
 		ImGui::TreePop();
 	}
 
diff --git a/src/Menu/OverlayRenderer.cpp b/src/Menu/OverlayRenderer.cpp
index c80bf97bed..e7ccaeb5b5 100644
--- a/src/Menu/OverlayRenderer.cpp
+++ b/src/Menu/OverlayRenderer.cpp
@@ -168,6 +168,23 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::function<const ch
 		}
 		ImGui::TextUnformatted(progressTitle.c_str());
 		ImGui::ProgressBar(percent, ImVec2(0.0f, 0.0f), progressOverlay.c_str());
+		if (state->IsDeveloperMode()) {
+			int32_t threadLimit = shaderCache->backgroundCompilation ? shaderCache->backgroundCompilationThreadCount : shaderCache->compilationThreadCount;
+			int compilationRunning = (int)shaderCache->compilationPool.get_tasks_running();
+			int heavyInFlight = shaderCache->GetHeavyTasksInFlight();
+			int heavyLimit = static_cast<int>(Util::GetPerformanceCoreCount());
+			uint64_t slow = shaderCache->GetSlowTasks();
+			uint64_t verySlow = shaderCache->GetVerySlowTasks();
+			ImGui::Text("Threads: %d / %d limit | Heavy: %d / %d P-cores | %d workers",
+				compilationRunning,
+				threadLimit,
+				heavyInFlight,
+				heavyLimit,
+				(int)shaderCache->compilationPool.get_thread_count());
+			if (slow > 0) {
+				ImGui::Text("Slow shaders: %llu (very slow: %llu)", slow, verySlow);
+			}
+		}
 		if (!shaderCache->backgroundCompilation && shaderCache->menuLoaded) {
 			auto skipShadersText = fmt::format(
 				"Press {} to proceed without completing shader compilation. ",
@@ -180,7 +197,9 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::function<const ch
 			ImGui::TextColored(themeSettings.StatusPalette.Warning, renderDocInformation.c_str());
 
 		ImGui::End();
-	} else if (failed) {
+	}
+
+	if (failed) {
 		if (!hide) {
 			ImGui::SetNextWindowPos(ImVec2(pos, pos));
 			if (!ImGui::Begin("ShaderCompilationInfo", nullptr, ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoSavedSettings)) {
diff --git a/src/Menu/SettingsTabRenderer.cpp b/src/Menu/SettingsTabRenderer.cpp
index 7e6be77684..90e8c47cc8 100644
--- a/src/Menu/SettingsTabRenderer.cpp
+++ b/src/Menu/SettingsTabRenderer.cpp
@@ -11,6 +11,7 @@
 #include "IconLoader.h"
 #include "Menu.h"
 #include "ShaderCache.h"
+#include "State.h"
 #include "ThemeManager.h"
 #include "Util.h"
 
@@ -230,6 +231,77 @@ void SettingsTabRenderer::RenderShadersTab()
 		if (shaderCache->GetTotalTasks() > 0) {
 			ImGui::Text("Last shader cache build duration: %s",
 				shaderCache->GetShaderStatsString(true, true).c_str());
+
+			// Stacked bar showing compilation breakdown
+			{
+				uint64_t total = shaderCache->GetTotalTasks();
+				uint64_t completed = shaderCache->GetCompletedTasks();
+				uint64_t failed = shaderCache->GetFailedTasks();
+				uint64_t cacheHits = shaderCache->GetCachedHitTasks();
+				uint64_t slow = shaderCache->GetSlowTasks();
+				uint64_t verySlow = shaderCache->GetVerySlowTasks();
+				// Compiled = tasks that actually went through compilation.
+				// Cache hits are separate (returned early without queueing).
+				uint64_t compiled = completed;
+				uint64_t fast = compiled > slow ? compiled - slow : 0;
+				uint64_t medium = slow > verySlow ? slow - verySlow : 0;  // 2-8s
+
+				struct Segment
+				{
+					uint64_t count;
+					ImU32 color;
+					const char* label;
+				};
+				Segment segments[] = {
+					{ cacheHits, IM_COL32(120, 120, 120, 255), "Deduplicated" },
+					{ fast, IM_COL32(80, 180, 80, 255), "Fast (<2s)" },
+					{ medium, IM_COL32(220, 180, 50, 255), "Slow (2-8s)" },
+					{ verySlow, IM_COL32(220, 60, 60, 255), "Very slow (>=8s)" },
+					{ failed, IM_COL32(160, 30, 30, 255), "Failed" },
+				};
+
+				float barHeight = 14.0f * Util::GetUIScale();
+				float barWidth = ImGui::GetContentRegionAvail().x;
+				ImVec2 cursor = ImGui::GetCursorScreenPos();
+				ImDrawList* drawList = ImGui::GetWindowDrawList();
+
+				// Background
+				drawList->AddRectFilled(cursor, ImVec2(cursor.x + barWidth, cursor.y + barHeight), IM_COL32(40, 40, 40, 255));
+
+				// Draw segments
+				float x = cursor.x;
+				for (auto& seg : segments) {
+					if (seg.count == 0 || total == 0)
+						continue;
+					float segWidth = (static_cast<float>(seg.count) / static_cast<float>(total)) * barWidth;
+					if (segWidth < 1.0f)
+						segWidth = 1.0f;
+					drawList->AddRectFilled(ImVec2(x, cursor.y), ImVec2(x + segWidth, cursor.y + barHeight), seg.color);
+					x += segWidth;
+				}
+
+				// Reserve space and handle tooltip
+				ImGui::Dummy(ImVec2(barWidth, barHeight));
+				if (ImGui::IsItemHovered()) {
+					ImGui::BeginTooltip();
+					for (auto& seg : segments) {
+						if (seg.count == 0)
+							continue;
+						float pct = total > 0 ? 100.0f * static_cast<float>(seg.count) / static_cast<float>(total) : 0.0f;
+						ImGui::TextColored(ImGui::ColorConvertU32ToFloat4(seg.color), "%s: %llu (%.1f%%)", seg.label, seg.count, pct);
+					}
+					ImGui::EndTooltip();
+				}
+			}
+
+			auto state = globals::state;
+			if (state->IsDeveloperMode()) {
+				ImGui::Text("Threads: %d compile, %d background, %d pool | P-cores: %d",
+					(int)shaderCache->compilationThreadCount,
+					(int)shaderCache->backgroundCompilationThreadCount,
+					(int)shaderCache->compilationPool.get_thread_count(),
+					(int)Util::GetPerformanceCoreCount());
+			}
 		}
 
 		ImGui::EndTabItem();
diff --git a/src/ShaderCache.cpp b/src/ShaderCache.cpp
index 1b3f3edd3d..9bdbb45e9b 100644
--- a/src/ShaderCache.cpp
+++ b/src/ShaderCache.cpp
@@ -1341,20 +1341,24 @@ namespace SIE
 				return nullptr;
 			}
 
-			// check hashmap
 			auto& cache = ShaderCache::Instance();
-			ID3DBlob* shaderBlob = cache.GetCompletedShader(shaderClass, shader, descriptor);
-
-			if (shaderBlob) {
-				// already compiled before
-				logger::debug("Shader already compiled; using cache: {}", SShaderCache::GetShaderString(shaderClass, shader, descriptor));
+			auto key = SShaderCache::GetShaderString(shaderClass, shader, descriptor, true);
+
+			// Atomically check the shaderMap and either:
+			//  - return the blob if already Completed (cache hit),
+			//  - wait if another thread is compiling (Pending),
+			//  - claim the slot with Pending if nobody started yet.
+			auto [claimResult, cachedBlob] = cache.ClaimCompilation(key);
+			if (claimResult == ShaderCache::ClaimResult::CacheHit) {
 				cache.IncCacheHitTasks();
-				return shaderBlob;
+				return cachedBlob;
 			}
+
 			const auto type = shader.shaderType.get();
 
 			// check diskcache
 			auto diskPath = GetDiskPath(shader.fxpFilename, descriptor, shaderClass);
+			ID3DBlob* shaderBlob = nullptr;
 
 			if (useDiskCache && std::filesystem::exists(diskPath)) {
 				// check build time of cache
@@ -1405,6 +1409,7 @@ namespace SIE
 			auto pathString = Util::WStringToString(path);
 			if (!std::filesystem::exists(path)) {
 				logger::error("Failed to compile {} shader {}::{:X}: {} does not exist", magic_enum::enum_name(shaderClass), magic_enum::enum_name(type), descriptor, pathString);
+				cache.AddCompletedShader(shaderClass, shader, descriptor, nullptr);
 				return nullptr;
 			}
 			logger::debug("Compiling {} {}:{}:{:X} to {}", pathString, magic_enum::enum_name(type), magic_enum::enum_name(shaderClass), descriptor, MergeDefinesString(defines));
@@ -1926,11 +1931,16 @@ namespace SIE
 	{
 		Clear();
 		StopFileWatcher();
+		// Signal management thread to stop dispatching; pool workers observe the same
+		// stop token and will not pick up new tasks after current compilations finish.
+		HANDLE managementHandle = managementJthread.native_handle();
+		managementJthread.request_stop();
+		// Purge unstarted tasks so we only wait for compilations already in flight.
+		compilationPool.purge();
 		if (!compilationPool.wait_for(std::chrono::milliseconds(1000))) {
-			logger::info("Tasks still running despite request to stop; killing thread {}!", GetThreadId(managementThread));
-			WaitForSingleObject(managementThread, 1000);
-			TerminateThread(managementThread, 0);
-			CloseHandle(managementThread);
+			logger::info("Tasks still running despite request to stop; killing management thread {}!", GetThreadId(managementHandle));
+			WaitForSingleObject(managementHandle, 1000);
+			TerminateThread(managementHandle, 0);
 		}
 	}
 
@@ -2102,6 +2112,7 @@ namespace SIE
 			std::unique_lock lockM{ mapMutex };
 			shaderMap.insert_or_assign(key, ShaderCacheResult{ a_blob, status, system_clock::now() });
 		}
+		mapCV.notify_all();  // wake threads waiting on a Pending→Completed/Failed transition
 		const std::wstring path = SIE::SShaderCache::GetShaderPath(
 			shader.shaderType == RE::BSShader::Type::ImageSpace ?
 				static_cast<const RE::BSImagespaceShader&>(shader).originalShaderName :
@@ -2138,6 +2149,53 @@ namespace SIE
 		return a_blob != nullptr;
 	}
 
+	std::pair<ShaderCache::ClaimResult, ID3DBlob*> ShaderCache::ClaimCompilation(const std::string& key)
+	{
+		std::unique_lock lockM{ mapMutex };
+
+		for (;;) {
+			auto it = shaderMap.find(key);
+			if (it != shaderMap.end()) {
+				auto& entry = it->second;
+				if (entry.status == ShaderCompilationTask::Status::Completed) {
+					if (entry.blob) {
+						logger::debug("Shader already compiled; using cache: {}", key);
+						return { ClaimResult::CacheHit, entry.blob };
+					}
+					break;  // Completed with nullptr blob — re-compile
+				}
+				if (entry.status == ShaderCompilationTask::Status::Failed) {
+					break;  // Previous attempt failed — re-compile
+				}
+				// Status is Pending — another thread is compiling this shader.
+				logger::debug("Shader compilation in progress, waiting: {}", key);
+				mapCV.wait(lockM);
+				continue;  // re-check after wakeup
+			}
+			break;  // not in map at all
+		}
+
+		// Claim the slot as Pending before releasing the lock
+		shaderMap.insert_or_assign(key, ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Pending, system_clock::now() });
+		return { ClaimResult::Claimed, nullptr };
+	}
+
+	void ShaderCache::ResolvePendingFailure(const std::string& key)
+	{
+		bool changed = false;
+		{
+			std::unique_lock lockM{ mapMutex };
+			auto it = shaderMap.find(key);
+			if (it != shaderMap.end() && it->second.status == ShaderCompilationTask::Status::Pending) {
+				it->second = ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Failed, system_clock::now() };
+				changed = true;
+			}
+		}
+		if (changed) {
+			mapCV.notify_all();
+		}
+	}
+
 	ID3DBlob* ShaderCache::GetCompletedShader(const std::string& a_key)
 	{
 		std::string type = SIE::SShaderCache::GetTypeFromShaderString(a_key);
@@ -2211,7 +2269,8 @@ namespace SIE
 		if (IsCompiling()) {
 			logger::info("Stopping {} remaining shader compilation tasks", compilationSet.totalTasks - compilationSet.completedTasks - compilationSet.failedTasks);
 		}
-		ssource.request_stop();
+		ssource.request_stop();            // signals any legacy stop_token users
+		managementJthread.request_stop();  // stops management thread + in-flight compilations
 		compilationSet.Clear();
 	}
 
@@ -2311,8 +2370,13 @@ namespace SIE
 	ShaderCache::ShaderCache()
 	{
 		dependencyTracker = std::make_unique<ShaderFileDependencyTracker>();
-		logger::debug("ShaderCache initialized with {} compiler threads", (int)compilationThreadCount);
-		compilationPool.detach_task([this, token = ssource.get_token()] { ManageCompilationSet(token); });
+		logger::debug("ShaderCache initialized: {} startup threads, {} background threads, {} pool threads",
+			(int)compilationThreadCount, (int)backgroundCompilationThreadCount, (int)compilationPool.get_thread_count());
+		// Management thread runs on a dedicated jthread, not in the compilation pool,
+		// so it doesn't consume a pool slot that could be used for shader compilation.
+		managementJthread = std::jthread([this](std::stop_token stoken) {
+			ManageCompilationSet(stoken);
+		});
 	}
 
 	bool ShaderCache::UseFileWatcher() const
@@ -2346,7 +2410,12 @@ namespace SIE
 				pathStr += std::format("{}; ", path);
 			}
 			logger::debug("ShaderCache watching for changes in {}", pathStr);
-			compilationPool.detach_task([this] { listener->processQueue(); });
+			// Capture listener by value so the thread does not race with StopFileWatcher()
+			// nulling this->listener before the thread has had a chance to start.
+			auto* capturedListener = listener;
+			capturedListener->fileWatcherThread = std::jthread([capturedListener]() {
+				capturedListener->processQueue();
+			});
 		} else {
 			logger::debug("ShaderCache already enabled");
 		}
@@ -2355,11 +2424,16 @@ namespace SIE
 	void ShaderCache::StopFileWatcher()
 	{
 		logger::info("Stopping FileWatcher");
+		// Set flag first so processQueue()'s loop condition becomes false before we join.
+		useFileWatcher = false;
 		if (fileWatcher) {
 			fileWatcher->removeWatch(watchID);
 			fileWatcher = nullptr;
 		}
 		if (listener) {
+			// ~jthread() calls request_stop() + join(); processQueue() exits when
+			// UseFileWatcher() returns false (set above).
+			delete listener;
 			listener = nullptr;
 		}
 	}
@@ -2546,6 +2620,87 @@ namespace SIE
 		return hideError;
 	}
 
+	int ShaderCache::GetHeavyTasksInFlight()
+	{
+		return static_cast<int>(compilationSet.heavyTasksInFlight.load(std::memory_order_relaxed));
+	}
+
+	uint64_t ShaderCache::GetSlowTasks()
+	{
+		return compilationSet.slowTasks.load(std::memory_order_relaxed);
+	}
+
+	uint64_t ShaderCache::GetVerySlowTasks()
+	{
+		return compilationSet.verySlowTasks.load(std::memory_order_relaxed);
+	}
+
+	std::vector<CompilationSet::SlowTaskRecord> CompilationSet::GetTopSlowTasks(size_t n) const
+	{
+		std::lock_guard lock(slowTasksMutex);
+		// Partial sort to get the N highest without fully sorting the whole vector.
+		std::vector<SlowTaskRecord> result = slowTaskRecords;
+		if (result.size() > n) {
+			std::partial_sort(result.begin(), result.begin() + n, result.end(),
+				[](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; });
+			result.resize(n);
+		} else {
+			std::sort(result.begin(), result.end(),
+				[](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; });
+		}
+		return result;
+	}
+
+	std::vector<CompilationSet::SlowTaskRecord> ShaderCache::GetTopSlowTasks(size_t n)
+	{
+		return compilationSet.GetTopSlowTasks(n);
+	}
+
+	std::optional<CompilationSet::ParallelismStats> CompilationSet::GetParallelismStats() const
+	{
+		std::vector<SlowTaskRecord> records;
+		{
+			std::lock_guard lock(slowTasksMutex);
+			if (slowTaskRecords.empty()) {
+				return std::nullopt;
+			}
+			records = slowTaskRecords;
+		}
+
+		ParallelismStats stats;
+		stats.sampleCount = records.size();
+		for (const auto& rec : records) {
+			stats.workMs += rec.elapsedMs;
+			stats.spanMs = std::max(stats.spanMs, rec.elapsedMs);
+			stats.avgQueueWaitMs += rec.queueWaitMs;
+			stats.maxQueueWaitMs = std::max(stats.maxQueueWaitMs, rec.queueWaitMs);
+		}
+		stats.avgQueueWaitMs /= static_cast<double>(stats.sampleCount);
+
+		LARGE_INTEGER now;
+		QueryPerformanceCounter(&now);
+		int64_t endTime = completionTime.load(std::memory_order_relaxed);
+		if (endTime == 0) {
+			endTime = now.QuadPart;
+		}
+		stats.makespanMs = static_cast<double>(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart;
+
+		if (stats.spanMs > 0.0) {
+			stats.avgParallelism = stats.workMs / stats.spanMs;
+		}
+		if (stats.makespanMs > 0.0) {
+			stats.infiniteCoreEfficiency = stats.spanMs / stats.makespanMs;
+			stats.infiniteCoreGapPercent = std::max(0.0, 100.0 * (1.0 - stats.infiniteCoreEfficiency));
+		}
+
+		return stats;
+	}
+
+	std::optional<CompilationSet::ParallelismStats> ShaderCache::GetParallelismStats()
+	{
+		return compilationSet.GetParallelismStats();
+	}
+
 	void ShaderCache::ClearShaderMap(RE::BSShader::Type a_type)
 	{
 		std::string_view shaderTypeStr = magic_enum::enum_name(a_type);
@@ -2730,8 +2885,79 @@ namespace SIE
 			return;
 		}
 
-		SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
-		task.Perform();
+		const auto taskKey = task.GetString();
+
+		// Thread priority serves as a signal to Intel Thread Director and
+		// the Windows scheduler for P-core vs E-core placement on hybrid CPUs.
+		// Heavy shaders compile at normal priority (favouring P-cores); light
+		// shaders stay below-normal (allowing E-core placement).  On non-hybrid
+		// CPUs this still gives heavy compiles slightly more scheduler attention.
+		SetThreadPriority(GetCurrentThread(),
+			task.GetPriority() >= SIE::kHeavyPriorityThreshold ? THREAD_PRIORITY_NORMAL : THREAD_PRIORITY_BELOW_NORMAL);
+
+		LARGE_INTEGER start, end, freq;
+		QueryPerformanceFrequency(&freq);
+		QueryPerformanceCounter(&start);
+		const double queueWaitMs = task.GetEnqueuedQpc() > 0 ?
+		                               static_cast<double>(start.QuadPart - task.GetEnqueuedQpc()) * 1000.0 / freq.QuadPart :
+		                               0.0;
+
+		try {
+			task.Perform();
+		} catch (const std::exception& e) {
+			logger::error("Unhandled exception compiling shader task {}: {}", taskKey, e.what());
+			ResolvePendingFailure(taskKey);
+		} catch (...) {
+			logger::error("Unhandled non-standard exception compiling shader task {}", taskKey);
+			ResolvePendingFailure(taskKey);
+		}
+
+		QueryPerformanceCounter(&end);
+		const double elapsedMs = static_cast<double>(end.QuadPart - start.QuadPart) * 1000.0 / freq.QuadPart;
+		const uint64_t remaining = compilationSet.totalTasks - compilationSet.completedTasks.load(std::memory_order_relaxed) - compilationSet.failedTasks.load(std::memory_order_relaxed);
+
+		// Proxy for permutation complexity: descriptor low 32 bits from GetId(); popcount = active defines.
+		// Shader file size provides a secondary signal for source complexity.
+		const auto descriptorComplexity = std::popcount(static_cast<uint32_t>(task.GetId()));
+		uintmax_t sourceBytes = 0;
+		{
+			// GetString() format: "fxpFilename:ShaderClass:defines" — filename is before the first colon.
+			const auto taskStr = task.GetString();
+			const auto sep = taskStr.find(':');
+			if (sep != std::string::npos) {
+				const auto shaderName = taskStr.substr(0, sep);
+				if (auto path = SIE::SShaderCache::GetShaderPath(shaderName); !path.empty()) {
+					std::error_code ec;
+					sourceBytes = std::filesystem::file_size(path, ec);
+				}
+			}
+		}
+
+		// Debug: full per-task record for post-mortem straggler analysis.
+		logger::debug("[ShaderTiming] {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | tid={} | {}",
+			elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes,
+			task.GetPriority(), GetCurrentThreadId(), taskKey);
+
+		constexpr double kSlowMs = 2000.0;
+		constexpr double kVerySlowMs = 8000.0;
+
+		// Record every task for post-mortem analysis and developer UI (top-N display).
+		{
+			std::lock_guard lock(compilationSet.slowTasksMutex);
+			compilationSet.slowTaskRecords.push_back({ taskKey, elapsedMs, queueWaitMs, task.GetPriority(),
+				static_cast<int>(descriptorComplexity), sourceBytes });
+		}
+
+		if (elapsedMs >= kVerySlowMs) {
+			compilationSet.verySlowTasks++;
+			compilationSet.slowTasks++;
+			logger::info("[ShaderTiming] Very slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}",
+				elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey);
+		} else if (elapsedMs >= kSlowMs) {
+			compilationSet.slowTasks++;
+			logger::debug("[ShaderTiming] Slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}",
+				elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey);
+		}
 
 		if (stoken.stop_requested()) {
 			return;
@@ -2744,7 +2970,8 @@ namespace SIE
 		const RE::BSShader& aShader,
 		uint32_t aDescriptor) :
 		shaderClass(aShaderClass),
-		shader(aShader), descriptor(aDescriptor)
+		shader(aShader), descriptor(aDescriptor),
+		cachedPriority(ComputePriority(aShaderClass, aShader, aDescriptor))
 	{}
 
 	void ShaderCompilationTask::Perform() const
@@ -2777,6 +3004,72 @@ namespace SIE
 		return GetId() == other.GetId();
 	}
 
+	int ShaderCompilationTask::ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor)
+	{
+		int priority = 0;
+		const auto type = shader.shaderType.get();
+
+		// Base priority by shader type — Lighting is consistently the slowest
+		// (123KB source, 12s+ compile), followed by Effect (~31KB, up to 12s).
+		switch (type) {
+		case RE::BSShader::Type::Lighting:
+			priority += 1000;
+			break;
+		case RE::BSShader::Type::Effect:
+			priority += 500;
+			break;
+		case RE::BSShader::Type::Water:
+			priority += 300;
+			break;
+		default:
+			break;
+		}
+
+		// Pixel shaders compile significantly slower than vertex shaders
+		if (shaderClass == ShaderClass::Pixel)
+			priority += 200;
+
+		// More active descriptor bits → more #defines → more code paths for the compiler
+		priority += std::popcount(descriptor) * 30;
+
+		// Known heavy Lighting techniques and flags from straggler analysis
+		if (type == RE::BSShader::Type::Lighting) {
+			const auto technique = static_cast<ShaderCache::LightingShaderTechniques>(0x3F & (descriptor >> 24));
+
+			// LANDSCAPE techniques (MTLand, MTLandLODBlend) are among the heaviest
+			// due to multi-texture blending codegen — regularly 60-130s compile times
+			if (technique == ShaderCache::LightingShaderTechniques::MTLand ||
+				technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend)
+				priority += 500;
+			if (technique == ShaderCache::LightingShaderTechniques::Parallax ||
+				technique == ShaderCache::LightingShaderTechniques::ParallaxOcc)
+				priority += 300;
+			if (technique == ShaderCache::LightingShaderTechniques::Eye)
+				priority += 200;
+			if (technique == ShaderCache::LightingShaderTechniques::MultilayerParallax)
+				priority += 200;
+
+			// TRUE_PBR and ANISO_LIGHTING are the dominant cost drivers,
+			// especially in combination with LANDSCAPE (115-130s observed)
+			if (descriptor & static_cast<uint32_t>(ShaderCache::LightingShaderFlags::TruePbr))
+				priority += 500;
+			if (descriptor & static_cast<uint32_t>(ShaderCache::LightingShaderFlags::AnisoLighting))
+				priority += 300;
+			// Deferred adds extra codegen overhead
+			if (descriptor & static_cast<uint32_t>(ShaderCache::LightingShaderFlags::Deferred))
+				priority += 200;
+
+			// LANDSCAPE + TRUE_PBR combination triggers extreme register pressure
+			// (6x unrolled texture layers * PBR params = 30+ textures, 180s+ compile)
+			if ((technique == ShaderCache::LightingShaderTechniques::MTLand ||
+					technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend) &&
+				(descriptor & static_cast<uint32_t>(ShaderCache::LightingShaderFlags::TruePbr)))
+				priority += 500;
+		}
+
+		return priority;
+	}
+
 	std::optional<ShaderCompilationTask> CompilationSet::WaitTake(std::stop_token stoken)
 	{
 		std::unique_lock lock(compilationMutex);
@@ -2784,8 +3077,9 @@ namespace SIE
 		if (!conditionVariable.wait(
 				lock, stoken,
 				[this, &shaderCache]() { return !availableTasks.empty() &&
-			                                    // check against all tasks in queue to trickle the work. It cannot be the active tasks count because the thread pool itself is maximum.
-			                                    (int)shaderCache->compilationPool.get_tasks_total() <=
+			                                    // Dispatch when pool has room. Use < (not <=) so that after
+			                                    // push_task() the total never exceeds the limit.
+			                                    (int)shaderCache->compilationPool.get_tasks_total() <
 			                                        (!shaderCache->backgroundCompilation ? shaderCache->compilationThreadCount : shaderCache->backgroundCompilationThreadCount); })) {
 			/*Woke up because of a stop request. */
 			return std::nullopt;
@@ -2794,9 +3088,27 @@ namespace SIE
 			QueryPerformanceCounter(&lastReset);
 			lastCalculation = lastReset;
 		}
-		auto node = availableTasks.extract(availableTasks.begin());
-		auto& task = node.value();
-		tasksInProgress.insert(std::move(node));
+
+		// Startup policy: keep dispatching the hardest queued work first.
+		// This preserves the existing priority score while preventing light tasks
+		// from bypassing queued heavy shaders and stretching the tail.
+		auto bestIt = availableTasks.end();
+		if (!availableTasks.empty()) {
+			bestIt = std::prev(availableTasks.end());
+		}
+
+		if (bestIt == availableTasks.end()) {
+			return std::nullopt;
+		}
+
+		ShaderCompilationTask task = *bestIt;
+		availableTasks.erase(bestIt);
+
+		if (task.GetPriority() >= kHeavyPriorityThreshold) {
+			heavyTasksInFlight.fetch_add(1, std::memory_order_relaxed);
+		}
+
+		tasksInProgress.insert(task);
 		return task;
 	}
 
@@ -2806,11 +3118,16 @@ namespace SIE
 		auto inProgressIt = tasksInProgress.find(task);
 		auto processedIt = processedTasks.find(task);
 		if (inProgressIt == tasksInProgress.end() && processedIt == processedTasks.end() && !globals::shaderCache->GetCompletedShader(task)) {
-			auto [availableIt, wasAdded] = availableTasks.insert(task);
+			LARGE_INTEGER now;
+			QueryPerformanceCounter(&now);
+			auto queuedTask = task;
+			queuedTask.SetEnqueuedQpc(now.QuadPart);
+			auto [_, wasAdded] = availableTasks.insert(queuedTask);
 			lock.unlock();
 			if (wasAdded) {
 				conditionVariable.notify_one();
 				totalTasks++;
+				totalPriorityWeight += static_cast<uint64_t>(task.GetPriority()) + 1;
 			}
 		}
 	}
@@ -2836,6 +3153,17 @@ namespace SIE
 				logger::debug("Compiling Task failed: {}", key);
 				failedTasks++;
 			}
+			completedPriorityWeight += static_cast<uint64_t>(task.GetPriority()) + 1;
+
+			// Track heavy task completion for P-core concurrency limiting
+			if (task.GetPriority() >= kHeavyPriorityThreshold) {
+				auto current = heavyTasksInFlight.load(std::memory_order_relaxed);
+				while (current > 0 &&
+					   !heavyTasksInFlight.compare_exchange_weak(current, current - 1,
+						   std::memory_order_relaxed,
+						   std::memory_order_relaxed)) {
+				}
+			}
 
 			// Update timing
 			LARGE_INTEGER now;
@@ -2873,36 +3201,49 @@ namespace SIE
 		completedTasks = 0;
 		failedTasks = 0;
 		cacheHitTasks = 0;
+		slowTasks = 0;
+		verySlowTasks = 0;
+		totalPriorityWeight = 0;
+		completedPriorityWeight = 0;
+		heavyTasksInFlight = 0;
 		QueryPerformanceCounter(&lastReset);
 		QueryPerformanceCounter(&lastCalculation);
 		completionTime = { 0 };  // Reset completion time
 		totalTime = { 0 };
+		{
+			std::lock_guard slowLock(slowTasksMutex);
+			slowTaskRecords.clear();
+		}
 	}
 
 	std::string CompilationSet::GetHumanTime(double a_totalMs)
 	{
-		int milliseconds = static_cast<int>(a_totalMs);
-		int seconds = milliseconds / 1000;
-		int minutes = seconds / 60;
-		seconds %= 60;
-		int hours = minutes / 60;
-		minutes %= 60;
-
-		return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds);
+		return Util::FormatDuration(a_totalMs);
 	}
 
 	double CompilationSet::GetEta()
 	{
-		// For ETA calculation, we still use the active compilation time (totalTime)
-		// because it reflects the actual work time, not wall-clock time
-		double totalMs = static_cast<double>(totalTime.QuadPart) * 1000.0 / frequency.QuadPart;
+		// Use wall-clock elapsed time since compilation started
+		LARGE_INTEGER now;
+		QueryPerformanceCounter(&now);
+		int64_t endTime = (completionTime.load(std::memory_order_relaxed) != 0) ? completionTime.load(std::memory_order_relaxed) : now.QuadPart;
+		double elapsedMs = static_cast<double>(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart;
 
-		if (totalMs == 0.0) {
-			return 0.0;  // Avoid division by zero
-		}
-		auto rate = completedTasks / totalMs;
-		auto remaining = totalTasks - completedTasks - failedTasks;
-		return std::max(remaining / rate, 0.0);
+		if (elapsedMs <= 0.0)
+			return 0.0;
+
+		// Priority-weighted ETA: heavy tasks completing early should not inflate
+		// the estimate. We measure progress as a fraction of total priority weight
+		// completed, which accounts for the decreasing cost of remaining tasks.
+		double doneWeight = static_cast<double>(completedPriorityWeight.load(std::memory_order_relaxed));
+		double totalWeight = static_cast<double>(totalPriorityWeight.load(std::memory_order_relaxed));
+
+		if (doneWeight <= 0.0 || totalWeight <= 0.0)
+			return 0.0;
+
+		double fractionDone = doneWeight / totalWeight;
+		double estimatedTotalMs = elapsedMs / fractionDone;
+		return std::max(estimatedTotalMs - elapsedMs, 0.0);
 	}
 
 	std::string CompilationSet::GetStatsString(bool a_timeOnly, bool a_elapsedOnly)
@@ -2927,7 +3268,7 @@ namespace SIE
 			}
 		}
 
-		return fmt::format("{}/{} (successful/total)\tfailed: {}\tcachehits: {}\nElapsed/Estimated Time: {}/{}",
+		return fmt::format("{}/{} (successful/total)\tfailed: {}\tdeduplicated: {}\nElapsed/Estimated Time: {}/{}",
 			(std::uint64_t)completedTasks,
 			(std::uint64_t)totalTasks,
 			(std::uint64_t)failedTasks,
diff --git a/src/ShaderCache.h b/src/ShaderCache.h
index 89bcedf92d..72d4c6cec8 100644
--- a/src/ShaderCache.h
+++ b/src/ShaderCache.h
@@ -4,6 +4,8 @@
 #include <efsw/efsw.hpp>
 #include <vector>
 
+#include "Utils/WinApi.h"
+
 using namespace std::chrono;
 
 namespace ShaderConstants
@@ -218,12 +220,24 @@ namespace SIE
 		size_t GetId() const;
 		std::string GetString() const;
 
+		/// LPT scheduling score: higher = more expensive = should be dispatched first.
+		/// Based on shader type, class, descriptor complexity, and known heavy defines.
+		/// Computed once at construction and cached.
+		int GetPriority() const { return cachedPriority; }
+		void SetEnqueuedQpc(int64_t qpc) { enqueuedQpc = qpc; }
+		int64_t GetEnqueuedQpc() const { return enqueuedQpc; }
+
 		bool operator==(const ShaderCompilationTask& other) const;
 
 	protected:
 		ShaderClass shaderClass;
 		const RE::BSShader& shader;
 		uint32_t descriptor;
+
+	private:
+		static int ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor);
+		int cachedPriority;
+		int64_t enqueuedQpc = 0;
 	};
 }
 
@@ -236,8 +250,24 @@ struct std::hash<SIE::ShaderCompilationTask>
 	}
 };
 
+struct TaskPriorityLess
+{
+	bool operator()(const SIE::ShaderCompilationTask& a, const SIE::ShaderCompilationTask& b) const
+	{
+		if (a.GetPriority() != b.GetPriority()) {
+			return a.GetPriority() < b.GetPriority();
+		}
+		return a.GetId() < b.GetId();
+	}
+};
+
 namespace SIE
 {
+	/// Threshold above which a shader task is considered "heavy" and benefits
+	/// from P-core placement on hybrid CPUs. Used for thread-priority hints,
+	/// telemetry, and developer-facing diagnostics.
+	constexpr int kHeavyPriorityThreshold = 500;
+
 	class CompilationSet
 	{
 	public:
@@ -259,19 +289,61 @@ namespace SIE
 		void Add(const ShaderCompilationTask& task);
 		void Complete(const ShaderCompilationTask& task);
 		void Clear();
-		std::string GetHumanTime(double a_totalMs);
+		static std::string GetHumanTime(double a_totalMs);
 		double GetEta();
 		std::string GetStatsString(bool a_timeOnly = false, bool a_elapsedOnly = false);
 		std::atomic<uint64_t> completedTasks = 0;
 		std::atomic<uint64_t> totalTasks = 0;
 		std::atomic<uint64_t> failedTasks = 0;
-		std::atomic<uint64_t> cacheHitTasks = 0;  // number of compiles of a previously seen shader combo
+		std::atomic<uint64_t> cacheHitTasks = 0;            // number of compiles of a previously seen shader combo
+		std::atomic<uint64_t> slowTasks = 0;                // shaders taking >= 2s
+		std::atomic<uint64_t> verySlowTasks = 0;            // shaders taking >= 8s
+		std::atomic<uint64_t> totalPriorityWeight = 0;      // sum of (GetPriority()+1) for all queued tasks
+		std::atomic<uint64_t> completedPriorityWeight = 0;  // sum of (GetPriority()+1) for completed/failed tasks
+		std::atomic<uint32_t> heavyTasksInFlight = 0;       // number of dispatched heavy (>= kHeavyPriorityThreshold) tasks still running
 		std::mutex compilationMutex;
 
+		/// Per-task timing record stored for post-mortem analysis and developer UI.
+		struct SlowTaskRecord
+		{
+			std::string key;  // ShaderCompilationTask::GetString() — "fxpFile:Class:defines"
+			double elapsedMs = 0.0;
+			double queueWaitMs = 0.0;
+			int priority = 0;               // estimated compile weight (see ComputePriority)
+			int defineCount = 0;            // popcount of descriptor — active define permutations
+			uintmax_t sourceSizeBytes = 0;  // HLSL source file size at compile time
+		};
+
+		/// On-demand parallelism metrics derived from task timings.
+		struct ParallelismStats
+		{
+			double workMs = 0.0;                  // W = sum of all task times
+			double spanMs = 0.0;                  // S ~= longest single task
+			double makespanMs = 0.0;              // T_p = wall-clock compile duration
+			double avgParallelism = 0.0;          // W / S
+			double infiniteCoreEfficiency = 0.0;  // S / T_p
+			double infiniteCoreGapPercent = 0.0;  // 100 * (1 - S / T_p)
+			double avgQueueWaitMs = 0.0;          // average enqueue -> dispatch delay
+			double maxQueueWaitMs = 0.0;          // worst enqueue -> dispatch delay
+			size_t sampleCount = 0;
+		};
+
+		/// All per-task timing records for this build (appended from multiple threads).
+		/// Protected by slowTasksMutex.
+		std::vector<SlowTaskRecord> slowTaskRecords;
+		mutable std::mutex slowTasksMutex;
+
+		/// Returns a copy of the N records with the highest elapsedMs, sorted descending.
+		std::vector<SlowTaskRecord> GetTopSlowTasks(size_t n = 3) const;
+
+		/// Computes parallelism metrics on demand from collected task timings.
+		std::optional<ParallelismStats> GetParallelismStats() const;
+
 	private:
-		std::unordered_set<ShaderCompilationTask> availableTasks;
-		std::unordered_set<ShaderCompilationTask> tasksInProgress;
-		std::unordered_set<ShaderCompilationTask> processedTasks;  // completed or failed
+		/// Tasks awaiting dispatch, ordered by cached priority and task id.
+		std::set<ShaderCompilationTask, TaskPriorityLess> availableTasks;
+		std::set<ShaderCompilationTask, TaskPriorityLess> tasksInProgress;
+		std::set<ShaderCompilationTask, TaskPriorityLess> processedTasks;  // completed or failed
 		std::condition_variable_any conditionVariable;
 	};
 
@@ -394,6 +466,15 @@ namespace SIE
 		bool Clear(const std::string& a_path);
 
 		bool AddCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor, ID3DBlob* a_blob);
+
+		enum class ClaimResult
+		{
+			CacheHit,  // Already compiled; use the returned blob
+			Claimed    // Claimed as Pending; caller must compile and call AddCompletedShader
+		};
+		std::pair<ClaimResult, ID3DBlob*> ClaimCompilation(const std::string& key);
+		void ResolvePendingFailure(const std::string& key);
+
 		ID3DBlob* GetCompletedShader(const std::string& a_key);
 		ID3DBlob* GetCompletedShader(const SIE::ShaderCompilationTask& a_task);
 		ID3DBlob* GetCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor);
@@ -432,6 +513,15 @@ namespace SIE
 		void IterateShaderBlock(bool a_forward = true);
 		bool IsHideErrors();
 
+		// Overlay stats
+		int GetHeavyTasksInFlight();
+		uint64_t GetSlowTasks();
+		uint64_t GetVerySlowTasks();
+
+		/// Returns a copy of the top-N slowest task records from the last build, sorted descending.
+		std::vector<CompilationSet::SlowTaskRecord> GetTopSlowTasks(size_t n = 3);
+		std::optional<CompilationSet::ParallelismStats> GetParallelismStats();
+
 		/**
 		 * @brief Clears all shaders of a specific type from the shader map.
 		 *
@@ -445,9 +535,13 @@ namespace SIE
 
 		ShaderFileDependencyTracker* GetDependencyTracker() { return dependencyTracker.get(); }
 
-		int32_t compilationThreadCount = std::max({ static_cast<int32_t>(std::thread::hardware_concurrency()) - 4, static_cast<int32_t>(std::thread::hardware_concurrency()) * 3 / 4, 1 });
-		int32_t backgroundCompilationThreadCount = std::max(static_cast<int32_t>(std::thread::hardware_concurrency()) / 2, 1);
-		BS::thread_pool<> compilationPool{};
+		// Use all logical cores minus one at startup for OS headroom (E-cores included).
+		// Management and file watcher run on dedicated jthreads, not pool slots.
+		// Background (in-game): half of P-cores only, to avoid starving the render thread.
+		int32_t compilationThreadCount = std::max(static_cast<int32_t>(std::thread::hardware_concurrency()) - 1, 1);
+		int32_t backgroundCompilationThreadCount = std::max(static_cast<int32_t>(Util::GetPerformanceCoreCount()) / 2, 1);
+		BS::thread_pool<> compilationPool{ static_cast<std::size_t>(compilationThreadCount) };
+		std::jthread managementJthread;  // dedicated thread for ManageCompilationSet (not in pool)
 		bool backgroundCompilation = false;
 		bool menuLoaded = false;
 
@@ -717,6 +811,7 @@ namespace SIE
 		CompilationSet compilationSet;
 		ankerl::unordered_dense::map<std::string, ShaderCacheResult> shaderMap{};
 		std::mutex mapMutex;                                                                      // guard for shaderMap
+		std::condition_variable mapCV;                                                            // signalled when a Pending entry transitions to Completed/Failed
 		ankerl::unordered_dense::map<std::string, system_clock::time_point> modifiedShaderMap{};  // hashmap when a shader source file last modified
 		std::mutex modifiedMapMutex;                                                              // guard for modifiedShaderMap
 		ankerl::unordered_dense::map<std::string, std::set<hlslRecord>> hlslToShaderMap{};        // hashmap linking specific hlsl files to shader keys in shaderMap
@@ -756,6 +851,8 @@ namespace SIE
 		void processQueue();
 		void handleFileAction(efsw::WatchID, const std::string& dir, const std::string& filename, efsw::Action action, std::string) override;
 
+		std::jthread fileWatcherThread;  // dedicated thread for processQueue (not in pool)
+
 	private:
 		ShaderFileDependencyTracker* deps;
 		struct fileAction
diff --git a/src/Utils/Format.cpp b/src/Utils/Format.cpp
index 61f1d91195..2b0d88905b 100644
--- a/src/Utils/Format.cpp
+++ b/src/Utils/Format.cpp
@@ -158,6 +158,21 @@ namespace Util
 		}
 	}
 
+	std::string FormatDuration(double ms)
+	{
+		// Validate input: handle negative, NaN, and infinite values
+		if (!std::isfinite(ms) || ms < 0.0) {
+			return "00:00:00";
+		}
+
+		// Use int64_t to avoid overflow on long durations (>596 hours with int)
+		int64_t total_s = static_cast<int64_t>(ms) / 1000;
+		int64_t hours = total_s / 3600;
+		int64_t minutes = (total_s % 3600) / 60;
+		int64_t seconds = total_s % 60;
+		return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds);
+	}
+
 	std::string TimeAgoString(std::chrono::steady_clock::time_point last)
 	{
 		using namespace std::chrono;
diff --git a/src/Utils/Format.h b/src/Utils/Format.h
index 5448581c7c..dcee025bce 100644
--- a/src/Utils/Format.h
+++ b/src/Utils/Format.h
@@ -66,6 +66,17 @@ namespace Util
 	 */
 	std::string FormatTimeAgo(std::filesystem::file_time_type fileTime);
 
+	/**
+	 * Formats a duration given in milliseconds as HH:MM:SS.
+	 * Suitable for displaying long-running operation times (e.g. shader compilation).
+	 *
+	 * @param ms Duration in milliseconds. Fractional milliseconds are truncated.
+	 *           Non-finite (NaN/inf) or negative values are clamped to "00:00:00".
+	 *           Durations >= 24 hours display hours without limit (e.g., "125:34:56").
+	 * @return Formatted string like "00:02:35" or "00:00:00" for invalid inputs
+	 */
+	std::string FormatDuration(double ms);
+
 	/**
 	 * Formats a delta value with percentage difference for A/B test comparisons.
 	 * Returns a string like "+0.45 ms (+12.3%)" or "-0.23 ms (-8.1%)".
diff --git a/src/Utils/WinApi.cpp b/src/Utils/WinApi.cpp
index cef816e213..7f3e810015 100644
--- a/src/Utils/WinApi.cpp
+++ b/src/Utils/WinApi.cpp
@@ -27,4 +27,46 @@ namespace Util
 
 		return std::nullopt;
 	}
+
+	uint32_t GetPerformanceCoreCount()
+	{
+		// Cache the result — CPU topology never changes at runtime.
+		// C++11 guarantees thread-safe initialisation of static locals.
+		static const uint32_t cached = []() -> uint32_t {
+			const uint32_t fallback = std::max(1u, std::thread::hardware_concurrency());
+
+			DWORD size = 0;
+			GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size);
+			if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || size == 0)
+				return fallback;
+
+			std::vector<uint8_t> buf(size);
+			auto* info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buf.data());
+			if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size))
+				return fallback;
+
+			// First pass: find the highest efficiency class present.
+			BYTE maxClass = 0;
+			for (DWORD offset = 0; offset < size;) {
+				auto* entry = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buf.data() + offset);
+				if (entry->Processor.EfficiencyClass > maxClass)
+					maxClass = entry->Processor.EfficiencyClass;
+				offset += entry->Size;
+			}
+
+			// Second pass: count logical processors on those (P-)cores.
+			uint32_t count = 0;
+			for (DWORD offset = 0; offset < size;) {
+				auto* entry = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buf.data() + offset);
+				if (entry->Processor.EfficiencyClass == maxClass) {
+					for (WORD g = 0; g < entry->Processor.GroupCount; ++g)
+						count += static_cast<uint32_t>(std::popcount(entry->Processor.GroupMask[g].Mask));
+				}
+				offset += entry->Size;
+			}
+
+			return count > 0 ? count : fallback;
+		}();
+		return cached;
+	}
 }  // namespace Util
diff --git a/src/Utils/WinApi.h b/src/Utils/WinApi.h
index cf048b3d31..3b21ccd8a0 100644
--- a/src/Utils/WinApi.h
+++ b/src/Utils/WinApi.h
@@ -3,4 +3,10 @@
 namespace Util
 {
 	std::optional<REL::Version> GetDllVersion(const std::wstring& dllPath);
+
+	/// Returns the number of logical processors on the highest-efficiency cores
+	/// (P-cores on Intel hybrid CPUs). On non-hybrid CPUs all cores share the
+	/// same efficiency class, so this returns std::thread::hardware_concurrency().
+	/// Falls back to hardware_concurrency() on any API failure.
+	uint32_t GetPerformanceCoreCount();
 }  // namespace Util