diff --git a/src/Features/UnifiedWater/WaterCache.cpp b/src/Features/UnifiedWater/WaterCache.cpp index 2da8f1cbc6..a2addeaf97 100644 --- a/src/Features/UnifiedWater/WaterCache.cpp +++ b/src/Features/UnifiedWater/WaterCache.cpp @@ -2,6 +2,8 @@ #include +#include "Utils/WinApi.h" + bool WaterCache::SetCurrentWorldSpace(const RE::TESWorldSpace* worldSpace) { if (!worldSpace) @@ -214,7 +216,8 @@ bool WaterCache::GenerateCaches() } } - const unsigned hw = std::max(1u, std::thread::hardware_concurrency()); + // Use P-core logical count on Intel hybrid CPUs; falls back to hardware_concurrency() on non-hybrid. + const unsigned hw = Util::GetPerformanceCoreCount(); const unsigned threads = std::max(1u, hw > 4 ? hw - 4 : (hw * 3) / 4); async.pool = std::make_unique>(threads); diff --git a/src/Menu/AdvancedSettingsRenderer.cpp b/src/Menu/AdvancedSettingsRenderer.cpp index f985f6f680..85310a0ec4 100644 --- a/src/Menu/AdvancedSettingsRenderer.cpp +++ b/src/Menu/AdvancedSettingsRenderer.cpp @@ -126,15 +126,16 @@ void AdvancedSettingsRenderer::RenderLoggingSection() ImGui::SliderInt("Compiler Threads", &shaderCache->compilationThreadCount, 1, static_cast(std::thread::hardware_concurrency())); if (auto _tt = Util::HoverTooltipWrapper()) { ImGui::Text( - "Number of threads to use to compile shaders. " - "The more threads the faster compilation will finish but may make the system unresponsive. "); + "Number of threads used to compile shaders at startup. " + "Defaults to all logical cores minus one for OS headroom (E-cores included). " + "Higher values finish compilation faster but may make the system less responsive."); } ImGui::SliderInt("Background Compiler Threads", &shaderCache->backgroundCompilationThreadCount, 1, static_cast(std::thread::hardware_concurrency())); if (auto _tt = Util::HoverTooltipWrapper()) { ImGui::Text( - "Number of threads to use to compile shaders while playing game. " - "This is activated if the startup compilation is skipped. " - "The more threads the faster compilation will finish but may make the system unresponsive. "); + "Number of threads used to compile shaders during gameplay. " + "Defaults to half of performance cores to avoid impacting the render thread. " + "Higher values finish compilation faster but may cause stuttering."); } ImGui::Columns(2, nullptr, false); @@ -552,6 +553,103 @@ void AdvancedSettingsRenderer::RenderDeveloperSection() // Statistics section (moved from Advanced/Logging) if (ImGui::TreeNodeEx("Statistics", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text(std::format("Shader Compiler : {}", shaderCache->GetShaderStatsString()).c_str()); + + // Derived parallelism metrics are computed lazily on demand and only shown + // once compilation has completed to avoid per-frame analysis while compiling. + if (!shaderCache->IsCompiling()) { + auto parallelism = shaderCache->GetParallelismStats(); + if (parallelism.has_value()) { + const auto& p = parallelism.value(); + ImGui::Spacing(); + ImGui::TextDisabled("Parallelism (derived from %zu compiled tasks)", p.sampleCount); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Computed lazily from the last completed build."); + ImGui::Text("Only evaluated when this Statistics section is open."); + } + ImGui::Text("Work (W, sum of task wall times): %s", Util::FormatDuration(p.workMs).c_str()); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Total compile work: sum of all per-shader wall-clock compile times."); + ImGui::Text("This is not CPU time; it is accumulated task elapsed time."); + ImGui::Text("Equivalent serial time on one worker if overhead stayed the same."); + } + ImGui::Text("Span (S, longest): %s", Util::FormatDuration(p.spanMs).c_str()); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Critical-path lower bound, approximated by the single slowest shader."); + ImGui::Text("Even infinite cores cannot finish faster than this."); + } + ImGui::Text("Makespan (T_p): %s", Util::FormatDuration(p.makespanMs).c_str()); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Observed wall-clock duration for the full shader build."); + } + ImGui::Text("Queue wait (avg/max): %s / %s", + Util::FormatDuration(p.avgQueueWaitMs).c_str(), + Util::FormatDuration(p.maxQueueWaitMs).c_str()); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Time spent waiting in the ready queue before a worker started compilation."); + ImGui::Text("Useful for identifying scheduler-induced delay separate from compile cost."); + } + ImGui::Text("Average parallelism (W/S): %.2fx", p.avgParallelism); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Average useful concurrency in this workload."); + ImGui::Text("Roughly the worker count where adding more cores gives diminishing returns."); + } + ImGui::Text("Infinite-core efficiency (S/T_p): %.1f%%", 100.0 * p.infiniteCoreEfficiency); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("How close runtime is to the infinite-core lower bound."); + ImGui::Text("100%% means T_p == S."); + } + ImGui::Text("Infinite-core gap: %.1f%%", p.infiniteCoreGapPercent); + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("Distance from ideal infinite-core time."); + ImGui::Text("Defined as 100 * (1 - S / T_p). Lower is better."); + } + + ImGui::Spacing(); + ImGui::TextDisabled("Infinite-core efficiency"); + float efficiency = static_cast(std::clamp(p.infiniteCoreEfficiency, 0.0, 1.0)); + ImGui::ProgressBar(efficiency, ImVec2(-1.0f, 0.0f), std::format("{:.1f}% efficient / {:.1f}% gap", 100.0 * p.infiniteCoreEfficiency, p.infiniteCoreGapPercent).c_str()); + + ImGui::Spacing(); + ImGui::TextDisabled("Relative durations (normalized)"); + double maxMs = std::max({ p.workMs, p.spanMs, p.makespanMs, 1.0 }); + auto drawRelativeBar = [maxMs](const char* label, double value) { + float ratio = static_cast(std::clamp(value / maxMs, 0.0, 1.0)); + ImGui::TextUnformatted(label); + ImGui::SameLine(); + ImGui::ProgressBar(ratio, ImVec2(-1.0f, 0.0f), std::format("{} ({:.1f}%)", Util::FormatDuration(value), 100.0 * ratio).c_str()); + }; + drawRelativeBar("Span (S)", p.spanMs); + drawRelativeBar("Makespan (T_p)", p.makespanMs); + drawRelativeBar("Work (W)", p.workMs); + } + } + + // Top-3 slowest shaders from the last build + auto topSlow = shaderCache->GetTopSlowTasks(3); + if (!topSlow.empty()) { + ImGui::Spacing(); + ImGui::TextDisabled("Top %zu Slowest Shaders (last build)", topSlow.size()); + for (size_t i = 0; i < topSlow.size(); ++i) { + const auto& rec = topSlow[i]; + ImGui::Text("#%zu %s (weight %d)", i + 1, + Util::FormatDuration(rec.elapsedMs).c_str(), rec.priority); + ImGui::SameLine(); + ImGui::TextDisabled("%s", rec.key.c_str()); + if (ImGui::IsItemHovered()) { + if (auto _tt = Util::HoverTooltipWrapper()) { + ImGui::Text("%s", rec.key.c_str()); + } + } + // Allow copying the full key with a right-click + if (ImGui::BeginPopupContextItem(std::format("##slowcopy{}", i).c_str())) { + if (ImGui::MenuItem("Copy key")) { + ImGui::SetClipboardText(rec.key.c_str()); + } + ImGui::EndPopup(); + } + } + } + ImGui::TreePop(); } diff --git a/src/Menu/OverlayRenderer.cpp b/src/Menu/OverlayRenderer.cpp index c80bf97bed..e7ccaeb5b5 100644 --- a/src/Menu/OverlayRenderer.cpp +++ b/src/Menu/OverlayRenderer.cpp @@ -168,6 +168,23 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::functionIsDeveloperMode()) { + int32_t threadLimit = shaderCache->backgroundCompilation ? shaderCache->backgroundCompilationThreadCount : shaderCache->compilationThreadCount; + int compilationRunning = (int)shaderCache->compilationPool.get_tasks_running(); + int heavyInFlight = shaderCache->GetHeavyTasksInFlight(); + int heavyLimit = static_cast(Util::GetPerformanceCoreCount()); + uint64_t slow = shaderCache->GetSlowTasks(); + uint64_t verySlow = shaderCache->GetVerySlowTasks(); + ImGui::Text("Threads: %d / %d limit | Heavy: %d / %d P-cores | %d workers", + compilationRunning, + threadLimit, + heavyInFlight, + heavyLimit, + (int)shaderCache->compilationPool.get_thread_count()); + if (slow > 0) { + ImGui::Text("Slow shaders: %llu (very slow: %llu)", slow, verySlow); + } + } if (!shaderCache->backgroundCompilation && shaderCache->menuLoaded) { auto skipShadersText = fmt::format( "Press {} to proceed without completing shader compilation. ", @@ -180,7 +197,9 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::functionGetTotalTasks() > 0) { ImGui::Text("Last shader cache build duration: %s", shaderCache->GetShaderStatsString(true, true).c_str()); + + // Stacked bar showing compilation breakdown + { + uint64_t total = shaderCache->GetTotalTasks(); + uint64_t completed = shaderCache->GetCompletedTasks(); + uint64_t failed = shaderCache->GetFailedTasks(); + uint64_t cacheHits = shaderCache->GetCachedHitTasks(); + uint64_t slow = shaderCache->GetSlowTasks(); + uint64_t verySlow = shaderCache->GetVerySlowTasks(); + // Compiled = tasks that actually went through compilation. + // Cache hits are separate (returned early without queueing). + uint64_t compiled = completed; + uint64_t fast = compiled > slow ? compiled - slow : 0; + uint64_t medium = slow > verySlow ? slow - verySlow : 0; // 2-8s + + struct Segment + { + uint64_t count; + ImU32 color; + const char* label; + }; + Segment segments[] = { + { cacheHits, IM_COL32(120, 120, 120, 255), "Deduplicated" }, + { fast, IM_COL32(80, 180, 80, 255), "Fast (<2s)" }, + { medium, IM_COL32(220, 180, 50, 255), "Slow (2-8s)" }, + { verySlow, IM_COL32(220, 60, 60, 255), "Very slow (>=8s)" }, + { failed, IM_COL32(160, 30, 30, 255), "Failed" }, + }; + + float barHeight = 14.0f * Util::GetUIScale(); + float barWidth = ImGui::GetContentRegionAvail().x; + ImVec2 cursor = ImGui::GetCursorScreenPos(); + ImDrawList* drawList = ImGui::GetWindowDrawList(); + + // Background + drawList->AddRectFilled(cursor, ImVec2(cursor.x + barWidth, cursor.y + barHeight), IM_COL32(40, 40, 40, 255)); + + // Draw segments + float x = cursor.x; + for (auto& seg : segments) { + if (seg.count == 0 || total == 0) + continue; + float segWidth = (static_cast(seg.count) / static_cast(total)) * barWidth; + if (segWidth < 1.0f) + segWidth = 1.0f; + drawList->AddRectFilled(ImVec2(x, cursor.y), ImVec2(x + segWidth, cursor.y + barHeight), seg.color); + x += segWidth; + } + + // Reserve space and handle tooltip + ImGui::Dummy(ImVec2(barWidth, barHeight)); + if (ImGui::IsItemHovered()) { + ImGui::BeginTooltip(); + for (auto& seg : segments) { + if (seg.count == 0) + continue; + float pct = total > 0 ? 100.0f * static_cast(seg.count) / static_cast(total) : 0.0f; + ImGui::TextColored(ImGui::ColorConvertU32ToFloat4(seg.color), "%s: %llu (%.1f%%)", seg.label, seg.count, pct); + } + ImGui::EndTooltip(); + } + } + + auto state = globals::state; + if (state->IsDeveloperMode()) { + ImGui::Text("Threads: %d compile, %d background, %d pool | P-cores: %d", + (int)shaderCache->compilationThreadCount, + (int)shaderCache->backgroundCompilationThreadCount, + (int)shaderCache->compilationPool.get_thread_count(), + (int)Util::GetPerformanceCoreCount()); + } } ImGui::EndTabItem(); diff --git a/src/ShaderCache.cpp b/src/ShaderCache.cpp index 1b3f3edd3d..9bdbb45e9b 100644 --- a/src/ShaderCache.cpp +++ b/src/ShaderCache.cpp @@ -1341,20 +1341,24 @@ namespace SIE return nullptr; } - // check hashmap auto& cache = ShaderCache::Instance(); - ID3DBlob* shaderBlob = cache.GetCompletedShader(shaderClass, shader, descriptor); - - if (shaderBlob) { - // already compiled before - logger::debug("Shader already compiled; using cache: {}", SShaderCache::GetShaderString(shaderClass, shader, descriptor)); + auto key = SShaderCache::GetShaderString(shaderClass, shader, descriptor, true); + + // Atomically check the shaderMap and either: + // - return the blob if already Completed (cache hit), + // - wait if another thread is compiling (Pending), + // - claim the slot with Pending if nobody started yet. + auto [claimResult, cachedBlob] = cache.ClaimCompilation(key); + if (claimResult == ShaderCache::ClaimResult::CacheHit) { cache.IncCacheHitTasks(); - return shaderBlob; + return cachedBlob; } + const auto type = shader.shaderType.get(); // check diskcache auto diskPath = GetDiskPath(shader.fxpFilename, descriptor, shaderClass); + ID3DBlob* shaderBlob = nullptr; if (useDiskCache && std::filesystem::exists(diskPath)) { // check build time of cache @@ -1405,6 +1409,7 @@ namespace SIE auto pathString = Util::WStringToString(path); if (!std::filesystem::exists(path)) { logger::error("Failed to compile {} shader {}::{:X}: {} does not exist", magic_enum::enum_name(shaderClass), magic_enum::enum_name(type), descriptor, pathString); + cache.AddCompletedShader(shaderClass, shader, descriptor, nullptr); return nullptr; } logger::debug("Compiling {} {}:{}:{:X} to {}", pathString, magic_enum::enum_name(type), magic_enum::enum_name(shaderClass), descriptor, MergeDefinesString(defines)); @@ -1926,11 +1931,16 @@ namespace SIE { Clear(); StopFileWatcher(); + // Signal management thread to stop dispatching; pool workers observe the same + // stop token and will not pick up new tasks after current compilations finish. + HANDLE managementHandle = managementJthread.native_handle(); + managementJthread.request_stop(); + // Purge unstarted tasks so we only wait for compilations already in flight. + compilationPool.purge(); if (!compilationPool.wait_for(std::chrono::milliseconds(1000))) { - logger::info("Tasks still running despite request to stop; killing thread {}!", GetThreadId(managementThread)); - WaitForSingleObject(managementThread, 1000); - TerminateThread(managementThread, 0); - CloseHandle(managementThread); + logger::info("Tasks still running despite request to stop; killing management thread {}!", GetThreadId(managementHandle)); + WaitForSingleObject(managementHandle, 1000); + TerminateThread(managementHandle, 0); } } @@ -2102,6 +2112,7 @@ namespace SIE std::unique_lock lockM{ mapMutex }; shaderMap.insert_or_assign(key, ShaderCacheResult{ a_blob, status, system_clock::now() }); } + mapCV.notify_all(); // wake threads waiting on a Pending→Completed/Failed transition const std::wstring path = SIE::SShaderCache::GetShaderPath( shader.shaderType == RE::BSShader::Type::ImageSpace ? static_cast(shader).originalShaderName : @@ -2138,6 +2149,53 @@ namespace SIE return a_blob != nullptr; } + std::pair ShaderCache::ClaimCompilation(const std::string& key) + { + std::unique_lock lockM{ mapMutex }; + + for (;;) { + auto it = shaderMap.find(key); + if (it != shaderMap.end()) { + auto& entry = it->second; + if (entry.status == ShaderCompilationTask::Status::Completed) { + if (entry.blob) { + logger::debug("Shader already compiled; using cache: {}", key); + return { ClaimResult::CacheHit, entry.blob }; + } + break; // Completed with nullptr blob — re-compile + } + if (entry.status == ShaderCompilationTask::Status::Failed) { + break; // Previous attempt failed — re-compile + } + // Status is Pending — another thread is compiling this shader. + logger::debug("Shader compilation in progress, waiting: {}", key); + mapCV.wait(lockM); + continue; // re-check after wakeup + } + break; // not in map at all + } + + // Claim the slot as Pending before releasing the lock + shaderMap.insert_or_assign(key, ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Pending, system_clock::now() }); + return { ClaimResult::Claimed, nullptr }; + } + + void ShaderCache::ResolvePendingFailure(const std::string& key) + { + bool changed = false; + { + std::unique_lock lockM{ mapMutex }; + auto it = shaderMap.find(key); + if (it != shaderMap.end() && it->second.status == ShaderCompilationTask::Status::Pending) { + it->second = ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Failed, system_clock::now() }; + changed = true; + } + } + if (changed) { + mapCV.notify_all(); + } + } + ID3DBlob* ShaderCache::GetCompletedShader(const std::string& a_key) { std::string type = SIE::SShaderCache::GetTypeFromShaderString(a_key); @@ -2211,7 +2269,8 @@ namespace SIE if (IsCompiling()) { logger::info("Stopping {} remaining shader compilation tasks", compilationSet.totalTasks - compilationSet.completedTasks - compilationSet.failedTasks); } - ssource.request_stop(); + ssource.request_stop(); // signals any legacy stop_token users + managementJthread.request_stop(); // stops management thread + in-flight compilations compilationSet.Clear(); } @@ -2311,8 +2370,13 @@ namespace SIE ShaderCache::ShaderCache() { dependencyTracker = std::make_unique(); - logger::debug("ShaderCache initialized with {} compiler threads", (int)compilationThreadCount); - compilationPool.detach_task([this, token = ssource.get_token()] { ManageCompilationSet(token); }); + logger::debug("ShaderCache initialized: {} startup threads, {} background threads, {} pool threads", + (int)compilationThreadCount, (int)backgroundCompilationThreadCount, (int)compilationPool.get_thread_count()); + // Management thread runs on a dedicated jthread, not in the compilation pool, + // so it doesn't consume a pool slot that could be used for shader compilation. + managementJthread = std::jthread([this](std::stop_token stoken) { + ManageCompilationSet(stoken); + }); } bool ShaderCache::UseFileWatcher() const @@ -2346,7 +2410,12 @@ namespace SIE pathStr += std::format("{}; ", path); } logger::debug("ShaderCache watching for changes in {}", pathStr); - compilationPool.detach_task([this] { listener->processQueue(); }); + // Capture listener by value so the thread does not race with StopFileWatcher() + // nulling this->listener before the thread has had a chance to start. + auto* capturedListener = listener; + capturedListener->fileWatcherThread = std::jthread([capturedListener]() { + capturedListener->processQueue(); + }); } else { logger::debug("ShaderCache already enabled"); } @@ -2355,11 +2424,16 @@ namespace SIE void ShaderCache::StopFileWatcher() { logger::info("Stopping FileWatcher"); + // Set flag first so processQueue()'s loop condition becomes false before we join. + useFileWatcher = false; if (fileWatcher) { fileWatcher->removeWatch(watchID); fileWatcher = nullptr; } if (listener) { + // ~jthread() calls request_stop() + join(); processQueue() exits when + // UseFileWatcher() returns false (set above). + delete listener; listener = nullptr; } } @@ -2546,6 +2620,87 @@ namespace SIE return hideError; } + int ShaderCache::GetHeavyTasksInFlight() + { + return static_cast(compilationSet.heavyTasksInFlight.load(std::memory_order_relaxed)); + } + + uint64_t ShaderCache::GetSlowTasks() + { + return compilationSet.slowTasks.load(std::memory_order_relaxed); + } + + uint64_t ShaderCache::GetVerySlowTasks() + { + return compilationSet.verySlowTasks.load(std::memory_order_relaxed); + } + + std::vector CompilationSet::GetTopSlowTasks(size_t n) const + { + std::lock_guard lock(slowTasksMutex); + // Partial sort to get the N highest without fully sorting the whole vector. + std::vector result = slowTaskRecords; + if (result.size() > n) { + std::partial_sort(result.begin(), result.begin() + n, result.end(), + [](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; }); + result.resize(n); + } else { + std::sort(result.begin(), result.end(), + [](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; }); + } + return result; + } + + std::vector ShaderCache::GetTopSlowTasks(size_t n) + { + return compilationSet.GetTopSlowTasks(n); + } + + std::optional CompilationSet::GetParallelismStats() const + { + std::vector records; + { + std::lock_guard lock(slowTasksMutex); + if (slowTaskRecords.empty()) { + return std::nullopt; + } + records = slowTaskRecords; + } + + ParallelismStats stats; + stats.sampleCount = records.size(); + for (const auto& rec : records) { + stats.workMs += rec.elapsedMs; + stats.spanMs = std::max(stats.spanMs, rec.elapsedMs); + stats.avgQueueWaitMs += rec.queueWaitMs; + stats.maxQueueWaitMs = std::max(stats.maxQueueWaitMs, rec.queueWaitMs); + } + stats.avgQueueWaitMs /= static_cast(stats.sampleCount); + + LARGE_INTEGER now; + QueryPerformanceCounter(&now); + int64_t endTime = completionTime.load(std::memory_order_relaxed); + if (endTime == 0) { + endTime = now.QuadPart; + } + stats.makespanMs = static_cast(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart; + + if (stats.spanMs > 0.0) { + stats.avgParallelism = stats.workMs / stats.spanMs; + } + if (stats.makespanMs > 0.0) { + stats.infiniteCoreEfficiency = stats.spanMs / stats.makespanMs; + stats.infiniteCoreGapPercent = std::max(0.0, 100.0 * (1.0 - stats.infiniteCoreEfficiency)); + } + + return stats; + } + + std::optional ShaderCache::GetParallelismStats() + { + return compilationSet.GetParallelismStats(); + } + void ShaderCache::ClearShaderMap(RE::BSShader::Type a_type) { std::string_view shaderTypeStr = magic_enum::enum_name(a_type); @@ -2730,8 +2885,79 @@ namespace SIE return; } - SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL); - task.Perform(); + const auto taskKey = task.GetString(); + + // Thread priority serves as a signal to Intel Thread Director and + // the Windows scheduler for P-core vs E-core placement on hybrid CPUs. + // Heavy shaders compile at normal priority (favouring P-cores); light + // shaders stay below-normal (allowing E-core placement). On non-hybrid + // CPUs this still gives heavy compiles slightly more scheduler attention. + SetThreadPriority(GetCurrentThread(), + task.GetPriority() >= SIE::kHeavyPriorityThreshold ? THREAD_PRIORITY_NORMAL : THREAD_PRIORITY_BELOW_NORMAL); + + LARGE_INTEGER start, end, freq; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + const double queueWaitMs = task.GetEnqueuedQpc() > 0 ? + static_cast(start.QuadPart - task.GetEnqueuedQpc()) * 1000.0 / freq.QuadPart : + 0.0; + + try { + task.Perform(); + } catch (const std::exception& e) { + logger::error("Unhandled exception compiling shader task {}: {}", taskKey, e.what()); + ResolvePendingFailure(taskKey); + } catch (...) { + logger::error("Unhandled non-standard exception compiling shader task {}", taskKey); + ResolvePendingFailure(taskKey); + } + + QueryPerformanceCounter(&end); + const double elapsedMs = static_cast(end.QuadPart - start.QuadPart) * 1000.0 / freq.QuadPart; + const uint64_t remaining = compilationSet.totalTasks - compilationSet.completedTasks.load(std::memory_order_relaxed) - compilationSet.failedTasks.load(std::memory_order_relaxed); + + // Proxy for permutation complexity: descriptor low 32 bits from GetId(); popcount = active defines. + // Shader file size provides a secondary signal for source complexity. + const auto descriptorComplexity = std::popcount(static_cast(task.GetId())); + uintmax_t sourceBytes = 0; + { + // GetString() format: "fxpFilename:ShaderClass:defines" — filename is before the first colon. + const auto taskStr = task.GetString(); + const auto sep = taskStr.find(':'); + if (sep != std::string::npos) { + const auto shaderName = taskStr.substr(0, sep); + if (auto path = SIE::SShaderCache::GetShaderPath(shaderName); !path.empty()) { + std::error_code ec; + sourceBytes = std::filesystem::file_size(path, ec); + } + } + } + + // Debug: full per-task record for post-mortem straggler analysis. + logger::debug("[ShaderTiming] {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | tid={} | {}", + elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, + task.GetPriority(), GetCurrentThreadId(), taskKey); + + constexpr double kSlowMs = 2000.0; + constexpr double kVerySlowMs = 8000.0; + + // Record every task for post-mortem analysis and developer UI (top-N display). + { + std::lock_guard lock(compilationSet.slowTasksMutex); + compilationSet.slowTaskRecords.push_back({ taskKey, elapsedMs, queueWaitMs, task.GetPriority(), + static_cast(descriptorComplexity), sourceBytes }); + } + + if (elapsedMs >= kVerySlowMs) { + compilationSet.verySlowTasks++; + compilationSet.slowTasks++; + logger::info("[ShaderTiming] Very slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}", + elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey); + } else if (elapsedMs >= kSlowMs) { + compilationSet.slowTasks++; + logger::debug("[ShaderTiming] Slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}", + elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey); + } if (stoken.stop_requested()) { return; @@ -2744,7 +2970,8 @@ namespace SIE const RE::BSShader& aShader, uint32_t aDescriptor) : shaderClass(aShaderClass), - shader(aShader), descriptor(aDescriptor) + shader(aShader), descriptor(aDescriptor), + cachedPriority(ComputePriority(aShaderClass, aShader, aDescriptor)) {} void ShaderCompilationTask::Perform() const @@ -2777,6 +3004,72 @@ namespace SIE return GetId() == other.GetId(); } + int ShaderCompilationTask::ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor) + { + int priority = 0; + const auto type = shader.shaderType.get(); + + // Base priority by shader type — Lighting is consistently the slowest + // (123KB source, 12s+ compile), followed by Effect (~31KB, up to 12s). + switch (type) { + case RE::BSShader::Type::Lighting: + priority += 1000; + break; + case RE::BSShader::Type::Effect: + priority += 500; + break; + case RE::BSShader::Type::Water: + priority += 300; + break; + default: + break; + } + + // Pixel shaders compile significantly slower than vertex shaders + if (shaderClass == ShaderClass::Pixel) + priority += 200; + + // More active descriptor bits → more #defines → more code paths for the compiler + priority += std::popcount(descriptor) * 30; + + // Known heavy Lighting techniques and flags from straggler analysis + if (type == RE::BSShader::Type::Lighting) { + const auto technique = static_cast(0x3F & (descriptor >> 24)); + + // LANDSCAPE techniques (MTLand, MTLandLODBlend) are among the heaviest + // due to multi-texture blending codegen — regularly 60-130s compile times + if (technique == ShaderCache::LightingShaderTechniques::MTLand || + technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend) + priority += 500; + if (technique == ShaderCache::LightingShaderTechniques::Parallax || + technique == ShaderCache::LightingShaderTechniques::ParallaxOcc) + priority += 300; + if (technique == ShaderCache::LightingShaderTechniques::Eye) + priority += 200; + if (technique == ShaderCache::LightingShaderTechniques::MultilayerParallax) + priority += 200; + + // TRUE_PBR and ANISO_LIGHTING are the dominant cost drivers, + // especially in combination with LANDSCAPE (115-130s observed) + if (descriptor & static_cast(ShaderCache::LightingShaderFlags::TruePbr)) + priority += 500; + if (descriptor & static_cast(ShaderCache::LightingShaderFlags::AnisoLighting)) + priority += 300; + // Deferred adds extra codegen overhead + if (descriptor & static_cast(ShaderCache::LightingShaderFlags::Deferred)) + priority += 200; + + // LANDSCAPE + TRUE_PBR combination triggers extreme register pressure + // (6x unrolled texture layers * PBR params = 30+ textures, 180s+ compile) + if ((technique == ShaderCache::LightingShaderTechniques::MTLand || + technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend) && + (descriptor & static_cast(ShaderCache::LightingShaderFlags::TruePbr))) + priority += 500; + } + + return priority; + } + std::optional CompilationSet::WaitTake(std::stop_token stoken) { std::unique_lock lock(compilationMutex); @@ -2784,8 +3077,9 @@ namespace SIE if (!conditionVariable.wait( lock, stoken, [this, &shaderCache]() { return !availableTasks.empty() && - // check against all tasks in queue to trickle the work. It cannot be the active tasks count because the thread pool itself is maximum. - (int)shaderCache->compilationPool.get_tasks_total() <= + // Dispatch when pool has room. Use < (not <=) so that after + // push_task() the total never exceeds the limit. + (int)shaderCache->compilationPool.get_tasks_total() < (!shaderCache->backgroundCompilation ? shaderCache->compilationThreadCount : shaderCache->backgroundCompilationThreadCount); })) { /*Woke up because of a stop request. */ return std::nullopt; @@ -2794,9 +3088,27 @@ namespace SIE QueryPerformanceCounter(&lastReset); lastCalculation = lastReset; } - auto node = availableTasks.extract(availableTasks.begin()); - auto& task = node.value(); - tasksInProgress.insert(std::move(node)); + + // Startup policy: keep dispatching the hardest queued work first. + // This preserves the existing priority score while preventing light tasks + // from bypassing queued heavy shaders and stretching the tail. + auto bestIt = availableTasks.end(); + if (!availableTasks.empty()) { + bestIt = std::prev(availableTasks.end()); + } + + if (bestIt == availableTasks.end()) { + return std::nullopt; + } + + ShaderCompilationTask task = *bestIt; + availableTasks.erase(bestIt); + + if (task.GetPriority() >= kHeavyPriorityThreshold) { + heavyTasksInFlight.fetch_add(1, std::memory_order_relaxed); + } + + tasksInProgress.insert(task); return task; } @@ -2806,11 +3118,16 @@ namespace SIE auto inProgressIt = tasksInProgress.find(task); auto processedIt = processedTasks.find(task); if (inProgressIt == tasksInProgress.end() && processedIt == processedTasks.end() && !globals::shaderCache->GetCompletedShader(task)) { - auto [availableIt, wasAdded] = availableTasks.insert(task); + LARGE_INTEGER now; + QueryPerformanceCounter(&now); + auto queuedTask = task; + queuedTask.SetEnqueuedQpc(now.QuadPart); + auto [_, wasAdded] = availableTasks.insert(queuedTask); lock.unlock(); if (wasAdded) { conditionVariable.notify_one(); totalTasks++; + totalPriorityWeight += static_cast(task.GetPriority()) + 1; } } } @@ -2836,6 +3153,17 @@ namespace SIE logger::debug("Compiling Task failed: {}", key); failedTasks++; } + completedPriorityWeight += static_cast(task.GetPriority()) + 1; + + // Track heavy task completion for P-core concurrency limiting + if (task.GetPriority() >= kHeavyPriorityThreshold) { + auto current = heavyTasksInFlight.load(std::memory_order_relaxed); + while (current > 0 && + !heavyTasksInFlight.compare_exchange_weak(current, current - 1, + std::memory_order_relaxed, + std::memory_order_relaxed)) { + } + } // Update timing LARGE_INTEGER now; @@ -2873,36 +3201,49 @@ namespace SIE completedTasks = 0; failedTasks = 0; cacheHitTasks = 0; + slowTasks = 0; + verySlowTasks = 0; + totalPriorityWeight = 0; + completedPriorityWeight = 0; + heavyTasksInFlight = 0; QueryPerformanceCounter(&lastReset); QueryPerformanceCounter(&lastCalculation); completionTime = { 0 }; // Reset completion time totalTime = { 0 }; + { + std::lock_guard slowLock(slowTasksMutex); + slowTaskRecords.clear(); + } } std::string CompilationSet::GetHumanTime(double a_totalMs) { - int milliseconds = static_cast(a_totalMs); - int seconds = milliseconds / 1000; - int minutes = seconds / 60; - seconds %= 60; - int hours = minutes / 60; - minutes %= 60; - - return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds); + return Util::FormatDuration(a_totalMs); } double CompilationSet::GetEta() { - // For ETA calculation, we still use the active compilation time (totalTime) - // because it reflects the actual work time, not wall-clock time - double totalMs = static_cast(totalTime.QuadPart) * 1000.0 / frequency.QuadPart; + // Use wall-clock elapsed time since compilation started + LARGE_INTEGER now; + QueryPerformanceCounter(&now); + int64_t endTime = (completionTime.load(std::memory_order_relaxed) != 0) ? completionTime.load(std::memory_order_relaxed) : now.QuadPart; + double elapsedMs = static_cast(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart; - if (totalMs == 0.0) { - return 0.0; // Avoid division by zero - } - auto rate = completedTasks / totalMs; - auto remaining = totalTasks - completedTasks - failedTasks; - return std::max(remaining / rate, 0.0); + if (elapsedMs <= 0.0) + return 0.0; + + // Priority-weighted ETA: heavy tasks completing early should not inflate + // the estimate. We measure progress as a fraction of total priority weight + // completed, which accounts for the decreasing cost of remaining tasks. + double doneWeight = static_cast(completedPriorityWeight.load(std::memory_order_relaxed)); + double totalWeight = static_cast(totalPriorityWeight.load(std::memory_order_relaxed)); + + if (doneWeight <= 0.0 || totalWeight <= 0.0) + return 0.0; + + double fractionDone = doneWeight / totalWeight; + double estimatedTotalMs = elapsedMs / fractionDone; + return std::max(estimatedTotalMs - elapsedMs, 0.0); } std::string CompilationSet::GetStatsString(bool a_timeOnly, bool a_elapsedOnly) @@ -2927,7 +3268,7 @@ namespace SIE } } - return fmt::format("{}/{} (successful/total)\tfailed: {}\tcachehits: {}\nElapsed/Estimated Time: {}/{}", + return fmt::format("{}/{} (successful/total)\tfailed: {}\tdeduplicated: {}\nElapsed/Estimated Time: {}/{}", (std::uint64_t)completedTasks, (std::uint64_t)totalTasks, (std::uint64_t)failedTasks, diff --git a/src/ShaderCache.h b/src/ShaderCache.h index 89bcedf92d..72d4c6cec8 100644 --- a/src/ShaderCache.h +++ b/src/ShaderCache.h @@ -4,6 +4,8 @@ #include #include +#include "Utils/WinApi.h" + using namespace std::chrono; namespace ShaderConstants @@ -218,12 +220,24 @@ namespace SIE size_t GetId() const; std::string GetString() const; + /// LPT scheduling score: higher = more expensive = should be dispatched first. + /// Based on shader type, class, descriptor complexity, and known heavy defines. + /// Computed once at construction and cached. + int GetPriority() const { return cachedPriority; } + void SetEnqueuedQpc(int64_t qpc) { enqueuedQpc = qpc; } + int64_t GetEnqueuedQpc() const { return enqueuedQpc; } + bool operator==(const ShaderCompilationTask& other) const; protected: ShaderClass shaderClass; const RE::BSShader& shader; uint32_t descriptor; + + private: + static int ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor); + int cachedPriority; + int64_t enqueuedQpc = 0; }; } @@ -236,8 +250,24 @@ struct std::hash } }; +struct TaskPriorityLess +{ + bool operator()(const SIE::ShaderCompilationTask& a, const SIE::ShaderCompilationTask& b) const + { + if (a.GetPriority() != b.GetPriority()) { + return a.GetPriority() < b.GetPriority(); + } + return a.GetId() < b.GetId(); + } +}; + namespace SIE { + /// Threshold above which a shader task is considered "heavy" and benefits + /// from P-core placement on hybrid CPUs. Used for thread-priority hints, + /// telemetry, and developer-facing diagnostics. + constexpr int kHeavyPriorityThreshold = 500; + class CompilationSet { public: @@ -259,19 +289,61 @@ namespace SIE void Add(const ShaderCompilationTask& task); void Complete(const ShaderCompilationTask& task); void Clear(); - std::string GetHumanTime(double a_totalMs); + static std::string GetHumanTime(double a_totalMs); double GetEta(); std::string GetStatsString(bool a_timeOnly = false, bool a_elapsedOnly = false); std::atomic completedTasks = 0; std::atomic totalTasks = 0; std::atomic failedTasks = 0; - std::atomic cacheHitTasks = 0; // number of compiles of a previously seen shader combo + std::atomic cacheHitTasks = 0; // number of compiles of a previously seen shader combo + std::atomic slowTasks = 0; // shaders taking >= 2s + std::atomic verySlowTasks = 0; // shaders taking >= 8s + std::atomic totalPriorityWeight = 0; // sum of (GetPriority()+1) for all queued tasks + std::atomic completedPriorityWeight = 0; // sum of (GetPriority()+1) for completed/failed tasks + std::atomic heavyTasksInFlight = 0; // number of dispatched heavy (>= kHeavyPriorityThreshold) tasks still running std::mutex compilationMutex; + /// Per-task timing record stored for post-mortem analysis and developer UI. + struct SlowTaskRecord + { + std::string key; // ShaderCompilationTask::GetString() — "fxpFile:Class:defines" + double elapsedMs = 0.0; + double queueWaitMs = 0.0; + int priority = 0; // estimated compile weight (see ComputePriority) + int defineCount = 0; // popcount of descriptor — active define permutations + uintmax_t sourceSizeBytes = 0; // HLSL source file size at compile time + }; + + /// On-demand parallelism metrics derived from task timings. + struct ParallelismStats + { + double workMs = 0.0; // W = sum of all task times + double spanMs = 0.0; // S ~= longest single task + double makespanMs = 0.0; // T_p = wall-clock compile duration + double avgParallelism = 0.0; // W / S + double infiniteCoreEfficiency = 0.0; // S / T_p + double infiniteCoreGapPercent = 0.0; // 100 * (1 - S / T_p) + double avgQueueWaitMs = 0.0; // average enqueue -> dispatch delay + double maxQueueWaitMs = 0.0; // worst enqueue -> dispatch delay + size_t sampleCount = 0; + }; + + /// All per-task timing records for this build (appended from multiple threads). + /// Protected by slowTasksMutex. + std::vector slowTaskRecords; + mutable std::mutex slowTasksMutex; + + /// Returns a copy of the N records with the highest elapsedMs, sorted descending. + std::vector GetTopSlowTasks(size_t n = 3) const; + + /// Computes parallelism metrics on demand from collected task timings. + std::optional GetParallelismStats() const; + private: - std::unordered_set availableTasks; - std::unordered_set tasksInProgress; - std::unordered_set processedTasks; // completed or failed + /// Tasks awaiting dispatch, ordered by cached priority and task id. + std::set availableTasks; + std::set tasksInProgress; + std::set processedTasks; // completed or failed std::condition_variable_any conditionVariable; }; @@ -394,6 +466,15 @@ namespace SIE bool Clear(const std::string& a_path); bool AddCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor, ID3DBlob* a_blob); + + enum class ClaimResult + { + CacheHit, // Already compiled; use the returned blob + Claimed // Claimed as Pending; caller must compile and call AddCompletedShader + }; + std::pair ClaimCompilation(const std::string& key); + void ResolvePendingFailure(const std::string& key); + ID3DBlob* GetCompletedShader(const std::string& a_key); ID3DBlob* GetCompletedShader(const SIE::ShaderCompilationTask& a_task); ID3DBlob* GetCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor); @@ -432,6 +513,15 @@ namespace SIE void IterateShaderBlock(bool a_forward = true); bool IsHideErrors(); + // Overlay stats + int GetHeavyTasksInFlight(); + uint64_t GetSlowTasks(); + uint64_t GetVerySlowTasks(); + + /// Returns a copy of the top-N slowest task records from the last build, sorted descending. + std::vector GetTopSlowTasks(size_t n = 3); + std::optional GetParallelismStats(); + /** * @brief Clears all shaders of a specific type from the shader map. * @@ -445,9 +535,13 @@ namespace SIE ShaderFileDependencyTracker* GetDependencyTracker() { return dependencyTracker.get(); } - int32_t compilationThreadCount = std::max({ static_cast(std::thread::hardware_concurrency()) - 4, static_cast(std::thread::hardware_concurrency()) * 3 / 4, 1 }); - int32_t backgroundCompilationThreadCount = std::max(static_cast(std::thread::hardware_concurrency()) / 2, 1); - BS::thread_pool<> compilationPool{}; + // Use all logical cores minus one at startup for OS headroom (E-cores included). + // Management and file watcher run on dedicated jthreads, not pool slots. + // Background (in-game): half of P-cores only, to avoid starving the render thread. + int32_t compilationThreadCount = std::max(static_cast(std::thread::hardware_concurrency()) - 1, 1); + int32_t backgroundCompilationThreadCount = std::max(static_cast(Util::GetPerformanceCoreCount()) / 2, 1); + BS::thread_pool<> compilationPool{ static_cast(compilationThreadCount) }; + std::jthread managementJthread; // dedicated thread for ManageCompilationSet (not in pool) bool backgroundCompilation = false; bool menuLoaded = false; @@ -717,6 +811,7 @@ namespace SIE CompilationSet compilationSet; ankerl::unordered_dense::map shaderMap{}; std::mutex mapMutex; // guard for shaderMap + std::condition_variable mapCV; // signalled when a Pending entry transitions to Completed/Failed ankerl::unordered_dense::map modifiedShaderMap{}; // hashmap when a shader source file last modified std::mutex modifiedMapMutex; // guard for modifiedShaderMap ankerl::unordered_dense::map> hlslToShaderMap{}; // hashmap linking specific hlsl files to shader keys in shaderMap @@ -756,6 +851,8 @@ namespace SIE void processQueue(); void handleFileAction(efsw::WatchID, const std::string& dir, const std::string& filename, efsw::Action action, std::string) override; + std::jthread fileWatcherThread; // dedicated thread for processQueue (not in pool) + private: ShaderFileDependencyTracker* deps; struct fileAction diff --git a/src/Utils/Format.cpp b/src/Utils/Format.cpp index 61f1d91195..2b0d88905b 100644 --- a/src/Utils/Format.cpp +++ b/src/Utils/Format.cpp @@ -158,6 +158,21 @@ namespace Util } } + std::string FormatDuration(double ms) + { + // Validate input: handle negative, NaN, and infinite values + if (!std::isfinite(ms) || ms < 0.0) { + return "00:00:00"; + } + + // Use int64_t to avoid overflow on long durations (>596 hours with int) + int64_t total_s = static_cast(ms) / 1000; + int64_t hours = total_s / 3600; + int64_t minutes = (total_s % 3600) / 60; + int64_t seconds = total_s % 60; + return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds); + } + std::string TimeAgoString(std::chrono::steady_clock::time_point last) { using namespace std::chrono; diff --git a/src/Utils/Format.h b/src/Utils/Format.h index 5448581c7c..dcee025bce 100644 --- a/src/Utils/Format.h +++ b/src/Utils/Format.h @@ -66,6 +66,17 @@ namespace Util */ std::string FormatTimeAgo(std::filesystem::file_time_type fileTime); + /** + * Formats a duration given in milliseconds as HH:MM:SS. + * Suitable for displaying long-running operation times (e.g. shader compilation). + * + * @param ms Duration in milliseconds. Fractional milliseconds are truncated. + * Non-finite (NaN/inf) or negative values are clamped to "00:00:00". + * Durations >= 24 hours display hours without limit (e.g., "125:34:56"). + * @return Formatted string like "00:02:35" or "00:00:00" for invalid inputs + */ + std::string FormatDuration(double ms); + /** * Formats a delta value with percentage difference for A/B test comparisons. * Returns a string like "+0.45 ms (+12.3%)" or "-0.23 ms (-8.1%)". diff --git a/src/Utils/WinApi.cpp b/src/Utils/WinApi.cpp index cef816e213..7f3e810015 100644 --- a/src/Utils/WinApi.cpp +++ b/src/Utils/WinApi.cpp @@ -27,4 +27,46 @@ namespace Util return std::nullopt; } + + uint32_t GetPerformanceCoreCount() + { + // Cache the result — CPU topology never changes at runtime. + // C++11 guarantees thread-safe initialisation of static locals. + static const uint32_t cached = []() -> uint32_t { + const uint32_t fallback = std::max(1u, std::thread::hardware_concurrency()); + + DWORD size = 0; + GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size); + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || size == 0) + return fallback; + + std::vector buf(size); + auto* info = reinterpret_cast(buf.data()); + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) + return fallback; + + // First pass: find the highest efficiency class present. + BYTE maxClass = 0; + for (DWORD offset = 0; offset < size;) { + auto* entry = reinterpret_cast(buf.data() + offset); + if (entry->Processor.EfficiencyClass > maxClass) + maxClass = entry->Processor.EfficiencyClass; + offset += entry->Size; + } + + // Second pass: count logical processors on those (P-)cores. + uint32_t count = 0; + for (DWORD offset = 0; offset < size;) { + auto* entry = reinterpret_cast(buf.data() + offset); + if (entry->Processor.EfficiencyClass == maxClass) { + for (WORD g = 0; g < entry->Processor.GroupCount; ++g) + count += static_cast(std::popcount(entry->Processor.GroupMask[g].Mask)); + } + offset += entry->Size; + } + + return count > 0 ? count : fallback; + }(); + return cached; + } } // namespace Util diff --git a/src/Utils/WinApi.h b/src/Utils/WinApi.h index cf048b3d31..3b21ccd8a0 100644 --- a/src/Utils/WinApi.h +++ b/src/Utils/WinApi.h @@ -3,4 +3,10 @@ namespace Util { std::optional GetDllVersion(const std::wstring& dllPath); + + /// Returns the number of logical processors on the highest-efficiency cores + /// (P-cores on Intel hybrid CPUs). On non-hybrid CPUs all cores share the + /// same efficiency class, so this returns std::thread::hardware_concurrency(). + /// Falls back to hardware_concurrency() on any API failure. + uint32_t GetPerformanceCoreCount(); } // namespace Util