diff --git a/src/Features/UnifiedWater/WaterCache.cpp b/src/Features/UnifiedWater/WaterCache.cpp index a2addeaf97..2da8f1cbc6 100644 --- a/src/Features/UnifiedWater/WaterCache.cpp +++ b/src/Features/UnifiedWater/WaterCache.cpp @@ -2,8 +2,6 @@ #include -#include "Utils/WinApi.h" - bool WaterCache::SetCurrentWorldSpace(const RE::TESWorldSpace* worldSpace) { if (!worldSpace) @@ -216,8 +214,7 @@ bool WaterCache::GenerateCaches() } } - // Use P-core logical count on Intel hybrid CPUs; falls back to hardware_concurrency() on non-hybrid. - const unsigned hw = Util::GetPerformanceCoreCount(); + const unsigned hw = std::max(1u, std::thread::hardware_concurrency()); const unsigned threads = std::max(1u, hw > 4 ? hw - 4 : (hw * 3) / 4); async.pool = std::make_unique>(threads); diff --git a/src/Menu/AdvancedSettingsRenderer.cpp b/src/Menu/AdvancedSettingsRenderer.cpp index 85310a0ec4..f985f6f680 100644 --- a/src/Menu/AdvancedSettingsRenderer.cpp +++ b/src/Menu/AdvancedSettingsRenderer.cpp @@ -126,16 +126,15 @@ void AdvancedSettingsRenderer::RenderLoggingSection() ImGui::SliderInt("Compiler Threads", &shaderCache->compilationThreadCount, 1, static_cast(std::thread::hardware_concurrency())); if (auto _tt = Util::HoverTooltipWrapper()) { ImGui::Text( - "Number of threads used to compile shaders at startup. " - "Defaults to all logical cores minus one for OS headroom (E-cores included). " - "Higher values finish compilation faster but may make the system less responsive."); + "Number of threads to use to compile shaders. " + "The more threads the faster compilation will finish but may make the system unresponsive. "); } ImGui::SliderInt("Background Compiler Threads", &shaderCache->backgroundCompilationThreadCount, 1, static_cast(std::thread::hardware_concurrency())); if (auto _tt = Util::HoverTooltipWrapper()) { ImGui::Text( - "Number of threads used to compile shaders during gameplay. " - "Defaults to half of performance cores to avoid impacting the render thread. " - "Higher values finish compilation faster but may cause stuttering."); + "Number of threads to use to compile shaders while playing game. " + "This is activated if the startup compilation is skipped. " + "The more threads the faster compilation will finish but may make the system unresponsive. "); } ImGui::Columns(2, nullptr, false); @@ -553,103 +552,6 @@ void AdvancedSettingsRenderer::RenderDeveloperSection() // Statistics section (moved from Advanced/Logging) if (ImGui::TreeNodeEx("Statistics", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text(std::format("Shader Compiler : {}", shaderCache->GetShaderStatsString()).c_str()); - - // Derived parallelism metrics are computed lazily on demand and only shown - // once compilation has completed to avoid per-frame analysis while compiling. - if (!shaderCache->IsCompiling()) { - auto parallelism = shaderCache->GetParallelismStats(); - if (parallelism.has_value()) { - const auto& p = parallelism.value(); - ImGui::Spacing(); - ImGui::TextDisabled("Parallelism (derived from %zu compiled tasks)", p.sampleCount); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Computed lazily from the last completed build."); - ImGui::Text("Only evaluated when this Statistics section is open."); - } - ImGui::Text("Work (W, sum of task wall times): %s", Util::FormatDuration(p.workMs).c_str()); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Total compile work: sum of all per-shader wall-clock compile times."); - ImGui::Text("This is not CPU time; it is accumulated task elapsed time."); - ImGui::Text("Equivalent serial time on one worker if overhead stayed the same."); - } - ImGui::Text("Span (S, longest): %s", Util::FormatDuration(p.spanMs).c_str()); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Critical-path lower bound, approximated by the single slowest shader."); - ImGui::Text("Even infinite cores cannot finish faster than this."); - } - ImGui::Text("Makespan (T_p): %s", Util::FormatDuration(p.makespanMs).c_str()); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Observed wall-clock duration for the full shader build."); - } - ImGui::Text("Queue wait (avg/max): %s / %s", - Util::FormatDuration(p.avgQueueWaitMs).c_str(), - Util::FormatDuration(p.maxQueueWaitMs).c_str()); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Time spent waiting in the ready queue before a worker started compilation."); - ImGui::Text("Useful for identifying scheduler-induced delay separate from compile cost."); - } - ImGui::Text("Average parallelism (W/S): %.2fx", p.avgParallelism); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Average useful concurrency in this workload."); - ImGui::Text("Roughly the worker count where adding more cores gives diminishing returns."); - } - ImGui::Text("Infinite-core efficiency (S/T_p): %.1f%%", 100.0 * p.infiniteCoreEfficiency); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("How close runtime is to the infinite-core lower bound."); - ImGui::Text("100%% means T_p == S."); - } - ImGui::Text("Infinite-core gap: %.1f%%", p.infiniteCoreGapPercent); - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("Distance from ideal infinite-core time."); - ImGui::Text("Defined as 100 * (1 - S / T_p). Lower is better."); - } - - ImGui::Spacing(); - ImGui::TextDisabled("Infinite-core efficiency"); - float efficiency = static_cast(std::clamp(p.infiniteCoreEfficiency, 0.0, 1.0)); - ImGui::ProgressBar(efficiency, ImVec2(-1.0f, 0.0f), std::format("{:.1f}% efficient / {:.1f}% gap", 100.0 * p.infiniteCoreEfficiency, p.infiniteCoreGapPercent).c_str()); - - ImGui::Spacing(); - ImGui::TextDisabled("Relative durations (normalized)"); - double maxMs = std::max({ p.workMs, p.spanMs, p.makespanMs, 1.0 }); - auto drawRelativeBar = [maxMs](const char* label, double value) { - float ratio = static_cast(std::clamp(value / maxMs, 0.0, 1.0)); - ImGui::TextUnformatted(label); - ImGui::SameLine(); - ImGui::ProgressBar(ratio, ImVec2(-1.0f, 0.0f), std::format("{} ({:.1f}%)", Util::FormatDuration(value), 100.0 * ratio).c_str()); - }; - drawRelativeBar("Span (S)", p.spanMs); - drawRelativeBar("Makespan (T_p)", p.makespanMs); - drawRelativeBar("Work (W)", p.workMs); - } - } - - // Top-3 slowest shaders from the last build - auto topSlow = shaderCache->GetTopSlowTasks(3); - if (!topSlow.empty()) { - ImGui::Spacing(); - ImGui::TextDisabled("Top %zu Slowest Shaders (last build)", topSlow.size()); - for (size_t i = 0; i < topSlow.size(); ++i) { - const auto& rec = topSlow[i]; - ImGui::Text("#%zu %s (weight %d)", i + 1, - Util::FormatDuration(rec.elapsedMs).c_str(), rec.priority); - ImGui::SameLine(); - ImGui::TextDisabled("%s", rec.key.c_str()); - if (ImGui::IsItemHovered()) { - if (auto _tt = Util::HoverTooltipWrapper()) { - ImGui::Text("%s", rec.key.c_str()); - } - } - // Allow copying the full key with a right-click - if (ImGui::BeginPopupContextItem(std::format("##slowcopy{}", i).c_str())) { - if (ImGui::MenuItem("Copy key")) { - ImGui::SetClipboardText(rec.key.c_str()); - } - ImGui::EndPopup(); - } - } - } - ImGui::TreePop(); } diff --git a/src/Menu/OverlayRenderer.cpp b/src/Menu/OverlayRenderer.cpp index e7ccaeb5b5..c80bf97bed 100644 --- a/src/Menu/OverlayRenderer.cpp +++ b/src/Menu/OverlayRenderer.cpp @@ -168,23 +168,6 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::functionIsDeveloperMode()) { - int32_t threadLimit = shaderCache->backgroundCompilation ? shaderCache->backgroundCompilationThreadCount : shaderCache->compilationThreadCount; - int compilationRunning = (int)shaderCache->compilationPool.get_tasks_running(); - int heavyInFlight = shaderCache->GetHeavyTasksInFlight(); - int heavyLimit = static_cast(Util::GetPerformanceCoreCount()); - uint64_t slow = shaderCache->GetSlowTasks(); - uint64_t verySlow = shaderCache->GetVerySlowTasks(); - ImGui::Text("Threads: %d / %d limit | Heavy: %d / %d P-cores | %d workers", - compilationRunning, - threadLimit, - heavyInFlight, - heavyLimit, - (int)shaderCache->compilationPool.get_thread_count()); - if (slow > 0) { - ImGui::Text("Slow shaders: %llu (very slow: %llu)", slow, verySlow); - } - } if (!shaderCache->backgroundCompilation && shaderCache->menuLoaded) { auto skipShadersText = fmt::format( "Press {} to proceed without completing shader compilation. ", @@ -197,9 +180,7 @@ void OverlayRenderer::RenderShaderCompilationStatus(const std::functionGetTotalTasks() > 0) { ImGui::Text("Last shader cache build duration: %s", shaderCache->GetShaderStatsString(true, true).c_str()); - - // Stacked bar showing compilation breakdown - { - uint64_t total = shaderCache->GetTotalTasks(); - uint64_t completed = shaderCache->GetCompletedTasks(); - uint64_t failed = shaderCache->GetFailedTasks(); - uint64_t cacheHits = shaderCache->GetCachedHitTasks(); - uint64_t slow = shaderCache->GetSlowTasks(); - uint64_t verySlow = shaderCache->GetVerySlowTasks(); - // Compiled = tasks that actually went through compilation. - // Cache hits are separate (returned early without queueing). - uint64_t compiled = completed; - uint64_t fast = compiled > slow ? compiled - slow : 0; - uint64_t medium = slow > verySlow ? slow - verySlow : 0; // 2-8s - - struct Segment - { - uint64_t count; - ImU32 color; - const char* label; - }; - Segment segments[] = { - { cacheHits, IM_COL32(120, 120, 120, 255), "Deduplicated" }, - { fast, IM_COL32(80, 180, 80, 255), "Fast (<2s)" }, - { medium, IM_COL32(220, 180, 50, 255), "Slow (2-8s)" }, - { verySlow, IM_COL32(220, 60, 60, 255), "Very slow (>=8s)" }, - { failed, IM_COL32(160, 30, 30, 255), "Failed" }, - }; - - float barHeight = 14.0f * Util::GetUIScale(); - float barWidth = ImGui::GetContentRegionAvail().x; - ImVec2 cursor = ImGui::GetCursorScreenPos(); - ImDrawList* drawList = ImGui::GetWindowDrawList(); - - // Background - drawList->AddRectFilled(cursor, ImVec2(cursor.x + barWidth, cursor.y + barHeight), IM_COL32(40, 40, 40, 255)); - - // Draw segments - float x = cursor.x; - for (auto& seg : segments) { - if (seg.count == 0 || total == 0) - continue; - float segWidth = (static_cast(seg.count) / static_cast(total)) * barWidth; - if (segWidth < 1.0f) - segWidth = 1.0f; - drawList->AddRectFilled(ImVec2(x, cursor.y), ImVec2(x + segWidth, cursor.y + barHeight), seg.color); - x += segWidth; - } - - // Reserve space and handle tooltip - ImGui::Dummy(ImVec2(barWidth, barHeight)); - if (ImGui::IsItemHovered()) { - ImGui::BeginTooltip(); - for (auto& seg : segments) { - if (seg.count == 0) - continue; - float pct = total > 0 ? 100.0f * static_cast(seg.count) / static_cast(total) : 0.0f; - ImGui::TextColored(ImGui::ColorConvertU32ToFloat4(seg.color), "%s: %llu (%.1f%%)", seg.label, seg.count, pct); - } - ImGui::EndTooltip(); - } - } - - auto state = globals::state; - if (state->IsDeveloperMode()) { - ImGui::Text("Threads: %d compile, %d background, %d pool | P-cores: %d", - (int)shaderCache->compilationThreadCount, - (int)shaderCache->backgroundCompilationThreadCount, - (int)shaderCache->compilationPool.get_thread_count(), - (int)Util::GetPerformanceCoreCount()); - } } ImGui::EndTabItem(); diff --git a/src/ShaderCache.cpp b/src/ShaderCache.cpp index 9bdbb45e9b..1b3f3edd3d 100644 --- a/src/ShaderCache.cpp +++ b/src/ShaderCache.cpp @@ -1341,24 +1341,20 @@ namespace SIE return nullptr; } + // check hashmap auto& cache = ShaderCache::Instance(); - auto key = SShaderCache::GetShaderString(shaderClass, shader, descriptor, true); - - // Atomically check the shaderMap and either: - // - return the blob if already Completed (cache hit), - // - wait if another thread is compiling (Pending), - // - claim the slot with Pending if nobody started yet. - auto [claimResult, cachedBlob] = cache.ClaimCompilation(key); - if (claimResult == ShaderCache::ClaimResult::CacheHit) { + ID3DBlob* shaderBlob = cache.GetCompletedShader(shaderClass, shader, descriptor); + + if (shaderBlob) { + // already compiled before + logger::debug("Shader already compiled; using cache: {}", SShaderCache::GetShaderString(shaderClass, shader, descriptor)); cache.IncCacheHitTasks(); - return cachedBlob; + return shaderBlob; } - const auto type = shader.shaderType.get(); // check diskcache auto diskPath = GetDiskPath(shader.fxpFilename, descriptor, shaderClass); - ID3DBlob* shaderBlob = nullptr; if (useDiskCache && std::filesystem::exists(diskPath)) { // check build time of cache @@ -1409,7 +1405,6 @@ namespace SIE auto pathString = Util::WStringToString(path); if (!std::filesystem::exists(path)) { logger::error("Failed to compile {} shader {}::{:X}: {} does not exist", magic_enum::enum_name(shaderClass), magic_enum::enum_name(type), descriptor, pathString); - cache.AddCompletedShader(shaderClass, shader, descriptor, nullptr); return nullptr; } logger::debug("Compiling {} {}:{}:{:X} to {}", pathString, magic_enum::enum_name(type), magic_enum::enum_name(shaderClass), descriptor, MergeDefinesString(defines)); @@ -1931,16 +1926,11 @@ namespace SIE { Clear(); StopFileWatcher(); - // Signal management thread to stop dispatching; pool workers observe the same - // stop token and will not pick up new tasks after current compilations finish. - HANDLE managementHandle = managementJthread.native_handle(); - managementJthread.request_stop(); - // Purge unstarted tasks so we only wait for compilations already in flight. - compilationPool.purge(); if (!compilationPool.wait_for(std::chrono::milliseconds(1000))) { - logger::info("Tasks still running despite request to stop; killing management thread {}!", GetThreadId(managementHandle)); - WaitForSingleObject(managementHandle, 1000); - TerminateThread(managementHandle, 0); + logger::info("Tasks still running despite request to stop; killing thread {}!", GetThreadId(managementThread)); + WaitForSingleObject(managementThread, 1000); + TerminateThread(managementThread, 0); + CloseHandle(managementThread); } } @@ -2112,7 +2102,6 @@ namespace SIE std::unique_lock lockM{ mapMutex }; shaderMap.insert_or_assign(key, ShaderCacheResult{ a_blob, status, system_clock::now() }); } - mapCV.notify_all(); // wake threads waiting on a Pending→Completed/Failed transition const std::wstring path = SIE::SShaderCache::GetShaderPath( shader.shaderType == RE::BSShader::Type::ImageSpace ? static_cast(shader).originalShaderName : @@ -2149,53 +2138,6 @@ namespace SIE return a_blob != nullptr; } - std::pair ShaderCache::ClaimCompilation(const std::string& key) - { - std::unique_lock lockM{ mapMutex }; - - for (;;) { - auto it = shaderMap.find(key); - if (it != shaderMap.end()) { - auto& entry = it->second; - if (entry.status == ShaderCompilationTask::Status::Completed) { - if (entry.blob) { - logger::debug("Shader already compiled; using cache: {}", key); - return { ClaimResult::CacheHit, entry.blob }; - } - break; // Completed with nullptr blob — re-compile - } - if (entry.status == ShaderCompilationTask::Status::Failed) { - break; // Previous attempt failed — re-compile - } - // Status is Pending — another thread is compiling this shader. - logger::debug("Shader compilation in progress, waiting: {}", key); - mapCV.wait(lockM); - continue; // re-check after wakeup - } - break; // not in map at all - } - - // Claim the slot as Pending before releasing the lock - shaderMap.insert_or_assign(key, ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Pending, system_clock::now() }); - return { ClaimResult::Claimed, nullptr }; - } - - void ShaderCache::ResolvePendingFailure(const std::string& key) - { - bool changed = false; - { - std::unique_lock lockM{ mapMutex }; - auto it = shaderMap.find(key); - if (it != shaderMap.end() && it->second.status == ShaderCompilationTask::Status::Pending) { - it->second = ShaderCacheResult{ nullptr, ShaderCompilationTask::Status::Failed, system_clock::now() }; - changed = true; - } - } - if (changed) { - mapCV.notify_all(); - } - } - ID3DBlob* ShaderCache::GetCompletedShader(const std::string& a_key) { std::string type = SIE::SShaderCache::GetTypeFromShaderString(a_key); @@ -2269,8 +2211,7 @@ namespace SIE if (IsCompiling()) { logger::info("Stopping {} remaining shader compilation tasks", compilationSet.totalTasks - compilationSet.completedTasks - compilationSet.failedTasks); } - ssource.request_stop(); // signals any legacy stop_token users - managementJthread.request_stop(); // stops management thread + in-flight compilations + ssource.request_stop(); compilationSet.Clear(); } @@ -2370,13 +2311,8 @@ namespace SIE ShaderCache::ShaderCache() { dependencyTracker = std::make_unique(); - logger::debug("ShaderCache initialized: {} startup threads, {} background threads, {} pool threads", - (int)compilationThreadCount, (int)backgroundCompilationThreadCount, (int)compilationPool.get_thread_count()); - // Management thread runs on a dedicated jthread, not in the compilation pool, - // so it doesn't consume a pool slot that could be used for shader compilation. - managementJthread = std::jthread([this](std::stop_token stoken) { - ManageCompilationSet(stoken); - }); + logger::debug("ShaderCache initialized with {} compiler threads", (int)compilationThreadCount); + compilationPool.detach_task([this, token = ssource.get_token()] { ManageCompilationSet(token); }); } bool ShaderCache::UseFileWatcher() const @@ -2410,12 +2346,7 @@ namespace SIE pathStr += std::format("{}; ", path); } logger::debug("ShaderCache watching for changes in {}", pathStr); - // Capture listener by value so the thread does not race with StopFileWatcher() - // nulling this->listener before the thread has had a chance to start. - auto* capturedListener = listener; - capturedListener->fileWatcherThread = std::jthread([capturedListener]() { - capturedListener->processQueue(); - }); + compilationPool.detach_task([this] { listener->processQueue(); }); } else { logger::debug("ShaderCache already enabled"); } @@ -2424,16 +2355,11 @@ namespace SIE void ShaderCache::StopFileWatcher() { logger::info("Stopping FileWatcher"); - // Set flag first so processQueue()'s loop condition becomes false before we join. - useFileWatcher = false; if (fileWatcher) { fileWatcher->removeWatch(watchID); fileWatcher = nullptr; } if (listener) { - // ~jthread() calls request_stop() + join(); processQueue() exits when - // UseFileWatcher() returns false (set above). - delete listener; listener = nullptr; } } @@ -2620,87 +2546,6 @@ namespace SIE return hideError; } - int ShaderCache::GetHeavyTasksInFlight() - { - return static_cast(compilationSet.heavyTasksInFlight.load(std::memory_order_relaxed)); - } - - uint64_t ShaderCache::GetSlowTasks() - { - return compilationSet.slowTasks.load(std::memory_order_relaxed); - } - - uint64_t ShaderCache::GetVerySlowTasks() - { - return compilationSet.verySlowTasks.load(std::memory_order_relaxed); - } - - std::vector CompilationSet::GetTopSlowTasks(size_t n) const - { - std::lock_guard lock(slowTasksMutex); - // Partial sort to get the N highest without fully sorting the whole vector. - std::vector result = slowTaskRecords; - if (result.size() > n) { - std::partial_sort(result.begin(), result.begin() + n, result.end(), - [](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; }); - result.resize(n); - } else { - std::sort(result.begin(), result.end(), - [](const SlowTaskRecord& a, const SlowTaskRecord& b) { return a.elapsedMs > b.elapsedMs; }); - } - return result; - } - - std::vector ShaderCache::GetTopSlowTasks(size_t n) - { - return compilationSet.GetTopSlowTasks(n); - } - - std::optional CompilationSet::GetParallelismStats() const - { - std::vector records; - { - std::lock_guard lock(slowTasksMutex); - if (slowTaskRecords.empty()) { - return std::nullopt; - } - records = slowTaskRecords; - } - - ParallelismStats stats; - stats.sampleCount = records.size(); - for (const auto& rec : records) { - stats.workMs += rec.elapsedMs; - stats.spanMs = std::max(stats.spanMs, rec.elapsedMs); - stats.avgQueueWaitMs += rec.queueWaitMs; - stats.maxQueueWaitMs = std::max(stats.maxQueueWaitMs, rec.queueWaitMs); - } - stats.avgQueueWaitMs /= static_cast(stats.sampleCount); - - LARGE_INTEGER now; - QueryPerformanceCounter(&now); - int64_t endTime = completionTime.load(std::memory_order_relaxed); - if (endTime == 0) { - endTime = now.QuadPart; - } - stats.makespanMs = static_cast(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart; - - if (stats.spanMs > 0.0) { - stats.avgParallelism = stats.workMs / stats.spanMs; - } - if (stats.makespanMs > 0.0) { - stats.infiniteCoreEfficiency = stats.spanMs / stats.makespanMs; - stats.infiniteCoreGapPercent = std::max(0.0, 100.0 * (1.0 - stats.infiniteCoreEfficiency)); - } - - return stats; - } - - std::optional ShaderCache::GetParallelismStats() - { - return compilationSet.GetParallelismStats(); - } - void ShaderCache::ClearShaderMap(RE::BSShader::Type a_type) { std::string_view shaderTypeStr = magic_enum::enum_name(a_type); @@ -2885,79 +2730,8 @@ namespace SIE return; } - const auto taskKey = task.GetString(); - - // Thread priority serves as a signal to Intel Thread Director and - // the Windows scheduler for P-core vs E-core placement on hybrid CPUs. - // Heavy shaders compile at normal priority (favouring P-cores); light - // shaders stay below-normal (allowing E-core placement). On non-hybrid - // CPUs this still gives heavy compiles slightly more scheduler attention. - SetThreadPriority(GetCurrentThread(), - task.GetPriority() >= SIE::kHeavyPriorityThreshold ? THREAD_PRIORITY_NORMAL : THREAD_PRIORITY_BELOW_NORMAL); - - LARGE_INTEGER start, end, freq; - QueryPerformanceFrequency(&freq); - QueryPerformanceCounter(&start); - const double queueWaitMs = task.GetEnqueuedQpc() > 0 ? - static_cast(start.QuadPart - task.GetEnqueuedQpc()) * 1000.0 / freq.QuadPart : - 0.0; - - try { - task.Perform(); - } catch (const std::exception& e) { - logger::error("Unhandled exception compiling shader task {}: {}", taskKey, e.what()); - ResolvePendingFailure(taskKey); - } catch (...) { - logger::error("Unhandled non-standard exception compiling shader task {}", taskKey); - ResolvePendingFailure(taskKey); - } - - QueryPerformanceCounter(&end); - const double elapsedMs = static_cast(end.QuadPart - start.QuadPart) * 1000.0 / freq.QuadPart; - const uint64_t remaining = compilationSet.totalTasks - compilationSet.completedTasks.load(std::memory_order_relaxed) - compilationSet.failedTasks.load(std::memory_order_relaxed); - - // Proxy for permutation complexity: descriptor low 32 bits from GetId(); popcount = active defines. - // Shader file size provides a secondary signal for source complexity. - const auto descriptorComplexity = std::popcount(static_cast(task.GetId())); - uintmax_t sourceBytes = 0; - { - // GetString() format: "fxpFilename:ShaderClass:defines" — filename is before the first colon. - const auto taskStr = task.GetString(); - const auto sep = taskStr.find(':'); - if (sep != std::string::npos) { - const auto shaderName = taskStr.substr(0, sep); - if (auto path = SIE::SShaderCache::GetShaderPath(shaderName); !path.empty()) { - std::error_code ec; - sourceBytes = std::filesystem::file_size(path, ec); - } - } - } - - // Debug: full per-task record for post-mortem straggler analysis. - logger::debug("[ShaderTiming] {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | tid={} | {}", - elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, - task.GetPriority(), GetCurrentThreadId(), taskKey); - - constexpr double kSlowMs = 2000.0; - constexpr double kVerySlowMs = 8000.0; - - // Record every task for post-mortem analysis and developer UI (top-N display). - { - std::lock_guard lock(compilationSet.slowTasksMutex); - compilationSet.slowTaskRecords.push_back({ taskKey, elapsedMs, queueWaitMs, task.GetPriority(), - static_cast(descriptorComplexity), sourceBytes }); - } - - if (elapsedMs >= kVerySlowMs) { - compilationSet.verySlowTasks++; - compilationSet.slowTasks++; - logger::info("[ShaderTiming] Very slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}", - elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey); - } else if (elapsedMs >= kSlowMs) { - compilationSet.slowTasks++; - logger::debug("[ShaderTiming] Slow {:.0f}ms | queue_wait={:.0f}ms | remaining={} | defines={} | src={}B | prio={} | {}", - elapsedMs, queueWaitMs, remaining, descriptorComplexity, sourceBytes, task.GetPriority(), taskKey); - } + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL); + task.Perform(); if (stoken.stop_requested()) { return; @@ -2970,8 +2744,7 @@ namespace SIE const RE::BSShader& aShader, uint32_t aDescriptor) : shaderClass(aShaderClass), - shader(aShader), descriptor(aDescriptor), - cachedPriority(ComputePriority(aShaderClass, aShader, aDescriptor)) + shader(aShader), descriptor(aDescriptor) {} void ShaderCompilationTask::Perform() const @@ -3004,72 +2777,6 @@ namespace SIE return GetId() == other.GetId(); } - int ShaderCompilationTask::ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor) - { - int priority = 0; - const auto type = shader.shaderType.get(); - - // Base priority by shader type — Lighting is consistently the slowest - // (123KB source, 12s+ compile), followed by Effect (~31KB, up to 12s). - switch (type) { - case RE::BSShader::Type::Lighting: - priority += 1000; - break; - case RE::BSShader::Type::Effect: - priority += 500; - break; - case RE::BSShader::Type::Water: - priority += 300; - break; - default: - break; - } - - // Pixel shaders compile significantly slower than vertex shaders - if (shaderClass == ShaderClass::Pixel) - priority += 200; - - // More active descriptor bits → more #defines → more code paths for the compiler - priority += std::popcount(descriptor) * 30; - - // Known heavy Lighting techniques and flags from straggler analysis - if (type == RE::BSShader::Type::Lighting) { - const auto technique = static_cast(0x3F & (descriptor >> 24)); - - // LANDSCAPE techniques (MTLand, MTLandLODBlend) are among the heaviest - // due to multi-texture blending codegen — regularly 60-130s compile times - if (technique == ShaderCache::LightingShaderTechniques::MTLand || - technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend) - priority += 500; - if (technique == ShaderCache::LightingShaderTechniques::Parallax || - technique == ShaderCache::LightingShaderTechniques::ParallaxOcc) - priority += 300; - if (technique == ShaderCache::LightingShaderTechniques::Eye) - priority += 200; - if (technique == ShaderCache::LightingShaderTechniques::MultilayerParallax) - priority += 200; - - // TRUE_PBR and ANISO_LIGHTING are the dominant cost drivers, - // especially in combination with LANDSCAPE (115-130s observed) - if (descriptor & static_cast(ShaderCache::LightingShaderFlags::TruePbr)) - priority += 500; - if (descriptor & static_cast(ShaderCache::LightingShaderFlags::AnisoLighting)) - priority += 300; - // Deferred adds extra codegen overhead - if (descriptor & static_cast(ShaderCache::LightingShaderFlags::Deferred)) - priority += 200; - - // LANDSCAPE + TRUE_PBR combination triggers extreme register pressure - // (6x unrolled texture layers * PBR params = 30+ textures, 180s+ compile) - if ((technique == ShaderCache::LightingShaderTechniques::MTLand || - technique == ShaderCache::LightingShaderTechniques::MTLandLODBlend) && - (descriptor & static_cast(ShaderCache::LightingShaderFlags::TruePbr))) - priority += 500; - } - - return priority; - } - std::optional CompilationSet::WaitTake(std::stop_token stoken) { std::unique_lock lock(compilationMutex); @@ -3077,9 +2784,8 @@ namespace SIE if (!conditionVariable.wait( lock, stoken, [this, &shaderCache]() { return !availableTasks.empty() && - // Dispatch when pool has room. Use < (not <=) so that after - // push_task() the total never exceeds the limit. - (int)shaderCache->compilationPool.get_tasks_total() < + // check against all tasks in queue to trickle the work. It cannot be the active tasks count because the thread pool itself is maximum. + (int)shaderCache->compilationPool.get_tasks_total() <= (!shaderCache->backgroundCompilation ? shaderCache->compilationThreadCount : shaderCache->backgroundCompilationThreadCount); })) { /*Woke up because of a stop request. */ return std::nullopt; @@ -3088,27 +2794,9 @@ namespace SIE QueryPerformanceCounter(&lastReset); lastCalculation = lastReset; } - - // Startup policy: keep dispatching the hardest queued work first. - // This preserves the existing priority score while preventing light tasks - // from bypassing queued heavy shaders and stretching the tail. - auto bestIt = availableTasks.end(); - if (!availableTasks.empty()) { - bestIt = std::prev(availableTasks.end()); - } - - if (bestIt == availableTasks.end()) { - return std::nullopt; - } - - ShaderCompilationTask task = *bestIt; - availableTasks.erase(bestIt); - - if (task.GetPriority() >= kHeavyPriorityThreshold) { - heavyTasksInFlight.fetch_add(1, std::memory_order_relaxed); - } - - tasksInProgress.insert(task); + auto node = availableTasks.extract(availableTasks.begin()); + auto& task = node.value(); + tasksInProgress.insert(std::move(node)); return task; } @@ -3118,16 +2806,11 @@ namespace SIE auto inProgressIt = tasksInProgress.find(task); auto processedIt = processedTasks.find(task); if (inProgressIt == tasksInProgress.end() && processedIt == processedTasks.end() && !globals::shaderCache->GetCompletedShader(task)) { - LARGE_INTEGER now; - QueryPerformanceCounter(&now); - auto queuedTask = task; - queuedTask.SetEnqueuedQpc(now.QuadPart); - auto [_, wasAdded] = availableTasks.insert(queuedTask); + auto [availableIt, wasAdded] = availableTasks.insert(task); lock.unlock(); if (wasAdded) { conditionVariable.notify_one(); totalTasks++; - totalPriorityWeight += static_cast(task.GetPriority()) + 1; } } } @@ -3153,17 +2836,6 @@ namespace SIE logger::debug("Compiling Task failed: {}", key); failedTasks++; } - completedPriorityWeight += static_cast(task.GetPriority()) + 1; - - // Track heavy task completion for P-core concurrency limiting - if (task.GetPriority() >= kHeavyPriorityThreshold) { - auto current = heavyTasksInFlight.load(std::memory_order_relaxed); - while (current > 0 && - !heavyTasksInFlight.compare_exchange_weak(current, current - 1, - std::memory_order_relaxed, - std::memory_order_relaxed)) { - } - } // Update timing LARGE_INTEGER now; @@ -3201,49 +2873,36 @@ namespace SIE completedTasks = 0; failedTasks = 0; cacheHitTasks = 0; - slowTasks = 0; - verySlowTasks = 0; - totalPriorityWeight = 0; - completedPriorityWeight = 0; - heavyTasksInFlight = 0; QueryPerformanceCounter(&lastReset); QueryPerformanceCounter(&lastCalculation); completionTime = { 0 }; // Reset completion time totalTime = { 0 }; - { - std::lock_guard slowLock(slowTasksMutex); - slowTaskRecords.clear(); - } } std::string CompilationSet::GetHumanTime(double a_totalMs) { - return Util::FormatDuration(a_totalMs); + int milliseconds = static_cast(a_totalMs); + int seconds = milliseconds / 1000; + int minutes = seconds / 60; + seconds %= 60; + int hours = minutes / 60; + minutes %= 60; + + return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds); } double CompilationSet::GetEta() { - // Use wall-clock elapsed time since compilation started - LARGE_INTEGER now; - QueryPerformanceCounter(&now); - int64_t endTime = (completionTime.load(std::memory_order_relaxed) != 0) ? completionTime.load(std::memory_order_relaxed) : now.QuadPart; - double elapsedMs = static_cast(endTime - lastReset.QuadPart) * 1000.0 / frequency.QuadPart; - - if (elapsedMs <= 0.0) - return 0.0; + // For ETA calculation, we still use the active compilation time (totalTime) + // because it reflects the actual work time, not wall-clock time + double totalMs = static_cast(totalTime.QuadPart) * 1000.0 / frequency.QuadPart; - // Priority-weighted ETA: heavy tasks completing early should not inflate - // the estimate. We measure progress as a fraction of total priority weight - // completed, which accounts for the decreasing cost of remaining tasks. - double doneWeight = static_cast(completedPriorityWeight.load(std::memory_order_relaxed)); - double totalWeight = static_cast(totalPriorityWeight.load(std::memory_order_relaxed)); - - if (doneWeight <= 0.0 || totalWeight <= 0.0) - return 0.0; - - double fractionDone = doneWeight / totalWeight; - double estimatedTotalMs = elapsedMs / fractionDone; - return std::max(estimatedTotalMs - elapsedMs, 0.0); + if (totalMs == 0.0) { + return 0.0; // Avoid division by zero + } + auto rate = completedTasks / totalMs; + auto remaining = totalTasks - completedTasks - failedTasks; + return std::max(remaining / rate, 0.0); } std::string CompilationSet::GetStatsString(bool a_timeOnly, bool a_elapsedOnly) @@ -3268,7 +2927,7 @@ namespace SIE } } - return fmt::format("{}/{} (successful/total)\tfailed: {}\tdeduplicated: {}\nElapsed/Estimated Time: {}/{}", + return fmt::format("{}/{} (successful/total)\tfailed: {}\tcachehits: {}\nElapsed/Estimated Time: {}/{}", (std::uint64_t)completedTasks, (std::uint64_t)totalTasks, (std::uint64_t)failedTasks, diff --git a/src/ShaderCache.h b/src/ShaderCache.h index 72d4c6cec8..89bcedf92d 100644 --- a/src/ShaderCache.h +++ b/src/ShaderCache.h @@ -4,8 +4,6 @@ #include #include -#include "Utils/WinApi.h" - using namespace std::chrono; namespace ShaderConstants @@ -220,24 +218,12 @@ namespace SIE size_t GetId() const; std::string GetString() const; - /// LPT scheduling score: higher = more expensive = should be dispatched first. - /// Based on shader type, class, descriptor complexity, and known heavy defines. - /// Computed once at construction and cached. - int GetPriority() const { return cachedPriority; } - void SetEnqueuedQpc(int64_t qpc) { enqueuedQpc = qpc; } - int64_t GetEnqueuedQpc() const { return enqueuedQpc; } - bool operator==(const ShaderCompilationTask& other) const; protected: ShaderClass shaderClass; const RE::BSShader& shader; uint32_t descriptor; - - private: - static int ComputePriority(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor); - int cachedPriority; - int64_t enqueuedQpc = 0; }; } @@ -250,24 +236,8 @@ struct std::hash } }; -struct TaskPriorityLess -{ - bool operator()(const SIE::ShaderCompilationTask& a, const SIE::ShaderCompilationTask& b) const - { - if (a.GetPriority() != b.GetPriority()) { - return a.GetPriority() < b.GetPriority(); - } - return a.GetId() < b.GetId(); - } -}; - namespace SIE { - /// Threshold above which a shader task is considered "heavy" and benefits - /// from P-core placement on hybrid CPUs. Used for thread-priority hints, - /// telemetry, and developer-facing diagnostics. - constexpr int kHeavyPriorityThreshold = 500; - class CompilationSet { public: @@ -289,61 +259,19 @@ namespace SIE void Add(const ShaderCompilationTask& task); void Complete(const ShaderCompilationTask& task); void Clear(); - static std::string GetHumanTime(double a_totalMs); + std::string GetHumanTime(double a_totalMs); double GetEta(); std::string GetStatsString(bool a_timeOnly = false, bool a_elapsedOnly = false); std::atomic completedTasks = 0; std::atomic totalTasks = 0; std::atomic failedTasks = 0; - std::atomic cacheHitTasks = 0; // number of compiles of a previously seen shader combo - std::atomic slowTasks = 0; // shaders taking >= 2s - std::atomic verySlowTasks = 0; // shaders taking >= 8s - std::atomic totalPriorityWeight = 0; // sum of (GetPriority()+1) for all queued tasks - std::atomic completedPriorityWeight = 0; // sum of (GetPriority()+1) for completed/failed tasks - std::atomic heavyTasksInFlight = 0; // number of dispatched heavy (>= kHeavyPriorityThreshold) tasks still running + std::atomic cacheHitTasks = 0; // number of compiles of a previously seen shader combo std::mutex compilationMutex; - /// Per-task timing record stored for post-mortem analysis and developer UI. - struct SlowTaskRecord - { - std::string key; // ShaderCompilationTask::GetString() — "fxpFile:Class:defines" - double elapsedMs = 0.0; - double queueWaitMs = 0.0; - int priority = 0; // estimated compile weight (see ComputePriority) - int defineCount = 0; // popcount of descriptor — active define permutations - uintmax_t sourceSizeBytes = 0; // HLSL source file size at compile time - }; - - /// On-demand parallelism metrics derived from task timings. - struct ParallelismStats - { - double workMs = 0.0; // W = sum of all task times - double spanMs = 0.0; // S ~= longest single task - double makespanMs = 0.0; // T_p = wall-clock compile duration - double avgParallelism = 0.0; // W / S - double infiniteCoreEfficiency = 0.0; // S / T_p - double infiniteCoreGapPercent = 0.0; // 100 * (1 - S / T_p) - double avgQueueWaitMs = 0.0; // average enqueue -> dispatch delay - double maxQueueWaitMs = 0.0; // worst enqueue -> dispatch delay - size_t sampleCount = 0; - }; - - /// All per-task timing records for this build (appended from multiple threads). - /// Protected by slowTasksMutex. - std::vector slowTaskRecords; - mutable std::mutex slowTasksMutex; - - /// Returns a copy of the N records with the highest elapsedMs, sorted descending. - std::vector GetTopSlowTasks(size_t n = 3) const; - - /// Computes parallelism metrics on demand from collected task timings. - std::optional GetParallelismStats() const; - private: - /// Tasks awaiting dispatch, ordered by cached priority and task id. - std::set availableTasks; - std::set tasksInProgress; - std::set processedTasks; // completed or failed + std::unordered_set availableTasks; + std::unordered_set tasksInProgress; + std::unordered_set processedTasks; // completed or failed std::condition_variable_any conditionVariable; }; @@ -466,15 +394,6 @@ namespace SIE bool Clear(const std::string& a_path); bool AddCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor, ID3DBlob* a_blob); - - enum class ClaimResult - { - CacheHit, // Already compiled; use the returned blob - Claimed // Claimed as Pending; caller must compile and call AddCompletedShader - }; - std::pair ClaimCompilation(const std::string& key); - void ResolvePendingFailure(const std::string& key); - ID3DBlob* GetCompletedShader(const std::string& a_key); ID3DBlob* GetCompletedShader(const SIE::ShaderCompilationTask& a_task); ID3DBlob* GetCompletedShader(ShaderClass shaderClass, const RE::BSShader& shader, uint32_t descriptor); @@ -513,15 +432,6 @@ namespace SIE void IterateShaderBlock(bool a_forward = true); bool IsHideErrors(); - // Overlay stats - int GetHeavyTasksInFlight(); - uint64_t GetSlowTasks(); - uint64_t GetVerySlowTasks(); - - /// Returns a copy of the top-N slowest task records from the last build, sorted descending. - std::vector GetTopSlowTasks(size_t n = 3); - std::optional GetParallelismStats(); - /** * @brief Clears all shaders of a specific type from the shader map. * @@ -535,13 +445,9 @@ namespace SIE ShaderFileDependencyTracker* GetDependencyTracker() { return dependencyTracker.get(); } - // Use all logical cores minus one at startup for OS headroom (E-cores included). - // Management and file watcher run on dedicated jthreads, not pool slots. - // Background (in-game): half of P-cores only, to avoid starving the render thread. - int32_t compilationThreadCount = std::max(static_cast(std::thread::hardware_concurrency()) - 1, 1); - int32_t backgroundCompilationThreadCount = std::max(static_cast(Util::GetPerformanceCoreCount()) / 2, 1); - BS::thread_pool<> compilationPool{ static_cast(compilationThreadCount) }; - std::jthread managementJthread; // dedicated thread for ManageCompilationSet (not in pool) + int32_t compilationThreadCount = std::max({ static_cast(std::thread::hardware_concurrency()) - 4, static_cast(std::thread::hardware_concurrency()) * 3 / 4, 1 }); + int32_t backgroundCompilationThreadCount = std::max(static_cast(std::thread::hardware_concurrency()) / 2, 1); + BS::thread_pool<> compilationPool{}; bool backgroundCompilation = false; bool menuLoaded = false; @@ -811,7 +717,6 @@ namespace SIE CompilationSet compilationSet; ankerl::unordered_dense::map shaderMap{}; std::mutex mapMutex; // guard for shaderMap - std::condition_variable mapCV; // signalled when a Pending entry transitions to Completed/Failed ankerl::unordered_dense::map modifiedShaderMap{}; // hashmap when a shader source file last modified std::mutex modifiedMapMutex; // guard for modifiedShaderMap ankerl::unordered_dense::map> hlslToShaderMap{}; // hashmap linking specific hlsl files to shader keys in shaderMap @@ -851,8 +756,6 @@ namespace SIE void processQueue(); void handleFileAction(efsw::WatchID, const std::string& dir, const std::string& filename, efsw::Action action, std::string) override; - std::jthread fileWatcherThread; // dedicated thread for processQueue (not in pool) - private: ShaderFileDependencyTracker* deps; struct fileAction diff --git a/src/Utils/Format.cpp b/src/Utils/Format.cpp index 2b0d88905b..61f1d91195 100644 --- a/src/Utils/Format.cpp +++ b/src/Utils/Format.cpp @@ -158,21 +158,6 @@ namespace Util } } - std::string FormatDuration(double ms) - { - // Validate input: handle negative, NaN, and infinite values - if (!std::isfinite(ms) || ms < 0.0) { - return "00:00:00"; - } - - // Use int64_t to avoid overflow on long durations (>596 hours with int) - int64_t total_s = static_cast(ms) / 1000; - int64_t hours = total_s / 3600; - int64_t minutes = (total_s % 3600) / 60; - int64_t seconds = total_s % 60; - return fmt::format("{:02}:{:02}:{:02}", hours, minutes, seconds); - } - std::string TimeAgoString(std::chrono::steady_clock::time_point last) { using namespace std::chrono; diff --git a/src/Utils/Format.h b/src/Utils/Format.h index dcee025bce..5448581c7c 100644 --- a/src/Utils/Format.h +++ b/src/Utils/Format.h @@ -66,17 +66,6 @@ namespace Util */ std::string FormatTimeAgo(std::filesystem::file_time_type fileTime); - /** - * Formats a duration given in milliseconds as HH:MM:SS. - * Suitable for displaying long-running operation times (e.g. shader compilation). - * - * @param ms Duration in milliseconds. Fractional milliseconds are truncated. - * Non-finite (NaN/inf) or negative values are clamped to "00:00:00". - * Durations >= 24 hours display hours without limit (e.g., "125:34:56"). - * @return Formatted string like "00:02:35" or "00:00:00" for invalid inputs - */ - std::string FormatDuration(double ms); - /** * Formats a delta value with percentage difference for A/B test comparisons. * Returns a string like "+0.45 ms (+12.3%)" or "-0.23 ms (-8.1%)". diff --git a/src/Utils/WinApi.cpp b/src/Utils/WinApi.cpp index 7f3e810015..cef816e213 100644 --- a/src/Utils/WinApi.cpp +++ b/src/Utils/WinApi.cpp @@ -27,46 +27,4 @@ namespace Util return std::nullopt; } - - uint32_t GetPerformanceCoreCount() - { - // Cache the result — CPU topology never changes at runtime. - // C++11 guarantees thread-safe initialisation of static locals. - static const uint32_t cached = []() -> uint32_t { - const uint32_t fallback = std::max(1u, std::thread::hardware_concurrency()); - - DWORD size = 0; - GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size); - if (GetLastError() != ERROR_INSUFFICIENT_BUFFER || size == 0) - return fallback; - - std::vector buf(size); - auto* info = reinterpret_cast(buf.data()); - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) - return fallback; - - // First pass: find the highest efficiency class present. - BYTE maxClass = 0; - for (DWORD offset = 0; offset < size;) { - auto* entry = reinterpret_cast(buf.data() + offset); - if (entry->Processor.EfficiencyClass > maxClass) - maxClass = entry->Processor.EfficiencyClass; - offset += entry->Size; - } - - // Second pass: count logical processors on those (P-)cores. - uint32_t count = 0; - for (DWORD offset = 0; offset < size;) { - auto* entry = reinterpret_cast(buf.data() + offset); - if (entry->Processor.EfficiencyClass == maxClass) { - for (WORD g = 0; g < entry->Processor.GroupCount; ++g) - count += static_cast(std::popcount(entry->Processor.GroupMask[g].Mask)); - } - offset += entry->Size; - } - - return count > 0 ? count : fallback; - }(); - return cached; - } } // namespace Util diff --git a/src/Utils/WinApi.h b/src/Utils/WinApi.h index 3b21ccd8a0..cf048b3d31 100644 --- a/src/Utils/WinApi.h +++ b/src/Utils/WinApi.h @@ -3,10 +3,4 @@ namespace Util { std::optional GetDllVersion(const std::wstring& dllPath); - - /// Returns the number of logical processors on the highest-efficiency cores - /// (P-cores on Intel hybrid CPUs). On non-hybrid CPUs all cores share the - /// same efficiency class, so this returns std::thread::hardware_concurrency(). - /// Falls back to hardware_concurrency() on any API failure. - uint32_t GetPerformanceCoreCount(); } // namespace Util