Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ares/md/mcd/mcd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,15 @@ struct MCD : M68000, Thread {
std::atomic_flag videoFramePrefetchThreadStarted;
std::atomic_flag videoFramePrefetchThreadShutdownRequested;
std::atomic_flag videoFramePrefetchThreadShutdownComplete;
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
// Workaround for bad performance on Windows targets under MSYS2 with libc++ due to https://github.com/llvm/llvm-project/issues/127221
std::mutex videoFramePrefetchMutex;
std::condition_variable notifyVideoFramePrefetchPending;
std::condition_variable notifyVideoFramePrefetchComplete;
std::condition_variable notifyVideoFramePrefetchThreadStarted;
std::condition_variable notifyVideoFramePrefetchThreadShutdownRequested;
std::condition_variable notifyVideoFramePrefetchThreadShutdownComplete;
#endif
const unsigned char* videoFramePrefetchTarget;
std::vector<unsigned char> videoFramePrefetchBuffer;
} ld;
Expand Down
85 changes: 85 additions & 0 deletions ares/md/mcd/megald.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,37 @@ auto MCD::LD::load(string location) -> void {
videoFramePrefetchThreadShutdownComplete.clear();
std::thread workerThread(std::bind(std::mem_fn(&MCD::LD::videoFramePrefetchThread), this));
workerThread.detach();
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (!videoFramePrefetchThreadStarted.test()) {
notifyVideoFramePrefetchThreadStarted.wait(lock);
}
}
#else
videoFramePrefetchThreadStarted.wait(false);
#endif
}

auto MCD::LD::unload() -> void {
// Request the prefetch background thread to terminate, and wait for it to complete.
if (videoFramePrefetchThreadStarted.test()) {
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadShutdownRequested.test_and_set();
videoFramePrefetchPending.test_and_set();
notifyVideoFramePrefetchPending.notify_all();
while (!videoFramePrefetchThreadShutdownComplete.test()) {
notifyVideoFramePrefetchThreadShutdownComplete.wait(lock);
}
}
#else
videoFramePrefetchThreadShutdownRequested.test_and_set();
videoFramePrefetchPending.test_and_set();
videoFramePrefetchPending.notify_all();
videoFramePrefetchThreadShutdownComplete.wait(false);
#endif
}

// Close the mmi file
Expand Down Expand Up @@ -2773,11 +2794,26 @@ auto MCD::LD::loadCurrentVideoFrameIntoBuffer() -> void {
// when it starts the new one, as it is our responsibility to clear the prefetch complete state. This means if we
// don't wait for the original prefetch to complete here, it would trigger a race condition for the load of the
// following frame.
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (videoFramePrefetchPending.test()) {
notifyVideoFramePrefetchPending.wait(lock);
}
if (videoFramePrefetchTarget != nullptr) {
while (!videoFramePrefetchComplete.test()) {
notifyVideoFramePrefetchComplete.wait(lock);
}
videoFramePrefetchComplete.clear();
}
}
#else
videoFramePrefetchPending.wait(true);
if (videoFramePrefetchTarget != nullptr) {
videoFramePrefetchComplete.wait(false);
videoFramePrefetchComplete.clear();
}
#endif

// If the prefetch operation is for the correct frame, we've just waited for it to complete above, so we now swap the
// prefetch buffer with the build frame buffer. Note that this will exchange memory buffer pointers and not copy the
Expand Down Expand Up @@ -2841,28 +2877,61 @@ auto MCD::LD::loadCurrentVideoFrameIntoBuffer() -> void {
return;
}
videoFramePrefetchTarget = videoFrameCompressed;
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchPending.test_and_set();
notifyVideoFramePrefetchPending.notify_all();
}
#else
videoFramePrefetchPending.test_and_set();
videoFramePrefetchPending.notify_all();
#endif
}

auto MCD::LD::videoFramePrefetchThread() -> void {
// Trigger a notification that this worker thread has started
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadStarted.test_and_set();
notifyVideoFramePrefetchThreadStarted.notify_all();
}
#else
videoFramePrefetchThreadStarted.test_and_set();
videoFramePrefetchThreadStarted.notify_all();
#endif

// Perform prefetch requests as they arrive, and terminate the thread when requested.
while (!videoFramePrefetchThreadShutdownRequested.test()) {
// Wait for a prefetch request to arrive
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (!videoFramePrefetchPending.test()) {
notifyVideoFramePrefetchPending.wait(lock);
}
}
#else
videoFramePrefetchPending.wait(false);
#endif

// If this thread has been requested to terminate, break out of the prefetch loop.
if (videoFramePrefetchThreadShutdownRequested.test()) {
break;
}

// Trigger a notification that a prefetch request is no longer pending
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchPending.clear();
notifyVideoFramePrefetchPending.notify_all();
}
#else
videoFramePrefetchPending.clear();
videoFramePrefetchPending.notify_all();
#endif

// Allocate memory for the prefetch frame buffer if it's currently empty
if (videoFramePrefetchBuffer.empty()) {
Expand All @@ -2874,13 +2943,29 @@ auto MCD::LD::videoFramePrefetchThread() -> void {
qoi2_decode_data(videoFramePrefetchTarget + QON_FRAME_SIZE_SIZE, frameSizeCompressed, &video.videoFrameHeader, nullptr, videoFramePrefetchBuffer.data(), 3);

// Trigger a notification that the prefetch operation is complete
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchComplete.test_and_set();
notifyVideoFramePrefetchComplete.notify_all();
}
#else
videoFramePrefetchComplete.test_and_set();
videoFramePrefetchComplete.notify_all();
#endif
}

// Trigger a notification that this worker thread has shut down
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadShutdownComplete.test_and_set();
notifyVideoFramePrefetchThreadShutdownComplete.notify_all();
}
#else
videoFramePrefetchThreadShutdownComplete.test_and_set();
videoFramePrefetchThreadShutdownComplete.notify_all();
#endif
}

auto MCD::LD::decodeBiphaseCodeFromScanline(int lineNo) -> u32 {
Expand Down
8 changes: 8 additions & 0 deletions ares/md/md.hpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
#pragma once
//started: 2016-07-08

#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK) && !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
Comment on lines +4 to +6
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK) && !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK)
// Work around a slow implementation of atomic flag waits with libc++ on Windows; see https://github.com/llvm/llvm-project/issues/127221
#if !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
// Atomic flag waits are also unavailable below macOS 11.0
#elif defined(__APPLE__) && __MAC_OS_X_VERSION_MIN_REQUIRED < 110000
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
#endif

Copy link
Contributor Author

@RogerSanders RogerSanders Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My only concern with going this route, is I'd like to strip out the entire alternate implementation if/when we cut over to the Microsoft STL, which is being discussed (or if/when the libc++ ticket is resolved). If we tie it to MacOS platform support on older versions, support for which has already been dropped, it's not clear to me when the alternate implementation would be cut. Ares could benefit greatly in other areas from better use of modern threading approaches, particularly as we move away from the nall threading implementation, and I feel like having this slower, more error prone threading method in there muddies the waters. To properly support older MacOS versions, we'd have to carry this same dual-system threading model into other parts of the software, or just use the worse version, both of which seem like significant costs to me.

Copy link
Contributor Author

@RogerSanders RogerSanders Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add here though, it's possible to set the "USE_ATOMIC_FLAG_NOTIFY_FALLBACK" define externally, IE, through a CMake rule or CXXFLAGS. If you do want to do an unsupported build targeting an older MacOS release. This would allow you do that without having to modify the source, at least under this changeset today. Future changes elsewhere in the code would probably cause more problems over time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the platform support question as a totally separate concern, myself. I'm not proposing re-lowering the macOS deployment target. I just think if we're adding a workaround like this, we may as well provide complete information in the code about what that workaround enables. I do not think that the mere presence of this definition impacts our goals or criteria for whether something should be removed or an OS dropped, for whatever reason.

One other thing: I am not sure I personally see a time horizon where ares drops support for libc++ on Windows. A large plurality of homebrew and emulator developers target libc++ via MSYS2/MinGW on Windows, and from my experience I do not see that situation changing in any near-term timeframe. Even if LLVM ships an update that improves the underlying implementation of atomic flag waits, the code being introduced here will still be fairly long-lived in ares, I would expect.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll defer to @LukeUsher on what he thinks is the right way to go here. We could adopt the preprocessor change you propose here, I'm not strictly opposed to it. Shifting the platform/compiler detection logic centrally to CMakeLists might be more appropriate if it's going to be a pattern that's replicated or kept for the longer term though, rather than just a temporary workaround.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I honestly have half a mind to just fix it in llvm-project myself and be done with it, but I can't spare the time for a few more weeks.


#include <ares/ares.hpp>
#include <nall/decode/mmi.hpp>
#include <nall/dsp/iir/dc-removal.hpp>
#include <vector>
#include <cmath>
#include <thread>
#include <atomic>
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#include <mutex>
#include <condition_variable>
#endif
#include <functional>

#include <qon/qon.h>
Expand Down
85 changes: 85 additions & 0 deletions ares/pce/pcd/ldrom2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,37 @@ auto PCD::LD::load(string location) -> void {
videoFramePrefetchThreadShutdownComplete.clear();
std::thread workerThread(std::bind(std::mem_fn(&PCD::LD::videoFramePrefetchThread), this));
workerThread.detach();
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (!videoFramePrefetchThreadStarted.test()) {
notifyVideoFramePrefetchThreadStarted.wait(lock);
}
}
#else
videoFramePrefetchThreadStarted.wait(false);
#endif
}

auto PCD::LD::unload() -> void {
// Request the prefetch background thread to terminate, and wait for it to complete.
if (videoFramePrefetchThreadStarted.test()) {
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadShutdownRequested.test_and_set();
videoFramePrefetchPending.test_and_set();
notifyVideoFramePrefetchPending.notify_all();
while (!videoFramePrefetchThreadShutdownComplete.test()) {
notifyVideoFramePrefetchThreadShutdownComplete.wait(lock);
}
}
#else
videoFramePrefetchThreadShutdownRequested.test_and_set();
videoFramePrefetchPending.test_and_set();
videoFramePrefetchPending.notify_all();
videoFramePrefetchThreadShutdownComplete.wait(false);
#endif
}

// Close the mmi file
Expand Down Expand Up @@ -2777,11 +2798,26 @@ auto PCD::LD::loadCurrentVideoFrameIntoBuffer() -> void {
// when it starts the new one, as it is our responsibility to clear the prefetch complete state. This means if we
// don't wait for the original prefetch to complete here, it would trigger a race condition for the load of the
// following frame.
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (videoFramePrefetchPending.test()) {
notifyVideoFramePrefetchPending.wait(lock);
}
if (videoFramePrefetchTarget != nullptr) {
while (!videoFramePrefetchComplete.test()) {
notifyVideoFramePrefetchComplete.wait(lock);
}
videoFramePrefetchComplete.clear();
}
}
#else
videoFramePrefetchPending.wait(true);
if (videoFramePrefetchTarget != nullptr) {
videoFramePrefetchComplete.wait(false);
videoFramePrefetchComplete.clear();
}
#endif

// If the prefetch operation is for the correct frame, we've just waited for it to complete above, so we now swap the
// prefetch buffer with the build frame buffer. Note that this will exchange memory buffer pointers and not copy the
Expand Down Expand Up @@ -2845,28 +2881,61 @@ auto PCD::LD::loadCurrentVideoFrameIntoBuffer() -> void {
return;
}
videoFramePrefetchTarget = videoFrameCompressed;
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchPending.test_and_set();
notifyVideoFramePrefetchPending.notify_all();
}
#else
videoFramePrefetchPending.test_and_set();
videoFramePrefetchPending.notify_all();
#endif
}

auto PCD::LD::videoFramePrefetchThread() -> void {
// Trigger a notification that this worker thread has started
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadStarted.test_and_set();
notifyVideoFramePrefetchThreadStarted.notify_all();
}
#else
videoFramePrefetchThreadStarted.test_and_set();
videoFramePrefetchThreadStarted.notify_all();
#endif

// Perform prefetch requests as they arrive, and terminate the thread when requested.
while (!videoFramePrefetchThreadShutdownRequested.test()) {
// Wait for a prefetch request to arrive
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
while (!videoFramePrefetchPending.test()) {
notifyVideoFramePrefetchPending.wait(lock);
}
}
#else
videoFramePrefetchPending.wait(false);
#endif

// If this thread has been requested to terminate, break out of the prefetch loop.
if (videoFramePrefetchThreadShutdownRequested.test()) {
break;
}

// Trigger a notification that a prefetch request is no longer pending
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchPending.clear();
notifyVideoFramePrefetchPending.notify_all();
}
#else
videoFramePrefetchPending.clear();
videoFramePrefetchPending.notify_all();
#endif

// Allocate memory for the prefetch frame buffer if it's currently empty
if (videoFramePrefetchBuffer.empty()) {
Expand All @@ -2878,13 +2947,29 @@ auto PCD::LD::videoFramePrefetchThread() -> void {
qoi2_decode_data(videoFramePrefetchTarget + QON_FRAME_SIZE_SIZE, frameSizeCompressed, &video.videoFrameHeader, nullptr, videoFramePrefetchBuffer.data(), 3);

// Trigger a notification that the prefetch operation is complete
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchComplete.test_and_set();
notifyVideoFramePrefetchComplete.notify_all();
}
#else
videoFramePrefetchComplete.test_and_set();
videoFramePrefetchComplete.notify_all();
#endif
}

// Trigger a notification that this worker thread has shut down
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
{
std::unique_lock lock(videoFramePrefetchMutex);
videoFramePrefetchThreadShutdownComplete.test_and_set();
notifyVideoFramePrefetchThreadShutdownComplete.notify_all();
}
#else
videoFramePrefetchThreadShutdownComplete.test_and_set();
videoFramePrefetchThreadShutdownComplete.notify_all();
#endif
}

auto PCD::LD::decodeBiphaseCodeFromScanline(int lineNo) -> u32 {
Expand Down
9 changes: 9 additions & 0 deletions ares/pce/pcd/pcd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,15 @@ struct PCD : Thread {
std::atomic_flag videoFramePrefetchThreadStarted;
std::atomic_flag videoFramePrefetchThreadShutdownRequested;
std::atomic_flag videoFramePrefetchThreadShutdownComplete;
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
// Workaround for bad performance on Windows targets under MSYS2 with libc++ due to https://github.com/llvm/llvm-project/issues/127221
std::mutex videoFramePrefetchMutex;
std::condition_variable notifyVideoFramePrefetchPending;
std::condition_variable notifyVideoFramePrefetchComplete;
std::condition_variable notifyVideoFramePrefetchThreadStarted;
std::condition_variable notifyVideoFramePrefetchThreadShutdownRequested;
std::condition_variable notifyVideoFramePrefetchThreadShutdownComplete;
#endif
const unsigned char* videoFramePrefetchTarget;
std::vector<unsigned char> videoFramePrefetchBuffer;
} ld;
Expand Down
8 changes: 8 additions & 0 deletions ares/pce/pce.hpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
#pragma once
//started: 2017-01-11

#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK) && !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
Comment on lines +4 to +6
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK) && !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
#if !defined(USE_ATOMIC_FLAG_NOTIFY_FALLBACK)
// Work around a slow implementation of atomic flag waits with libc++ on Windows; see https://github.com/llvm/llvm-project/issues/127221
#if !defined(_MSC_VER) && defined(_WIN32)
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
// Atomic flag waits are also unavailable below macOS 11.0
#elif defined(__APPLE__) && __MAC_OS_X_VERSION_MIN_REQUIRED < 110000
#define USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#endif
#endif


#include <ares/ares.hpp>
#include <nall/decode/mmi.hpp>
#include <vector>
#include <cmath>
#include <thread>
#include <atomic>
#ifdef USE_ATOMIC_FLAG_NOTIFY_FALLBACK
#include <mutex>
#include <condition_variable>
#endif
#include <functional>

#include <qon/qon.h>
Expand Down
Loading