diff --git a/.github/workflows/windows_builds.yml b/.github/workflows/windows_builds.yml
index c7d94b82eaa4..f8776b4c1157 100644
--- a/.github/workflows/windows_builds.yml
+++ b/.github/workflows/windows_builds.yml
@@ -93,6 +93,11 @@ jobs:
             echo "ANGLE_ENABLED=no" >> "$GITHUB_OUTPUT"
           fi
 
+      - name: Download WinRT components
+        shell: sh
+        run: python ./misc/scripts/install_winrt.py
+        continue-on-error: true
+
       - name: Download pre-built AccessKit
         shell: sh
         id: accesskit-sdk
diff --git a/core/os/spin_lock.h b/core/os/spin_lock.h
index 7a5051035373..e8e7833f6f81 100644
--- a/core/os/spin_lock.h
+++ b/core/os/spin_lock.h
@@ -93,7 +93,11 @@ static_assert(std::atomic_bool::is_always_lock_free);
 
 class SpinLock {
 	union {
+#if __cplusplus >= 202002L
+		mutable std::atomic<bool> locked = false;
+#else
 		mutable std::atomic<bool> locked = ATOMIC_VAR_INIT(false);
+#endif
 		char aligner[Thread::CACHE_LINE_BYTES];
 	};
 
diff --git a/misc/scripts/install_winrt.py b/misc/scripts/install_winrt.py
new file mode 100755
index 000000000000..ce69929cb4e9
--- /dev/null
+++ b/misc/scripts/install_winrt.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+if __name__ != "__main__":
+    raise SystemExit(f'Utility script "{__file__}" should not be used as a module!')
+
+import os
+import shutil
+import sys
+import urllib.request
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../"))
+
+
+# Base Godot dependencies path
+# If cross-compiling (no LOCALAPPDATA), we install in `bin`
+deps_folder = os.getenv("LOCALAPPDATA")
+if deps_folder:
+    deps_folder = os.path.join(deps_folder, "Godot", "build_deps")
+else:
+    deps_folder = os.path.join("bin", "build_deps")
+
+# WinRT
+winrt_version = "72"
+
+# Create dependencies folder
+if not os.path.exists(deps_folder):
+    os.makedirs(deps_folder)
+
+winrt_filename = "winrt-headers.zip"
+winrt_archive = os.path.join(deps_folder, winrt_filename)
+winrt_folder = os.path.join(deps_folder, "winrt_mingw")
+
+if os.path.isfile(winrt_archive):
+    os.remove(winrt_archive)
+
+print(f"Downloading WinRT {winrt_filename} ...")
+urllib.request.urlretrieve(
+    f"https://github.com/bruvzg/winrt_mingw/releases/download/{winrt_version}/{winrt_filename}",
+    winrt_archive,
+)
+if os.path.exists(winrt_folder):
+    print(f"Removing existing local WinRT installation in {winrt_folder} ...")
+    shutil.rmtree(winrt_folder)
+print(f"Extracting WinRT {winrt_filename} to {winrt_folder} ...")
+shutil.unpack_archive(winrt_archive, winrt_folder)
+os.remove(winrt_archive)
+
+print("WinRT installed successfully.\n")
diff --git a/platform/windows/SCsub b/platform/windows/SCsub
index 8c24ecf71aac..cd9e9d52831a 100644
--- a/platform/windows/SCsub
+++ b/platform/windows/SCsub
@@ -16,7 +16,6 @@ common_win = [
     "os_windows.cpp",
     "display_server_windows.cpp",
     "key_mapping_windows.cpp",
-    "tts_windows.cpp",
     "windows_terminal_logger.cpp",
     "windows_utils.cpp",
     "native_menu_windows.cpp",
@@ -81,6 +80,30 @@ res_obj = env.RES(res_target, res_file)
 env.Depends(res_obj, "#core/version_generated.gen.h")
 
 env.add_source_files(sources, common_win)
+
+env_winrt = env.Clone()
+if not env_winrt.msvc:
+    if "-std=gnu++17" in env_winrt["CXXFLAGS"]:
+        env_winrt["CXXFLAGS"].remove("-std=gnu++17")
+    env_winrt.Append(CXXFLAGS=["-std=gnu++20"])
+    if "-fno-exceptions" in env_winrt["CXXFLAGS"]:
+        env_winrt["CXXFLAGS"].remove("-fno-exceptions")
+    env_winrt.Append(CXXFLAGS=["-fexceptions"])
+else:
+    if "/std:c++17" in env_winrt["CXXFLAGS"]:
+        env_winrt["CXXFLAGS"].remove("/std:c++17")
+    env_winrt.Append(CXXFLAGS=["/std:c++20"])
+    if "_HAS_EXCEPTIONS" in env_winrt["CPPDEFINES"]:
+        env_winrt["CPPDEFINES"].remove("_HAS_EXCEPTIONS")
+    env_winrt.Append(CXXFLAGS=["/EHsc"])
+tts_sources = ["tts_windows.cpp", "tts_driver_sapi.cpp"]
+if env_winrt["winrt_path"] != "" or env_winrt.msvc:
+    if not env_winrt.msvc:
+        env_winrt.Append(CPPPATH=[env["winrt_path"]])
+    env_winrt.AppendUnique(CPPDEFINES=["WINRT_ENABLED"])
+    tts_sources += ["tts_driver_onecore.cpp"]
+env_winrt.add_source_files(sources, tts_sources)
+
 sources += res_obj
 
 if env["accesskit"] and not env.msvc:
diff --git a/platform/windows/detect.py b/platform/windows/detect.py
index e49bb29067c7..6453c60bb365 100644
--- a/platform/windows/detect.py
+++ b/platform/windows/detect.py
@@ -200,11 +200,18 @@ def get_opts():
             "Path to the AccessKit C SDK",
             os.path.join(deps_folder, "accesskit"),
         ),
+        # OpenGL over Direct3D 11.
         (
             "angle_libs",
             "Path to the ANGLE static libraries",
             os.path.join(deps_folder, "angle"),
         ),
+        # WinRT.
+        (
+            "winrt_path",
+            "Path to the WinRT headers",
+            os.path.join(deps_folder, "winrt_mingw"),
+        ),
         # Direct3D 12 support.
         (
             "mesa_libs",
@@ -418,6 +425,7 @@ def spawn_capture(sh, escape, cmd, args, env):
         "wbemuuid",
         "ntdll",
         "hid",
+        "mincore",
     ]
 
     if env.debug_features:
@@ -811,6 +819,7 @@ def configure_mingw(env: "SConsEnvironment"):
             "wbemuuid",
             "ntdll",
             "hid",
+            "mincore",
         ]
     )
 
diff --git a/platform/windows/tts_driver.h b/platform/windows/tts_driver.h
new file mode 100644
index 000000000000..3cef44619683
--- /dev/null
+++ b/platform/windows/tts_driver.h
@@ -0,0 +1,53 @@
+/**************************************************************************/
+/*  tts_driver.h                                                          */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "core/object/object.h"
+
+class TTSDriver : public Object {
+	GDSOFTCLASS(TTSDriver, Object);
+
+public:
+	virtual bool is_speaking() const = 0;
+	virtual bool is_paused() const = 0;
+	virtual Array get_voices() const = 0;
+
+	virtual void speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int64_t p_utterance_id = 0, bool p_interrupt = false) = 0;
+	virtual void pause() = 0;
+	virtual void resume() = 0;
+	virtual void stop() = 0;
+
+	virtual void process_events() = 0;
+
+	virtual bool init() = 0;
+
+	virtual ~TTSDriver() {}
+};
diff --git a/platform/windows/tts_driver_onecore.cpp b/platform/windows/tts_driver_onecore.cpp
new file mode 100644
index 000000000000..a456af67ddb8
--- /dev/null
+++ b/platform/windows/tts_driver_onecore.cpp
@@ -0,0 +1,267 @@
+/**************************************************************************/
+/*  tts_driver_onecore.cpp                                                */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "tts_driver_onecore.h"
+
+#include "core/object/callable_mp.h"
+#include "servers/display/display_server.h"
+
+TTSDriverOneCore *TTSDriverOneCore::singleton = nullptr;
+
+void TTSDriverOneCore::_speech_index_mark(int p_msg_id, int p_index_mark) {
+	DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_BOUNDARY, p_msg_id, p_index_mark);
+}
+
+void TTSDriverOneCore::_speech_cancel(int p_msg_id) {
+	DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, p_msg_id);
+}
+
+void TTSDriverOneCore::_speech_end(int p_msg_id) {
+	DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_ENDED, p_msg_id);
+}
+
+void TTSDriverOneCore::_dispose_current(bool p_silent, bool p_canceled) {
+	if (media.get() != nullptr) {
+		for (const TrackData &T : tracks) {
+			T.track.CueEntered(T.token);
+		}
+		tracks.clear();
+		media->MediaFailed(singleton->token_f);
+		media->MediaEnded(singleton->token_e);
+		if (!ApiInformation::IsApiContractPresent(L"Windows.Foundation.UniversalApiContract", 4)) {
+			media->PlaybackMediaMarkerReached(singleton->token_s);
+		}
+		media->Close();
+		media.reset();
+
+		if (!p_silent) {
+			if (p_canceled) {
+				callable_mp(this, &TTSDriverOneCore::_speech_cancel).call_deferred(id);
+			} else {
+				callable_mp(this, &TTSDriverOneCore::_speech_end).call_deferred(id);
+			}
+		}
+		id = -1;
+		string = Char16String();
+		playing = false;
+		paused = false;
+		offset = 0;
+	}
+}
+
+void TTSDriverOneCore::process_events() {
+	if (update_requested && !paused && queue.size() > 0 && !is_speaking()) {
+		TTSUtterance &message = queue.front()->get();
+		_dispose_current(true);
+		playing = true;
+
+		SpeechSynthesizer synth = SpeechSynthesizer();
+
+		if (ApiInformation::IsApiContractPresent(L"Windows.Foundation.UniversalApiContract", 4)) {
+			synth.Options().IncludeWordBoundaryMetadata(true);
+		}
+		if (ApiInformation::IsApiContractPresent(L"Windows.Foundation.UniversalApiContract", 5)) {
+			synth.Options().SpeakingRate(CLAMP(message.rate, 0.5, 6.0));
+			synth.Options().AudioPitch(CLAMP(message.pitch, 0.0, 2.0));
+			synth.Options().AudioVolume(CLAMP((double)message.volume / 100.0, 0.0, 1.0));
+		}
+
+		winrt::hstring name = winrt::hstring((const wchar_t *)message.voice.utf16().get_data());
+		IVectorView<VoiceInformation> voices = SpeechSynthesizer::AllVoices();
+		for (uint32_t i = 0; i < voices.Size(); i++) {
+			VoiceInformation voice = voices.GetAt(i);
+			if (voice.Id() == name) {
+				synth.Voice(voice);
+				break;
+			}
+		}
+
+		string = message.text.utf16();
+		winrt::hstring text = winrt::hstring((const wchar_t *)string.get_data());
+
+		SpeechSynthesisStream stream = synth.SynthesizeTextToStreamAsync(text).get();
+
+		media = std::make_shared<MediaPlayer>();
+		token_f = media->MediaFailed([=, this](const MediaPlayer &p_sender, const MediaPlayerFailedEventArgs &p_args) {
+			_dispose_current(false, true);
+		});
+		token_e = media->MediaEnded([=, this](const MediaPlayer &p_sender, const IInspectable &p_args) {
+			_dispose_current(false, false);
+		});
+		if (ApiInformation::IsApiContractPresent(L"Windows.Foundation.UniversalApiContract", 4)) {
+			MediaPlaybackItem mitem = MediaPlaybackItem(MediaSource::CreateFromStream(stream, stream.ContentType()));
+			media->Source(mitem);
+			MediaPlaybackTimedMetadataTrackList list = mitem.TimedMetadataTracks();
+
+			for (uint32_t i = 0; i < list.Size(); i++) {
+				TimedMetadataTrack track = list.GetAt(i);
+				if (track.TimedMetadataKind() == TimedMetadataKind::Speech) {
+					winrt::event_token token = track.CueEntered([=, this](const TimedMetadataTrack &p_sender, const MediaCueEventArgs &p_args) {
+						SpeechCue sq;
+						p_args.Cue().as(sq);
+						int32_t pos16 = sq.StartPositionInInput().Value();
+						int pos = 0;
+						for (int j = 0; j < MIN(pos16, string.length()); j++) {
+							char16_t c = string[j];
+							if ((c & 0xfffffc00) == 0xd800) {
+								j++;
+							}
+							pos++;
+						}
+						callable_mp(singleton, &TTSDriverOneCore::_speech_index_mark).call_deferred(id, pos);
+					});
+					tracks.push_back({ track, token });
+					list.SetPresentationMode(i, TimedMetadataTrackPresentationMode::ApplicationPresented);
+				}
+			}
+		} else {
+			media->Source(MediaSource::CreateFromStream(stream, stream.ContentType()));
+			token_s = media->PlaybackMediaMarkerReached([=, this](const MediaPlayer &p_sender, const PlaybackMediaMarkerReachedEventArgs &p_args) {
+				offset += p_args.PlaybackMediaMarker().Text().size() + 1;
+				int pos = 0;
+				for (int j = 0; j < MIN(offset, string.length()); j++) {
+					char16_t c = string[j];
+					if ((c & 0xfffffc00) == 0xd800) {
+						j++;
+					}
+					pos++;
+				}
+				callable_mp(singleton, &TTSDriverOneCore::_speech_index_mark).call_deferred(id, pos);
+			});
+		}
+		media->AutoPlay(true);
+
+		id = message.id;
+		update_requested = false;
+		paused = false;
+
+		media->Play();
+
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_STARTED, message.id);
+		queue.pop_front();
+	}
+}
+
+bool TTSDriverOneCore::is_speaking() const {
+	return playing;
+}
+
+bool TTSDriverOneCore::is_paused() const {
+	return paused;
+}
+
+Array TTSDriverOneCore::get_voices() const {
+	Array list;
+
+	IVectorView<VoiceInformation> voices = SpeechSynthesizer::AllVoices();
+	for (uint32_t i = 0; i < voices.Size(); i++) {
+		VoiceInformation voice = voices.GetAt(i);
+		winrt::hstring vname = voice.DisplayName();
+		winrt::hstring vid = voice.Id();
+		winrt::hstring vlang = voice.Language();
+
+		Dictionary voice_d;
+		voice_d["id"] = String::utf16((const char16_t *)vid.c_str(), vid.size());
+		voice_d["name"] = String::utf16((const char16_t *)vname.c_str(), vname.size());
+		voice_d["language"] = String::utf16((const char16_t *)vlang.c_str(), vlang.size());
+		list.push_back(voice_d);
+	}
+	return list;
+}
+
+void TTSDriverOneCore::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int64_t p_utterance_id, bool p_interrupt) {
+	if (p_interrupt) {
+		stop();
+	}
+
+	if (p_text.is_empty()) {
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, p_utterance_id);
+		return;
+	}
+
+	TTSUtterance message;
+	message.text = p_text;
+	message.voice = p_voice;
+	message.volume = CLAMP(p_volume, 0, 100);
+	message.pitch = CLAMP(p_pitch, 0.f, 2.f);
+	message.rate = CLAMP(p_rate, 0.1f, 10.f);
+	message.id = p_utterance_id;
+	queue.push_back(message);
+
+	if (is_paused()) {
+		resume();
+	} else {
+		update_requested = true;
+	}
+}
+
+void TTSDriverOneCore::pause() {
+	if (!paused && playing) {
+		media->Pause();
+		paused = true;
+	}
+}
+
+void TTSDriverOneCore::resume() {
+	if (paused && playing) {
+		media->Play();
+		paused = false;
+	}
+}
+
+void TTSDriverOneCore::stop() {
+	for (TTSUtterance &message : queue) {
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, message.id);
+	}
+	queue.clear();
+	_dispose_current(false, true);
+}
+
+bool TTSDriverOneCore::init() {
+	if (!ApiInformation::IsApiContractPresent(L"Windows.Foundation.UniversalApiContract", 1)) {
+		print_verbose("Text-to-Speech: Cannot initialize OneCore driver, API contract not present!");
+		return false;
+	}
+	if (SpeechSynthesizer::AllVoices().Size() == 0) {
+		print_verbose("Text-to-Speech: Cannot initialize OneCore driver, no voices found!");
+		return false;
+	}
+	print_verbose("Text-to-Speech: OneCore initialized.");
+	return true;
+}
+
+TTSDriverOneCore::TTSDriverOneCore() {
+	singleton = this;
+}
+
+TTSDriverOneCore::~TTSDriverOneCore() {
+	_dispose_current(false, true);
+	singleton = nullptr;
+}
diff --git a/platform/windows/tts_driver_onecore.h b/platform/windows/tts_driver_onecore.h
new file mode 100644
index 000000000000..6cd8f181e231
--- /dev/null
+++ b/platform/windows/tts_driver_onecore.h
@@ -0,0 +1,107 @@
+/**************************************************************************/
+/*  tts_driver_onecore.h                                                  */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "tts_driver.h"
+
+GODOT_GCC_WARNING_PUSH
+GODOT_GCC_WARNING_IGNORE("-Wnon-virtual-dtor")
+GODOT_GCC_WARNING_IGNORE("-Wctor-dtor-privacy")
+GODOT_GCC_WARNING_IGNORE("-Wshadow")
+GODOT_GCC_WARNING_IGNORE("-Wstrict-aliasing")
+GODOT_CLANG_WARNING_PUSH
+GODOT_CLANG_WARNING_IGNORE("-Wnon-virtual-dtor")
+
+#include <winrt/Windows.Foundation.Collections.h>
+#include <winrt/Windows.Foundation.Metadata.h>
+#include <winrt/Windows.Media.Core.h>
+#include <winrt/Windows.Media.Playback.h>
+#include <winrt/Windows.Media.SpeechSynthesis.h>
+#include <winrt/Windows.Storage.Streams.h>
+
+GODOT_GCC_WARNING_POP
+GODOT_CLANG_WARNING_POP
+
+using namespace winrt::Windows::Foundation;
+using namespace winrt::Windows::Foundation::Collections;
+using namespace winrt::Windows::Foundation::Metadata;
+using namespace winrt::Windows::Media::Core;
+using namespace winrt::Windows::Media::Playback;
+using namespace winrt::Windows::Media::SpeechSynthesis;
+using namespace winrt::Windows::Storage::Streams;
+
+struct TTSUtterance;
+
+class TTSDriverOneCore : public TTSDriver {
+	List<TTSUtterance> queue;
+
+	bool playing = false;
+	bool paused = false;
+	bool update_requested = false;
+
+	int64_t id = -1;
+	Char16String string;
+	std::shared_ptr<MediaPlayer> media;
+	struct TrackData {
+		TimedMetadataTrack track;
+		winrt::event_token token{};
+	};
+	Vector<TrackData> tracks;
+	winrt::event_token token_s{};
+	winrt::event_token token_f{};
+	winrt::event_token token_e{};
+	int64_t offset = 0;
+
+	void _dispose_current(bool p_silent = false, bool p_canceled = false);
+
+	void _speech_cancel(int p_msg_id);
+	void _speech_end(int p_msg_id);
+	void _speech_index_mark(int p_msg_id, int p_index_mark);
+
+	static TTSDriverOneCore *singleton;
+
+public:
+	virtual bool is_speaking() const override;
+	virtual bool is_paused() const override;
+	virtual Array get_voices() const override;
+
+	virtual void speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int64_t p_utterance_id = 0, bool p_interrupt = false) override;
+	virtual void pause() override;
+	virtual void resume() override;
+	virtual void stop() override;
+
+	virtual void process_events() override;
+
+	virtual bool init() override;
+
+	TTSDriverOneCore();
+	~TTSDriverOneCore();
+};
diff --git a/platform/windows/tts_driver_sapi.cpp b/platform/windows/tts_driver_sapi.cpp
new file mode 100644
index 000000000000..8e69d5b6865b
--- /dev/null
+++ b/platform/windows/tts_driver_sapi.cpp
@@ -0,0 +1,274 @@
+/**************************************************************************/
+/*  tts_driver_sapi.cpp                                                   */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "tts_driver_sapi.h"
+
+#include "core/object/callable_mp.h"
+#include "servers/display/display_server.h"
+
+TTSDriverSAPI *TTSDriverSAPI::singleton = nullptr;
+
+void __stdcall TTSDriverSAPI::speech_event_callback(WPARAM wParam, LPARAM lParam) {
+	SPEVENT event;
+	while (singleton->synth->GetEvents(1, &event, nullptr) == S_OK) {
+		uint32_t stream_num = (uint32_t)event.ulStreamNum;
+		if (singleton->ids.has(stream_num)) {
+			if (event.eEventId == SPEI_START_INPUT_STREAM) {
+				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_STARTED, singleton->ids[stream_num].id);
+			} else if (event.eEventId == SPEI_END_INPUT_STREAM) {
+				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_ENDED, singleton->ids[stream_num].id);
+				singleton->ids.erase(stream_num);
+				singleton->update_requested = true;
+			} else if (event.eEventId == SPEI_WORD_BOUNDARY) {
+				const Char16String &string = singleton->ids[stream_num].string;
+				int pos = 0;
+				for (int i = 0; i < MIN(event.lParam, string.length()); i++) {
+					char16_t c = string[i];
+					if ((c & 0xfffffc00) == 0xd800) {
+						i++;
+					}
+					pos++;
+				}
+				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_BOUNDARY, singleton->ids[stream_num].id, pos - singleton->ids[stream_num].offset);
+			}
+		}
+	}
+}
+
+void TTSDriverSAPI::process_events() {
+	if (update_requested && !paused && queue.size() > 0 && !is_speaking()) {
+		TTSUtterance &message = queue.front()->get();
+
+		String text;
+		DWORD flags = SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_IS_XML;
+		String pitch_tag = String("<pitch absmiddle=\"") + String::num_int64(message.pitch * 10 - 10, 10) + String("\">");
+		text = pitch_tag + message.text + String("</pitch>");
+
+		IEnumSpObjectTokens *cpEnum;
+		ISpObjectToken *cpVoiceToken;
+		ULONG ulCount = 0;
+		ULONG stream_number = 0;
+		ISpObjectTokenCategory *cpCategory;
+		HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory);
+		if (SUCCEEDED(hr)) {
+			hr = cpCategory->SetId(SPCAT_VOICES, false);
+			if (SUCCEEDED(hr)) {
+				hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum);
+				if (SUCCEEDED(hr)) {
+					hr = cpEnum->GetCount(&ulCount);
+					while (SUCCEEDED(hr) && ulCount--) {
+						wchar_t *w_id = nullptr;
+						hr = cpEnum->Next(1, &cpVoiceToken, nullptr);
+						cpVoiceToken->GetId(&w_id);
+						if (String::utf16((const char16_t *)w_id) == message.voice) {
+							synth->SetVoice(cpVoiceToken);
+							cpVoiceToken->Release();
+							break;
+						}
+						cpVoiceToken->Release();
+					}
+					cpEnum->Release();
+				}
+			}
+			cpCategory->Release();
+		}
+
+		UTData ut;
+		ut.string = text.utf16();
+		ut.offset = pitch_tag.length(); // Subtract injected <pitch> tag offset.
+		ut.id = message.id;
+
+		synth->SetVolume(message.volume);
+		synth->SetRate(10.f * std::log10(message.rate) / std::log10(3.f));
+		synth->Speak((LPCWSTR)ut.string.get_data(), flags, &stream_number);
+
+		ids[(uint32_t)stream_number] = ut;
+
+		queue.pop_front();
+
+		update_requested = false;
+	}
+}
+
+bool TTSDriverSAPI::is_speaking() const {
+	ERR_FAIL_NULL_V(synth, false);
+
+	SPVOICESTATUS status;
+	synth->GetStatus(&status, nullptr);
+	return (status.dwRunningState == SPRS_IS_SPEAKING || status.dwRunningState == 0 /* Waiting To Speak */);
+}
+
+bool TTSDriverSAPI::is_paused() const {
+	ERR_FAIL_NULL_V(synth, false);
+	return paused;
+}
+
+Array TTSDriverSAPI::get_voices() const {
+	Array list;
+	IEnumSpObjectTokens *cpEnum;
+	ISpObjectToken *cpVoiceToken;
+	ISpDataKey *cpDataKeyAttribs;
+	ULONG ulCount = 0;
+	ISpObjectTokenCategory *cpCategory;
+	HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory);
+	if (SUCCEEDED(hr)) {
+		hr = cpCategory->SetId(SPCAT_VOICES, false);
+		if (SUCCEEDED(hr)) {
+			hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum);
+			if (SUCCEEDED(hr)) {
+				hr = cpEnum->GetCount(&ulCount);
+				while (SUCCEEDED(hr) && ulCount--) {
+					hr = cpEnum->Next(1, &cpVoiceToken, nullptr);
+					HRESULT hr_attr = cpVoiceToken->OpenKey(SPTOKENKEY_ATTRIBUTES, &cpDataKeyAttribs);
+					if (SUCCEEDED(hr_attr)) {
+						wchar_t *w_id = nullptr;
+						wchar_t *w_lang = nullptr;
+						wchar_t *w_name = nullptr;
+						cpVoiceToken->GetId(&w_id);
+						cpDataKeyAttribs->GetStringValue(L"Language", &w_lang);
+						cpDataKeyAttribs->GetStringValue(nullptr, &w_name);
+						LCID locale = wcstol(w_lang, nullptr, 16);
+
+						int locale_chars = GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, nullptr, 0);
+						int region_chars = GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, nullptr, 0);
+						wchar_t *w_lang_code = new wchar_t[locale_chars];
+						wchar_t *w_reg_code = new wchar_t[region_chars];
+						GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, w_lang_code, locale_chars);
+						GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, w_reg_code, region_chars);
+
+						Dictionary voice_d;
+						voice_d["id"] = String::utf16((const char16_t *)w_id);
+						if (w_name) {
+							voice_d["name"] = String::utf16((const char16_t *)w_name);
+						} else {
+							voice_d["name"] = voice_d["id"].operator String().replace("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\", "");
+						}
+						voice_d["language"] = String::utf16((const char16_t *)w_lang_code) + "_" + String::utf16((const char16_t *)w_reg_code);
+						list.push_back(voice_d);
+
+						delete[] w_lang_code;
+						delete[] w_reg_code;
+
+						cpDataKeyAttribs->Release();
+					}
+					cpVoiceToken->Release();
+				}
+				cpEnum->Release();
+			}
+		}
+		cpCategory->Release();
+	}
+	return list;
+}
+
+void TTSDriverSAPI::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int64_t p_utterance_id, bool p_interrupt) {
+	ERR_FAIL_NULL(synth);
+	if (p_interrupt) {
+		stop();
+	}
+
+	if (p_text.is_empty()) {
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, p_utterance_id);
+		return;
+	}
+
+	TTSUtterance message;
+	message.text = p_text;
+	message.voice = p_voice;
+	message.volume = CLAMP(p_volume, 0, 100);
+	message.pitch = CLAMP(p_pitch, 0.f, 2.f);
+	message.rate = CLAMP(p_rate, 0.1f, 10.f);
+	message.id = p_utterance_id;
+	queue.push_back(message);
+
+	if (is_paused()) {
+		resume();
+	} else {
+		update_requested = true;
+	}
+}
+
+void TTSDriverSAPI::pause() {
+	ERR_FAIL_NULL(synth);
+	if (!paused) {
+		if (synth->Pause() == S_OK) {
+			paused = true;
+		}
+	}
+}
+
+void TTSDriverSAPI::resume() {
+	ERR_FAIL_NULL(synth);
+	synth->Resume();
+	paused = false;
+}
+
+void TTSDriverSAPI::stop() {
+	ERR_FAIL_NULL(synth);
+
+	SPVOICESTATUS status;
+	synth->GetStatus(&status, nullptr);
+	uint32_t current_stream = (uint32_t)status.ulCurrentStream;
+	if (ids.has(current_stream)) {
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, ids[current_stream].id);
+		ids.erase(current_stream);
+	}
+	for (TTSUtterance &message : queue) {
+		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, message.id);
+	}
+	queue.clear();
+	synth->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr);
+	synth->Resume();
+	paused = false;
+}
+
+bool TTSDriverSAPI::init() {
+	if (SUCCEEDED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, (void **)&synth))) {
+		ULONGLONG event_mask = SPFEI(SPEI_END_INPUT_STREAM) | SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_WORD_BOUNDARY);
+		synth->SetInterest(event_mask, event_mask);
+		synth->SetNotifyCallbackFunction(&speech_event_callback, (WPARAM)(this), 0);
+		print_verbose("Text-to-Speech: SAPI initialized.");
+		return true;
+	} else {
+		print_verbose("Text-to-Speech: Cannot initialize SAPI driver!");
+		return false;
+	}
+}
+
+TTSDriverSAPI::TTSDriverSAPI() {
+	singleton = this;
+}
+
+TTSDriverSAPI::~TTSDriverSAPI() {
+	if (synth) {
+		synth->Release();
+	}
+	singleton = nullptr;
+}
diff --git a/platform/windows/tts_driver_sapi.h b/platform/windows/tts_driver_sapi.h
new file mode 100644
index 000000000000..e40495e345d0
--- /dev/null
+++ b/platform/windows/tts_driver_sapi.h
@@ -0,0 +1,77 @@
+/**************************************************************************/
+/*  tts_driver_sapi.h                                                     */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "tts_driver.h"
+
+#include <windows.h>
+
+#include <objbase.h>
+#include <sapi.h>
+#include <winnls.h>
+
+#include <cwchar>
+
+struct TTSUtterance;
+
+class TTSDriverSAPI : public TTSDriver {
+	List<TTSUtterance> queue;
+	ISpVoice *synth = nullptr;
+	bool paused = false;
+	struct UTData {
+		Char16String string;
+		int offset;
+		int64_t id;
+	};
+	HashMap<uint32_t, UTData> ids;
+	bool update_requested = false;
+
+	static void __stdcall speech_event_callback(WPARAM wParam, LPARAM lParam);
+
+	static TTSDriverSAPI *singleton;
+
+public:
+	virtual bool is_speaking() const override;
+	virtual bool is_paused() const override;
+	virtual Array get_voices() const override;
+
+	virtual void speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int64_t p_utterance_id = 0, bool p_interrupt = false) override;
+	virtual void pause() override;
+	virtual void resume() override;
+	virtual void stop() override;
+
+	virtual void process_events() override;
+
+	virtual bool init() override;
+
+	TTSDriverSAPI();
+	~TTSDriverSAPI();
+};
diff --git a/platform/windows/tts_windows.cpp b/platform/windows/tts_windows.cpp
index 15364a248e1b..6273483ff60d 100644
--- a/platform/windows/tts_windows.cpp
+++ b/platform/windows/tts_windows.cpp
@@ -30,245 +30,92 @@
 
 #include "tts_windows.h"
 
-#include "servers/display/display_server.h"
+#include "tts_driver_sapi.h"
 
-TTS_Windows *TTS_Windows::singleton = nullptr;
-
-void __stdcall TTS_Windows::speech_event_callback(WPARAM wParam, LPARAM lParam) {
-	TTS_Windows *tts = TTS_Windows::get_singleton();
-	SPEVENT event;
-	while (tts->synth->GetEvents(1, &event, nullptr) == S_OK) {
-		uint32_t stream_num = (uint32_t)event.ulStreamNum;
-		if (tts->ids.has(stream_num)) {
-			if (event.eEventId == SPEI_START_INPUT_STREAM) {
-				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_STARTED, tts->ids[stream_num].id);
-			} else if (event.eEventId == SPEI_END_INPUT_STREAM) {
-				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_ENDED, tts->ids[stream_num].id);
-				tts->ids.erase(stream_num);
-				tts->update_requested = true;
-			} else if (event.eEventId == SPEI_WORD_BOUNDARY) {
-				const Char16String &string = tts->ids[stream_num].string;
-				int pos = 0;
-				for (int i = 0; i < MIN(event.lParam, string.length()); i++) {
-					char16_t c = string[i];
-					if ((c & 0xfffffc00) == 0xd800) {
-						i++;
-					}
-					pos++;
-				}
-				DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_BOUNDARY, tts->ids[stream_num].id, pos - tts->ids[stream_num].offset);
-			}
-		}
-	}
-}
-
-void TTS_Windows::process_events() {
-	if (update_requested && !paused && queue.size() > 0 && !is_speaking()) {
-		TTSUtterance &message = queue.front()->get();
-
-		String text;
-		DWORD flags = SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_IS_XML;
-		String pitch_tag = String("<pitch absmiddle=\"") + String::num_int64(message.pitch * 10 - 10, 10) + String("\">");
-		text = pitch_tag + message.text + String("</pitch>");
-
-		IEnumSpObjectTokens *cpEnum;
-		ISpObjectToken *cpVoiceToken;
-		ULONG ulCount = 0;
-		ULONG stream_number = 0;
-		ISpObjectTokenCategory *cpCategory;
-		HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory);
-		if (SUCCEEDED(hr)) {
-			hr = cpCategory->SetId(SPCAT_VOICES, false);
-			if (SUCCEEDED(hr)) {
-				hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum);
-				if (SUCCEEDED(hr)) {
-					hr = cpEnum->GetCount(&ulCount);
-					while (SUCCEEDED(hr) && ulCount--) {
-						wchar_t *w_id = nullptr;
-						hr = cpEnum->Next(1, &cpVoiceToken, nullptr);
-						cpVoiceToken->GetId(&w_id);
-						if (String::utf16((const char16_t *)w_id) == message.voice) {
-							synth->SetVoice(cpVoiceToken);
-							cpVoiceToken->Release();
-							break;
-						}
-						cpVoiceToken->Release();
-					}
-					cpEnum->Release();
-				}
-			}
-			cpCategory->Release();
-		}
-
-		UTData ut;
-		ut.string = text.utf16();
-		ut.offset = pitch_tag.length(); // Subtract injected <pitch> tag offset.
-		ut.id = message.id;
+#ifdef WINRT_ENABLED
+#include "tts_driver_onecore.h"
+#endif
 
-		synth->SetVolume(message.volume);
-		synth->SetRate(10.f * std::log10(message.rate) / std::log10(3.f));
-		synth->Speak((LPCWSTR)ut.string.get_data(), flags, &stream_number);
-
-		ids[(uint32_t)stream_number] = ut;
-
-		queue.pop_front();
+TTS_Windows *TTS_Windows::singleton = nullptr;
 
-		update_requested = false;
-	}
+TTS_Windows *TTS_Windows::get_singleton() {
+	return singleton;
 }
 
 bool TTS_Windows::is_speaking() const {
-	ERR_FAIL_NULL_V(synth, false);
-
-	SPVOICESTATUS status;
-	synth->GetStatus(&status, nullptr);
-	return (status.dwRunningState == SPRS_IS_SPEAKING || status.dwRunningState == 0 /* Waiting To Speak */);
+	if (driver) {
+		return driver->is_speaking();
+	}
+	return false;
 }
 
 bool TTS_Windows::is_paused() const {
-	ERR_FAIL_NULL_V(synth, false);
-	return paused;
+	if (driver) {
+		return driver->is_paused();
+	}
+	return false;
 }
 
 Array TTS_Windows::get_voices() const {
-	Array list;
-	IEnumSpObjectTokens *cpEnum;
-	ISpObjectToken *cpVoiceToken;
-	ISpDataKey *cpDataKeyAttribs;
-	ULONG ulCount = 0;
-	ISpObjectTokenCategory *cpCategory;
-	HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory);
-	if (SUCCEEDED(hr)) {
-		hr = cpCategory->SetId(SPCAT_VOICES, false);
-		if (SUCCEEDED(hr)) {
-			hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum);
-			if (SUCCEEDED(hr)) {
-				hr = cpEnum->GetCount(&ulCount);
-				while (SUCCEEDED(hr) && ulCount--) {
-					hr = cpEnum->Next(1, &cpVoiceToken, nullptr);
-					HRESULT hr_attr = cpVoiceToken->OpenKey(SPTOKENKEY_ATTRIBUTES, &cpDataKeyAttribs);
-					if (SUCCEEDED(hr_attr)) {
-						wchar_t *w_id = nullptr;
-						wchar_t *w_lang = nullptr;
-						wchar_t *w_name = nullptr;
-						cpVoiceToken->GetId(&w_id);
-						cpDataKeyAttribs->GetStringValue(L"Language", &w_lang);
-						cpDataKeyAttribs->GetStringValue(nullptr, &w_name);
-						LCID locale = wcstol(w_lang, nullptr, 16);
-
-						int locale_chars = GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, nullptr, 0);
-						int region_chars = GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, nullptr, 0);
-						wchar_t *w_lang_code = new wchar_t[locale_chars];
-						wchar_t *w_reg_code = new wchar_t[region_chars];
-						GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, w_lang_code, locale_chars);
-						GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, w_reg_code, region_chars);
-
-						Dictionary voice_d;
-						voice_d["id"] = String::utf16((const char16_t *)w_id);
-						if (w_name) {
-							voice_d["name"] = String::utf16((const char16_t *)w_name);
-						} else {
-							voice_d["name"] = voice_d["id"].operator String().replace("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\", "");
-						}
-						voice_d["language"] = String::utf16((const char16_t *)w_lang_code) + "_" + String::utf16((const char16_t *)w_reg_code);
-						list.push_back(voice_d);
-
-						delete[] w_lang_code;
-						delete[] w_reg_code;
-
-						cpDataKeyAttribs->Release();
-					}
-					cpVoiceToken->Release();
-				}
-				cpEnum->Release();
-			}
-		}
-		cpCategory->Release();
+	if (driver) {
+		return driver->get_voices();
 	}
-	return list;
+	return Array();
 }
 
 void TTS_Windows::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int64_t p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_NULL(synth);
-	if (p_interrupt) {
-		stop();
-	}
-
-	if (p_text.is_empty()) {
-		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, p_utterance_id);
-		return;
-	}
-
-	TTSUtterance message;
-	message.text = p_text;
-	message.voice = p_voice;
-	message.volume = CLAMP(p_volume, 0, 100);
-	message.pitch = CLAMP(p_pitch, 0.f, 2.f);
-	message.rate = CLAMP(p_rate, 0.1f, 10.f);
-	message.id = p_utterance_id;
-	queue.push_back(message);
-
-	if (is_paused()) {
-		resume();
-	} else {
-		update_requested = true;
+	if (driver) {
+		driver->speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
 	}
 }
 
 void TTS_Windows::pause() {
-	ERR_FAIL_NULL(synth);
-	if (!paused) {
-		if (synth->Pause() == S_OK) {
-			paused = true;
-		}
+	if (driver) {
+		driver->pause();
 	}
 }
 
 void TTS_Windows::resume() {
-	ERR_FAIL_NULL(synth);
-	synth->Resume();
-	paused = false;
+	if (driver) {
+		driver->resume();
+	}
 }
 
 void TTS_Windows::stop() {
-	ERR_FAIL_NULL(synth);
-
-	SPVOICESTATUS status;
-	synth->GetStatus(&status, nullptr);
-	uint32_t current_stream = (uint32_t)status.ulCurrentStream;
-	if (ids.has(current_stream)) {
-		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, ids[current_stream].id);
-		ids.erase(current_stream);
-	}
-	for (TTSUtterance &message : queue) {
-		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServerEnums::TTS_UTTERANCE_CANCELED, message.id);
+	if (driver) {
+		driver->stop();
 	}
-	queue.clear();
-	synth->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr);
-	synth->Resume();
-	paused = false;
 }
 
-TTS_Windows *TTS_Windows::get_singleton() {
-	return singleton;
+void TTS_Windows::process_events() {
+	if (driver) {
+		driver->process_events();
+	}
 }
 
 TTS_Windows::TTS_Windows() {
-	singleton = this;
-
-	if (SUCCEEDED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, (void **)&synth))) {
-		ULONGLONG event_mask = SPFEI(SPEI_END_INPUT_STREAM) | SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_WORD_BOUNDARY);
-		synth->SetInterest(event_mask, event_mask);
-		synth->SetNotifyCallbackFunction(&speech_event_callback, (WPARAM)(this), 0);
-		print_verbose("Text-to-Speech: SAPI initialized.");
-	} else {
-		print_verbose("Text-to-Speech: Cannot initialize ISpVoice!");
+#ifdef WINRT_ENABLED
+	// Try OneCore driver.
+	if (!driver) {
+		driver = memnew(TTSDriverOneCore);
+		if (!driver->init()) {
+			memdelete(driver);
+			driver = nullptr;
+		}
+	}
+#endif
+	// Try SAPI driver.
+	if (!driver) {
+		driver = memnew(TTSDriverSAPI);
+		if (!driver->init()) {
+			memdelete(driver);
+			driver = nullptr;
+		}
 	}
 }
 
 TTS_Windows::~TTS_Windows() {
-	if (synth) {
-		synth->Release();
+	if (driver) {
+		memdelete(driver);
 	}
-	singleton = nullptr;
 }
diff --git a/platform/windows/tts_windows.h b/platform/windows/tts_windows.h
index 5efef4c6e896..a2f7d9140410 100644
--- a/platform/windows/tts_windows.h
+++ b/platform/windows/tts_windows.h
@@ -31,31 +31,14 @@
 #pragma once
 
 #include "core/string/ustring.h"
-#include "core/templates/hash_map.h"
-#include "core/templates/list.h"
 #include "core/variant/array.h"
 
-#include <windows.h>
-
-#include <objbase.h>
-#include <sapi.h>
-#include <winnls.h>
+class TTSDriver;
 
 struct TTSUtterance;
 
 class TTS_Windows {
-	List<TTSUtterance> queue;
-	ISpVoice *synth = nullptr;
-	bool paused = false;
-	struct UTData {
-		Char16String string;
-		int offset;
-		int64_t id;
-	};
-	HashMap<uint32_t, UTData> ids;
-	bool update_requested = false;
-
-	static void __stdcall speech_event_callback(WPARAM wParam, LPARAM lParam);
+	TTSDriver *driver = nullptr;
 
 	static TTS_Windows *singleton;