From 60acd1a977c7eeda2c8d8f8c25d52a8a816ca215 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Wed, 3 Sep 2025 11:36:41 +0200 Subject: [PATCH 01/23] Speech pipeline POC --- src/BUILD | 1 + src/http_rest_api_handler.cpp | 2 + .../mediapipegraphdefinition.cpp | 1 + src/speech/BUILD | 53 ++++ src/speech/http_speech_calculator.cc | 273 ++++++++++++++++++ src/speech/speech_calculator.proto | 46 +++ 6 files changed, 376 insertions(+) create mode 100644 src/speech/BUILD create mode 100644 src/speech/http_speech_calculator.cc create mode 100644 src/speech/speech_calculator.proto diff --git a/src/BUILD b/src/BUILD index 7e8d8b04b4..2446bca768 100644 --- a/src/BUILD +++ b/src/BUILD @@ -562,6 +562,7 @@ ovms_cc_library( "//conditions:default": [], "//:not_disable_mediapipe" : [ "//src/image_gen:image_gen_calculator", + "//src/speech:speech_calculator", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", "//src/embeddings:embeddingscalculator", diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 05df6ce7ac..cb9049f191 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -495,6 +495,7 @@ static Status createV3HttpPayload( bool isUriBasedRouting = !isApplicationJson && !isMultiPart; // For content types other than "application/json" and "multipart/form-data", we look for model information in the URI if (isMultiPart) { + SPDLOG_ERROR("MULTIPART"); OVMS_PROFILE_SCOPE("multipart parse"); if (!multiPartParser->parse()) { SPDLOG_DEBUG("Failed to parse multipart content type request"); @@ -502,6 +503,7 @@ static Status createV3HttpPayload( } modelName = multiPartParser->getFieldByName("model"); if (modelName.empty()) { + SPDLOG_ERROR("model"); isUriBasedRouting = true; } else { SPDLOG_DEBUG("Model name from deduced from MultiPart field: {}", modelName); diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index ef4dbeda3b..28cf68acf2 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -61,6 +61,7 @@ const std::string MediapipeGraphDefinition::SCHEDULER_CLASS_NAME{"Mediapipe"}; const std::string MediapipeGraphDefinition::PYTHON_NODE_CALCULATOR_NAME{"PythonExecutorCalculator"}; const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalculator"}; const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; +//const std::string MediapipeGraphDefinition::SPEECH_CALCULATOR_NAME{"SpeechCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; diff --git a/src/speech/BUILD b/src/speech/BUILD new file mode 100644 index 0000000000..ffec290669 --- /dev/null +++ b/src/speech/BUILD @@ -0,0 +1,53 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "llm_engine", # in fact this is genai library + srcs = [], + deps = ["@llm_engine//:llm_engine"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "speech_calculator", + srcs = ["http_speech_calculator.cc"], + hdrs = ["dr_wav.h"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "speech_calculator_cc_proto", + ]+ select({ + "//conditions:default": ["//third_party:genai", ":llm_engine"], + "//:not_genai_bin" : [":llm_engine"], + }), + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "speech_calculator_proto", + srcs = ["speech_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc new file mode 100644 index 0000000000..07bfad5fda --- /dev/null +++ b/src/speech/http_speech_calculator.cc @@ -0,0 +1,273 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "../http_payload.hpp" +#include "../logging.hpp" + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" + +#ifdef _WIN32 +# include +# include +#endif + +using namespace ovms; + +namespace mediapipe { + +// using SpeechPipelinesMap = std::unordered_map>; + + +const std::string SPEECH_SESSION_SIDE_PACKET_TAG = "SPEECH_NODE_RESOURCES"; + +#define COMMON_SAMPLE_RATE 16000 + +bool is_wav_buffer(const std::string buf) { + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { + return false; + } + + uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); + if (chunk_size + 8 != buf.size()) { + return false; + } + + return true; +} + +ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { + drwav wav; + +// if (filename == "-") { +// { +// #ifdef _WIN32 +// _setmode(_fileno(stdin), _O_BINARY); +// #endif + +// uint8_t buf[1024]; +// while (true) { +// const size_t n = fread(buf, 1, sizeof(buf), stdin); +// if (n == 0) { +// break; +// } +// wav_data.insert(wav_data.end(), buf, buf + n); +// } +// } + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to open WAV file from stdin"); + +// fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); +// } else if (is_wav_buffer(filename)) { +// OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), +// "Failed to open WAV file from fname buffer"); +// } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { +// #if defined(WHISPER_FFMPEG) +// OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to read wav data as wav") +// #else +// throw std::runtime_error("failed to open as WAV file"); +// #endif +// } + OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), "Failed to open WAV file from stdin"); + if (wav.channels != 1 && wav.channels != 2) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be mono or stereo"); + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + } + + const uint64_t n = + wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8ul); + + std::vector pcm16; + pcm16.resize(n * wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + std::vector pcmf32; + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i]) / 32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + } + } + + return pcmf32; +} + +std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { + std::string_view value = parser.getFileContentByFieldName(keyName); + if (value.empty()) { + return std::nullopt; + } + return value; +} + +#define SET_OR_RETURN(TYPE, NAME, RHS) \ + auto NAME##_OPT = RHS; \ + RETURN_IF_HOLDS_STATUS(NAME##_OPT) \ + auto NAME = std::get(NAME##_OPT); + +#define RETURN_IF_HOLDS_STATUS(NAME) \ + if (std::holds_alternative(NAME)) { \ + return std::get(NAME); \ + } + +class SpeechCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + // cc->InputSidePackets().Tag(IMAGE_GEN_SESSION_SIDE_PACKET_TAG).Set(); // TODO: template? + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {} ] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Open start", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process start", cc->NodeName()); + + // ImageGenerationPipelinesMap pipelinesMap = cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Get(); + // auto it = pipelinesMap.find(cc->NodeName()); + // RET_CHECK(it != pipelinesMap.end()) << "Could not find initialized Speech node named: " << cc->NodeName(); + // auto pipe = it->second; + + auto payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + + std::unique_ptr images; // output + std::unique_ptr output; + if (absl::StartsWith(payload.uri, "/v3/audio/transcriptions")) { + if (payload.multipartParser->hasParseError()) + return absl::InvalidArgumentError("Failed to parse multipart data"); + + SET_OR_RETURN(std::optional, file, getFileFromPayload(*payload.multipartParser, "file")); + if(!file.has_value()){ + return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); + } + ov::genai::WhisperPipeline pipeline("/models/audio/transcriptions", "CPU"); + ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config(); + // 'task' and 'language' parameters are supported for multilingual models only + config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + config.task = "transcribe"; + config.return_timestamps = true; + ov::genai::RawSpeechInput raw_speech = read_wav(file.value()); + output = std::make_unique(pipeline.generate(raw_speech)); + } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ + if (payload.parsedJson->HasParseError()) + return absl::InvalidArgumentError("Failed to parse JSON"); + + if (!payload.parsedJson->IsObject()) { + return absl::InvalidArgumentError("JSON body must be an object"); + } + auto inputIt = payload.parsedJson->FindMember("input"); + if (inputIt == payload.parsedJson->MemberEnd()) { + return absl::InvalidArgumentError("input field is missing in JSON body"); + } + if (!inputIt->value.IsString()) { + return absl::InvalidArgumentError("input field is not a string"); + } + ov::genai::Text2SpeechPipeline pipeline("/models/audio/speech", "CPU"); + SPDLOG_ERROR("1"); + auto gen_speech = pipeline.generate(inputIt->value.GetString()); + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 16000; // assume it is always 16 KHz + format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth(); + + drwav wav; + void* ppData; + size_t pDataSize; + OPENVINO_ASSERT(drwav_init_memory_write(&wav, &ppData, &pDataSize, &format, nullptr), + "Failed to initialize WAV writer"); + auto waveform_size = gen_speech.speeches[0].get_size(); + size_t total_samples = waveform_size * format.channels; + auto waveform_ptr = gen_speech.speeches[0].data(); + + drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); + OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); + + SPDLOG_ERROR("SIZE {}", gen_speech.speeches[0].get_size()); + output = std::make_unique(reinterpret_cast(ppData), pDataSize); + //drwav_free(&wav) TODO: ?? + drwav_uninit(&wav); + SPDLOG_ERROR("3"); + }else { + return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); + } + + // auto outputOrStatus = generateJSONResponseFromOvTensor(*images); + // RETURN_IF_HOLDS_STATUS(outputOrStatus); + // auto output = std::move(std::get>(outputOrStatus)); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process end", cc->NodeName()); + + return absl::OkStatus(); + } +}; + +const std::string SpeechCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string SpeechCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(SpeechCalculator); + +} // namespace mediapipe diff --git a/src/speech/speech_calculator.proto b/src/speech/speech_calculator.proto new file mode 100644 index 0000000000..b97bbc3298 --- /dev/null +++ b/src/speech/speech_calculator.proto @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message SpeechCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional SpeechCalculatorOptions ext = 116423755; + } + + // fields required for GenAI pipeline initialization + required string models_path = 1; + optional string device = 2; + optional string plugin_config = 3; + + // fields used during inference + optional string max_resolution = 4 [default = "4096x4096"]; + optional string default_resolution = 5; + optional uint64 max_num_images_per_prompt = 6 [default = 10]; + optional uint64 default_num_inference_steps = 7 [default = 50]; + optional uint64 max_num_inference_steps = 8 [default = 100]; + + // static reshape setting, required for NPU, optional for other devices + optional string resolution = 9; + optional int64 num_images_per_prompt = 10; + optional float guidance_scale = 11; +} From af99fc55b21888706b0dd76e50f21ab40254a337 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Wed, 3 Sep 2025 11:37:42 +0200 Subject: [PATCH 02/23] fix --- src/http_rest_api_handler.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index cb9049f191..05df6ce7ac 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -495,7 +495,6 @@ static Status createV3HttpPayload( bool isUriBasedRouting = !isApplicationJson && !isMultiPart; // For content types other than "application/json" and "multipart/form-data", we look for model information in the URI if (isMultiPart) { - SPDLOG_ERROR("MULTIPART"); OVMS_PROFILE_SCOPE("multipart parse"); if (!multiPartParser->parse()) { SPDLOG_DEBUG("Failed to parse multipart content type request"); @@ -503,7 +502,6 @@ static Status createV3HttpPayload( } modelName = multiPartParser->getFieldByName("model"); if (modelName.empty()) { - SPDLOG_ERROR("model"); isUriBasedRouting = true; } else { SPDLOG_DEBUG("Model name from deduced from MultiPart field: {}", modelName); From 5e3cb5ba6630d97cc34dad50918b8535e99608ca Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Wed, 3 Sep 2025 13:48:51 +0200 Subject: [PATCH 03/23] update --- src/speech/http_speech_calculator.cc | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 07bfad5fda..e8abcd1bfb 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -105,7 +105,11 @@ ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { // throw std::runtime_error("failed to open as WAV file"); // #endif // } - OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), "Failed to open WAV file from stdin"); + auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr); + if(result == false){ + SPDLOG_ERROR("FILE PARSING FAILED {}", result); + throw std::runtime_error("FILE PARSING FAILED"); + } if (wav.channels != 1 && wav.channels != 2) { drwav_uninit(&wav); throw std::runtime_error("WAV file must be mono or stereo"); @@ -208,7 +212,13 @@ class SpeechCalculator : public CalculatorBase { config.language = "<|en|>"; // can switch to <|zh|> for Chinese language config.task = "transcribe"; config.return_timestamps = true; - ov::genai::RawSpeechInput raw_speech = read_wav(file.value()); + ov::genai::RawSpeechInput raw_speech; + try { + raw_speech = read_wav(file.value()); + } catch(std::exception&){ + return absl::InvalidArgumentError("Audio file pasing failed"); + } + output = std::make_unique(pipeline.generate(raw_speech)); } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ if (payload.parsedJson->HasParseError()) @@ -225,7 +235,6 @@ class SpeechCalculator : public CalculatorBase { return absl::InvalidArgumentError("input field is not a string"); } ov::genai::Text2SpeechPipeline pipeline("/models/audio/speech", "CPU"); - SPDLOG_ERROR("1"); auto gen_speech = pipeline.generate(inputIt->value.GetString()); drwav_data_format format; format.container = drwav_container_riff; @@ -237,20 +246,18 @@ class SpeechCalculator : public CalculatorBase { drwav wav; void* ppData; size_t pDataSize; - OPENVINO_ASSERT(drwav_init_memory_write(&wav, &ppData, &pDataSize, &format, nullptr), - "Failed to initialize WAV writer"); + auto waveform_size = gen_speech.speeches[0].get_size(); size_t total_samples = waveform_size * format.channels; auto waveform_ptr = gen_speech.speeches[0].data(); - + OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), + "Failed to initialize WAV writer"); drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); - SPDLOG_ERROR("SIZE {}", gen_speech.speeches[0].get_size()); output = std::make_unique(reinterpret_cast(ppData), pDataSize); - //drwav_free(&wav) TODO: ?? drwav_uninit(&wav); - SPDLOG_ERROR("3"); + drwav_free(ppData, NULL); }else { return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); } From fec1825fa6a655ac821dc260c0a2ff4530d7954c Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 4 Sep 2025 10:31:51 +0200 Subject: [PATCH 04/23] fix --- src/speech/http_speech_calculator.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index e8abcd1bfb..b79f0e4df7 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -218,8 +218,10 @@ class SpeechCalculator : public CalculatorBase { } catch(std::exception&){ return absl::InvalidArgumentError("Audio file pasing failed"); } - - output = std::make_unique(pipeline.generate(raw_speech)); + std::string result = "{\"text\": \""; + result += pipeline.generate(raw_speech); + result.append("\"}"); + output = std::make_unique(result); } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ if (payload.parsedJson->HasParseError()) return absl::InvalidArgumentError("Failed to parse JSON"); From 8524d72c9ef1604b898d1fc552dc6ecc35012928 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 4 Sep 2025 13:57:43 +0200 Subject: [PATCH 05/23] fix --- .../mediapipegraphdefinition.cpp | 24 +++++++- .../mediapipegraphdefinition.hpp | 8 ++- .../mediapipegraphexecutor.cpp | 4 +- .../mediapipegraphexecutor.hpp | 4 ++ src/speech/BUILD | 8 +++ src/speech/http_speech_calculator.cc | 20 +++---- src/speech/speech_calculator.proto | 16 ++--- src/speech/speech_servable.hpp | 60 +++++++++++++++++++ 8 files changed, 120 insertions(+), 24 deletions(-) create mode 100644 src/speech/speech_servable.hpp diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index 28cf68acf2..241c7de283 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -61,7 +61,7 @@ const std::string MediapipeGraphDefinition::SCHEDULER_CLASS_NAME{"Mediapipe"}; const std::string MediapipeGraphDefinition::PYTHON_NODE_CALCULATOR_NAME{"PythonExecutorCalculator"}; const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalculator"}; const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; -//const std::string MediapipeGraphDefinition::SPEECH_CALCULATOR_NAME{"SpeechCalculator"}; +const std::string MediapipeGraphDefinition::SPEECH_NODE_CALCULATOR_NAME{"SpeechCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -555,6 +555,28 @@ Status MediapipeGraphDefinition::initializeNodes() { rerankServableMap.insert(std::pair>(nodeName, std::move(servable))); rerankServablesCleaningGuard.disableCleaning(); } + if (endsWith(config.node(i).calculator(), SPEECH_NODE_CALCULATOR_NAME)) { + auto& speechServableMap = this->sidePacketMaps.speechServableMap; + ResourcesCleaningGuard speechServablesCleaningGuard(speechServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (speechServableMap.find(nodeName) != speechServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::SpeechCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.device(), mgconfig.getBasePath(), nodeOptions.mode()); + speechServableMap.insert(std::pair>(nodeName, std::move(servable))); + speechServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 1a6e98bfcf..e5d8a700de 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -46,6 +46,7 @@ #include "../sidepacket_servable.hpp" #include "../embeddings/embeddings_servable.hpp" #include "../rerank/rerank_servable.hpp" +#include "../speech/speech_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -62,6 +63,7 @@ struct ImageGenerationPipelines; using PythonNodeResourcesMap = std::unordered_map>; using GenAiServableMap = std::unordered_map>; using RerankServableMap = std::unordered_map>; +using SpeechServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -71,19 +73,22 @@ struct GraphSidePackets { ImageGenerationPipelinesMap imageGenPipelinesMap; EmbeddingsServableMap embeddingsServableMap; RerankServableMap rerankServableMap; + SpeechServableMap speechServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); imageGenPipelinesMap.clear(); embeddingsServableMap.clear(); rerankServableMap.clear(); + speechServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && genAiServableMap.empty() && imageGenPipelinesMap.empty() && embeddingsServableMap.empty() && - rerankServableMap.empty()); + rerankServableMap.empty() && + speechServableMap.empty()); } }; @@ -124,6 +129,7 @@ class MediapipeGraphDefinition { static const std::string IMAGE_GEN_CALCULATOR_NAME; static const std::string EMBEDDINGS_NODE_CALCULATOR_NAME; static const std::string RERANK_NODE_CALCULATOR_NAME; + static const std::string SPEECH_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index aa95bf88ec..5545b13966 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -47,6 +47,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -56,7 +57,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, speechServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -88,6 +89,7 @@ const std::string MediapipeGraphExecutor::LLM_SESSION_SIDE_PACKET_TAG = "llm"; const std::string MediapipeGraphExecutor::IMAGE_GEN_SESSION_SIDE_PACKET_TAG = "pipes"; const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "embeddings_servable"; const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; +const std::string MediapipeGraphExecutor::SPEECH_SESSION_SIDE_PACKET_TAG = "speech_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index b2468f5540..f3c6907aaa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -93,6 +93,7 @@ class MediapipeGraphExecutor { static const std::string IMAGE_GEN_SESSION_SIDE_PACKET_TAG; static const std::string EMBEDDINGS_SESSION_SIDE_PACKET_TAG; static const std::string RERANK_SESSION_SIDE_PACKET_TAG; + static const std::string SPEECH_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -103,6 +104,7 @@ class MediapipeGraphExecutor { const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -151,6 +153,8 @@ class MediapipeGraphExecutor { inputSidePackets[EMBEDDINGS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.embeddingsServableMap).At(STARTING_TIMESTAMP); inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[SPEECH_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.speechServableMap).At(STARTING_TIMESTAMP); + MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR); ::mediapipe::Packet packet; diff --git a/src/speech/BUILD b/src/speech/BUILD index ffec290669..b072e9fbcd 100644 --- a/src/speech/BUILD +++ b/src/speech/BUILD @@ -17,6 +17,13 @@ load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") load("//:common_settings.bzl", "ovms_cc_library") +ovms_cc_library( + name = "speech_servable", + hdrs = ["speech_servable.hpp"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + ovms_cc_library( name = "llm_engine", # in fact this is genai library srcs = [], @@ -34,6 +41,7 @@ ovms_cc_library( "//src:httppayload", "//src:libovmslogging", "speech_calculator_cc_proto", + ":speech_servable", ]+ select({ "//conditions:default": ["//third_party:genai", ":llm_engine"], "//:not_genai_bin" : [":llm_engine"], diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index b79f0e4df7..258318419a 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -37,6 +37,7 @@ #include "dr_wav.h" #include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "speech_servable.hpp" #ifdef _WIN32 # include @@ -171,7 +172,7 @@ class SpeechCalculator : public CalculatorBase { RET_CHECK(!cc->Inputs().GetTags().empty()); RET_CHECK(!cc->Outputs().GetTags().empty()); cc->Inputs().Tag(INPUT_TAG_NAME).Set(); - // cc->InputSidePackets().Tag(IMAGE_GEN_SESSION_SIDE_PACKET_TAG).Set(); // TODO: template? + cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Set(); // TODO: template? cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); return absl::OkStatus(); } @@ -189,10 +190,10 @@ class SpeechCalculator : public CalculatorBase { absl::Status Process(CalculatorContext* cc) final { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process start", cc->NodeName()); - // ImageGenerationPipelinesMap pipelinesMap = cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Get(); - // auto it = pipelinesMap.find(cc->NodeName()); - // RET_CHECK(it != pipelinesMap.end()) << "Could not find initialized Speech node named: " << cc->NodeName(); - // auto pipe = it->second; + SpeechServableMap pipelinesMap = cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Get(); + auto it = pipelinesMap.find(cc->NodeName()); + RET_CHECK(it != pipelinesMap.end()) << "Could not find initialized Speech node named: " << cc->NodeName(); + auto pipe = it->second; auto payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); @@ -206,8 +207,7 @@ class SpeechCalculator : public CalculatorBase { if(!file.has_value()){ return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); } - ov::genai::WhisperPipeline pipeline("/models/audio/transcriptions", "CPU"); - ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config(); + ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); // 'task' and 'language' parameters are supported for multilingual models only config.language = "<|en|>"; // can switch to <|zh|> for Chinese language config.task = "transcribe"; @@ -219,7 +219,7 @@ class SpeechCalculator : public CalculatorBase { return absl::InvalidArgumentError("Audio file pasing failed"); } std::string result = "{\"text\": \""; - result += pipeline.generate(raw_speech); + result += pipe->whisperPipeline->generate(raw_speech); result.append("\"}"); output = std::make_unique(result); } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ @@ -236,8 +236,8 @@ class SpeechCalculator : public CalculatorBase { if (!inputIt->value.IsString()) { return absl::InvalidArgumentError("input field is not a string"); } - ov::genai::Text2SpeechPipeline pipeline("/models/audio/speech", "CPU"); - auto gen_speech = pipeline.generate(inputIt->value.GetString()); + + auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_IEEE_FLOAT; diff --git a/src/speech/speech_calculator.proto b/src/speech/speech_calculator.proto index b97bbc3298..80ed29ccd6 100644 --- a/src/speech/speech_calculator.proto +++ b/src/speech/speech_calculator.proto @@ -31,16 +31,10 @@ message SpeechCalculatorOptions { required string models_path = 1; optional string device = 2; optional string plugin_config = 3; + enum Mode { + TEXT_TO_SPEECH = 0; + SPEECH_TO_TEXT = 1; + } - // fields used during inference - optional string max_resolution = 4 [default = "4096x4096"]; - optional string default_resolution = 5; - optional uint64 max_num_images_per_prompt = 6 [default = 10]; - optional uint64 default_num_inference_steps = 7 [default = 50]; - optional uint64 max_num_inference_steps = 8 [default = 100]; - - // static reshape setting, required for NPU, optional for other devices - optional string resolution = 9; - optional int64 num_images_per_prompt = 10; - optional float guidance_scale = 11; + required Mode mode = 4 [default = TEXT_TO_SPEECH]; } diff --git a/src/speech/speech_servable.hpp b/src/speech/speech_servable.hpp new file mode 100644 index 0000000000..47a98ca3df --- /dev/null +++ b/src/speech/speech_servable.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "src/speech/speech_calculator.pb.h" + + +namespace ovms { + +struct SpeechServable { + std::filesystem::path parsedModelsPath; + std::shared_ptr whisperPipeline; + std::shared_ptr text2SpeechPipeline; + + SpeechServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath, mediapipe::SpeechCalculatorOptions::Mode mode) { + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath.string(); + } + if(mode == mediapipe::SpeechCalculatorOptions::TEXT_TO_SPEECH){ + text2SpeechPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } + else{ + whisperPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } + } +}; + +using SpeechServableMap = std::unordered_map>; +} // namespace ovms From 4ec76f283e3786d8d3099f2b1cf7ad1785191859 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 11 Sep 2025 11:19:32 +0200 Subject: [PATCH 06/23] fix --- src/http_rest_api_handler.cpp | 2 + src/speech/BUILD | 2 +- src/speech/http_speech_calculator.cc | 92 +++++++++++++++++++++++++++- src/version.hpp | 6 +- 4 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 05df6ce7ac..48ed47b155 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -506,6 +506,8 @@ static Status createV3HttpPayload( } else { SPDLOG_DEBUG("Model name from deduced from MultiPart field: {}", modelName); } + auto stream = multiPartParser->getFieldByName("stream"); + SPDLOG_ERROR("{}", stream); ensureJsonParserInErrorState(parsedJson); } else if (isApplicationJson) { { diff --git a/src/speech/BUILD b/src/speech/BUILD index b072e9fbcd..680a015cc6 100644 --- a/src/speech/BUILD +++ b/src/speech/BUILD @@ -35,7 +35,7 @@ ovms_cc_library( ovms_cc_library( name = "speech_calculator", srcs = ["http_speech_calculator.cc"], - hdrs = ["dr_wav.h"], + hdrs = ["dr_wav.h", "dr_mp3.h"], deps = [ "@mediapipe//mediapipe/framework:calculator_framework", "//src:httppayload", diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 258318419a..af13d21466 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -35,6 +35,11 @@ #define DR_WAV_IMPLEMENTATION #include "dr_wav.h" +#define DR_MP3_IMPLEMENTATION +#pragma warning(push) +#pragma warning(disable : 6386 6262) +#include "dr_mp3.h" +#pragma warning(pop) #include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" #include "speech_servable.hpp" @@ -145,6 +150,84 @@ ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { return pcmf32; } +ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { + drmp3 mp3; + +// if (filename == "-") { +// { +// #ifdef _WIN32 +// _setmode(_fileno(stdin), _O_BINARY); +// #endif + +// uint8_t buf[1024]; +// while (true) { +// const size_t n = fread(buf, 1, sizeof(buf), stdin); +// if (n == 0) { +// break; +// } +// wav_data.insert(wav_data.end(), buf, buf + n); +// } +// } + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to open WAV file from stdin"); + +// fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); +// } else if (is_wav_buffer(filename)) { +// OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), +// "Failed to open WAV file from fname buffer"); +// } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { +// #if defined(WHISPER_FFMPEG) +// OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to read wav data as wav") +// #else +// throw std::runtime_error("failed to open as WAV file"); +// #endif +// } + SPDLOG_ERROR("1"); + auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); + if(result == false){ + SPDLOG_ERROR("FILE PARSING FAILED {}", result); + throw std::runtime_error("FILE PARSING FAILED"); + } + SPDLOG_ERROR("2"); + if (mp3.channels != 1 && mp3.channels != 2) { + drmp3_uninit(&mp3); + throw std::runtime_error("WAV file must be mono or stereo"); + } + SPDLOG_ERROR("3 {}", mp3.sampleRate); + mp3.sampleRate = COMMON_SAMPLE_RATE; + // if (mp3.sampleRate != COMMON_SAMPLE_RATE) { + // drmp3_uninit(&mp3); + // throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + // } + SPDLOG_ERROR("4"); + const uint64_t n = + mp3_data.empty() ? mp3.totalPCMFrameCount : mp3_data.size() / (mp3.channels * 4); + SPDLOG_ERROR("{}", mp3.totalPCMFrameCount); + std::vector pcm16; + pcm16.resize(n * mp3.channels); + drmp3_read_pcm_frames_s16(&mp3, n, pcm16.data()); + drmp3_uninit(&mp3); + SPDLOG_ERROR("5"); + // convert to mono, float + std::vector pcmf32; + pcmf32.resize(n); + if (mp3.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i]) / 32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + } + } + + return pcmf32; +} + std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { std::string_view value = parser.getFileContentByFieldName(keyName); if (value.empty()) { @@ -204,6 +287,13 @@ class SpeechCalculator : public CalculatorBase { return absl::InvalidArgumentError("Failed to parse multipart data"); SET_OR_RETURN(std::optional, file, getFileFromPayload(*payload.multipartParser, "file")); + auto stream = getFileFromPayload(*payload.multipartParser, "stream"); + if(!std::holds_alternative(stream)){ + SPDLOG_ERROR("NO VALUE"); + } + else{ + SPDLOG_ERROR("{}", (std::get>(stream)).value()); + } if(!file.has_value()){ return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); } @@ -214,7 +304,7 @@ class SpeechCalculator : public CalculatorBase { config.return_timestamps = true; ov::genai::RawSpeechInput raw_speech; try { - raw_speech = read_wav(file.value()); + raw_speech = read_mp3(file.value()); } catch(std::exception&){ return absl::InvalidArgumentError("Audio file pasing failed"); } diff --git a/src/version.hpp b/src/version.hpp index fbebc03710..a1f181f6cc 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -16,7 +16,7 @@ #ifndef SRC_VERSION_HPP_ #define SRC_VERSION_HPP_ #define PROJECT_NAME "OpenVINO Model Server" -#define PROJECT_VERSION "REPLACE_PROJECT_VERSION" -#define OPENVINO_NAME "REPLACE_OPENVINO_NAME" -#define BAZEL_BUILD_FLAGS "REPLACE_BAZEL_BUILD_FLAGS" +#define PROJECT_VERSION "2025.3.0.8524d72c" +#define OPENVINO_NAME "2025.3.0.0.dev20250812" +#define BAZEL_BUILD_FLAGS "--config=win_mp_on_py_off" #endif // SRC_VERSION_HPP_" From 51d77efc00be19dc2b0af5f4b1bcdd11bf5bd0b9 Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Wed, 17 Sep 2025 11:41:48 +0200 Subject: [PATCH 07/23] fix --- demos/common/export_models/README.md | 2 +- src/speech/http_speech_calculator.cc | 140 +++++++++++++++------------ src/speech/speech_servable.hpp | 2 + 3 files changed, 83 insertions(+), 61 deletions(-) diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 16fd33fb6c..e4ed134772 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -98,7 +98,7 @@ options: #### Text Generation CPU Deployment ```console -python export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config_all.json --model_repository_path models +python demos\common\export_models\export_model.py text_generation --source_model meta-llama/Llama-3.2-1B-Instruct --weight-format int4 --kv_cache_precision u8 --config_file_path config.json --model_repository_path audio ``` #### GPU Deployment (Low Concurrency, Limited Memory) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index af13d21466..bd1a0496aa 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -26,6 +26,8 @@ #include "../http_payload.hpp" #include "../logging.hpp" +#include +#include #pragma warning(push) #pragma warning(disable : 6001 4324 6385 6386) @@ -150,42 +152,47 @@ ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { return pcmf32; } -ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { - drmp3 mp3; +float* resample_audio(const float* input, + size_t input_length, + float input_rate, + float target_rate, + size_t* output_length) { + if (input_rate == target_rate) { + *output_length = input_length; + float* output = (float*)malloc(input_length * sizeof(float)); + if (output) { + memcpy(output, input, input_length * sizeof(float)); + } + return output; + } -// if (filename == "-") { -// { -// #ifdef _WIN32 -// _setmode(_fileno(stdin), _O_BINARY); -// #endif + float ratio = input_rate / target_rate; + *output_length = (size_t)(input_length / ratio); + float* output = (float*)malloc(*output_length * sizeof(float)); -// uint8_t buf[1024]; -// while (true) { -// const size_t n = fread(buf, 1, sizeof(buf), stdin); -// if (n == 0) { -// break; -// } -// wav_data.insert(wav_data.end(), buf, buf + n); -// } -// } + if (!output) { + return NULL; + } -// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), -// "Failed to open WAV file from stdin"); + for (size_t i = 0; i < *output_length; i++) { + float src_idx = i * ratio; + size_t idx0 = (size_t)src_idx; + size_t idx1 = idx0 + 1; -// fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); -// } else if (is_wav_buffer(filename)) { -// OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), -// "Failed to open WAV file from fname buffer"); -// } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { -// #if defined(WHISPER_FFMPEG) -// OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + if (idx1 >= input_length) { + output[i] = input[input_length - 1]; + } else { + float frac = src_idx - idx0; + output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac; + } + } + + return output; +} + +ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { + drmp3 mp3; -// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), -// "Failed to read wav data as wav") -// #else -// throw std::runtime_error("failed to open as WAV file"); -// #endif -// } SPDLOG_ERROR("1"); auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); if(result == false){ @@ -198,34 +205,34 @@ ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { throw std::runtime_error("WAV file must be mono or stereo"); } SPDLOG_ERROR("3 {}", mp3.sampleRate); - mp3.sampleRate = COMMON_SAMPLE_RATE; // if (mp3.sampleRate != COMMON_SAMPLE_RATE) { // drmp3_uninit(&mp3); // throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); // } SPDLOG_ERROR("4"); - const uint64_t n = - mp3_data.empty() ? mp3.totalPCMFrameCount : mp3_data.size() / (mp3.channels * 4); - SPDLOG_ERROR("{}", mp3.totalPCMFrameCount); - std::vector pcm16; - pcm16.resize(n * mp3.channels); - drmp3_read_pcm_frames_s16(&mp3, n, pcm16.data()); + const uint64_t n = mp3.totalPCMFrameCount; + SPDLOG_ERROR("mp3.totalPCMFrameCount {} : n {}", mp3.totalPCMFrameCount, n); + std::vector pcmf32; + pcmf32.resize(n * mp3.channels); + drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data()); drmp3_uninit(&mp3); SPDLOG_ERROR("5"); // convert to mono, float - std::vector pcmf32; - pcmf32.resize(n); - if (mp3.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i]) / 32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; - } - } - - return pcmf32; + // std::vector pcmf32; + // pcmf32.resize(n); + // if (mp3.channels == 1) { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[i]) / 32768.0f; + // } + // } else { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + // } + // } + size_t output_length; + auto buffer = resample_audio(reinterpret_cast(pcmf32.data()), pcmf32.size(), mp3.sampleRate, 16000, &output_length); + std::vector output(buffer, buffer + output_length); + return output; } std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { @@ -297,11 +304,11 @@ class SpeechCalculator : public CalculatorBase { if(!file.has_value()){ return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); } - ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); - // 'task' and 'language' parameters are supported for multilingual models only - config.language = "<|en|>"; // can switch to <|zh|> for Chinese language - config.task = "transcribe"; - config.return_timestamps = true; + // ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); + // // 'task' and 'language' parameters are supported for multilingual models only + // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + // config.task = "transcribe"; + // config.return_timestamps = true; ov::genai::RawSpeechInput raw_speech; try { raw_speech = read_mp3(file.value()); @@ -309,8 +316,10 @@ class SpeechCalculator : public CalculatorBase { return absl::InvalidArgumentError("Audio file pasing failed"); } std::string result = "{\"text\": \""; + std::unique_lock lock(pipe->whisperPipelineMutex); result += pipe->whisperPipeline->generate(raw_speech); result.append("\"}"); + SPDLOG_ERROR("{}",result); output = std::make_unique(result); } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ if (payload.parsedJson->HasParseError()) @@ -326,15 +335,24 @@ class SpeechCalculator : public CalculatorBase { if (!inputIt->value.IsString()) { return absl::InvalidArgumentError("input field is not a string"); } - + auto streamIt = payload.parsedJson->FindMember("stream_format"); + if (streamIt != payload.parsedJson->MemberEnd()) { + SPDLOG_ERROR("STREAM: {}", streamIt->value.GetString()); + } + else{ + SPDLOG_ERROR("NO STREAM"); + } + SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); + std::unique_lock lock(pipe->text2SpeechPipelineMutex); auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); + lock.unlock(); drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_IEEE_FLOAT; format.channels = 1; format.sampleRate = 16000; // assume it is always 16 KHz format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth(); - + SPDLOG_ERROR("1"); drwav wav; void* ppData; size_t pDataSize; @@ -344,12 +362,14 @@ class SpeechCalculator : public CalculatorBase { auto waveform_ptr = gen_speech.speeches[0].data(); OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), "Failed to initialize WAV writer"); + SPDLOG_ERROR("2"); drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); - + SPDLOG_ERROR("3"); output = std::make_unique(reinterpret_cast(ppData), pDataSize); drwav_uninit(&wav); - drwav_free(ppData, NULL); + SPDLOG_ERROR("4"); + //drwav_free(ppData, NULL); }else { return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); } diff --git a/src/speech/speech_servable.hpp b/src/speech/speech_servable.hpp index 47a98ca3df..8097552d64 100644 --- a/src/speech/speech_servable.hpp +++ b/src/speech/speech_servable.hpp @@ -39,6 +39,8 @@ struct SpeechServable { std::filesystem::path parsedModelsPath; std::shared_ptr whisperPipeline; std::shared_ptr text2SpeechPipeline; + std::mutex whisperPipelineMutex; + std::mutex text2SpeechPipelineMutex; SpeechServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath, mediapipe::SpeechCalculatorOptions::Mode mode) { auto fsModelsPath = std::filesystem::path(modelDir); From d34991177411d7fa449526fc12c9d26527a6d6ae Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 11:55:54 +0200 Subject: [PATCH 08/23] fix build --- WORKSPACE | 16 ++++++++++++++++ src/speech/BUILD | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/WORKSPACE b/WORKSPACE index bfff7ea463..facf9155e7 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -640,3 +640,19 @@ cc_library( ) """, ) + +new_git_repository( + name = "dr_libs", + remote = "https://github.com/mackron/dr_libs", + commit = "24d738be2349fd4b6fe50eeaa81f5bd586267fd0", + build_file_content = """ +cc_library( + name = "dr", + hdrs = ["dr_flac.h", "dr_mp3.h", "dr_wav.h"], + visibility = ["//visibility:public"], + local_defines = [ + ], +) +""", +) + diff --git a/src/speech/BUILD b/src/speech/BUILD index 680a015cc6..39ad5e14d4 100644 --- a/src/speech/BUILD +++ b/src/speech/BUILD @@ -35,12 +35,12 @@ ovms_cc_library( ovms_cc_library( name = "speech_calculator", srcs = ["http_speech_calculator.cc"], - hdrs = ["dr_wav.h", "dr_mp3.h"], deps = [ "@mediapipe//mediapipe/framework:calculator_framework", "//src:httppayload", "//src:libovmslogging", "speech_calculator_cc_proto", + "@dr_libs//:dr", ":speech_servable", ]+ select({ "//conditions:default": ["//third_party:genai", ":llm_engine"], From c33a736c7f131b427fc88e09b11eb07bce3ebaf1 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 13:26:55 +0200 Subject: [PATCH 09/23] spelling --- src/speech/http_speech_calculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index bd1a0496aa..1ceab6ebdf 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -313,7 +313,7 @@ class SpeechCalculator : public CalculatorBase { try { raw_speech = read_mp3(file.value()); } catch(std::exception&){ - return absl::InvalidArgumentError("Audio file pasing failed"); + return absl::InvalidArgumentError("Audio file reading failed"); } std::string result = "{\"text\": \""; std::unique_lock lock(pipe->whisperPipelineMutex); From 4974868b50c08402db0beba85d8a623d6a800f98 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 15:09:27 +0200 Subject: [PATCH 10/23] style --- src/speech/http_speech_calculator.cc | 111 +++++++++++++-------------- src/speech/speech_servable.hpp | 8 +- 2 files changed, 57 insertions(+), 62 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 1ceab6ebdf..af94099988 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -47,8 +47,8 @@ #include "speech_servable.hpp" #ifdef _WIN32 -# include -# include +#include +#include #endif using namespace ovms; @@ -57,7 +57,6 @@ namespace mediapipe { // using SpeechPipelinesMap = std::unordered_map>; - const std::string SPEECH_SESSION_SIDE_PACKET_TAG = "SPEECH_NODE_RESOURCES"; #define COMMON_SAMPLE_RATE 16000 @@ -80,41 +79,41 @@ bool is_wav_buffer(const std::string buf) { ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { drwav wav; -// if (filename == "-") { -// { -// #ifdef _WIN32 -// _setmode(_fileno(stdin), _O_BINARY); -// #endif - -// uint8_t buf[1024]; -// while (true) { -// const size_t n = fread(buf, 1, sizeof(buf), stdin); -// if (n == 0) { -// break; -// } -// wav_data.insert(wav_data.end(), buf, buf + n); -// } -// } - -// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), -// "Failed to open WAV file from stdin"); - -// fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); -// } else if (is_wav_buffer(filename)) { -// OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), -// "Failed to open WAV file from fname buffer"); -// } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { -// #if defined(WHISPER_FFMPEG) -// OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") - -// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), -// "Failed to read wav data as wav") -// #else -// throw std::runtime_error("failed to open as WAV file"); -// #endif -// } - auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr); - if(result == false){ + // if (filename == "-") { + // { + // #ifdef _WIN32 + // _setmode(_fileno(stdin), _O_BINARY); + // #endif + + // uint8_t buf[1024]; + // while (true) { + // const size_t n = fread(buf, 1, sizeof(buf), stdin); + // if (n == 0) { + // break; + // } + // wav_data.insert(wav_data.end(), buf, buf + n); + // } + // } + + // OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + // "Failed to open WAV file from stdin"); + + // fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + // } else if (is_wav_buffer(filename)) { + // OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), + // "Failed to open WAV file from fname buffer"); + // } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { + // #if defined(WHISPER_FFMPEG) + // OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + + // OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + // "Failed to read wav data as wav") + // #else + // throw std::runtime_error("failed to open as WAV file"); + // #endif + // } + auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr); + if (result == false) { SPDLOG_ERROR("FILE PARSING FAILED {}", result); throw std::runtime_error("FILE PARSING FAILED"); } @@ -153,10 +152,10 @@ ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { } float* resample_audio(const float* input, - size_t input_length, - float input_rate, - float target_rate, - size_t* output_length) { + size_t input_length, + float input_rate, + float target_rate, + size_t* output_length) { if (input_rate == target_rate) { *output_length = input_length; float* output = (float*)malloc(input_length * sizeof(float)); @@ -194,8 +193,8 @@ ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { drmp3 mp3; SPDLOG_ERROR("1"); - auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); - if(result == false){ + auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); + if (result == false) { SPDLOG_ERROR("FILE PARSING FAILED {}", result); throw std::runtime_error("FILE PARSING FAILED"); } @@ -211,7 +210,7 @@ ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { // } SPDLOG_ERROR("4"); const uint64_t n = mp3.totalPCMFrameCount; - SPDLOG_ERROR("mp3.totalPCMFrameCount {} : n {}", mp3.totalPCMFrameCount, n); + SPDLOG_ERROR("mp3.totalPCMFrameCount {} : n {}", mp3.totalPCMFrameCount, n); std::vector pcmf32; pcmf32.resize(n * mp3.channels); drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data()); @@ -295,13 +294,12 @@ class SpeechCalculator : public CalculatorBase { SET_OR_RETURN(std::optional, file, getFileFromPayload(*payload.multipartParser, "file")); auto stream = getFileFromPayload(*payload.multipartParser, "stream"); - if(!std::holds_alternative(stream)){ + if (!std::holds_alternative(stream)) { SPDLOG_ERROR("NO VALUE"); - } - else{ + } else { SPDLOG_ERROR("{}", (std::get>(stream)).value()); } - if(!file.has_value()){ + if (!file.has_value()) { return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); } // ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); @@ -312,16 +310,16 @@ class SpeechCalculator : public CalculatorBase { ov::genai::RawSpeechInput raw_speech; try { raw_speech = read_mp3(file.value()); - } catch(std::exception&){ + } catch (std::exception&) { return absl::InvalidArgumentError("Audio file reading failed"); } std::string result = "{\"text\": \""; std::unique_lock lock(pipe->whisperPipelineMutex); result += pipe->whisperPipeline->generate(raw_speech); result.append("\"}"); - SPDLOG_ERROR("{}",result); + SPDLOG_ERROR("{}", result); output = std::make_unique(result); - } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ + } else if (absl::StartsWith(payload.uri, "/v3/audio/speech")) { if (payload.parsedJson->HasParseError()) return absl::InvalidArgumentError("Failed to parse JSON"); @@ -338,8 +336,7 @@ class SpeechCalculator : public CalculatorBase { auto streamIt = payload.parsedJson->FindMember("stream_format"); if (streamIt != payload.parsedJson->MemberEnd()) { SPDLOG_ERROR("STREAM: {}", streamIt->value.GetString()); - } - else{ + } else { SPDLOG_ERROR("NO STREAM"); } SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); @@ -361,16 +358,16 @@ class SpeechCalculator : public CalculatorBase { size_t total_samples = waveform_size * format.channels; auto waveform_ptr = gen_speech.speeches[0].data(); OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), - "Failed to initialize WAV writer"); + "Failed to initialize WAV writer"); SPDLOG_ERROR("2"); drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); - SPDLOG_ERROR("3"); + SPDLOG_ERROR("3"); output = std::make_unique(reinterpret_cast(ppData), pDataSize); drwav_uninit(&wav); SPDLOG_ERROR("4"); //drwav_free(ppData, NULL); - }else { + } else { return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); } diff --git a/src/speech/speech_servable.hpp b/src/speech/speech_servable.hpp index 8097552d64..853b46ad40 100644 --- a/src/speech/speech_servable.hpp +++ b/src/speech/speech_servable.hpp @@ -32,9 +32,8 @@ #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" #include "src/speech/speech_calculator.pb.h" - namespace ovms { - + struct SpeechServable { std::filesystem::path parsedModelsPath; std::shared_ptr whisperPipeline; @@ -49,10 +48,9 @@ struct SpeechServable { } else { parsedModelsPath = fsModelsPath.string(); } - if(mode == mediapipe::SpeechCalculatorOptions::TEXT_TO_SPEECH){ + if (mode == mediapipe::SpeechCalculatorOptions::TEXT_TO_SPEECH) { text2SpeechPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); - } - else{ + } else { whisperPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); } } From b8ed197de06ef32a67ad19566be0194ef818fb50 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 17:03:34 +0200 Subject: [PATCH 11/23] style --- src/speech/http_speech_calculator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index af94099988..ce5574fe81 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -36,11 +36,11 @@ #pragma warning(pop) #define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" +#include #define DR_MP3_IMPLEMENTATION #pragma warning(push) #pragma warning(disable : 6386 6262) -#include "dr_mp3.h" +#include #pragma warning(pop) #include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" From 44841ddc010aa4ba38977a4530dcc9a9352b4a00 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 17:07:23 +0200 Subject: [PATCH 12/23] demo and export script --- demos/audio/README.md | 11 +++ demos/audio/openai_speech.py | 26 +++++++ demos/common/export_models/export_model.py | 86 +++++++++++++++++++++- 3 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 demos/audio/README.md create mode 100644 demos/audio/openai_speech.py diff --git a/demos/audio/README.md b/demos/audio/README.md new file mode 100644 index 0000000000..9ec37ef616 --- /dev/null +++ b/demos/audio/README.md @@ -0,0 +1,11 @@ +# Audio endpoints + + +## Audio synthesis + +python export_model.py speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan + + +docker run -p 8000:8000 -it -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000 + +curl http://mclx-23.sclab.intel.com/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.mp3 diff --git a/demos/audio/openai_speech.py b/demos/audio/openai_speech.py new file mode 100644 index 0000000000..c3908b49b0 --- /dev/null +++ b/demos/audio/openai_speech.py @@ -0,0 +1,26 @@ +from pathlib import Path +from openai import OpenAI + +prompt = "Intel Corporation is an American multinational technology company headquartered in Santa Clara, California.[3] Intel designs, manufactures, and sells computer components such as central processing units (CPUs) and related products for business and consumer markets. It was the world's third-largest semiconductor chip manufacturer by revenue in 2024[4] and has been included in the Fortune 500 list of the largest United States corporations by revenue since 2007. It was one of the first companies listed on Nasdaq. Since 2025, it is partially owned by the United States government." +filename = "speech.wav" +url="http://localhost:80/v3" + + +speech_file_path = Path(__file__).parent / "speech.wav" +client = OpenAI(base_url=url, api_key="not_used") + +# with client.audio.speech.with_streaming_response.create( +# model="whisper", +# voice="alloy", +# input=prompt +# ) as response: +# response.stream_to_file(speech_file_path) + +audio_file = open("speech.wav", "rb") +transcript = client.audio.transcriptions.create( + model="whisper", + file=audio_file +) + + +print(transcript) \ No newline at end of file diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 2b6e76a210..b296ad20d6 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -92,8 +92,58 @@ def add_common_arguments(parser): parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt') parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps') parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps') + +parser_speech_generation = subparsers.add_parser('speech', help='export model for speech synthesis endpoint') +add_common_arguments(parser_speech_generation) +parser_speech_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') +parser_speech_generation.add_argument('--vocoder', type=str, help='The vocoder model to use for speech synthesis. For example microsoft/speecht5_hifigan', dest='vocoder') + +parser_transcription_generation = subparsers.add_parser('transcription', help='export model for speech transcription endpoint') +add_common_arguments(parser_transcription_generation) +parser_transcription_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') args = vars(parser.parse_args()) +speech_graph_template = """ +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "SpeechExecutor" + input_side_packet: "SPEECH_NODE_RESOURCES:speech_servable" + calculator: "SpeechCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.SpeechCalculatorOptions]: { + models_path: "{{model_path}}", + mode: TEXT_TO_SPEECH, + plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', + device: "{{target_device|default("CPU", true)}}" + } + } +} +""" + +transcription_graph_template = """ +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "SpeechExecutor" + input_side_packet: "SPEECH_NODE_RESOURCES:speech_servable" + calculator: "SpeechCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.SpeechCalculatorOptions]: { + models_path: "{{model_path}}", + mode: SPEECH_TO_TEXT, + plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', + device: "{{target_device|default("CPU", true)}}" + } + } +} +""" + + embedding_graph_template = """input_stream: "REQUEST_PAYLOAD:input" output_stream: "RESPONSE_PAYLOAD:output" node { @@ -526,7 +576,7 @@ def export_embeddings_model(model_repository_path, source_model, model_name, pre print("Created subconfig {}".format(os.path.join(model_repository_path, model_name, 'subconfig.json'))) add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) -def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True): +def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters): set_max_context_length = "" destination_path = os.path.join(model_repository_path, model_name) print("Exporting embeddings model to ",destination_path) @@ -543,7 +593,32 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: f.write(graph_content) print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) - add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) + +def export_speech_model(model_repository_path, source_model, model_name, precision, task_parameters): + destination_path = os.path.join(model_repository_path, model_name) + print("Exporting speech model to ",destination_path) + if not os.path.isdir(destination_path) or args['overwrite_models']: + optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path) + if os.system(optimum_command): + raise ValueError("Failed to export speech model", source_model) + gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(speech_graph_template) + graph_content = gtemplate.render(model_path="./", **task_parameters) + with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: + f.write(graph_content) + print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) + +def export_transcription_model(model_repository_path, source_model, model_name, precision, task_parameters): + destination_path = os.path.join(model_repository_path, model_name) + print("Exporting transcription model to ",destination_path) + if not os.path.isdir(destination_path) or args['overwrite_models']: + optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path) + if os.system(optimum_command): + raise ValueError("Failed to export transcription model", source_model) + gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(transcription_graph_template) + graph_content = gtemplate.render(model_path="./", **task_parameters) + with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: + f.write(graph_content) + print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length): destination_path = os.path.join(model_repository_path, model_name) @@ -674,7 +749,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam export_embeddings_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['truncate']) elif args['task'] == 'embeddings_ov': - export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate']) + export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters) elif args['task'] == 'rerank': export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length']) @@ -682,6 +757,11 @@ def export_image_generation_model(model_repository_path, source_model, model_nam elif args['task'] == 'rerank_ov': export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length']) +elif args['task'] == 'speech': + export_speech_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters) + +elif args['task'] == 'transcription': + export_transcription_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters) elif args['task'] == 'image_generation': template_parameters = {k: v for k, v in args.items() if k in [ 'ov_cache_dir', From 4ea4c505dae440f45a3b6eba4025e96b67346a7e Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 3 Oct 2025 17:19:53 +0200 Subject: [PATCH 13/23] style --- src/speech/http_speech_calculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index ce5574fe81..cf3bf081a1 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -366,7 +366,7 @@ class SpeechCalculator : public CalculatorBase { output = std::make_unique(reinterpret_cast(ppData), pDataSize); drwav_uninit(&wav); SPDLOG_ERROR("4"); - //drwav_free(ppData, NULL); + // drwav_free(ppData, NULL); } else { return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); } From 9cca35c043955a5baf374a4d0203b8e3c91010bd Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Sat, 4 Oct 2025 01:19:40 +0200 Subject: [PATCH 14/23] fix --- demos/audio/openai_speech.py | 16 ++++++++++++++++ src/speech/http_speech_calculator.cc | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/demos/audio/openai_speech.py b/demos/audio/openai_speech.py index c3908b49b0..1fed6b11d8 100644 --- a/demos/audio/openai_speech.py +++ b/demos/audio/openai_speech.py @@ -1,3 +1,19 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from pathlib import Path from openai import OpenAI diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index cf3bf081a1..01ec5824bb 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -36,11 +36,11 @@ #pragma warning(pop) #define DR_WAV_IMPLEMENTATION -#include +#include "dr_wav.h" #define DR_MP3_IMPLEMENTATION #pragma warning(push) #pragma warning(disable : 6386 6262) -#include +#include "dr_mp3.h" #pragma warning(pop) #include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" From cad63229af77ba5fe2611a0978d305fa174e675b Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Sun, 5 Oct 2025 02:00:14 +0200 Subject: [PATCH 15/23] fix --- src/speech/http_speech_calculator.cc | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 01ec5824bb..91f011e646 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -36,11 +36,11 @@ #pragma warning(pop) #define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" +#include "dr_wav.h" // NOLINT #define DR_MP3_IMPLEMENTATION #pragma warning(push) #pragma warning(disable : 6386 6262) -#include "dr_mp3.h" +#include "dr_mp3.h" // NOLINT #pragma warning(pop) #include "openvino/genai/whisper_pipeline.hpp" #include "openvino/genai/speech_generation/text2speech_pipeline.hpp" @@ -64,12 +64,14 @@ const std::string SPEECH_SESSION_SIDE_PACKET_TAG = "SPEECH_NODE_RESOURCES"; bool is_wav_buffer(const std::string buf) { // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + SPDLOG_DEBUG("is_wav_buffer: buf {}", buf.substr(0, 12)); if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { return false; } uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); - if (chunk_size + 8 != buf.size()) { + SPDLOG_DEBUG("is_wav_buffer: chunk_size {}", chunk_size); + if (chunk_size + 16 != buf.size()) { return false; } @@ -194,10 +196,11 @@ ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { SPDLOG_ERROR("1"); auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); - if (result == false) { - SPDLOG_ERROR("FILE PARSING FAILED {}", result); - throw std::runtime_error("FILE PARSING FAILED"); - } + SPDLOG_ERROR("drmp3_init_memory RESULT {} size:{}", result, mp3_data.size()); + // if (result == 1) { + // SPDLOG_ERROR("FILE PARSING FAILED {}", result); + // throw std::runtime_error("FILE PARSING FAILED"); + // } SPDLOG_ERROR("2"); if (mp3.channels != 1 && mp3.channels != 2) { drmp3_uninit(&mp3); @@ -309,7 +312,16 @@ class SpeechCalculator : public CalculatorBase { // config.return_timestamps = true; ov::genai::RawSpeechInput raw_speech; try { - raw_speech = read_mp3(file.value()); + SPDLOG_DEBUG(file.value().data()); + if (is_wav_buffer(std::string(file.value()))) { + SPDLOG_DEBUG("WAV FILE"); + raw_speech = read_wav(file.value()); + SPDLOG_DEBUG("WAV FILE SIZE: {}", raw_speech.size()); + } else { + SPDLOG_DEBUG("NOT WAV FILE"); + raw_speech = read_mp3(file.value()); + SPDLOG_DEBUG("MP3 FILE SIZE: {}", raw_speech.size()); + } } catch (std::exception&) { return absl::InvalidArgumentError("Audio file reading failed"); } From fa682d20d8c678ad213da5646098af32620b8346 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 09:00:30 +0200 Subject: [PATCH 16/23] demo init --- demos/audio/README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 9ec37ef616..41007bb6cc 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -3,9 +3,26 @@ ## Audio synthesis -python export_model.py speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan +python export_model.py speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan --weight-format fp16 + +docker run -p 8000:8000 -d -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000 + +curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.wav + + + + + +## Audio transcription + +python export_model.py transcription --source_model openai/whisper-large-v2 --weight-format fp16 + + +docker run -p 8000:8000 -it -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000 + + +curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper" + -docker run -p 8000:8000 -it -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000 -curl http://mclx-23.sclab.intel.com/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.mp3 From 867a5d3b6779609b979f0ceff5b11af04e0c294a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 12:11:29 +0200 Subject: [PATCH 17/23] fix unit tests --- src/test/mediapipeflow_test.cpp | 2 +- src/test/pythonnode_test.cpp | 2 +- src/test/streaming_test.cpp | 44 ++++++++++++++++----------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/test/mediapipeflow_test.cpp b/src/test/mediapipeflow_test.cpp index 196e13b2d4..72bedbf1dc 100644 --- a/src/test/mediapipeflow_test.cpp +++ b/src/test/mediapipeflow_test.cpp @@ -2683,7 +2683,7 @@ class MediapipeSerialization : public ::testing::Test { std::vector inputNames, std::vector outputNames, const PythonNodeResourcesMap& pythonNodeResourcesMap, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : - MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, nullptr, mediapipeServableMetricReporter) {} + MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, {}, nullptr, mediapipeServableMetricReporter) {} }; protected: diff --git a/src/test/pythonnode_test.cpp b/src/test/pythonnode_test.cpp index 5b1c91cbca..6a2b473283 100644 --- a/src/test/pythonnode_test.cpp +++ b/src/test/pythonnode_test.cpp @@ -1005,7 +1005,7 @@ class MockedMediapipeGraphExecutorPy : public ovms::MediapipeGraphExecutor { const PythonNodeResourcesMap& pythonNodeResourcesMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : - MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, pythonBackend, mediapipeServableMetricReporter) {} + MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, {}, pythonBackend, mediapipeServableMetricReporter) {} }; TEST_F(PythonFlowTest, SerializePyObjectWrapperToKServeResponse) { diff --git a/src/test/streaming_test.cpp b/src/test/streaming_test.cpp index d7c40b2ce6..cca060576f 100644 --- a/src/test/streaming_test.cpp +++ b/src/test/streaming_test.cpp @@ -359,7 +359,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::KFS_REQUEST}}, {{"out", mediapipe_packet_type_enum::KFS_RESPONSE}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}); @@ -416,7 +416,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}); // no timestamp specified, server will assign one @@ -559,7 +559,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests with manually (client) assigned ascending order of timestamp and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}, 3); // first request with timestamp 3 @@ -604,7 +604,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock only 1 request and disconnect immediately prepareRequest(this->firstRequest, {{"in", 3.5f}}); @@ -1243,7 +1243,7 @@ node { {"out3", mediapipe_packet_type_enum::OVTENSOR}}, {"in1", "in2", "in3"}, {"out1", "out2", "out3"}, - {}, {}, {}, {}, nullptr, this->reporter.get()}; + {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1295,7 +1295,7 @@ node { {"out3", mediapipe_packet_type_enum::OVTENSOR}}, {"in1", "in2", "in3"}, {"out1", "out2", "out3"}, - {}, {}, {}, {}, nullptr, this->reporter.get()}; + {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1330,7 +1330,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1364,7 +1364,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"wrong_name"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // cannot install observer due to wrong output name (should never happen due to validation) + {"in"}, {"wrong_name"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // cannot install observer due to wrong output name (should never happen due to validation) EXPECT_CALL(this->stream, Read(_)).Times(0); EXPECT_CALL(this->stream, Write(_, _)).Times(0); @@ -1389,7 +1389,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {}); EXPECT_CALL(this->stream, Read(_)) @@ -1417,7 +1417,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1453,7 +1453,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); ASSERT_EQ(executor.inferStream(this->firstRequest, this->stream, this->executionContext), StatusCode::MEDIAPIPE_GRAPH_INITIALIZATION_ERROR); @@ -1476,7 +1476,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Invalid request - missing data in buffer prepareInvalidRequest(this->firstRequest, {"in"}); // no timestamp specified, server will assign one @@ -1511,7 +1511,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise[3]; std::future signalFuture[3] = { @@ -1558,7 +1558,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}, 0); EXPECT_CALL(this->stream, Read(_)) @@ -1586,7 +1586,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); setRequestTimestamp(this->firstRequest, std::string("not an int")); @@ -1621,7 +1621,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Timestamps not allowed in stream // Expect continuity of operation and response with error message @@ -1663,7 +1663,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Allowed in stream for (auto timestamp : std::vector<::mediapipe::Timestamp>{ @@ -1699,7 +1699,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequestWithParam(this->firstRequest, {{"in", 3.5f}}, {"val", 65}); // request with parameter val @@ -1736,7 +1736,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving the invalid request and disconnection // Request with invalid param py (special pythons session side packet) @@ -1765,7 +1765,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); // missing required request param EXPECT_CALL(this->stream, Read(_)).Times(0); @@ -1791,7 +1791,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 2 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}, std::nullopt, this->name, this->version); // no timestamp specified, server will assign one @@ -1825,7 +1825,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); From b80eef193730d8489e7fa400d78a66fc6fae1629 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 13:25:57 +0200 Subject: [PATCH 18/23] fix --- src/speech/http_speech_calculator.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 91f011e646..fd7dd2eb96 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -36,6 +36,8 @@ #pragma warning(pop) #define DR_WAV_IMPLEMENTATION +#pragma warning(push) +#pragma warning(disable : 2220 4245) #include "dr_wav.h" // NOLINT #define DR_MP3_IMPLEMENTATION #pragma warning(push) @@ -71,7 +73,7 @@ bool is_wav_buffer(const std::string buf) { uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); SPDLOG_DEBUG("is_wav_buffer: chunk_size {}", chunk_size); - if (chunk_size + 16 != buf.size()) { + if (chunk_size + 8 != buf.size()) { return false; } From 477f007bdc6af35c4b9ba2048a4d6ecc7a967dca Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 14:10:27 +0200 Subject: [PATCH 19/23] windows fix --- src/speech/http_speech_calculator.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index fd7dd2eb96..082638de58 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -37,7 +37,7 @@ #define DR_WAV_IMPLEMENTATION #pragma warning(push) -#pragma warning(disable : 2220 4245) +#pragma warning(disable : 4245 4220) #include "dr_wav.h" // NOLINT #define DR_MP3_IMPLEMENTATION #pragma warning(push) @@ -193,7 +193,7 @@ float* resample_audio(const float* input, return output; } -ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { +/* ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { drmp3 mp3; SPDLOG_ERROR("1"); @@ -237,7 +237,7 @@ ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { auto buffer = resample_audio(reinterpret_cast(pcmf32.data()), pcmf32.size(), mp3.sampleRate, 16000, &output_length); std::vector output(buffer, buffer + output_length); return output; -} +} */ std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { std::string_view value = parser.getFileContentByFieldName(keyName); @@ -320,9 +320,9 @@ class SpeechCalculator : public CalculatorBase { raw_speech = read_wav(file.value()); SPDLOG_DEBUG("WAV FILE SIZE: {}", raw_speech.size()); } else { - SPDLOG_DEBUG("NOT WAV FILE"); - raw_speech = read_mp3(file.value()); - SPDLOG_DEBUG("MP3 FILE SIZE: {}", raw_speech.size()); + SPDLOG_DEBUG("NOT WAV FILE. Only WAVE format is supported currently"); + // raw_speech = read_mp3(file.value()); + SPDLOG_DEBUG("FILE SIZE: {}", raw_speech.size()); } } catch (std::exception&) { return absl::InvalidArgumentError("Audio file reading failed"); From 2af243fd32405b221fdde1f0d1f7d3bf35cce213 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 20:30:23 +0200 Subject: [PATCH 20/23] gpu fix --- src/speech/http_speech_calculator.cc | 9 +++++++-- src/test/mediapipeflow_test.cpp | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 082638de58..360db1b98c 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -356,7 +356,7 @@ class SpeechCalculator : public CalculatorBase { SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); std::unique_lock lock(pipe->text2SpeechPipelineMutex); auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); - lock.unlock(); + drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_IEEE_FLOAT; @@ -370,7 +370,12 @@ class SpeechCalculator : public CalculatorBase { auto waveform_size = gen_speech.speeches[0].get_size(); size_t total_samples = waveform_size * format.channels; - auto waveform_ptr = gen_speech.speeches[0].data(); + ov::Tensor cpu_tensor(gen_speech.speeches[0].get_element_type(), gen_speech.speeches[0].get_shape()); + // copy results to release inference request + gen_speech.speeches[0].copy_to(cpu_tensor); + lock.unlock(); + + auto waveform_ptr = cpu_tensor.data(); OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), "Failed to initialize WAV writer"); SPDLOG_ERROR("2"); diff --git a/src/test/mediapipeflow_test.cpp b/src/test/mediapipeflow_test.cpp index 72bedbf1dc..bf987fd811 100644 --- a/src/test/mediapipeflow_test.cpp +++ b/src/test/mediapipeflow_test.cpp @@ -3932,6 +3932,7 @@ TEST(WhitelistRegistered, MediapipeCalculatorsList) { "SerializationCalculator", "SetLandmarkVisibilityCalculator", "SidePacketToStreamCalculator", + "SpeechCalculator", "SplitAffineMatrixVectorCalculator", "SplitClassificationListVectorCalculator", "SplitDetectionVectorCalculator", From f339c43a61b7f02b3f0f5d1e2bc27ad008878376 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 20:40:06 +0200 Subject: [PATCH 21/23] stype --- src/speech/http_speech_calculator.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index 360db1b98c..fa66ebaed5 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -367,7 +367,6 @@ class SpeechCalculator : public CalculatorBase { drwav wav; void* ppData; size_t pDataSize; - auto waveform_size = gen_speech.speeches[0].get_size(); size_t total_samples = waveform_size * format.channels; ov::Tensor cpu_tensor(gen_speech.speeches[0].get_element_type(), gen_speech.speeches[0].get_shape()); From 737a98bc04345bd74ae4e33648f12badf5039a50 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 6 Oct 2025 20:56:31 +0200 Subject: [PATCH 22/23] stype --- src/speech/http_speech_calculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc index fa66ebaed5..79c1b3a960 100644 --- a/src/speech/http_speech_calculator.cc +++ b/src/speech/http_speech_calculator.cc @@ -356,7 +356,7 @@ class SpeechCalculator : public CalculatorBase { SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); std::unique_lock lock(pipe->text2SpeechPipelineMutex); auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); - + drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_IEEE_FLOAT; From d6253750617ee66660cfa38dd6996b749023f4ef Mon Sep 17 00:00:00 2001 From: "Trawinski, Dariusz" Date: Mon, 6 Oct 2025 23:26:24 +0200 Subject: [PATCH 23/23] Update README.md --- demos/audio/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 41007bb6cc..73b9b9231a 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -15,10 +15,10 @@ curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\ ## Audio transcription -python export_model.py transcription --source_model openai/whisper-large-v2 --weight-format fp16 +python export_model.py transcription --source_model openai/whisper-large-v2 --weight-format fp16 --target_device GPU -docker run -p 8000:8000 -it -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000 +docker run -p 8000:8000 -it --device /dev/dri -u 0 -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000 curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper"