diff --git a/WORKSPACE b/WORKSPACE index bfff7ea463..facf9155e7 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -640,3 +640,19 @@ cc_library( ) """, ) + +new_git_repository( + name = "dr_libs", + remote = "https://github.com/mackron/dr_libs", + commit = "24d738be2349fd4b6fe50eeaa81f5bd586267fd0", + build_file_content = """ +cc_library( + name = "dr", + hdrs = ["dr_flac.h", "dr_mp3.h", "dr_wav.h"], + visibility = ["//visibility:public"], + local_defines = [ + ], +) +""", +) + diff --git a/demos/audio/README.md b/demos/audio/README.md new file mode 100644 index 0000000000..73b9b9231a --- /dev/null +++ b/demos/audio/README.md @@ -0,0 +1,28 @@ +# Audio endpoints + + +## Audio synthesis + +python export_model.py speech --source_model microsoft/speecht5_tts --vocoder microsoft/speecht5_hifigan --weight-format fp16 + +docker run -p 8000:8000 -d -v $(pwd)/models/:/models openvino/model_server --model_name speecht5_tts --model_path /models/microsoft/speecht5_tts --rest_port 8000 + +curl http://localhost/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog.\"}" -o audio.wav + + + + + +## Audio transcription + +python export_model.py transcription --source_model openai/whisper-large-v2 --weight-format fp16 --target_device GPU + + +docker run -p 8000:8000 -it --device /dev/dri -u 0 -v $(pwd)/models/:/models openvino/model_server --model_name whisper --model_path /models/openai/whisper-large-v2 --rest_port 8000 + + +curl http://localhost/v3/audio/transcriptions -H "Content-Type: multipart/form-data" -F file="@audio.wav" -F model="whisper" + + + + diff --git a/demos/audio/openai_speech.py b/demos/audio/openai_speech.py new file mode 100644 index 0000000000..1fed6b11d8 --- /dev/null +++ b/demos/audio/openai_speech.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pathlib import Path +from openai import OpenAI + +prompt = "Intel Corporation is an American multinational technology company headquartered in Santa Clara, California.[3] Intel designs, manufactures, and sells computer components such as central processing units (CPUs) and related products for business and consumer markets. It was the world's third-largest semiconductor chip manufacturer by revenue in 2024[4] and has been included in the Fortune 500 list of the largest United States corporations by revenue since 2007. It was one of the first companies listed on Nasdaq. Since 2025, it is partially owned by the United States government." +filename = "speech.wav" +url="http://localhost:80/v3" + + +speech_file_path = Path(__file__).parent / "speech.wav" +client = OpenAI(base_url=url, api_key="not_used") + +# with client.audio.speech.with_streaming_response.create( +# model="whisper", +# voice="alloy", +# input=prompt +# ) as response: +# response.stream_to_file(speech_file_path) + +audio_file = open("speech.wav", "rb") +transcript = client.audio.transcriptions.create( + model="whisper", + file=audio_file +) + + +print(transcript) \ No newline at end of file diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 8810a52cb5..b5d321b102 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -97,7 +97,7 @@ options: #### Text Generation CPU Deployment ```console -python export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config_all.json --model_repository_path models +python demos\common\export_models\export_model.py text_generation --source_model meta-llama/Llama-3.2-1B-Instruct --weight-format int4 --kv_cache_precision u8 --config_file_path config.json --model_repository_path audio ``` #### GPU Deployment (Low Concurrency, Limited Memory) diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 2b6e76a210..b296ad20d6 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -92,8 +92,58 @@ def add_common_arguments(parser): parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt') parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps') parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps') + +parser_speech_generation = subparsers.add_parser('speech', help='export model for speech synthesis endpoint') +add_common_arguments(parser_speech_generation) +parser_speech_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') +parser_speech_generation.add_argument('--vocoder', type=str, help='The vocoder model to use for speech synthesis. For example microsoft/speecht5_hifigan', dest='vocoder') + +parser_transcription_generation = subparsers.add_parser('transcription', help='export model for speech transcription endpoint') +add_common_arguments(parser_transcription_generation) +parser_transcription_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams') args = vars(parser.parse_args()) +speech_graph_template = """ +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "SpeechExecutor" + input_side_packet: "SPEECH_NODE_RESOURCES:speech_servable" + calculator: "SpeechCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.SpeechCalculatorOptions]: { + models_path: "{{model_path}}", + mode: TEXT_TO_SPEECH, + plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', + device: "{{target_device|default("CPU", true)}}" + } + } +} +""" + +transcription_graph_template = """ +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "SpeechExecutor" + input_side_packet: "SPEECH_NODE_RESOURCES:speech_servable" + calculator: "SpeechCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.SpeechCalculatorOptions]: { + models_path: "{{model_path}}", + mode: SPEECH_TO_TEXT, + plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }', + device: "{{target_device|default("CPU", true)}}" + } + } +} +""" + + embedding_graph_template = """input_stream: "REQUEST_PAYLOAD:input" output_stream: "RESPONSE_PAYLOAD:output" node { @@ -526,7 +576,7 @@ def export_embeddings_model(model_repository_path, source_model, model_name, pre print("Created subconfig {}".format(os.path.join(model_repository_path, model_name, 'subconfig.json'))) add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) -def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True): +def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters): set_max_context_length = "" destination_path = os.path.join(model_repository_path, model_name) print("Exporting embeddings model to ",destination_path) @@ -543,7 +593,32 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name, with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: f.write(graph_content) print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) - add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) + +def export_speech_model(model_repository_path, source_model, model_name, precision, task_parameters): + destination_path = os.path.join(model_repository_path, model_name) + print("Exporting speech model to ",destination_path) + if not os.path.isdir(destination_path) or args['overwrite_models']: + optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path) + if os.system(optimum_command): + raise ValueError("Failed to export speech model", source_model) + gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(speech_graph_template) + graph_content = gtemplate.render(model_path="./", **task_parameters) + with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: + f.write(graph_content) + print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) + +def export_transcription_model(model_repository_path, source_model, model_name, precision, task_parameters): + destination_path = os.path.join(model_repository_path, model_name) + print("Exporting transcription model to ",destination_path) + if not os.path.isdir(destination_path) or args['overwrite_models']: + optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path) + if os.system(optimum_command): + raise ValueError("Failed to export transcription model", source_model) + gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(transcription_graph_template) + graph_content = gtemplate.render(model_path="./", **task_parameters) + with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: + f.write(graph_content) + print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length): destination_path = os.path.join(model_repository_path, model_name) @@ -674,7 +749,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam export_embeddings_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['truncate']) elif args['task'] == 'embeddings_ov': - export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate']) + export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters) elif args['task'] == 'rerank': export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length']) @@ -682,6 +757,11 @@ def export_image_generation_model(model_repository_path, source_model, model_nam elif args['task'] == 'rerank_ov': export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length']) +elif args['task'] == 'speech': + export_speech_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters) + +elif args['task'] == 'transcription': + export_transcription_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters) elif args['task'] == 'image_generation': template_parameters = {k: v for k, v in args.items() if k in [ 'ov_cache_dir', diff --git a/src/BUILD b/src/BUILD index 2eaebc1137..dad0158011 100644 --- a/src/BUILD +++ b/src/BUILD @@ -562,6 +562,7 @@ ovms_cc_library( "//conditions:default": [], "//:not_disable_mediapipe" : [ "//src/image_gen:image_gen_calculator", + "//src/speech:speech_calculator", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", "//src/embeddings:embeddingscalculator", diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index b22284c3fb..7033f86100 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -506,6 +506,8 @@ static Status createV3HttpPayload( } else { SPDLOG_DEBUG("Model name from deduced from MultiPart field: {}", modelName); } + auto stream = multiPartParser->getFieldByName("stream"); + SPDLOG_ERROR("{}", stream); ensureJsonParserInErrorState(parsedJson); } else if (isApplicationJson) { { diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index ef4dbeda3b..241c7de283 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -61,6 +61,7 @@ const std::string MediapipeGraphDefinition::SCHEDULER_CLASS_NAME{"Mediapipe"}; const std::string MediapipeGraphDefinition::PYTHON_NODE_CALCULATOR_NAME{"PythonExecutorCalculator"}; const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalculator"}; const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; +const std::string MediapipeGraphDefinition::SPEECH_NODE_CALCULATOR_NAME{"SpeechCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -554,6 +555,28 @@ Status MediapipeGraphDefinition::initializeNodes() { rerankServableMap.insert(std::pair>(nodeName, std::move(servable))); rerankServablesCleaningGuard.disableCleaning(); } + if (endsWith(config.node(i).calculator(), SPEECH_NODE_CALCULATOR_NAME)) { + auto& speechServableMap = this->sidePacketMaps.speechServableMap; + ResourcesCleaningGuard speechServablesCleaningGuard(speechServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (speechServableMap.find(nodeName) != speechServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::SpeechCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.device(), mgconfig.getBasePath(), nodeOptions.mode()); + speechServableMap.insert(std::pair>(nodeName, std::move(servable))); + speechServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 1a6e98bfcf..e5d8a700de 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -46,6 +46,7 @@ #include "../sidepacket_servable.hpp" #include "../embeddings/embeddings_servable.hpp" #include "../rerank/rerank_servable.hpp" +#include "../speech/speech_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -62,6 +63,7 @@ struct ImageGenerationPipelines; using PythonNodeResourcesMap = std::unordered_map>; using GenAiServableMap = std::unordered_map>; using RerankServableMap = std::unordered_map>; +using SpeechServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -71,19 +73,22 @@ struct GraphSidePackets { ImageGenerationPipelinesMap imageGenPipelinesMap; EmbeddingsServableMap embeddingsServableMap; RerankServableMap rerankServableMap; + SpeechServableMap speechServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); imageGenPipelinesMap.clear(); embeddingsServableMap.clear(); rerankServableMap.clear(); + speechServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && genAiServableMap.empty() && imageGenPipelinesMap.empty() && embeddingsServableMap.empty() && - rerankServableMap.empty()); + rerankServableMap.empty() && + speechServableMap.empty()); } }; @@ -124,6 +129,7 @@ class MediapipeGraphDefinition { static const std::string IMAGE_GEN_CALCULATOR_NAME; static const std::string EMBEDDINGS_NODE_CALCULATOR_NAME; static const std::string RERANK_NODE_CALCULATOR_NAME; + static const std::string SPEECH_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index aa95bf88ec..5545b13966 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -47,6 +47,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -56,7 +57,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, speechServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -88,6 +89,7 @@ const std::string MediapipeGraphExecutor::LLM_SESSION_SIDE_PACKET_TAG = "llm"; const std::string MediapipeGraphExecutor::IMAGE_GEN_SESSION_SIDE_PACKET_TAG = "pipes"; const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "embeddings_servable"; const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; +const std::string MediapipeGraphExecutor::SPEECH_SESSION_SIDE_PACKET_TAG = "speech_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index b2468f5540..f3c6907aaa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -93,6 +93,7 @@ class MediapipeGraphExecutor { static const std::string IMAGE_GEN_SESSION_SIDE_PACKET_TAG; static const std::string EMBEDDINGS_SESSION_SIDE_PACKET_TAG; static const std::string RERANK_SESSION_SIDE_PACKET_TAG; + static const std::string SPEECH_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -103,6 +104,7 @@ class MediapipeGraphExecutor { const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -151,6 +153,8 @@ class MediapipeGraphExecutor { inputSidePackets[EMBEDDINGS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.embeddingsServableMap).At(STARTING_TIMESTAMP); inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[SPEECH_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.speechServableMap).At(STARTING_TIMESTAMP); + MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR); ::mediapipe::Packet packet; diff --git a/src/speech/BUILD b/src/speech/BUILD new file mode 100644 index 0000000000..39ad5e14d4 --- /dev/null +++ b/src/speech/BUILD @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "speech_servable", + hdrs = ["speech_servable.hpp"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "llm_engine", # in fact this is genai library + srcs = [], + deps = ["@llm_engine//:llm_engine"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "speech_calculator", + srcs = ["http_speech_calculator.cc"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "speech_calculator_cc_proto", + "@dr_libs//:dr", + ":speech_servable", + ]+ select({ + "//conditions:default": ["//third_party:genai", ":llm_engine"], + "//:not_genai_bin" : [":llm_engine"], + }), + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "speech_calculator_proto", + srcs = ["speech_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc new file mode 100644 index 0000000000..79c1b3a960 --- /dev/null +++ b/src/speech/http_speech_calculator.cc @@ -0,0 +1,407 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "../http_payload.hpp" +#include "../logging.hpp" +#include +#include + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#define DR_WAV_IMPLEMENTATION +#pragma warning(push) +#pragma warning(disable : 4245 4220) +#include "dr_wav.h" // NOLINT +#define DR_MP3_IMPLEMENTATION +#pragma warning(push) +#pragma warning(disable : 6386 6262) +#include "dr_mp3.h" // NOLINT +#pragma warning(pop) +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "speech_servable.hpp" + +#ifdef _WIN32 +#include +#include +#endif + +using namespace ovms; + +namespace mediapipe { + +// using SpeechPipelinesMap = std::unordered_map>; + +const std::string SPEECH_SESSION_SIDE_PACKET_TAG = "SPEECH_NODE_RESOURCES"; + +#define COMMON_SAMPLE_RATE 16000 + +bool is_wav_buffer(const std::string buf) { + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + SPDLOG_DEBUG("is_wav_buffer: buf {}", buf.substr(0, 12)); + if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { + return false; + } + + uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); + SPDLOG_DEBUG("is_wav_buffer: chunk_size {}", chunk_size); + if (chunk_size + 8 != buf.size()) { + return false; + } + + return true; +} + +ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { + drwav wav; + + // if (filename == "-") { + // { + // #ifdef _WIN32 + // _setmode(_fileno(stdin), _O_BINARY); + // #endif + + // uint8_t buf[1024]; + // while (true) { + // const size_t n = fread(buf, 1, sizeof(buf), stdin); + // if (n == 0) { + // break; + // } + // wav_data.insert(wav_data.end(), buf, buf + n); + // } + // } + + // OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + // "Failed to open WAV file from stdin"); + + // fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + // } else if (is_wav_buffer(filename)) { + // OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), + // "Failed to open WAV file from fname buffer"); + // } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { + // #if defined(WHISPER_FFMPEG) + // OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + + // OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + // "Failed to read wav data as wav") + // #else + // throw std::runtime_error("failed to open as WAV file"); + // #endif + // } + auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr); + if (result == false) { + SPDLOG_ERROR("FILE PARSING FAILED {}", result); + throw std::runtime_error("FILE PARSING FAILED"); + } + if (wav.channels != 1 && wav.channels != 2) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be mono or stereo"); + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + } + + const uint64_t n = + wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8ul); + + std::vector pcm16; + pcm16.resize(n * wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + std::vector pcmf32; + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i]) / 32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + } + } + + return pcmf32; +} + +float* resample_audio(const float* input, + size_t input_length, + float input_rate, + float target_rate, + size_t* output_length) { + if (input_rate == target_rate) { + *output_length = input_length; + float* output = (float*)malloc(input_length * sizeof(float)); + if (output) { + memcpy(output, input, input_length * sizeof(float)); + } + return output; + } + + float ratio = input_rate / target_rate; + *output_length = (size_t)(input_length / ratio); + float* output = (float*)malloc(*output_length * sizeof(float)); + + if (!output) { + return NULL; + } + + for (size_t i = 0; i < *output_length; i++) { + float src_idx = i * ratio; + size_t idx0 = (size_t)src_idx; + size_t idx1 = idx0 + 1; + + if (idx1 >= input_length) { + output[i] = input[input_length - 1]; + } else { + float frac = src_idx - idx0; + output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac; + } + } + + return output; +} + +/* ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { + drmp3 mp3; + + SPDLOG_ERROR("1"); + auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); + SPDLOG_ERROR("drmp3_init_memory RESULT {} size:{}", result, mp3_data.size()); + // if (result == 1) { + // SPDLOG_ERROR("FILE PARSING FAILED {}", result); + // throw std::runtime_error("FILE PARSING FAILED"); + // } + SPDLOG_ERROR("2"); + if (mp3.channels != 1 && mp3.channels != 2) { + drmp3_uninit(&mp3); + throw std::runtime_error("WAV file must be mono or stereo"); + } + SPDLOG_ERROR("3 {}", mp3.sampleRate); + // if (mp3.sampleRate != COMMON_SAMPLE_RATE) { + // drmp3_uninit(&mp3); + // throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + // } + SPDLOG_ERROR("4"); + const uint64_t n = mp3.totalPCMFrameCount; + SPDLOG_ERROR("mp3.totalPCMFrameCount {} : n {}", mp3.totalPCMFrameCount, n); + std::vector pcmf32; + pcmf32.resize(n * mp3.channels); + drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data()); + drmp3_uninit(&mp3); + SPDLOG_ERROR("5"); + // convert to mono, float + // std::vector pcmf32; + // pcmf32.resize(n); + // if (mp3.channels == 1) { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[i]) / 32768.0f; + // } + // } else { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + // } + // } + size_t output_length; + auto buffer = resample_audio(reinterpret_cast(pcmf32.data()), pcmf32.size(), mp3.sampleRate, 16000, &output_length); + std::vector output(buffer, buffer + output_length); + return output; +} */ + +std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { + std::string_view value = parser.getFileContentByFieldName(keyName); + if (value.empty()) { + return std::nullopt; + } + return value; +} + +#define SET_OR_RETURN(TYPE, NAME, RHS) \ + auto NAME##_OPT = RHS; \ + RETURN_IF_HOLDS_STATUS(NAME##_OPT) \ + auto NAME = std::get(NAME##_OPT); + +#define RETURN_IF_HOLDS_STATUS(NAME) \ + if (std::holds_alternative(NAME)) { \ + return std::get(NAME); \ + } + +class SpeechCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Set(); // TODO: template? + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {} ] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Open start", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process start", cc->NodeName()); + + SpeechServableMap pipelinesMap = cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Get(); + auto it = pipelinesMap.find(cc->NodeName()); + RET_CHECK(it != pipelinesMap.end()) << "Could not find initialized Speech node named: " << cc->NodeName(); + auto pipe = it->second; + + auto payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + + std::unique_ptr images; // output + std::unique_ptr output; + if (absl::StartsWith(payload.uri, "/v3/audio/transcriptions")) { + if (payload.multipartParser->hasParseError()) + return absl::InvalidArgumentError("Failed to parse multipart data"); + + SET_OR_RETURN(std::optional, file, getFileFromPayload(*payload.multipartParser, "file")); + auto stream = getFileFromPayload(*payload.multipartParser, "stream"); + if (!std::holds_alternative(stream)) { + SPDLOG_ERROR("NO VALUE"); + } else { + SPDLOG_ERROR("{}", (std::get>(stream)).value()); + } + if (!file.has_value()) { + return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); + } + // ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); + // // 'task' and 'language' parameters are supported for multilingual models only + // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + // config.task = "transcribe"; + // config.return_timestamps = true; + ov::genai::RawSpeechInput raw_speech; + try { + SPDLOG_DEBUG(file.value().data()); + if (is_wav_buffer(std::string(file.value()))) { + SPDLOG_DEBUG("WAV FILE"); + raw_speech = read_wav(file.value()); + SPDLOG_DEBUG("WAV FILE SIZE: {}", raw_speech.size()); + } else { + SPDLOG_DEBUG("NOT WAV FILE. Only WAVE format is supported currently"); + // raw_speech = read_mp3(file.value()); + SPDLOG_DEBUG("FILE SIZE: {}", raw_speech.size()); + } + } catch (std::exception&) { + return absl::InvalidArgumentError("Audio file reading failed"); + } + std::string result = "{\"text\": \""; + std::unique_lock lock(pipe->whisperPipelineMutex); + result += pipe->whisperPipeline->generate(raw_speech); + result.append("\"}"); + SPDLOG_ERROR("{}", result); + output = std::make_unique(result); + } else if (absl::StartsWith(payload.uri, "/v3/audio/speech")) { + if (payload.parsedJson->HasParseError()) + return absl::InvalidArgumentError("Failed to parse JSON"); + + if (!payload.parsedJson->IsObject()) { + return absl::InvalidArgumentError("JSON body must be an object"); + } + auto inputIt = payload.parsedJson->FindMember("input"); + if (inputIt == payload.parsedJson->MemberEnd()) { + return absl::InvalidArgumentError("input field is missing in JSON body"); + } + if (!inputIt->value.IsString()) { + return absl::InvalidArgumentError("input field is not a string"); + } + auto streamIt = payload.parsedJson->FindMember("stream_format"); + if (streamIt != payload.parsedJson->MemberEnd()) { + SPDLOG_ERROR("STREAM: {}", streamIt->value.GetString()); + } else { + SPDLOG_ERROR("NO STREAM"); + } + SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); + std::unique_lock lock(pipe->text2SpeechPipelineMutex); + auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); + + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 16000; // assume it is always 16 KHz + format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth(); + SPDLOG_ERROR("1"); + drwav wav; + void* ppData; + size_t pDataSize; + auto waveform_size = gen_speech.speeches[0].get_size(); + size_t total_samples = waveform_size * format.channels; + ov::Tensor cpu_tensor(gen_speech.speeches[0].get_element_type(), gen_speech.speeches[0].get_shape()); + // copy results to release inference request + gen_speech.speeches[0].copy_to(cpu_tensor); + lock.unlock(); + + auto waveform_ptr = cpu_tensor.data(); + OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), + "Failed to initialize WAV writer"); + SPDLOG_ERROR("2"); + drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); + OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); + SPDLOG_ERROR("3"); + output = std::make_unique(reinterpret_cast(ppData), pDataSize); + drwav_uninit(&wav); + SPDLOG_ERROR("4"); + // drwav_free(ppData, NULL); + } else { + return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); + } + + // auto outputOrStatus = generateJSONResponseFromOvTensor(*images); + // RETURN_IF_HOLDS_STATUS(outputOrStatus); + // auto output = std::move(std::get>(outputOrStatus)); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process end", cc->NodeName()); + + return absl::OkStatus(); + } +}; + +const std::string SpeechCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string SpeechCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(SpeechCalculator); + +} // namespace mediapipe diff --git a/src/speech/speech_calculator.proto b/src/speech/speech_calculator.proto new file mode 100644 index 0000000000..80ed29ccd6 --- /dev/null +++ b/src/speech/speech_calculator.proto @@ -0,0 +1,40 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message SpeechCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional SpeechCalculatorOptions ext = 116423755; + } + + // fields required for GenAI pipeline initialization + required string models_path = 1; + optional string device = 2; + optional string plugin_config = 3; + enum Mode { + TEXT_TO_SPEECH = 0; + SPEECH_TO_TEXT = 1; + } + + required Mode mode = 4 [default = TEXT_TO_SPEECH]; +} diff --git a/src/speech/speech_servable.hpp b/src/speech/speech_servable.hpp new file mode 100644 index 0000000000..853b46ad40 --- /dev/null +++ b/src/speech/speech_servable.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "src/speech/speech_calculator.pb.h" + +namespace ovms { + +struct SpeechServable { + std::filesystem::path parsedModelsPath; + std::shared_ptr whisperPipeline; + std::shared_ptr text2SpeechPipeline; + std::mutex whisperPipelineMutex; + std::mutex text2SpeechPipelineMutex; + + SpeechServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath, mediapipe::SpeechCalculatorOptions::Mode mode) { + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath.string(); + } + if (mode == mediapipe::SpeechCalculatorOptions::TEXT_TO_SPEECH) { + text2SpeechPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } else { + whisperPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } + } +}; + +using SpeechServableMap = std::unordered_map>; +} // namespace ovms diff --git a/src/test/mediapipeflow_test.cpp b/src/test/mediapipeflow_test.cpp index 196e13b2d4..bf987fd811 100644 --- a/src/test/mediapipeflow_test.cpp +++ b/src/test/mediapipeflow_test.cpp @@ -2683,7 +2683,7 @@ class MediapipeSerialization : public ::testing::Test { std::vector inputNames, std::vector outputNames, const PythonNodeResourcesMap& pythonNodeResourcesMap, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : - MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, nullptr, mediapipeServableMetricReporter) {} + MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, {}, nullptr, mediapipeServableMetricReporter) {} }; protected: @@ -3932,6 +3932,7 @@ TEST(WhitelistRegistered, MediapipeCalculatorsList) { "SerializationCalculator", "SetLandmarkVisibilityCalculator", "SidePacketToStreamCalculator", + "SpeechCalculator", "SplitAffineMatrixVectorCalculator", "SplitClassificationListVectorCalculator", "SplitDetectionVectorCalculator", diff --git a/src/test/pythonnode_test.cpp b/src/test/pythonnode_test.cpp index 5b1c91cbca..6a2b473283 100644 --- a/src/test/pythonnode_test.cpp +++ b/src/test/pythonnode_test.cpp @@ -1005,7 +1005,7 @@ class MockedMediapipeGraphExecutorPy : public ovms::MediapipeGraphExecutor { const PythonNodeResourcesMap& pythonNodeResourcesMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : - MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, pythonBackend, mediapipeServableMetricReporter) {} + MediapipeGraphExecutor(name, version, config, inputTypes, outputTypes, inputNames, outputNames, pythonNodeResourcesMap, {}, {}, {}, {}, pythonBackend, mediapipeServableMetricReporter) {} }; TEST_F(PythonFlowTest, SerializePyObjectWrapperToKServeResponse) { diff --git a/src/test/streaming_test.cpp b/src/test/streaming_test.cpp index d7c40b2ce6..cca060576f 100644 --- a/src/test/streaming_test.cpp +++ b/src/test/streaming_test.cpp @@ -359,7 +359,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::KFS_REQUEST}}, {{"out", mediapipe_packet_type_enum::KFS_RESPONSE}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}); @@ -416,7 +416,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}); // no timestamp specified, server will assign one @@ -559,7 +559,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests with manually (client) assigned ascending order of timestamp and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}, 3); // first request with timestamp 3 @@ -604,7 +604,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock only 1 request and disconnect immediately prepareRequest(this->firstRequest, {{"in", 3.5f}}); @@ -1243,7 +1243,7 @@ node { {"out3", mediapipe_packet_type_enum::OVTENSOR}}, {"in1", "in2", "in3"}, {"out1", "out2", "out3"}, - {}, {}, {}, {}, nullptr, this->reporter.get()}; + {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1295,7 +1295,7 @@ node { {"out3", mediapipe_packet_type_enum::OVTENSOR}}, {"in1", "in2", "in3"}, {"out1", "out2", "out3"}, - {}, {}, {}, {}, nullptr, this->reporter.get()}; + {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1330,7 +1330,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1364,7 +1364,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"wrong_name"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // cannot install observer due to wrong output name (should never happen due to validation) + {"in"}, {"wrong_name"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // cannot install observer due to wrong output name (should never happen due to validation) EXPECT_CALL(this->stream, Read(_)).Times(0); EXPECT_CALL(this->stream, Write(_, _)).Times(0); @@ -1389,7 +1389,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {}); EXPECT_CALL(this->stream, Read(_)) @@ -1417,7 +1417,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); @@ -1453,7 +1453,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); ASSERT_EQ(executor.inferStream(this->firstRequest, this->stream, this->executionContext), StatusCode::MEDIAPIPE_GRAPH_INITIALIZATION_ERROR); @@ -1476,7 +1476,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Invalid request - missing data in buffer prepareInvalidRequest(this->firstRequest, {"in"}); // no timestamp specified, server will assign one @@ -1511,7 +1511,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise[3]; std::future signalFuture[3] = { @@ -1558,7 +1558,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}, 0); EXPECT_CALL(this->stream, Read(_)) @@ -1586,7 +1586,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); setRequestTimestamp(this->firstRequest, std::string("not an int")); @@ -1621,7 +1621,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Timestamps not allowed in stream // Expect continuity of operation and response with error message @@ -1663,7 +1663,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Allowed in stream for (auto timestamp : std::vector<::mediapipe::Timestamp>{ @@ -1699,7 +1699,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 3 requests and disconnection prepareRequestWithParam(this->firstRequest, {{"in", 3.5f}}, {"val", 65}); // request with parameter val @@ -1736,7 +1736,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving the invalid request and disconnection // Request with invalid param py (special pythons session side packet) @@ -1765,7 +1765,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; prepareRequest(this->firstRequest, {{"in", 3.5f}}); // missing required request param EXPECT_CALL(this->stream, Read(_)).Times(0); @@ -1791,7 +1791,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; // Mock receiving 2 requests and disconnection prepareRequest(this->firstRequest, {{"in", 3.5f}}, std::nullopt, this->name, this->version); // no timestamp specified, server will assign one @@ -1825,7 +1825,7 @@ node { this->name, this->version, config, {{"in", mediapipe_packet_type_enum::OVTENSOR}}, {{"out", mediapipe_packet_type_enum::OVTENSOR}}, - {"in"}, {"out"}, {}, {}, {}, {}, nullptr, this->reporter.get()}; + {"in"}, {"out"}, {}, {}, {}, {}, {}, nullptr, this->reporter.get()}; std::promise signalPromise; std::future signalFuture = signalPromise.get_future(); diff --git a/src/version.hpp b/src/version.hpp index fbebc03710..a1f181f6cc 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -16,7 +16,7 @@ #ifndef SRC_VERSION_HPP_ #define SRC_VERSION_HPP_ #define PROJECT_NAME "OpenVINO Model Server" -#define PROJECT_VERSION "REPLACE_PROJECT_VERSION" -#define OPENVINO_NAME "REPLACE_OPENVINO_NAME" -#define BAZEL_BUILD_FLAGS "REPLACE_BAZEL_BUILD_FLAGS" +#define PROJECT_VERSION "2025.3.0.8524d72c" +#define OPENVINO_NAME "2025.3.0.0.dev20250812" +#define BAZEL_BUILD_FLAGS "--config=win_mp_on_py_off" #endif // SRC_VERSION_HPP_"