Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions src/llm/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
global json
import json
from pathlib import Path
global datetime
import datetime

global contextmanager
from contextlib import contextmanager
Expand All @@ -205,9 +203,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ

def raise_exception(message):
raise jinja2.exceptions.TemplateError(message)
# Appears in some of mistral chat templates
def strftime_now(format):
return datetime.datetime.now().strftime(format)
# Following the logic from:
# https://github.com/huggingface/transformers/blob/7188e2e28c6d663284634732564143b820a03f8b/src/transformers/utils/chat_template_utils.py#L398
class AssistantTracker(Extension):
Expand Down Expand Up @@ -272,7 +267,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
jinja_env.globals["raise_exception"] = raise_exception
jinja_env.globals["strftime_now"] = strftime_now
if jinja_file.is_file():
template = jinja_env.get_template("chat_template.jinja")
elif jinja_file_legacy.is_file():
Expand Down
32 changes: 23 additions & 9 deletions src/test/llm/output_parsers/hermes3_output_parser_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,31 @@ const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_t
const std::string tokenizerPath = "/ovms/src/test/llm_testing/NousResearch/Hermes-3-Llama-3.1-8B";
#endif

static ov::genai::Tokenizer hermes3Tokenizer(tokenizerPath);
static std::unique_ptr<ov::genai::Tokenizer> hermes3Tokenizer;

class Hermes3OutputParserTest : public ::testing::Test {
protected:
std::unique_ptr<OutputParser> outputParserWithRegularToolParsing;
std::unique_ptr<OutputParser> outputParserWithImmediateToolParsing;

static void SetUpTestSuite() {
try {
hermes3Tokenizer = std::make_unique<ov::genai::Tokenizer>(tokenizerPath);
} catch (const std::exception& e) {
FAIL() << "Failed to initialize hermes3 tokenizer: " << e.what();
} catch (...) {
FAIL() << "Failed to initialize hermes3 tokenizer due to unknown error.";
}
}

static void TearDownTestSuite() {
hermes3Tokenizer.reset();
}

void SetUp() override {
// For Hermes3 model there is only tool parser available
outputParserWithRegularToolParsing = std::make_unique<OutputParser>(hermes3Tokenizer, "hermes3", "");
outputParserWithImmediateToolParsing = std::make_unique<OutputParser>(hermes3Tokenizer, "hermes3", "");
outputParserWithRegularToolParsing = std::make_unique<OutputParser>(*hermes3Tokenizer, "hermes3", "");
outputParserWithImmediateToolParsing = std::make_unique<OutputParser>(*hermes3Tokenizer, "hermes3", "");
outputParserWithImmediateToolParsing->enableImmediateToolParsing();
}
};
Expand All @@ -59,7 +73,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithSingleToolCall) {
// Remove opening tag for immediate parsing
input = input.substr(std::string("<tool_call>").length());
}
auto generatedTensor = hermes3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
EXPECT_EQ(parsedOutput.content, "");
Expand Down Expand Up @@ -88,7 +102,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithNoToolsInTheRequest) {
// Remove opening tag for immediate parsing
testInput = testInput.substr(std::string("<tool_call>").length());
}
auto generatedTensor = hermes3Tokenizer.encode(testInput, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, false) : outputParserWithRegularToolParsing->parse(generatedTokens, false);
EXPECT_EQ(parsedOutput.content, testInput);
Expand All @@ -115,7 +129,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) {
if (immediateParsing) {
input = input.substr(std::string("<tool_call>").length());
}
auto generatedTensor = hermes3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
EXPECT_EQ(parsedOutput.content, "");
Expand Down Expand Up @@ -162,7 +176,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithTwoValidToolCallsAndOneIn
if (immediateParsing) {
input = input.substr(std::string("<tool_call>").length());
}
auto generatedTensor = hermes3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
EXPECT_EQ(parsedOutput.content, "");
Expand All @@ -188,7 +202,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithTwoValidToolCallsAndOneIn

TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) {
std::string input = "This is a regular model response without tool calls.";
auto generatedTensor = hermes3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true);
EXPECT_EQ(parsedOutput.content, "This is a regular model response without tool calls.");
Expand All @@ -204,7 +218,7 @@ TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) {

TEST_F(Hermes3OutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) {
std::string input = "This is a content part and next will be a tool call.\n\n<tool_call>{\"name\": \"example_tool\", \"arguments\": {\"arg1\": \"value1\", \"arg2\": 42}}</tool_call>";
auto generatedTensor = hermes3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = hermes3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
// generatedTokens should now contain content followed by bot token ID and then tool call
ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true);
Expand Down
36 changes: 25 additions & 11 deletions src/test/llm/output_parsers/llama3_output_parser_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_t
const std::string tokenizerPath = "/ovms/src/test/llm_testing/meta-llama/Llama-3.1-8B-Instruct";
#endif

static ov::genai::Tokenizer llama3Tokenizer(tokenizerPath);
static std::unique_ptr<ov::genai::Tokenizer> llama3Tokenizer;

// Id of the <|python_tag|> which is a special token used to indicate the start of a tool calls
constexpr int64_t botTokenId = 128010;
Expand All @@ -41,16 +41,30 @@ class Llama3OutputParserTest : public ::testing::Test {
std::unique_ptr<OutputParser> outputParserWithRegularToolParsing;
std::unique_ptr<OutputParser> outputParserWithImmediateToolParsing;

static void SetUpTestSuite() {
try {
llama3Tokenizer = std::make_unique<ov::genai::Tokenizer>(tokenizerPath);
} catch (const std::exception& e) {
FAIL() << "Failed to initialize llama3 tokenizer: " << e.what();
} catch (...) {
FAIL() << "Failed to initialize llama3 tokenizer due to unknown error.";
}
}

static void TearDownTestSuite() {
llama3Tokenizer.reset();
}

void SetUp() override {
outputParserWithRegularToolParsing = std::make_unique<OutputParser>(llama3Tokenizer, "llama3", "");
outputParserWithImmediateToolParsing = std::make_unique<OutputParser>(llama3Tokenizer, "llama3", "");
outputParserWithRegularToolParsing = std::make_unique<OutputParser>(*llama3Tokenizer, "llama3", "");
outputParserWithImmediateToolParsing = std::make_unique<OutputParser>(*llama3Tokenizer, "llama3", "");
outputParserWithImmediateToolParsing->enableImmediateToolParsing();
}
};

TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithSingleToolCall) {
std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
generatedTokens.insert(generatedTokens.begin(), botTokenId);
for (bool immediateParsing : {false, true}) {
Expand All @@ -66,7 +80,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithSingleToolCall) {

TEST_F(Llama3OutputParserTest, ParseToolCallOutputNoToolsInTheRequest) {
std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
for (bool immediateParsing : {false, true}) {
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, false) : outputParserWithRegularToolParsing->parse(generatedTokens, false);
Expand All @@ -79,7 +93,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputNoToolsInTheRequest) {
// Tool parser assumes entire output are tool calls since it starts with "{", but it's not the case
TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputToolsInTheRequest) {
std::string input = "{\"name\": \"Jane Doe\", \"location\": \"unknown\"}";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
for (bool immediateParsing : {false, true}) {
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
Expand All @@ -92,7 +106,7 @@ TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputToolsInTheRequest) {
// Tool parser is available, but there are no tools in the request, so all output should be treated as content
TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputNoToolsInTheRequest) {
std::string input = "{\"name\": \"Jane Doe\", \"location\": \"unknown\"}";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
for (bool immediateParsing : {false, true}) {
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, false) : outputParserWithRegularToolParsing->parse(generatedTokens, false);
Expand All @@ -105,7 +119,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) {
std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}};"
"{\"name\": \"another_tool\", \"parameters\": {\"param1\": \"data\", \"param2\": true}};"
"{\"name\": \"third_tool\", \"parameters\": {\"key\": \"value\"}}";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
for (bool immediateParsing : {false, true}) {
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
Expand All @@ -132,7 +146,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) {

TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) {
std::string input = "This is a regular model response without tool calls.";
auto generatedTensor = llama3Tokenizer.encode(input, ov::genai::add_special_tokens(false)).input_ids;
auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedTokens(generatedTensor.data<int64_t>(), generatedTensor.data<int64_t>() + generatedTensor.get_size());
for (bool immediateParsing : {false, true}) {
ParsedOutput parsedOutput = immediateParsing ? outputParserWithImmediateToolParsing->parse(generatedTokens, true) : outputParserWithRegularToolParsing->parse(generatedTokens, true);
Expand All @@ -145,9 +159,9 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) {
TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) {
std::string content = "This is a content part and next will be a tool call.";
std::string toolCall = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}";
auto generatedContentTensor = llama3Tokenizer.encode(content, ov::genai::add_special_tokens(false)).input_ids;
auto generatedContentTensor = llama3Tokenizer->encode(content, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedContentTokens(generatedContentTensor.data<int64_t>(), generatedContentTensor.data<int64_t>() + generatedContentTensor.get_size());
auto generatedToolCallTensor = llama3Tokenizer.encode(toolCall, ov::genai::add_special_tokens(false)).input_ids;
auto generatedToolCallTensor = llama3Tokenizer->encode(toolCall, ov::genai::add_special_tokens(false)).input_ids;
std::vector<int64_t> generatedToolCallTokens(generatedToolCallTensor.data<int64_t>(), generatedToolCallTensor.data<int64_t>() + generatedToolCallTensor.get_size());
std::vector<int64_t> generatedTokens;
generatedTokens.insert(generatedTokens.end(), generatedContentTokens.begin(), generatedContentTokens.end());
Expand Down
Loading