|  | 
|  | 1 | +/* | 
|  | 2 | + * Copyright (c) Meta Platforms, Inc. and affiliates. | 
|  | 3 | + * All rights reserved. | 
|  | 4 | + * | 
|  | 5 | + * This source code is licensed under the BSD-style license found in the | 
|  | 6 | + * LICENSE file in the root directory of this source tree. | 
|  | 7 | + */ | 
|  | 8 | + | 
|  | 9 | +#include <gtest/gtest.h> | 
|  | 10 | + | 
|  | 11 | +#include "pytorch/tokenizers/pcre2_regex.h" | 
|  | 12 | +#include "pytorch/tokenizers/re2_regex.h" | 
|  | 13 | +#include "pytorch/tokenizers/regex.h" | 
|  | 14 | + | 
|  | 15 | +using namespace tokenizers; | 
|  | 16 | + | 
|  | 17 | +class RegexTest : public ::testing::Test {}; | 
|  | 18 | + | 
|  | 19 | +// Test basic functionality | 
|  | 20 | +TEST_F(RegexTest, BasicMatching) { | 
|  | 21 | +  auto regex = TK_UNWRAP_THROW(create_regex("\\w+")); | 
|  | 22 | + | 
|  | 23 | +  std::string text = "Hello world"; | 
|  | 24 | +  auto matches = regex->find_all(text); | 
|  | 25 | +  ASSERT_EQ(matches.size(), 2); | 
|  | 26 | +  EXPECT_EQ(matches[0].start, 0); | 
|  | 27 | +  EXPECT_EQ(matches[0].end, 5); | 
|  | 28 | +  EXPECT_EQ( | 
|  | 29 | +      text.substr(matches[0].start, matches[0].end - matches[0].start), | 
|  | 30 | +      "Hello"); | 
|  | 31 | +  EXPECT_EQ(matches[1].start, 6); | 
|  | 32 | +  EXPECT_EQ(matches[1].end, 11); | 
|  | 33 | +  EXPECT_EQ( | 
|  | 34 | +      text.substr(matches[1].start, matches[1].end - matches[1].start), | 
|  | 35 | +      "world"); | 
|  | 36 | +} | 
|  | 37 | + | 
|  | 38 | +// Test pattern that only PCRE2 supports (lookbehind) | 
|  | 39 | +TEST_F(RegexTest, Pcre2Specific) { | 
|  | 40 | +  const std::string pattern = "(?<=@)\\w+"; | 
|  | 41 | + | 
|  | 42 | +  // Verify that the factory function fallsback on a PCRE2 regex | 
|  | 43 | +  auto regex = TK_UNWRAP_THROW(create_regex(pattern)); | 
|  | 44 | +  EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr); | 
|  | 45 | + | 
|  | 46 | +  std::string text = "[email protected]" ; | 
|  | 47 | +  auto matches = regex->find_all(text); | 
|  | 48 | +  ASSERT_EQ(matches.size(), 1); | 
|  | 49 | +  EXPECT_EQ(matches[0].start, 5); | 
|  | 50 | +  EXPECT_EQ(matches[0].end, 12); | 
|  | 51 | +  EXPECT_EQ( | 
|  | 52 | +      text.substr(matches[0].start, matches[0].end - matches[0].start), | 
|  | 53 | +      "example"); | 
|  | 54 | +} | 
|  | 55 | + | 
|  | 56 | +// Test complex pattern with negative lookahead that should fall back to PCRE2. | 
|  | 57 | +// This specific pattern is from the Qwen2.5 1.5B pretokenizer. | 
|  | 58 | +// https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/tokenizer.json | 
|  | 59 | +TEST_F(RegexTest, ComplexPatternWithNegativeLookahead) { | 
|  | 60 | +  const std::string complex_pattern = | 
|  | 61 | +      "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; | 
|  | 62 | + | 
|  | 63 | +  // Now verify that the factory function fallsback on a PCRE2 regex | 
|  | 64 | +  auto regex = TK_UNWRAP_THROW(create_regex(complex_pattern)); | 
|  | 65 | +  EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr); | 
|  | 66 | + | 
|  | 67 | +  // Test the pattern with some sample text | 
|  | 68 | +  std::string text = "Hello's world\n  test"; | 
|  | 69 | +  auto matches = regex->find_all(text); | 
|  | 70 | + | 
|  | 71 | +  // We expect to match: | 
|  | 72 | +  // 1. "Hello" (word) | 
|  | 73 | +  // 2. "'s" (contraction) | 
|  | 74 | +  // 3. " world" (word with leading space) | 
|  | 75 | +  // 4. "\n" (newline) | 
|  | 76 | +  // 5. " " (whitespace) | 
|  | 77 | +  // 6. " test" (word with leading space) | 
|  | 78 | +  ASSERT_EQ(matches.size(), 6); | 
|  | 79 | + | 
|  | 80 | +  EXPECT_EQ(matches[0].start, 0); | 
|  | 81 | +  EXPECT_EQ(matches[0].end, 5); | 
|  | 82 | +  EXPECT_EQ( | 
|  | 83 | +      text.substr(matches[0].start, matches[0].end - matches[0].start), | 
|  | 84 | +      "Hello"); | 
|  | 85 | +  EXPECT_EQ(matches[1].start, 5); | 
|  | 86 | +  EXPECT_EQ(matches[1].end, 7); | 
|  | 87 | +  EXPECT_EQ( | 
|  | 88 | +      text.substr(matches[1].start, matches[1].end - matches[1].start), "'s"); | 
|  | 89 | +  EXPECT_EQ(matches[2].start, 7); | 
|  | 90 | +  EXPECT_EQ(matches[2].end, 13); | 
|  | 91 | +  EXPECT_EQ( | 
|  | 92 | +      text.substr(matches[2].start, matches[2].end - matches[2].start), | 
|  | 93 | +      " world"); | 
|  | 94 | +  EXPECT_EQ(matches[3].start, 13); | 
|  | 95 | +  EXPECT_EQ(matches[3].end, 14); | 
|  | 96 | +  EXPECT_EQ( | 
|  | 97 | +      text.substr(matches[3].start, matches[3].end - matches[3].start), "\n"); | 
|  | 98 | +  EXPECT_EQ(matches[4].start, 14); | 
|  | 99 | +  EXPECT_EQ(matches[4].end, 15); | 
|  | 100 | +  EXPECT_EQ( | 
|  | 101 | +      text.substr(matches[4].start, matches[4].end - matches[4].start), " "); | 
|  | 102 | +  EXPECT_EQ(matches[5].start, 15); | 
|  | 103 | +  EXPECT_EQ(matches[5].end, 20); | 
|  | 104 | +  EXPECT_EQ( | 
|  | 105 | +      text.substr(matches[5].start, matches[5].end - matches[5].start), | 
|  | 106 | +      " test"); | 
|  | 107 | +} | 
0 commit comments