Fix #286

yhirose · yhirose · commit 6201458f3b68 · 2024-01-26T22:00:18.000-05:00
diff --git a/README.md b/README.md
@@ -347,6 +347,13 @@ START <- 'This month is ' MONTH '.'
 MONTH <- 'Jan' | 'January' | 'Feb' | 'February' | '...'
 ```
 
+It supports the case insensitive mode.
+
+```peg
+START <- 'This month is ' MONTH '.'
+MONTH <- 'Jan'i | 'January'i | 'Feb'i | 'February'i | '...'i
+```
+
 Cut operator
 ------------
 
diff --git a/peglib.h b/peglib.h
@@ -377,14 +377,13 @@ template <typename T> T token_to_number_(std::string_view sv) {
 
 class Trie {
 public:
-  Trie() = default;
-  Trie(const Trie &) = default;
-
-  Trie(const std::vector<std::string> &items) {
+  Trie(const std::vector<std::string> &items, bool ignore_case)
+      : ignore_case_(ignore_case) {
     for (const auto &item : items) {
       for (size_t len = 1; len <= item.size(); len++) {
         auto last = len == item.size();
-        std::string_view sv(item.data(), len);
+        const auto &s = ignore_case ? to_lower(item) : item;
+        std::string_view sv(s.data(), len);
         auto it = dic_.find(sv);
         if (it == dic_.end()) {
           dic_.emplace(sv, Info{last, last});
@@ -402,7 +401,8 @@ class Trie {
     auto done = false;
     size_t len = 1;
     while (!done && len <= text_len) {
-      std::string_view sv(text, len);
+      const auto &s = ignore_case_ ? to_lower(text) : std::string(text);
+      std::string_view sv(s.data(), len);
       auto it = dic_.find(sv);
       if (it == dic_.end()) {
         done = true;
@@ -416,6 +416,13 @@ class Trie {
   }
 
 private:
+  std::string to_lower(std::string s) const {
+    for (char &c : s) {
+      c = std::tolower(c);
+    }
+    return s;
+  }
+
   struct Info {
     bool done;
     bool match;
@@ -424,6 +431,8 @@ class Trie {
   // TODO: Use unordered_map when heterogeneous lookup is supported in C++20
   // std::unordered_map<std::string, Info> dic_;
   std::map<std::string, Info, std::less<>> dic_;
+
+  bool ignore_case_;
 };
 
 /*-----------------------------------------------------------------------------
@@ -1159,7 +1168,8 @@ class NotPredicate : public Ope {
 
 class Dictionary : public Ope, public std::enable_shared_from_this<Dictionary> {
 public:
-  Dictionary(const std::vector<std::string> &v) : trie_(v) {}
+  Dictionary(const std::vector<std::string> &v, bool ignore_case)
+      : trie_(v, ignore_case) {}
 
   size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c,
                     std::any &dt) const override;
@@ -1568,8 +1578,9 @@ inline std::shared_ptr<Ope> npd(const std::shared_ptr<Ope> &ope) {
   return std::make_shared<NotPredicate>(ope);
 }
 
-inline std::shared_ptr<Ope> dic(const std::vector<std::string> &v) {
-  return std::make_shared<Dictionary>(v);
+inline std::shared_ptr<Ope> dic(const std::vector<std::string> &v,
+                                bool ignore_case) {
+  return std::make_shared<Dictionary>(v, ignore_case);
 }
 
 inline std::shared_ptr<Ope> lit(std::string &&s) {
@@ -3335,16 +3346,17 @@ class ParserGenerator {
         seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"])));
     g["Suffix"] <= seq(g["Primary"], opt(g["Loop"]));
     g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]);
-    g["Primary"] <=
-        cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"],
-                npd(g["LEFTARROW"])),
-            seq(g["Ignore"], g["Identifier"],
-                npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))),
-            seq(g["OPEN"], g["Expression"], g["CLOSE"]),
-            seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"],
-            seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"],
-            g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClassI"],
-            g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]);
+    g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"],
+                            npd(g["LEFTARROW"])),
+                        seq(g["Ignore"], g["Identifier"],
+                            npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))),
+                        seq(g["OPEN"], g["Expression"], g["CLOSE"]),
+                        seq(g["BeginTok"], g["Expression"], g["EndTok"]),
+                        g["CapScope"],
+                        seq(g["BeginCap"], g["Expression"], g["EndCap"]),
+                        g["BackRef"], g["DictionaryI"], g["LiteralI"],
+                        g["Dictionary"], g["Literal"], g["NegatedClassI"],
+                        g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]);
 
     g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
     g["IdentCont"] <= tok(seq(g["IdentStart"], zom(g["IdentRest"])));
@@ -3358,18 +3370,23 @@ class ParserGenerator {
 
     g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"])));
 
+    g["DictionaryI"] <=
+        seq(g["LiteralID"], oom(seq(g["PIPE"], g["LiteralID"])));
+
     auto lit_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))),
                            cls("'"), g["Spacing"]),
                        seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))),
                            cls("\""), g["Spacing"]));
     g["Literal"] <= lit_ope;
     g["LiteralD"] <= lit_ope;
 
-    g["LiteralI"] <=
+    auto lit_case_ignore_ope =
         cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"),
                 g["Spacing"]),
             seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"),
                 g["Spacing"]));
+    g["LiteralI"] <= lit_case_ignore_ope;
+    g["LiteralID"] <= lit_case_ignore_ope;
 
     // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
     g["Class"] <= seq(chr('['), npd(chr('^')),
@@ -3720,7 +3737,11 @@ class ParserGenerator {
 
     g["Dictionary"] = [](const SemanticValues &vs) {
       auto items = vs.transform<std::string>();
-      return dic(items);
+      return dic(items, false);
+    };
+    g["DictionaryI"] = [](const SemanticValues &vs) {
+      auto items = vs.transform<std::string>();
+      return dic(items, true);
     };
 
     g["Literal"] = [](const SemanticValues &vs) {
@@ -3735,6 +3756,10 @@ class ParserGenerator {
       auto &tok = vs.tokens.front();
       return resolve_escape_sequence(tok.data(), tok.size());
     };
+    g["LiteralID"] = [](const SemanticValues &vs) {
+      auto &tok = vs.tokens.front();
+      return resolve_escape_sequence(tok.data(), tok.size());
+    };
 
     g["Class"] = [](const SemanticValues &vs) {
       auto ranges = vs.transform<std::pair<char32_t, char32_t>>();
diff --git a/test/test1.cc b/test/test1.cc
@@ -374,6 +374,28 @@ TEST(GeneralTest, Word_expression_test_Dictionary) {
   EXPECT_TRUE(parser.parse("toa"));
 }
 
+TEST(GeneralTest, Word_expression_case_ignore_test_Dictionary) {
+  parser parser(R"(
+    Identifier  ← < !Keyword [a-z][a-z]* >
+    Keyword     ← 'def'i | 'to'i
+    %whitespace ← [ \t\r\n]*
+    %word       ← [a-z]+
+  )");
+
+  EXPECT_TRUE(parser.parse("toa"));
+}
+
+TEST(GeneralTest, Word_expression_syntax_error_test_Dictionary) {
+  parser parser(R"(
+    Identifier  ← < !Keyword [a-z][a-z]* >
+    Keyword     ← 'def' | 'to'i
+    %whitespace ← [ \t\r\n]*
+    %word       ← [a-z]+
+  )");
+
+  EXPECT_FALSE(parser);
+}
+
 TEST(GeneralTest, Skip_token_test) {
   parser parser("  ROOT  <-  _ ITEM (',' _ ITEM _)* "
                 "  ITEM  <-  ([a-z0-9])+  "