stsewd · stsewd · Dec 29, 2023 · Dec 29, 2023 · Dec 30, 2023 · Dec 30, 2023
diff --git a/README.md b/README.md
@@ -14,12 +14,18 @@ I have chosen to follow popular conventions for the syntax.
 
 ### Comment tags
 
-* Comment tags can contain:
-  - Upper case ascii letters
-  - Numbers (can't start with one)
-  - `-`, `_` (they can't start or end with these characters)
-* Optionally can have an user linked to the tag inside parentheses `()`
-* The name must be followed by `:` and a whitespace
+Comment tags can be:
+
+- Simple tags: A single uppercase word (tag name), optionally followed by `:`.
+  For example: `TODO`, `TODO:`.
+- Annotated tags: A single uppercase word (tag name), followed by an annotation inside parentheses `()`, followed by `:`.
+  For example:  `TODO (user):`.
+
+Tag names are composed of:
+
+- Upper case ASCII letters
+- Numbers (can't start with one)
+- `-`, `_` (they can't start or end with these characters)
 
 ### URIs
 
@@ -42,36 +48,42 @@ XXX:    extra white spaces.
 NOTE-BUG (stsewd): tags can be separated by `-`
 NOTE_BUG: or by `_`.
 
-This will be recognized as a URI
-https://github.com/stsewd/
+NOTE is also a valid tag.
+
+This will be recognized as a URL
+https://github.com/stsewd/.
+
+Even if the URL is surrounded by parenthesis (https://stsewd.dev)
 ```
 
 ## FAQ
 
-### Can I match a tag that doesn't end in `:`, like `TODO`?
+### All uppercase words are highlighted as tags, why?
+
+**Short answer:**
 
-This grammar doesn't provide a specific token for it,
-but you can match it with this query:
+Use a more specific query to match only the tags you want, for example:
 
 ```scm
-("text" @todo
- (#eq? @todo "TODO"))
+((tag (name) @todo)
+ (#any-of? @todo "TODO" "NOTE" "FIXME"))
 ```
 
-### Can I highlight references to issues, PRs, MRs, like `#10` or `!10`?
+**Long answer:**
 
-This grammar doesn't provide a specific token for it,
-but you can match it with this query:
+This grammar use to recognize tags followed by `:` only, but it's not uncommon to see tags without `:`.
+In order to match those types of tags users had to match against `text` nodes,
+but since every word is a text node, that operation was slow.
+So now the grammar matches recognizes all uppercase words as tags instead,
+and no longer exposes `text` nodes.
 
-```scm
-("text" @issue
- (#match? @issue "^#[0-9]+$"))
+### Can I highlight references to issues, PRs, MRs, like `#10` or `!10`?
 
-;; NOTE: This matches `!10` and `! 10`.
-("text" @symbol . "text" @issue
- (#eq? @symbol "!")
- (#match? @issue "^[0-9]+$"))
-```
+This grammar doesn't provide a token for it,
+but if you think it should be supported, feel free to open an issue.
+
+In the past, this was possible by matching against `text` nodes,
+but they are no longer exposed, see https://github.com/stsewd/tree-sitter-comment/pull/33.
 
 ### I'm using Neovim and don't see all tags highlighted
 
@@ -90,6 +102,13 @@ or to keep a state for whitespaces (like indentation).
 For these reasons, parsing _languages_ that need to keep a state or falling back to a general token,
 it requires some manual parsing in C.
 
+While it may be possible to write a simple grammar (like this one) in pure JS,
+it would need to make use of the `conflicts` feature or not expose some tokens,
+resolving conflicts is slow in tree-sitter.
+See https://github.com/stsewd/tree-sitter-comment/pull/33.
+
+If you are able to find a way to write this grammar in pure JS that doesn't make it slow, feel free to open a PR!
+
 ## Projects using this grammar
 
 - [nvim-treesitter](https://github.com/nvim-treesitter/nvim-treesitter)

diff --git a/docs/js/tree-sitter-comment.wasm b/docs/js/tree-sitter-comment.wasm
diff --git a/grammar.js b/grammar.js
@@ -41,33 +41,31 @@ module.exports = grammar({
   name: "comment",
 
   externals: ($) => [
-    $.name,
-    $.invalid_token
+    $._tag_name,
+    $._invalid_token
   ],
 
   rules: {
     source: ($) => repeat(
       choice(
+        // Explicitly end with a "stop" character to help TS disambiguate from a normal tag.
+        seq(alias($.simple_tag, $.tag), choice($._text, /\s/)),
         $.tag,
-        $._full_uri,
-        alias($._text, "text"),
+        $.uri,
+        $._text,
       ),
     ),
 
+    simple_tag: ($) => alias($._simple_tag_name, $.name),
+    _simple_tag_name: ($) => /[A-Z]([A-Z0-9_-]*[A-Z0-9])?/,
+
     tag: ($) => seq(
-      $.name,
-      optional($._user),
+      alias($._tag_name, $.name),
+      optional($._tag_annotation),
       ":",
     ),
 
-    _user: ($) => seq(
-      "(",
-      alias(/[^()]+/, $.user),
-      ")",
-    ),
-
-    // This token is split into two parts so the end character isn't included in the URI itself.
-    _full_uri: ($) => seq($.uri, choice(alias($._end_char, "text"), /\s/)),
+    _tag_annotation: ($) => seq("(", alias(/[^()]+/, $.annotation), ")"),
 
     // This token needs to be single regex, otherwise a partial match will result in an error.
     uri: ($) => get_uri_regex(),

diff --git a/src/grammar.json b/src/grammar.json
@@ -6,39 +6,79 @@
       "content": {
         "type": "CHOICE",
         "members": [
+          {
+            "type": "SEQ",
+            "members": [
+              {
+                "type": "ALIAS",
+                "content": {
+                  "type": "SYMBOL",
+                  "name": "simple_tag"
+                },
+                "named": true,
+                "value": "tag"
+              },
+              {
+                "type": "CHOICE",
+                "members": [
+                  {
+                    "type": "SYMBOL",
+                    "name": "_text"
+                  },
+                  {
+                    "type": "PATTERN",
+                    "value": "\\s"
+                  }
+                ]
+              }
+            ]
+          },
           {
             "type": "SYMBOL",
             "name": "tag"
           },
           {
             "type": "SYMBOL",
-            "name": "_full_uri"
+            "name": "uri"
           },
           {
-            "type": "ALIAS",
-            "content": {
-              "type": "SYMBOL",
-              "name": "_text"
-            },
-            "named": false,
-            "value": "text"
+            "type": "SYMBOL",
+            "name": "_text"
           }
         ]
       }
     },
+    "simple_tag": {
+      "type": "ALIAS",
+      "content": {
+        "type": "SYMBOL",
+        "name": "_simple_tag_name"
+      },
+      "named": true,
+      "value": "name"
+    },
+    "_simple_tag_name": {
+      "type": "PATTERN",
+      "value": "[A-Z]([A-Z0-9_-]*[A-Z0-9])?"
+    },
     "tag": {
       "type": "SEQ",
       "members": [
         {
-          "type": "SYMBOL",
-          "name": "name"
+          "type": "ALIAS",
+          "content": {
+            "type": "SYMBOL",
+            "name": "_tag_name"
+          },
+          "named": true,
+          "value": "name"
         },
         {
           "type": "CHOICE",
           "members": [
             {
               "type": "SYMBOL",
-              "name": "_user"
+              "name": "_tag_annotation"
             },
             {
               "type": "BLANK"
@@ -51,7 +91,7 @@
         }
       ]
     },
-    "_user": {
+    "_tag_annotation": {
       "type": "SEQ",
       "members": [
         {
@@ -65,41 +105,14 @@
             "value": "[^()]+"
           },
           "named": true,
-          "value": "user"
+          "value": "annotation"
         },
         {
           "type": "STRING",
           "value": ")"
         }
       ]
     },
-    "_full_uri": {
-      "type": "SEQ",
-      "members": [
-        {
-          "type": "SYMBOL",
-          "name": "uri"
-        },
-        {
-          "type": "CHOICE",
-          "members": [
-            {
-              "type": "ALIAS",
-              "content": {
-                "type": "SYMBOL",
-                "name": "_end_char"
-              },
-              "named": false,
-              "value": "text"
-            },
-            {
-              "type": "PATTERN",
-              "value": "\\s"
-            }
-          ]
-        }
-      ]
-    },
     "uri": {
       "type": "PATTERN",
       "value": "https?:\\/\\/([^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>]|[\\.,:;!\\?\\\\'\"\\}\\]\\)>][a-zA-Z0-9])+"
@@ -267,11 +280,11 @@
   "externals": [
     {
       "type": "SYMBOL",
-      "name": "name"
+      "name": "_tag_name"
     },
     {
       "type": "SYMBOL",
-      "name": "invalid_token"
+      "name": "_invalid_token"
     }
   ],
   "inline": [],

diff --git a/src/node-types.json b/src/node-types.json
@@ -27,21 +27,16 @@
       "required": true,
       "types": [
         {
-          "type": "name",
+          "type": "annotation",
           "named": true
         },
         {
-          "type": "user",
+          "type": "name",
           "named": true
         }
       ]
     }
   },
-  {
-    "type": "text",
-    "named": false,
-    "fields": {}
-  },
   {
     "type": "!",
     "named": false
@@ -111,15 +106,15 @@
     "named": false
   },
   {
-    "type": "name",
+    "type": "annotation",
     "named": true
   },
   {
-    "type": "uri",
+    "type": "name",
     "named": true
   },
   {
-    "type": "user",
+    "type": "uri",
     "named": true
   },
   {