Skip to content

Commit

Permalink
Add WebVTT lexer (#707) (#1032)
Browse files Browse the repository at this point in the history
Implements a WebVTT lexer (#707)
  • Loading branch information
dschuessler authored Dec 29, 2024
1 parent f3ff20b commit 009385f
Show file tree
Hide file tree
Showing 17 changed files with 1,906 additions and 0 deletions.
283 changes: 283 additions & 0 deletions lexers/embedded/webvtt.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
<lexer>
<config>
<name>WebVTT</name>
<alias>vtt</alias>
<filename>*.vtt</filename>
<mime_type>text/vtt</mime_type>
</config>
<!--
The WebVTT spec refers to a WebVTT line terminator as either CRLF, CR or LF.
(https://www.w3.org/TR/webvtt1/#webvtt-line-terminator) However, with this
definition it is unclear whether CRLF is one line terminator (CRLF) or two
line terminators (CR and LF).
To work around this ambiguity, only CRLF and LF are considered as line terminators.
To my knowledge only classic Mac OS uses CR as line terminators, so the lexer should
still work for most files.
-->
<rules>
<!-- https://www.w3.org/TR/webvtt1/#webvtt-file-body -->
<state name="root">
<rule pattern="(\AWEBVTT)((?:[ \t][^\r\n]*)?(?:\r?\n){2,})">
<bygroups>
<token type="Keyword" />
<token type="Text" />
</bygroups>
</rule>
<rule pattern="(^REGION)([ \t]*$)">
<bygroups>
<token type="Keyword" />
<token type="Text" />
</bygroups>
<push state="region-settings-list" />
</rule>
<rule
pattern="(^STYLE)([ \t]*$)((?:(?!&#45;&#45;&gt;)[\s\S])*?)((?:\r?\n){2})">
<bygroups>
<token type="Keyword" />
<token type="Text" />
<using lexer="CSS" />
<token type="Text" />
</bygroups>
</rule>
<rule>
<include state="comment" />
</rule>
<rule
pattern="(?=((?![^\r\n]*&#45;&#45;&gt;)[^\r\n]*\r?\n)?(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3}[ \t]+&#45;&#45;&gt;[ \t]+(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})"
>
<push state="cues" />
</rule>
</state>

<!-- https://www.w3.org/TR/webvtt1/#webvtt-region-settings-list -->
<state name="region-settings-list">
<rule pattern="(?: |\t|\r?\n(?!\r?\n))+">
<token type="Text" />
</rule>
<rule pattern="(?:\r?\n){2}">
<token type="Text" />
<pop depth="1" />
</rule>
<rule pattern="(id)(:)(?!&#45;&#45;&gt;)(\S+)">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
</bygroups>
</rule>
<rule pattern="(width)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
</bygroups>
</rule>
<rule pattern="(lines)(:)(\d+)">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
</bygroups>
</rule>
<rule
pattern="(regionanchor|viewportanchor)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)(,)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
</bygroups>
</rule>
<rule pattern="(scroll)(:)(up)">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="KeywordConstant" />
</bygroups>
</rule>
</state>

<!-- https://www.w3.org/TR/webvtt1/#webvtt-comment-block -->
<state name="comment">
<rule
pattern="^NOTE( |\t|\r?\n)((?!&#45;&#45;&gt;)[\s\S])*?(?:(\r?\n){2}|\Z)">
<token type="Comment" />
</rule>
</state>

<!--
"Zero or more WebVTT cue blocks and WebVTT comment blocks separated from each other by one or more
WebVTT line terminators." (https://www.w3.org/TR/webvtt1/#file-structure)
-->
<state name="cues">
<rule
pattern="(?:((?!&#45;&#45;&gt;)[^\r\n]+)?(\r?\n))?((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]+)(&#45;&#45;&gt;)([ \t]+)((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]*)">
<bygroups>
<token type="Name" />
<token type="Text" />
<token type="LiteralDate" />
<token type="Text" />
<token type="Operator" />
<token type="Text" />
<token type="LiteralDate" />
<token type="Text" />
</bygroups>
<push state="cue-settings-list" />
</rule>
<rule>
<include state="comment" />
</rule>
</state>

<!-- https://www.w3.org/TR/webvtt1/#webvtt-cue-settings-list -->
<state name="cue-settings-list">
<rule pattern="[ \t]+">
<token type="Text" />
</rule>
<rule pattern="(vertical)(:)?(rl|lr)?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="KeywordConstant" />
</bygroups>
</rule>
<rule
pattern="(line)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(start|center|end))?)?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
<token type="Literal" />
<token type="Punctuation" />
<token type="KeywordConstant" />
</bygroups>
</rule>
<rule
pattern="(position)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(line-left|center|line-right))?)?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
<token type="Literal" />
<token type="Punctuation" />
<token type="KeywordConstant" />
</bygroups>
</rule>
<rule pattern="(size)(:)?(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%))?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
<token type="KeywordType" />
</bygroups>
</rule>
<rule pattern="(align)(:)?(start|center|end|left|right)?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="KeywordConstant" />
</bygroups>
</rule>
<rule pattern="(region)(:)?((?![^\r\n]*&#45;&#45;&gt;(?=[ \t]+?))[^ \t\r\n]+)?">
<bygroups>
<token type="Keyword" />
<token type="Punctuation" />
<token type="Literal" />
</bygroups>
</rule>
<rule
pattern="(?=\r?\n)">
<push state="cue-payload" />
</rule>
</state>

<!-- https://www.w3.org/TR/webvtt1/#cue-payload -->
<state name="cue-payload">
<rule pattern="(\r?\n){2,}">
<token type="Text" />
<pop depth="2" />
</rule>
<rule pattern="[^&lt;&amp;]+?">
<token type="Text" />
</rule>
<rule pattern="&amp;(#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);">
<token type="Text" />
</rule>
<rule pattern="(?=&lt;)">
<token type="Text" />
<push state="cue-span-tag" />
</rule>
</state>
<state name="cue-span-tag">
<rule
pattern="&lt;(?=c|i|b|u|ruby|rt|v|lang|(?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
<token type="Punctuation" />
<push state="cue-span-start-tag-name" />
</rule>
<rule pattern="(&lt;/)(c|i|b|u|ruby|rt|v|lang)">
<bygroups>
<token type="Punctuation" />
<token type="NameTag" />
</bygroups>
</rule>
<rule pattern="&gt;">
<token type="Punctuation" />
<pop depth="1" />
</rule>
</state>
<state name="cue-span-start-tag-name">
<rule pattern="(c|i|b|u|ruby|rt)|((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
<bygroups>
<token type="NameTag" />
<token type="LiteralDate" />
</bygroups>
<push state="cue-span-classes-without-annotations" />
</rule>
<rule pattern="v|lang">
<token type="NameTag" />
<push state="cue-span-classes-with-annotations" />
</rule>
</state>
<state name="cue-span-classes-without-annotations">
<rule>
<include state="cue-span-classes" />
</rule>
<rule pattern="(?=&gt;)">
<pop depth="2" />
</rule>
</state>
<state name="cue-span-classes-with-annotations">
<rule>
<include state="cue-span-classes" />
</rule>
<rule pattern="(?=[ \t])">
<push state="cue-span-start-tag-annotations" />
</rule>
</state>
<state name="cue-span-classes">
<rule pattern="(\.)([^ \t\n\r&amp;&lt;&gt;\.]+)">
<bygroups>
<token type="Punctuation" />
<token type="NameTag" />
</bygroups>
</rule>
</state>
<state name="cue-span-start-tag-annotations">
<rule
pattern="[ \t](?:[^\n\r&amp;&gt;]|&amp;(?:#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);)+">
<token type="Text" />
</rule>
<rule pattern="(?=&gt;)">
<token type="Text" />
<pop depth="3" />
</rule>
</state>
</rules>
</lexer>
Loading

0 comments on commit 009385f

Please sign in to comment.