From fde0238a0594d4e6532c6b0ef39482f9063974c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Thu, 2 May 2024 17:58:45 +0200 Subject: [PATCH] Add lingua franca language (#993) --- README.md | 1 + languages.json | 9 ++++ src/language/embedding.rs | 36 ++++++++++----- src/language/language_type.rs | 70 +++++++++++++++++++++++++----- src/language/language_type.tera.rs | 25 +++++++++++ src/language/syntax.rs | 35 ++++++++++++++- tests/data/linguafranca.lf | 36 +++++++++++++++ 7 files changed, 188 insertions(+), 24 deletions(-) create mode 100644 tests/data/linguafranca.lf diff --git a/README.md b/README.md index e1b70b0b6..3256b72c3 100644 --- a/README.md +++ b/README.md @@ -438,6 +438,7 @@ Kotlin Lean Less LFE +Lingua Franca LinkerScript Liquid Lisp diff --git a/languages.json b/languages.json index bd399d434..14371adaf 100644 --- a/languages.json +++ b/languages.json @@ -878,6 +878,15 @@ "extensions": ["liquid"], "multi_line_comments": [[""], ["{% comment %}", "{% endcomment %}"]] }, + "LinguaFranca": { + "name": "Lingua Franca", + "line_comment": ["//", "#"], + "important_syntax": ["{="], + "multi_line_comments": [["/*", "*/"]], + "quotes": [["\\\"", "\\\""]], + "nested": true, + "extensions": ["lf"] + }, "LinkerScript": { "name": "LD Script", "multi_line_comments": [["/*", "*/"]], diff --git a/src/language/embedding.rs b/src/language/embedding.rs index 4a649ba14..0fb3fc2fe 100644 --- a/src/language/embedding.rs +++ b/src/language/embedding.rs @@ -19,6 +19,9 @@ pub static END_TEMPLATE: Lazy = Lazy::new(|| Regex::new(r#""#) pub static STARTING_MARKDOWN_REGEX: Lazy = Lazy::new(|| Regex::new(r#"```\S+\s"#).unwrap()); pub static ENDING_MARKDOWN_REGEX: Lazy = Lazy::new(|| Regex::new(r#"```\s?"#).unwrap()); +pub static STARTING_LF_BLOCK_REGEX: Lazy = Lazy::new(|| Regex::new(r#"\{="#).unwrap()); +pub static ENDING_LF_BLOCK_REGEX: Lazy = Lazy::new(|| Regex::new(r#"=}"#).unwrap()); + /// A memory of a regex matched. /// The values provided by `Self::start` and `Self::end` are in the same space as the /// start value supplied to `RegexCache::build` @@ -61,7 +64,8 @@ pub(crate) struct RegexCache<'a> { /// as well as the actual matches pub(crate) enum RegexFamily<'a> { HtmlLike(HtmlLike<'a>), - Markdown(Markdown<'a>), + LinguaFranca(SimpleCapture<'a>), + Markdown(SimpleCapture<'a>), Rust, } @@ -71,10 +75,11 @@ pub(crate) struct HtmlLike<'a> { start_template: Option]>>, } -pub(crate) struct Markdown<'a> { +pub(crate) struct SimpleCapture<'a> { starts: Option]>>, } + impl<'a> HtmlLike<'a> { pub fn start_script_in_range( &'a self, @@ -101,10 +106,22 @@ impl<'a> HtmlLike<'a> { } } -impl<'a> Markdown<'a> { +impl<'a> SimpleCapture<'a> { pub fn starts_in_range(&'a self, start: usize, end: usize) -> Option<&Capture<'a>> { filter_range(self.starts.as_ref()?, start, end).and_then(|mut it| it.next()) } + + fn make_capture(regex: &Regex, lines: &'a [u8], start: usize, end: usize) -> Option> { + let capture = SimpleCapture { + starts: save_captures(regex, lines, start, end), + }; + + if capture.starts.is_some() { + Some(capture) + } else { + None + } + } } fn filter_range<'a>( @@ -139,17 +156,12 @@ impl<'a> RegexCache<'a> { pub(crate) fn build(lang: LanguageType, lines: &'a [u8], start: usize, end: usize) -> Self { let inner = match lang { LanguageType::Markdown | LanguageType::UnrealDeveloperMarkdown => { - let markdown = Markdown { - starts: save_captures(&STARTING_MARKDOWN_REGEX, lines, start, end), - }; - - if markdown.starts.is_some() { - Some(RegexFamily::Markdown(markdown)) - } else { - None - } + SimpleCapture::make_capture(&STARTING_MARKDOWN_REGEX, lines, start, end).map(RegexFamily::Markdown) } LanguageType::Rust => Some(RegexFamily::Rust), + LanguageType::LinguaFranca => { + SimpleCapture::make_capture(&STARTING_LF_BLOCK_REGEX, lines, start, end).map(RegexFamily::LinguaFranca) + }, LanguageType::Html | LanguageType::RubyHtml | LanguageType::Svelte diff --git a/src/language/language_type.rs b/src/language/language_type.rs index 2cb949fb7..a15a6087a 100644 --- a/src/language/language_type.rs +++ b/src/language/language_type.rs @@ -16,6 +16,7 @@ use crate::{ use encoding_rs_io::DecodeReaderBytesBuilder; use grep_searcher::{LineIter, LineStep}; +use once_cell::sync::Lazy; use rayon::prelude::*; use serde::Serialize; @@ -65,13 +66,19 @@ impl LanguageType { pub fn parse_from_slice>(self, text: A, config: &Config) -> CodeStats { let text = text.as_ref(); - if self == LanguageType::Jupyter { + if self == Jupyter { return self .parse_jupyter(text.as_ref(), config) .unwrap_or_else(CodeStats::new); } - let syntax = SyntaxCounter::new(self); + let syntax = { + let mut syntax_mut = SyntaxCounter::new(self); + if self == LinguaFranca { + syntax_mut.lf_embedded_language = self.find_lf_target_language(text); + } + syntax_mut + }; if let Some(end) = syntax .shared @@ -178,6 +185,10 @@ impl LanguageType { // Add all the markdown blobs. *stats.blobs.entry(LanguageType::Markdown).or_default() += blob; } + LanguageContext::LinguaFranca => { + let child_lang = syntax.get_lf_target_language(); + *stats.blobs.entry(child_lang).or_default() += blob; + } LanguageContext::Html { language } => { stats.code += 1; // Add all the markdown blobs. @@ -275,6 +286,28 @@ impl LanguageType { Some(jupyter_stats) } + + /// The embedded language in LF is declared in a construct that looks like this: `target C;`, `target Python`. + /// This is the first thing in the file (although there may be comments before). + fn find_lf_target_language(&self, bytes: &[u8]) -> Option { + use regex::bytes::Regex; + static LF_TARGET_REGEX: Lazy = Lazy::new(|| Regex::new(r#"(?m)\btarget\s+(\w+)\s*($|;|\{)"#).unwrap()); + LF_TARGET_REGEX.captures(bytes) + .and_then(|captures| { + let name = captures.get(1).unwrap().as_bytes(); + if name == b"CCpp" { + // this is a special alias for the C target in LF + Some(C) + } else { + let name_str = &String::from_utf8_lossy(name); + let by_name = LanguageType::from_name(&name_str); + if by_name.is_none() { + trace!("LF target not recognized: {}", name_str); + } + by_name + } + }) + } } #[cfg(test)] @@ -288,22 +321,37 @@ mod tests { assert!(LanguageType::Rust.allows_nested()); } + + fn assert_stats(stats: &CodeStats, blanks: usize, code: usize, comments: usize) { + assert_eq!(stats.blanks, blanks, "expected {} blank lines", blanks); + assert_eq!(stats.code, code, "expected {} code lines", code); + assert_eq!(stats.comments, comments, "expected {} comment lines", comments); + } + #[test] fn jupyter_notebook_has_correct_totals() { let sample_notebook = fs::read_to_string(Path::new("tests").join("data").join("jupyter.ipynb")).unwrap(); - let CodeStats { - blanks, - code, - comments, - .. - } = LanguageType::Jupyter + let stats = LanguageType::Jupyter .parse_jupyter(sample_notebook.as_bytes(), &Config::default()) .unwrap(); - assert_eq!(blanks, 115); - assert_eq!(code, 528); - assert_eq!(comments, 333); + assert_stats(&stats, 115, 528, 333); + } + + #[test] + fn lf_embedded_language_is_counted() { + let file_text = + fs::read_to_string(Path::new("tests").join("data").join("linguafranca.lf")).unwrap(); + + let stats = LinguaFranca + .parse_from_str(file_text, &Config::default()); + + assert_stats(&stats, 9, 11, 8); + + assert_eq!(stats.blobs.len(), 1, "num embedded languages"); + let rust_stats = stats.blobs.get(&Rust).expect("should have a Rust entry"); + assert_stats(rust_stats, 2, 5, 1); } } diff --git a/src/language/language_type.tera.rs b/src/language/language_type.tera.rs index c0774fabf..6384f9bc6 100644 --- a/src/language/language_type.tera.rs +++ b/src/language/language_type.tera.rs @@ -322,6 +322,31 @@ impl LanguageType { } } + /// Get language from its name. + /// + /// ```no_run + /// use tokei::LanguageType; + /// + /// let rust = LanguageType::from_name("Rust"); + /// + /// assert_eq!(rust, Some(LanguageType::Rust)); + /// ``` + #[must_use] + pub fn from_name(name: &str) -> Option { + match name { + {% for key, value in languages -%} + {% if value.name and value.name != key -%} + | "{{value.name}}" + {% endif -%} + | "{{key}}" => Some({{key}}), + {% endfor %} + unknown => { + warn!("Unknown language name: {}", unknown); + None + }, + } + } + /// Get language from its MIME type if available. /// /// ```no_run diff --git a/src/language/syntax.rs b/src/language/syntax.rs index 5e1bc4a39..46141a144 100644 --- a/src/language/syntax.rs +++ b/src/language/syntax.rs @@ -7,9 +7,10 @@ use log::Level::Trace; use once_cell::sync::Lazy; use super::embedding::{ - RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE, + RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, ENDING_LF_BLOCK_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE }; use crate::{stats::CodeStats, utils::ext::SliceExt, Config, LanguageType}; +use crate::LanguageType::LinguaFranca; /// Tracks the syntax of the language as well as the current state in the file. /// Current has what could be consider three types of mode. @@ -29,6 +30,7 @@ pub(crate) struct SyntaxCounter { pub(crate) quote_is_doc_quote: bool, pub(crate) stack: Vec<&'static str>, pub(crate) quote_is_verbatim: bool, + pub(crate) lf_embedded_language: Option } #[derive(Clone, Debug)] @@ -53,6 +55,7 @@ pub(crate) enum LanguageContext { Html { language: LanguageType, }, + LinguaFranca, Markdown { balanced: bool, language: LanguageType, @@ -133,6 +136,7 @@ impl SyntaxCounter { quote_is_doc_quote: false, quote_is_verbatim: false, stack: Vec::with_capacity(1), + lf_embedded_language: None, quote: None, } } @@ -152,6 +156,12 @@ impl SyntaxCounter { !self.stack.is_empty() } + pub(crate) fn get_lf_target_language(&self) -> LanguageType { + // in case the target declaration was not found, default it to that language + const DEFAULT_LANG: LanguageType = LinguaFranca; + self.lf_embedded_language.unwrap_or(DEFAULT_LANG) + } + #[inline] pub(crate) fn parse_line_comment(&self, window: &[u8]) -> bool { if self.quote.is_some() || !self.stack.is_empty() { @@ -434,6 +444,29 @@ impl SyntaxCounter { doc_block, )) } + RegexFamily::LinguaFranca(lf) => { + let opening_fence = lf.starts_in_range(start, end)?; + let start_of_code = opening_fence.end(); + let closing_fence = ENDING_LF_BLOCK_REGEX.find(&lines[start_of_code..]); + let end_of_code = closing_fence + .map_or_else(|| lines.len(), + |fence| start_of_code + fence.start()); + + let block_contents = &lines[start_of_code..end_of_code]; + trace!( + "LF block: {:?}", + String::from_utf8_lossy(block_contents) + ); + let stats = + self.get_lf_target_language().parse_from_slice(block_contents.trim_first_and_last_line_of_whitespace(), config); + trace!("-> stats: {:?}", stats); + + Some(FileContext::new( + LanguageContext::LinguaFranca, + end_of_code, + stats, + )) + } RegexFamily::HtmlLike(html) => { if let Some(mut captures) = html.start_script_in_range(start, end) { let start_of_code = captures.next().unwrap().end(); diff --git a/tests/data/linguafranca.lf b/tests/data/linguafranca.lf new file mode 100644 index 000000000..e43ee017d --- /dev/null +++ b/tests/data/linguafranca.lf @@ -0,0 +1,36 @@ +// 36 lines 16 code 9 comments 11 blanks + +target Rust; + +// A C style comment +import KeyboardEvents from "KeyboardEvents.lf"; + +/* A block comment */ + # a python like comment + +main reactor Snake(grid_side: usize(32), + food_limit: u32(2)) { + + // counts as 2 lines of Rust code and one blank + preamble {= + use crate::snakes::*; + + use rand::prelude::*; + =} + + /// rust doc comment + keyboard = new KeyboardEvents(); + + // T + state snake: CircularSnake ({= CircularSnake::new(grid_side) =}); + state grid: SnakeGrid ({= SnakeGrid::new(grid_side, &snake) =}); + state food_on_grid: u32(0); + + + // 1 line of rust code + reaction(shutdown) {= + // comment in Rust + + println!("New high score: {}", self.snake.len()); + =} +}