From fde0238a0594d4e6532c6b0ef39482f9063974c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= <clem.fournier@proton.me>
Date: Thu, 2 May 2024 17:58:45 +0200
Subject: [PATCH] Add lingua franca language (#993)

---
 README.md                          |  1 +
 languages.json                     |  9 ++++
 src/language/embedding.rs          | 36 ++++++++++-----
 src/language/language_type.rs      | 70 +++++++++++++++++++++++++-----
 src/language/language_type.tera.rs | 25 +++++++++++
 src/language/syntax.rs             | 35 ++++++++++++++-
 tests/data/linguafranca.lf         | 36 +++++++++++++++
 7 files changed, 188 insertions(+), 24 deletions(-)
 create mode 100644 tests/data/linguafranca.lf
diff --git a/README.md b/README.md
index e1b70b0b6..3256b72c3 100644
--- a/README.md
+++ b/README.md
@@ -438,6 +438,7 @@ Kotlin
 Lean
 Less
 LFE
+Lingua Franca
 LinkerScript
 Liquid
 Lisp
diff --git a/languages.json b/languages.json
index bd399d434..14371adaf 100644
--- a/languages.json
+++ b/languages.json
@@ -878,6 +878,15 @@
       "extensions": ["liquid"],
       "multi_line_comments": [["<!--", "-->"], ["{% comment %}", "{% endcomment %}"]]
     },
+    "LinguaFranca": {
+      "name": "Lingua Franca",
+      "line_comment": ["//", "#"],
+      "important_syntax": ["{="],
+      "multi_line_comments": [["/*", "*/"]],
+      "quotes": [["\\\"", "\\\""]],
+      "nested": true,
+      "extensions": ["lf"]
+    },
     "LinkerScript": {
       "name": "LD Script",
       "multi_line_comments": [["/*", "*/"]],
diff --git a/src/language/embedding.rs b/src/language/embedding.rs
index 4a649ba14..0fb3fc2fe 100644
--- a/src/language/embedding.rs
+++ b/src/language/embedding.rs
@@ -19,6 +19,9 @@ pub static END_TEMPLATE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"</template>"#)
 pub static STARTING_MARKDOWN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"```\S+\s"#).unwrap());
 pub static ENDING_MARKDOWN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"```\s?"#).unwrap());
 
+pub static STARTING_LF_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{="#).unwrap());
+pub static ENDING_LF_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"=}"#).unwrap());
+
 /// A memory of a regex matched.
 /// The values provided by `Self::start` and `Self::end` are in the same space as the
 /// start value supplied to `RegexCache::build`
@@ -61,7 +64,8 @@ pub(crate) struct RegexCache<'a> {
 /// as well as the actual matches
 pub(crate) enum RegexFamily<'a> {
     HtmlLike(HtmlLike<'a>),
-    Markdown(Markdown<'a>),
+    LinguaFranca(SimpleCapture<'a>),
+    Markdown(SimpleCapture<'a>),
     Rust,
 }
 
@@ -71,10 +75,11 @@ pub(crate) struct HtmlLike<'a> {
     start_template: Option<Box<[Capture<'a>]>>,
 }
 
-pub(crate) struct Markdown<'a> {
+pub(crate) struct SimpleCapture<'a> {
     starts: Option<Box<[Capture<'a>]>>,
 }
 
+
 impl<'a> HtmlLike<'a> {
     pub fn start_script_in_range(
         &'a self,
@@ -101,10 +106,22 @@ impl<'a> HtmlLike<'a> {
     }
 }
 
-impl<'a> Markdown<'a> {
+impl<'a> SimpleCapture<'a> {
     pub fn starts_in_range(&'a self, start: usize, end: usize) -> Option<&Capture<'a>> {
         filter_range(self.starts.as_ref()?, start, end).and_then(|mut it| it.next())
     }
+
+    fn make_capture(regex: &Regex, lines: &'a [u8], start: usize, end: usize) -> Option<SimpleCapture<'a>> {
+        let capture = SimpleCapture {
+            starts: save_captures(regex, lines, start, end),
+        };
+
+        if capture.starts.is_some() {
+            Some(capture)
+        } else {
+            None
+        }
+    }
 }
 
 fn filter_range<'a>(
@@ -139,17 +156,12 @@ impl<'a> RegexCache<'a> {
     pub(crate) fn build(lang: LanguageType, lines: &'a [u8], start: usize, end: usize) -> Self {
         let inner = match lang {
             LanguageType::Markdown | LanguageType::UnrealDeveloperMarkdown => {
-                let markdown = Markdown {
-                    starts: save_captures(&STARTING_MARKDOWN_REGEX, lines, start, end),
-                };
-
-                if markdown.starts.is_some() {
-                    Some(RegexFamily::Markdown(markdown))
-                } else {
-                    None
-                }
+                SimpleCapture::make_capture(&STARTING_MARKDOWN_REGEX, lines, start, end).map(RegexFamily::Markdown)
             }
             LanguageType::Rust => Some(RegexFamily::Rust),
+            LanguageType::LinguaFranca => {
+                SimpleCapture::make_capture(&STARTING_LF_BLOCK_REGEX, lines, start, end).map(RegexFamily::LinguaFranca)
+            },
             LanguageType::Html
             | LanguageType::RubyHtml
             | LanguageType::Svelte
diff --git a/src/language/language_type.rs b/src/language/language_type.rs
index 2cb949fb7..a15a6087a 100644
--- a/src/language/language_type.rs
+++ b/src/language/language_type.rs
@@ -16,6 +16,7 @@ use crate::{
 
 use encoding_rs_io::DecodeReaderBytesBuilder;
 use grep_searcher::{LineIter, LineStep};
+use once_cell::sync::Lazy;
 use rayon::prelude::*;
 use serde::Serialize;
 
@@ -65,13 +66,19 @@ impl LanguageType {
     pub fn parse_from_slice<A: AsRef<[u8]>>(self, text: A, config: &Config) -> CodeStats {
         let text = text.as_ref();
 
-        if self == LanguageType::Jupyter {
+        if self == Jupyter {
             return self
                 .parse_jupyter(text.as_ref(), config)
                 .unwrap_or_else(CodeStats::new);
         }
 
-        let syntax = SyntaxCounter::new(self);
+        let syntax = {
+            let mut syntax_mut = SyntaxCounter::new(self);
+            if self == LinguaFranca {
+                syntax_mut.lf_embedded_language = self.find_lf_target_language(text);
+            }
+            syntax_mut
+        };
 
         if let Some(end) = syntax
             .shared
@@ -178,6 +185,10 @@ impl LanguageType {
                                 // Add all the markdown blobs.
                                 *stats.blobs.entry(LanguageType::Markdown).or_default() += blob;
                             }
+                            LanguageContext::LinguaFranca => {
+                                let child_lang = syntax.get_lf_target_language();
+                                *stats.blobs.entry(child_lang).or_default() += blob;
+                            }
                             LanguageContext::Html { language } => {
                                 stats.code += 1;
                                 // Add all the markdown blobs.
@@ -275,6 +286,28 @@ impl LanguageType {
 
         Some(jupyter_stats)
     }
+
+    /// The embedded language in LF is declared in a construct that looks like this: `target C;`, `target Python`.
+    /// This is the first thing in the file (although there may be comments before).
+    fn find_lf_target_language(&self, bytes: &[u8]) -> Option<LanguageType> {
+        use regex::bytes::Regex;
+        static LF_TARGET_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?m)\btarget\s+(\w+)\s*($|;|\{)"#).unwrap());
+        LF_TARGET_REGEX.captures(bytes)
+            .and_then(|captures| {
+                let name = captures.get(1).unwrap().as_bytes();
+                if name == b"CCpp" {
+                    // this is a special alias for the C target in LF
+                    Some(C)
+                } else {
+                    let name_str = &String::from_utf8_lossy(name);
+                    let by_name = LanguageType::from_name(&name_str);
+                    if by_name.is_none() {
+                        trace!("LF target not recognized: {}", name_str);
+                    }
+                    by_name
+                }
+            })
+    }
 }
 
 #[cfg(test)]
@@ -288,22 +321,37 @@ mod tests {
         assert!(LanguageType::Rust.allows_nested());
     }
 
+
+    fn assert_stats(stats: &CodeStats, blanks: usize, code: usize, comments: usize) {
+        assert_eq!(stats.blanks, blanks, "expected {} blank lines", blanks);
+        assert_eq!(stats.code, code, "expected {} code lines", code);
+        assert_eq!(stats.comments, comments, "expected {} comment lines", comments);
+    }
+
     #[test]
     fn jupyter_notebook_has_correct_totals() {
         let sample_notebook =
             fs::read_to_string(Path::new("tests").join("data").join("jupyter.ipynb")).unwrap();
 
-        let CodeStats {
-            blanks,
-            code,
-            comments,
-            ..
-        } = LanguageType::Jupyter
+        let stats = LanguageType::Jupyter
             .parse_jupyter(sample_notebook.as_bytes(), &Config::default())
             .unwrap();
 
-        assert_eq!(blanks, 115);
-        assert_eq!(code, 528);
-        assert_eq!(comments, 333);
+        assert_stats(&stats, 115, 528, 333);
+    }
+
+    #[test]
+    fn lf_embedded_language_is_counted() {
+        let file_text =
+            fs::read_to_string(Path::new("tests").join("data").join("linguafranca.lf")).unwrap();
+
+        let stats = LinguaFranca
+            .parse_from_str(file_text, &Config::default());
+
+        assert_stats(&stats, 9, 11, 8);
+
+        assert_eq!(stats.blobs.len(), 1, "num embedded languages");
+        let rust_stats = stats.blobs.get(&Rust).expect("should have a Rust entry");
+        assert_stats(rust_stats, 2, 5, 1);
     }
 }
diff --git a/src/language/language_type.tera.rs b/src/language/language_type.tera.rs
index c0774fabf..6384f9bc6 100644
--- a/src/language/language_type.tera.rs
+++ b/src/language/language_type.tera.rs
@@ -322,6 +322,31 @@ impl LanguageType {
         }
     }
 
+    /// Get language from its name.
+    ///
+    /// ```no_run
+    /// use tokei::LanguageType;
+    ///
+    /// let rust = LanguageType::from_name("Rust");
+    ///
+    /// assert_eq!(rust, Some(LanguageType::Rust));
+    /// ```
+    #[must_use]
+    pub fn from_name(name: &str) -> Option<Self> {
+        match name {
+            {% for key, value in languages -%}
+                {% if value.name and value.name != key -%}
+                    | "{{value.name}}"
+                {% endif -%}
+                    | "{{key}}" => Some({{key}}),
+            {% endfor %}
+            unknown => {
+                warn!("Unknown language name: {}", unknown);
+                None
+            },
+        }
+    }
+
     /// Get language from its MIME type if available.
     ///
     /// ```no_run
diff --git a/src/language/syntax.rs b/src/language/syntax.rs
index 5e1bc4a39..46141a144 100644
--- a/src/language/syntax.rs
+++ b/src/language/syntax.rs
@@ -7,9 +7,10 @@ use log::Level::Trace;
 use once_cell::sync::Lazy;
 
 use super::embedding::{
-    RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE,
+    RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, ENDING_LF_BLOCK_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE
 };
 use crate::{stats::CodeStats, utils::ext::SliceExt, Config, LanguageType};
+use crate::LanguageType::LinguaFranca;
 
 /// Tracks the syntax of the language as well as the current state in the file.
 /// Current has what could be consider three types of mode.
@@ -29,6 +30,7 @@ pub(crate) struct SyntaxCounter {
     pub(crate) quote_is_doc_quote: bool,
     pub(crate) stack: Vec<&'static str>,
     pub(crate) quote_is_verbatim: bool,
+    pub(crate) lf_embedded_language: Option<LanguageType>
 }
 
 #[derive(Clone, Debug)]
@@ -53,6 +55,7 @@ pub(crate) enum LanguageContext {
     Html {
         language: LanguageType,
     },
+    LinguaFranca,
     Markdown {
         balanced: bool,
         language: LanguageType,
@@ -133,6 +136,7 @@ impl SyntaxCounter {
             quote_is_doc_quote: false,
             quote_is_verbatim: false,
             stack: Vec::with_capacity(1),
+            lf_embedded_language: None,
             quote: None,
         }
     }
@@ -152,6 +156,12 @@ impl SyntaxCounter {
         !self.stack.is_empty()
     }
 
+    pub(crate) fn get_lf_target_language(&self) -> LanguageType {
+        // in case the target declaration was not found, default it to that language
+        const DEFAULT_LANG: LanguageType = LinguaFranca;
+        self.lf_embedded_language.unwrap_or(DEFAULT_LANG)
+    }
+
     #[inline]
     pub(crate) fn parse_line_comment(&self, window: &[u8]) -> bool {
         if self.quote.is_some() || !self.stack.is_empty() {
@@ -434,6 +444,29 @@ impl SyntaxCounter {
                     doc_block,
                 ))
             }
+            RegexFamily::LinguaFranca(lf) => {
+                let opening_fence = lf.starts_in_range(start, end)?;
+                let start_of_code = opening_fence.end();
+                let closing_fence = ENDING_LF_BLOCK_REGEX.find(&lines[start_of_code..]);
+                let end_of_code = closing_fence
+                    .map_or_else(|| lines.len(),
+                                 |fence| start_of_code + fence.start());
+
+                let block_contents = &lines[start_of_code..end_of_code];
+                trace!(
+                    "LF block: {:?}",
+                    String::from_utf8_lossy(block_contents)
+                );
+                let stats =
+                    self.get_lf_target_language().parse_from_slice(block_contents.trim_first_and_last_line_of_whitespace(), config);
+                trace!("-> stats: {:?}", stats);
+
+                Some(FileContext::new(
+                    LanguageContext::LinguaFranca,
+                    end_of_code,
+                    stats,
+                ))
+            }
             RegexFamily::HtmlLike(html) => {
                 if let Some(mut captures) = html.start_script_in_range(start, end) {
                     let start_of_code = captures.next().unwrap().end();
diff --git a/tests/data/linguafranca.lf b/tests/data/linguafranca.lf
new file mode 100644
index 000000000..e43ee017d
--- /dev/null
+++ b/tests/data/linguafranca.lf
@@ -0,0 +1,36 @@
+// 36 lines 16 code 9 comments 11 blanks
+
+target Rust;
+
+// A C style comment
+import KeyboardEvents from "KeyboardEvents.lf";
+
+/* A block comment */
+  # a python like comment
+
+main reactor Snake(grid_side: usize(32),
+                   food_limit: u32(2)) {
+
+    // counts as 2 lines of Rust code and one blank
+    preamble {=
+        use crate::snakes::*;
+
+        use rand::prelude::*;
+    =}
+
+    /// rust doc comment
+    keyboard = new KeyboardEvents();
+
+    // T
+    state snake: CircularSnake ({= CircularSnake::new(grid_side) =});
+    state grid: SnakeGrid ({= SnakeGrid::new(grid_side, &snake) =});
+    state food_on_grid: u32(0);
+
+
+    // 1 line of rust code
+    reaction(shutdown) {=
+        // comment in Rust
+
+        println!("New high score: {}", self.snake.len());
+    =}
+}