Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lingua franca language #993

Merged
merged 2 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ KakouneScript
Kotlin
Lean
Less
Lingua Franca
LinkerScript
Liquid
Lisp
Expand Down
9 changes: 9 additions & 0 deletions languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,15 @@
"extensions": ["liquid"],
"multi_line_comments": [["<!--", "-->"], ["{% comment %}", "{% endcomment %}"]]
},
"LinguaFranca": {
"name": "Lingua Franca",
"line_comment": ["//", "#"],
"important_syntax": ["{="],
"multi_line_comments": [["/*", "*/"]],
"quotes": [["\\\"", "\\\""]],
"nested": true,
"extensions": ["lf"]
},
"LinkerScript": {
"name": "LD Script",
"line_comment": ["//"],
Expand Down
36 changes: 24 additions & 12 deletions src/language/embedding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub static END_TEMPLATE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"</template>"#)
pub static STARTING_MARKDOWN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"```\S+\s"#).unwrap());
pub static ENDING_MARKDOWN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"```\s?"#).unwrap());

pub static STARTING_LF_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{="#).unwrap());
pub static ENDING_LF_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"=}"#).unwrap());

/// A memory of a regex matched.
/// The values provided by `Self::start` and `Self::end` are in the same space as the
/// start value supplied to `RegexCache::build`
Expand Down Expand Up @@ -61,7 +64,8 @@ pub(crate) struct RegexCache<'a> {
/// as well as the actual matches
pub(crate) enum RegexFamily<'a> {
HtmlLike(HtmlLike<'a>),
Markdown(Markdown<'a>),
LinguaFranca(SimpleCapture<'a>),
Markdown(SimpleCapture<'a>),
Rust,
}

Expand All @@ -71,10 +75,11 @@ pub(crate) struct HtmlLike<'a> {
start_template: Option<Box<[Capture<'a>]>>,
}

pub(crate) struct Markdown<'a> {
pub(crate) struct SimpleCapture<'a> {
starts: Option<Box<[Capture<'a>]>>,
}


impl<'a> HtmlLike<'a> {
pub fn start_script_in_range(
&'a self,
Expand All @@ -101,10 +106,22 @@ impl<'a> HtmlLike<'a> {
}
}

impl<'a> Markdown<'a> {
impl<'a> SimpleCapture<'a> {
pub fn starts_in_range(&'a self, start: usize, end: usize) -> Option<&Capture<'a>> {
filter_range(self.starts.as_ref()?, start, end).and_then(|mut it| it.next())
}

fn make_capture(regex: &Regex, lines: &'a [u8], start: usize, end: usize) -> Option<SimpleCapture<'a>> {
let capture = SimpleCapture {
starts: save_captures(regex, lines, start, end),
};

if capture.starts.is_some() {
Some(capture)
} else {
None
}
}
}

fn filter_range<'a>(
Expand Down Expand Up @@ -139,17 +156,12 @@ impl<'a> RegexCache<'a> {
pub(crate) fn build(lang: LanguageType, lines: &'a [u8], start: usize, end: usize) -> Self {
let inner = match lang {
LanguageType::Markdown | LanguageType::UnrealDeveloperMarkdown => {
let markdown = Markdown {
starts: save_captures(&STARTING_MARKDOWN_REGEX, lines, start, end),
};

if markdown.starts.is_some() {
Some(RegexFamily::Markdown(markdown))
} else {
None
}
SimpleCapture::make_capture(&STARTING_MARKDOWN_REGEX, lines, start, end).map(RegexFamily::Markdown)
}
LanguageType::Rust => Some(RegexFamily::Rust),
LanguageType::LinguaFranca => {
SimpleCapture::make_capture(&STARTING_LF_BLOCK_REGEX, lines, start, end).map(RegexFamily::LinguaFranca)
},
LanguageType::Html
| LanguageType::RubyHtml
| LanguageType::Svelte
Expand Down
70 changes: 59 additions & 11 deletions src/language/language_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use crate::{

use encoding_rs_io::DecodeReaderBytesBuilder;
use grep_searcher::{LineIter, LineStep};
use once_cell::sync::Lazy;
use rayon::prelude::*;

use self::LanguageType::*;
Expand Down Expand Up @@ -56,13 +57,19 @@ impl LanguageType {
pub fn parse_from_slice<A: AsRef<[u8]>>(self, text: A, config: &Config) -> CodeStats {
let text = text.as_ref();

if self == LanguageType::Jupyter {
if self == Jupyter {
return self
.parse_jupyter(text.as_ref(), config)
.unwrap_or_else(CodeStats::new);
}

let syntax = SyntaxCounter::new(self);
let syntax = {
let mut syntax_mut = SyntaxCounter::new(self);
if self == LinguaFranca {
syntax_mut.lf_embedded_language = self.find_lf_target_language(text);
}
syntax_mut
};

if let Some(end) = syntax
.shared
Expand Down Expand Up @@ -169,6 +176,10 @@ impl LanguageType {
// Add all the markdown blobs.
*stats.blobs.entry(LanguageType::Markdown).or_default() += blob;
}
LanguageContext::LinguaFranca => {
let child_lang = syntax.get_lf_target_language();
*stats.blobs.entry(child_lang).or_default() += blob;
}
LanguageContext::Html { language } => {
stats.code += 1;
// Add all the markdown blobs.
Expand Down Expand Up @@ -266,6 +277,28 @@ impl LanguageType {

Some(jupyter_stats)
}

/// The embedded language in LF is declared in a construct that looks like this: `target C;`, `target Python`.
/// This is the first thing in the file (although there may be comments before).
fn find_lf_target_language(&self, bytes: &[u8]) -> Option<LanguageType> {
use regex::bytes::Regex;
static LF_TARGET_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?m)\btarget\s+(\w+)\s*($|;|\{)"#).unwrap());
LF_TARGET_REGEX.captures(bytes)
.and_then(|captures| {
let name = captures.get(1).unwrap().as_bytes();
if name == b"CCpp" {
// this is a special alias for the C target in LF
Some(C)
} else {
let name_str = &String::from_utf8_lossy(name);
let by_name = LanguageType::from_name(&name_str);
if by_name.is_none() {
trace!("LF target not recognized: {}", name_str);
}
by_name
}
})
}
}

#[cfg(test)]
Expand All @@ -279,22 +312,37 @@ mod tests {
assert!(LanguageType::Rust.allows_nested());
}


fn assert_stats(stats: &CodeStats, blanks: usize, code: usize, comments: usize) {
assert_eq!(stats.blanks, blanks, "expected {} blank lines", blanks);
assert_eq!(stats.code, code, "expected {} code lines", code);
assert_eq!(stats.comments, comments, "expected {} comment lines", comments);
}

#[test]
fn jupyter_notebook_has_correct_totals() {
let sample_notebook =
fs::read_to_string(Path::new("tests").join("data").join("jupyter.ipynb")).unwrap();

let CodeStats {
blanks,
code,
comments,
..
} = LanguageType::Jupyter
let stats = LanguageType::Jupyter
.parse_jupyter(sample_notebook.as_bytes(), &Config::default())
.unwrap();

assert_eq!(blanks, 115);
assert_eq!(code, 528);
assert_eq!(comments, 333);
assert_stats(&stats, 115, 528, 333);
}

#[test]
fn lf_embedded_language_is_counted() {
let file_text =
fs::read_to_string(Path::new("tests").join("data").join("linguafranca.lf")).unwrap();

let stats = LinguaFranca
.parse_from_str(file_text, &Config::default());

assert_stats(&stats, 9, 11, 8);

assert_eq!(stats.blobs.len(), 1, "num embedded languages");
let rust_stats = stats.blobs.get(&Rust).expect("should have a Rust entry");
assert_stats(rust_stats, 2, 5, 1);
}
}
25 changes: 25 additions & 0 deletions src/language/language_type.tera.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,31 @@ impl LanguageType {
}
}

/// Get language from its name.
///
/// ```no_run
/// use tokei::LanguageType;
///
/// let rust = LanguageType::from_name("Rust");
///
/// assert_eq!(rust, Some(LanguageType::Rust));
/// ```
#[must_use]
pub fn from_name(name: &str) -> Option<Self> {
match name {
{% for key, value in languages -%}
{% if value.name and value.name != key -%}
| "{{value.name}}"
{% endif -%}
| "{{key}}" => Some({{key}}),
{% endfor %}
unknown => {
warn!("Unknown language name: {}", unknown);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should have a warn! here, because people use it as a library, and they may not want to have it warn when getting None can be expected behaviour.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, thanks! I'll change that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually all other such methods like from_file_extension use a warn! call, so maybe this change should be part of another PR?

None
},
}
}

/// Get language from its MIME type if available.
///
/// ```no_run
Expand Down
35 changes: 34 additions & 1 deletion src/language/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ use log::Level::Trace;
use once_cell::sync::Lazy;

use super::embedding::{
RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE,
RegexCache, RegexFamily, ENDING_MARKDOWN_REGEX, ENDING_LF_BLOCK_REGEX, END_SCRIPT, END_STYLE, END_TEMPLATE
};
use crate::{stats::CodeStats, utils::ext::SliceExt, Config, LanguageType};
use crate::LanguageType::LinguaFranca;

/// Tracks the syntax of the language as well as the current state in the file.
/// Current has what could be consider three types of mode.
Expand All @@ -29,6 +30,7 @@ pub(crate) struct SyntaxCounter {
pub(crate) quote_is_doc_quote: bool,
pub(crate) stack: Vec<&'static str>,
pub(crate) quote_is_verbatim: bool,
pub(crate) lf_embedded_language: Option<LanguageType>
}

#[derive(Clone, Debug)]
Expand All @@ -53,6 +55,7 @@ pub(crate) enum LanguageContext {
Html {
language: LanguageType,
},
LinguaFranca,
Markdown {
balanced: bool,
language: LanguageType,
Expand Down Expand Up @@ -129,6 +132,7 @@ impl SyntaxCounter {
quote_is_doc_quote: false,
quote_is_verbatim: false,
stack: Vec::with_capacity(1),
lf_embedded_language: None,
quote: None,
}
}
Expand All @@ -148,6 +152,12 @@ impl SyntaxCounter {
!self.stack.is_empty()
}

pub(crate) fn get_lf_target_language(&self) -> LanguageType {
// in case the target declaration was not found, default it to that language
const DEFAULT_LANG: LanguageType = LinguaFranca;
self.lf_embedded_language.unwrap_or(DEFAULT_LANG)
}

#[inline]
pub(crate) fn parse_line_comment(&self, window: &[u8]) -> bool {
if self.quote.is_some() || !self.stack.is_empty() {
Expand Down Expand Up @@ -430,6 +440,29 @@ impl SyntaxCounter {
doc_block,
))
}
RegexFamily::LinguaFranca(lf) => {
let opening_fence = lf.starts_in_range(start, end)?;
let start_of_code = opening_fence.end();
let closing_fence = ENDING_LF_BLOCK_REGEX.find(&lines[start_of_code..]);
let end_of_code = closing_fence
.map_or_else(|| lines.len(),
|fence| start_of_code + fence.start());

let block_contents = &lines[start_of_code..end_of_code];
trace!(
"LF block: {:?}",
String::from_utf8_lossy(block_contents)
);
let stats =
self.get_lf_target_language().parse_from_slice(block_contents.trim_first_and_last_line_of_whitespace(), config);
trace!("-> stats: {:?}", stats);

Some(FileContext::new(
LanguageContext::LinguaFranca,
end_of_code,
stats,
))
}
RegexFamily::HtmlLike(html) => {
if let Some(mut captures) = html.start_script_in_range(start, end) {
let start_of_code = captures.next().unwrap().end();
Expand Down
36 changes: 36 additions & 0 deletions tests/data/linguafranca.lf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// 36 lines 16 code 9 comments 11 blanks

target Rust;

// A C style comment
import KeyboardEvents from "KeyboardEvents.lf";

/* A block comment */
# a python like comment

main reactor Snake(grid_side: usize(32),
food_limit: u32(2)) {

// counts as 2 lines of Rust code and one blank
preamble {=
use crate::snakes::*;

use rand::prelude::*;
=}

/// rust doc comment
keyboard = new KeyboardEvents();

// T
state snake: CircularSnake ({= CircularSnake::new(grid_side) =});
state grid: SnakeGrid ({= SnakeGrid::new(grid_side, &snake) =});
state food_on_grid: u32(0);


// 1 line of rust code
reaction(shutdown) {=
// comment in Rust

println!("New high score: {}", self.snake.len());
=}
}