Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full parser for license expressions (version II) #29

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,12 @@ repository = "https://github.com/withoutboats/license-exprs"

[workspace]
members = [".", "fetch-license-list-from-spdx"]

[dependencies]
lazy_static = "1.3.0"
regex = "1.1.6"
lalrpop-util = "0.17.0"
failure = "0.1.5"

[build-dependencies]
lalrpop = "0.17.0"
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

This crate validates [SPDX 2.1 license expressions][SPDX-license-expressions] using short identifiers from the [SPDX 3.1 License List][SPDX-license-list].

## Limitations

Parentheses are [not currently supported][parens].

## License

Licensed under the [Apache License, Version 2.0][Apache-2.0] ([`LICENSE-APACHE`](LICENSE-APACHE)) or the [MIT license][MIT] ([`LICENSE-MIT`](LICENSE-MIT)), at your option.
Expand Down
5 changes: 5 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
extern crate lalrpop;

fn main() {
lalrpop::process_root().unwrap();
}
139 changes: 139 additions & 0 deletions src/lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
use spdx;

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Token<'a> {
LicenseId(&'a str),
ExceptionId(&'a str),
LicenseRef(Option<&'a str>, &'a str),
Plus,
OpenParen,
CloseParen,
With,
And,
Or,
}

impl<'a> std::fmt::Display for Token<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(self, f)
}
}

impl<'a> Token<'a> {
fn len(&self) -> usize {
match self {
Token::LicenseId(s) => s.len(),
Token::ExceptionId(e) => e.len(),
Token::LicenseRef(None, l) => "LicenseRef-".len() + l.len(),
Token::LicenseRef(Some(d), l) => {
"DocumentRef-".len() + d.len() + ":LicenseRef-".len() + l.len()
}
Token::With => 4,
Token::And => 3,
Token::Or => 2,
Token::Plus | Token::OpenParen | Token::CloseParen => 1,
}
}
}

pub struct Lexer<'a> {
inner: &'a str,
offset: usize,
}

impl<'a> Lexer<'a> {
pub fn new(text: &'a str) -> Lexer<'a> {
Lexer {
inner: text,
offset: 0,
}
}
}

impl<'a> Iterator for Lexer<'a> {
type Item = Result<(usize, Token<'a>, usize), failure::Error>;

fn next(&mut self) -> Option<Self::Item> {
lazy_static! {
static ref TEXTTOKEN: regex::Regex = regex::Regex::new(r"^[-a-zA-Z0-9.:]+").unwrap();
static ref IDSTRING: regex::Regex = regex::Regex::new(r"^[-a-zA-Z0-9.]+").unwrap();
static ref DOCREFLICREF: regex::Regex =
regex::Regex::new(r"^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)")
.unwrap();
static ref LICREF: regex::Regex =
regex::Regex::new(r"^LicenseRef-([-a-zA-Z0-9.]+)").unwrap();
}

// Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
let white_len = match self.inner.find(|c: char| !c.is_whitespace()) {
Some(idx) => idx,
None => self.inner.len(),
};
self.inner = &self.inner[white_len..];
self.offset += white_len;

match self.inner.chars().next() {
None => None,
Some('+') => Some(Ok(Token::Plus)),
Some('(') => Some(Ok(Token::OpenParen)),
Some(')') => Some(Ok(Token::CloseParen)),
_ => match TEXTTOKEN.find(self.inner) {
None => Some(Err(format_err!("Unparseable characters found after {}", self.inner))),
Some(m) => {
if m.as_str() == "WITH" {
Some(Ok(Token::With))
} else if m.as_str() == "AND" {
Some(Ok(Token::And))
} else if m.as_str() == "OR" {
Some(Ok(Token::Or))
} else if spdx::LICENSES.binary_search(&m.as_str()).is_ok() {
Some(Ok(Token::LicenseId(m.as_str())))
} else if spdx::EXCEPTIONS.binary_search(&m.as_str()).is_ok() {
Some(Ok(Token::ExceptionId(m.as_str())))
} else {
if let Some(c) = DOCREFLICREF.captures(m.as_str()) {
Some(Ok(Token::LicenseRef(
Some(c.get(1).unwrap().as_str()),
c.get(2).unwrap().as_str(),
)))
} else if let Some(c) = LICREF.captures(m.as_str()) {
Some(Ok(Token::LicenseRef(None, c.get(1).unwrap().as_str())))
} else {
Some(Err(format_err!("Invalid term found: {}", m.as_str())))
}
}
}
},
}
.map(|res| {
res.map(|tok| {
let len = tok.len();
let start = self.offset;
self.inner = &self.inner[len..];
self.offset += len;
(start, tok, start + len)
})
})
}
}

#[test]
fn lex_all_the_things() {
let text = "MIT OR + () Apache-2.0 WITH AND LicenseRef-World Classpath-exception-2.0 DocumentRef-Test:LicenseRef-Hello";
let mut lexer = Lexer::new(text);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::LicenseId("MIT"));
assert_eq!(lexer.next().unwrap().unwrap().1, Token::Or);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::Plus);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::OpenParen);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::CloseParen);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::LicenseId("Apache-2.0"));
assert_eq!(lexer.next().unwrap().unwrap().1, Token::With);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::And);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::LicenseRef(None, "World"));
assert_eq!(
lexer.next().unwrap().unwrap().1,
Token::ExceptionId("Classpath-exception-2.0")
);
assert_eq!(lexer.next().unwrap().unwrap().1, Token::LicenseRef(Some("Test"), "Hello"));
assert!(lexer.next().is_none());
}
84 changes: 18 additions & 66 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,76 +1,28 @@
use std::error::Error;
use std::fmt;
mod lexer;
mod parser_types;
mod spdx;

use self::LicenseExpr::*;
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate lalrpop_util;
#[macro_use]
extern crate failure;
extern crate regex;

#[derive(Debug, Clone, Copy)]
pub enum LicenseExpr<'a> {
License(&'a str),
Exception(&'a str),
And, Or, With,
}

impl<'a> fmt::Display for LicenseExpr<'a> {
fn fmt(&self, format: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
With => format.write_str("WITH"),
And => format.write_str("AND"),
Or => format.write_str("OR"),
License(info) | Exception(info) => format.write_str(info),
}
}
}
lalrpop_mod!(pub parser);

#[derive(Debug, Clone, Copy)]
pub enum ParseError<'a> {
UnknownLicenseId(&'a str),
InvalidStructure(LicenseExpr<'a>)
}

impl<'a> fmt::Display for ParseError<'a> {
fn fmt(&self, format: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
ParseError::UnknownLicenseId(info)
=> format.write_fmt(format_args!("{}: {}", self.description(), info)),
ParseError::InvalidStructure(info)
=> format.write_fmt(format_args!("{}: {}", self.description(), info)),
}
}
}
type Result<T, E = failure::Error> = std::result::Result<T, E>;

impl<'a> Error for ParseError<'a> {
fn description(&self) -> &str {
match *self {
ParseError::UnknownLicenseId(_) => "unknown license or other term",
ParseError::InvalidStructure(_) => "invalid license expression",
}
}
pub fn parse_license_expr(license_expr: &str) -> Result<parser_types::Disjunction> {
let lexer = lexer::Lexer::new(license_expr);
Ok(parser::DisjunctionParser::new()
.parse(lexer)
.map_err(|e| e.map_token(|t| t.to_string()))?)
}

pub fn validate_license_expr(license_expr: &str) -> Result<(), ParseError> {
license_expr.split_whitespace().map(|word| match word {
"AND" => Ok(And),
"OR" => Ok(Or),
"WITH" => Ok(With),
_ if spdx::LICENSES.binary_search(&word.trim_right_matches('+')).is_ok()
=> Ok(License(word)),
_ if spdx::EXCEPTIONS.binary_search(&word).is_ok()
=> Ok(Exception(word)),
_ => Err(ParseError::UnknownLicenseId(word))
}).fold(Ok(Or), |prev, word| match (prev, word) {
(err @ Err(_), _) | (_, err @ Err(_)) => err,
(Ok(License(_)), Ok(With))
| (Ok(License(_)), Ok(And))
| (Ok(License(_)), Ok(Or))
| (Ok(Exception(_)), Ok(And))
| (Ok(Exception(_)), Ok(Or))
| (Ok(And), Ok(License(_)))
| (Ok(Or), Ok(License(_)))
| (Ok(With), Ok(Exception(_)))
=> word,
_ => Err(ParseError::InvalidStructure(word.unwrap()))
}).and(Ok(()))
pub fn validate_license_expr(license_expr: &str) -> Result<()> {
parse_license_expr(license_expr).map(|_| ())
}

pub fn license_version() -> &'static str {
Expand Down
62 changes: 62 additions & 0 deletions src/parser.lalrpop
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use lexer;
use parser_types::*;

grammar<'input>;

pub Disjunction: Disjunction<'input> = {
<c:Conjunction> => Disjunction { head: c, tail: None },
<c:Conjunction> OR <d:Disjunction> => Disjunction { head: c, tail: Some(Box::new(d)) },
};

pub Conjunction: Conjunction<'input> = {
<t:Term> => Conjunction { head: t, tail: None },
<t:Term> AND <c:Conjunction> => Conjunction { head: t, tail: Some(Box::new(c)) },
};

pub Term: Term<'input> = {
<i:LicenseId> <p:"+"?> => {
if let lexer::Token::LicenseId(id) = i {
Term::License(License {
id: LicenseId::SPDX(id),
exception: None,
or_later: p.is_some(),
})
} else { unreachable!() }
},
<i:LicenseId> <p:"+"?> WITH <e:ExceptionId> => {
if let (lexer::Token::LicenseId(id), lexer::Token::ExceptionId(ex)) = (i, e) {
Term::License(License {
id: LicenseId::SPDX(id),
exception: Some(Exception(ex)),
or_later: p.is_some(),
})
} else { unreachable!() }
},
<r:LicenseRef> => {
if let lexer::Token::LicenseRef(doc, lic) = r {
Term::License(License {
id: LicenseId::Other(doc, lic),
exception: None,
or_later: false,
})
} else { unreachable!() }
},
"(" <d:Disjunction> ")" => Term::Bracketed(Box::new(d)),
};

extern {
type Location = usize;
type Error = failure::Error;

enum lexer::Token<'input> {
LicenseId => lexer::Token::LicenseId(_),
ExceptionId => lexer::Token::ExceptionId(_),
LicenseRef => lexer::Token::LicenseRef(_, _),
"(" => lexer::Token::OpenParen,
")" => lexer::Token::CloseParen,
"+" => lexer::Token::Plus,
OR => lexer::Token::Or,
AND => lexer::Token::And,
WITH => lexer::Token::With,
}
}
Loading