diff --git a/noodles-gtf/CHANGELOG.md b/noodles-gtf/CHANGELOG.md index c3c02b834..5a74e224c 100644 --- a/noodles-gtf/CHANGELOG.md +++ b/noodles-gtf/CHANGELOG.md @@ -8,6 +8,15 @@ [#291]: https://github.com/zaeleus/noodles/issues/291 +### Fixed + + * gtf/record/attributes: Read string values as text between double quotes + ([#299]). + + This allows the entry delimiter (`;`) to be used in string values. + +[#299]: https://github.com/zaeleus/noodles/issues/299 + ## 0.30.0 - 2024-07-14 ### Changed diff --git a/noodles-gtf/src/record/attributes.rs b/noodles-gtf/src/record/attributes.rs index e35c37581..778e979e4 100644 --- a/noodles-gtf/src/record/attributes.rs +++ b/noodles-gtf/src/record/attributes.rs @@ -36,7 +36,7 @@ impl fmt::Display for Attributes { for (i, entry) in self.0.iter().enumerate() { write!(f, "{entry}")?; - f.write_char(entry::TERMINATOR)?; + f.write_char(entry::DELIMITER)?; if i < self.0.len() - 1 { f.write_char(DELIMITER)?; @@ -80,17 +80,19 @@ impl fmt::Display for ParseError { impl FromStr for Attributes { type Err = ParseError; - fn from_str(s: &str) -> Result { + fn from_str(mut s: &str) -> Result { + use self::entry::parse_entry; + if s.is_empty() { return Err(ParseError::Empty); } - let s = s.strip_suffix(entry::TERMINATOR).unwrap_or(s); + let mut entries = Vec::new(); - let entries = s - .split(entry::TERMINATOR) - .map(|t| t.trim().parse().map_err(ParseError::InvalidEntry)) - .collect::>()?; + while !s.is_empty() { + let entry = parse_entry(&mut s).map_err(ParseError::InvalidEntry)?; + entries.push(entry); + } Ok(Self(entries)) } diff --git a/noodles-gtf/src/record/attributes/entry.rs b/noodles-gtf/src/record/attributes/entry.rs index d8722dba1..ec256310d 100644 --- a/noodles-gtf/src/record/attributes/entry.rs +++ b/noodles-gtf/src/record/attributes/entry.rs @@ -3,7 +3,8 @@ use std::{error, fmt, str::FromStr}; const SEPARATOR: char = ' '; -pub(super) const TERMINATOR: char = ';'; +const DOUBLE_QUOTES: char = '"'; +pub(super) const DELIMITER: char = ';'; /// A GTF record attribute entry. #[derive(Clone, Debug, Eq, PartialEq)] @@ -88,27 +89,70 @@ impl fmt::Display for ParseError { impl FromStr for Entry { type Err = ParseError; - fn from_str(s: &str) -> Result { + fn from_str(mut s: &str) -> Result { if s.is_empty() { Err(ParseError::Empty) } else { - parse_entry(s) + parse_entry(&mut s) } } } -fn parse_entry(s: &str) -> Result { - match s.split_once(SEPARATOR) { - Some((k, v)) => { - let value = parse_value(v); - Ok(Entry::new(k, value)) - } - None => Err(ParseError::Invalid), +pub(super) fn parse_entry(s: &mut &str) -> Result { + let key = parse_key(s)?; + let value = parse_value(s)?; + discard_delimiter(s); + Ok(Entry::new(key, value)) +} + +fn parse_key<'a>(s: &mut &'a str) -> Result<&'a str, ParseError> { + let Some(i) = s.find(SEPARATOR) else { + return Err(ParseError::Invalid); + }; + + let (key, rest) = s.split_at(i); + *s = &rest[1..]; + + Ok(key) +} + +fn parse_value<'a>(s: &mut &'a str) -> Result<&'a str, ParseError> { + if let Some(rest) = s.strip_prefix(DOUBLE_QUOTES) { + *s = rest; + parse_string(s) + } else { + parse_raw_value(s) + } +} + +fn parse_string<'a>(s: &mut &'a str) -> Result<&'a str, ParseError> { + if let Some(i) = s.find(DOUBLE_QUOTES) { + let (t, rest) = s.split_at(i); + *s = &rest[1..]; + Ok(t) + } else { + Err(ParseError::Invalid) } } -fn parse_value(s: &str) -> String { - s.trim_matches('"').into() +fn parse_raw_value<'a>(s: &mut &'a str) -> Result<&'a str, ParseError> { + if let Some(i) = s.find(DELIMITER) { + let (t, rest) = s.split_at(i); + *s = rest; + Ok(t) + } else { + Ok(s) + } +} + +fn discard_delimiter(s: &mut &str) { + *s = s.trim_start(); + + if let Some(rest) = s.strip_prefix(DELIMITER) { + *s = rest; + } + + *s = s.trim_start(); } #[cfg(test)] @@ -127,6 +171,10 @@ mod tests { r#"gene_id "g0""#.parse::(), Ok(Entry::new("gene_id", "g0")) ); + assert_eq!( + r#"gene_ids "g0;g1""#.parse::(), + Ok(Entry::new("gene_ids", "g0;g1")) + ); assert_eq!( r#"gene_id """#.parse::(), Ok(Entry::new("gene_id", ""))