(cmap) Add cmap::Subtable::codepoints method.

Closes #20
harfbuzz · Jul 31, 2020 · a0ec717 · a0ec717
1 parent f6bb9bf
commit a0ec717
Show file tree

Hide file tree

Showing 9 changed files with 254 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased]
+### Added
+- `cmap::Subtable::codepoints`
+
 ### Fixed
 - (cmap) Incorrectly returning glyph ID `0` instead of `None` for format 0
 - (cmap) Possible invalid glyph mapping for format 2

diff --git a/src/tables/cmap/format0.rs b/src/tables/cmap/format0.rs
@@ -20,9 +20,30 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
     }
 }
 
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.skip::<u16>(); // format
+    s.skip::<u16>(); // length
+    s.skip::<u16>(); // language
+
+    for code_point in 0..256 {
+        // In contrast to every other format, here we take a look at the glyph
+        // id and check whether it is zero because otherwise this method would
+        // always simply call `f` for `0..256` which would be kind of pointless
+        // (this array always has length 256 even when the face has only fewer
+        // glyphs).
+        let glyph_id: u8 = s.read()?;
+        if glyph_id != 0 {
+            f(code_point);
+        }
+    }
+
+    Some(())
+}
+
 #[cfg(test)]
-mod format0_tests {
-    use super::parse;
+mod tests {
+    use super::{parse, codepoints};
 
     #[test]
     fn maps_not_all_256_codepoints() {
@@ -39,5 +60,9 @@ mod format0_tests {
         assert_eq!(parse(&data, 0), None);
         assert_eq!(parse(&data, 0x40), Some(100));
         assert_eq!(parse(&data, 100), None);
+
+        let mut vec = vec![];
+        codepoints(&data, |c| vec.push(c));
+        assert_eq!(vec, [0x40]);
     }
 }
diff --git a/src/tables/cmap/format10.rs b/src/tables/cmap/format10.rs
@@ -15,3 +15,20 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
     let idx = code_point.checked_sub(first_code_point)?;
     glyphs.get(idx)
 }
+
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.skip::<u16>(); // format
+    s.skip::<u16>(); // reserved
+    s.skip::<u32>(); // length
+    s.skip::<u32>(); // language
+    let first_code_point: u32 = s.read()?;
+    let count: u32 = s.read()?;
+
+    for i in 0..count {
+        let code_point = first_code_point.checked_add(i)?;
+        f(code_point);
+    }
+
+    Some(())
+}
diff --git a/src/tables/cmap/format12.rs b/src/tables/cmap/format12.rs
@@ -43,3 +43,20 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
 
     None
 }
+
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.skip::<u16>(); // format
+    s.skip::<u16>(); // reserved
+    s.skip::<u32>(); // length
+    s.skip::<u32>(); // language
+    let count: u32 = s.read()?;
+    let groups = s.read_array32::<SequentialMapGroup>(count)?;
+    for group in groups {
+        for code_point in group.start_char_code..=group.end_char_code {
+            f(code_point);
+        }
+    }
+
+    Some(())
+}
diff --git a/src/tables/cmap/format13.rs b/src/tables/cmap/format13.rs
@@ -21,3 +21,9 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
 
     None
 }
+
+pub fn codepoints(data: &[u8], f: impl FnMut(u32)) -> Option<()> {
+    // Only the glyph id mapping differs for this table. The code points are the
+    // same as for format 12.
+    super::format12::codepoints(data, f)
+}
diff --git a/src/tables/cmap/format2.rs b/src/tables/cmap/format2.rs
@@ -92,10 +92,77 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
     u16::try_from((i32::from(glyph) + i32::from(sub_header.id_delta)) % 65536).ok()
 }
 
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.skip::<u16>(); // format
+    s.skip::<u16>(); // length
+    s.skip::<u16>(); // language
+    let sub_header_keys = s.read_array16::<u16>(256)?;
+
+    // The maximum index in a sub_header_keys is a sub_headers count.
+    let sub_headers_count = sub_header_keys.into_iter().map(|n| n / 8).max()? + 1;
+    let sub_headers = s.read_array16::<SubHeaderRecord>(sub_headers_count)?;
+
+    for first_byte in 0u16..256 {
+        let i = sub_header_keys.get(first_byte)? / 8;
+        let sub_header = sub_headers.get(i)?;
+        let first_code = sub_header.first_code;
+
+        if i == 0 {
+            // This is a single byte code.
+            let range_end = first_code.checked_add(sub_header.entry_count)?;
+            if first_byte >= first_code && first_byte < range_end {
+                f(u32::from(first_byte));
+            }
+        } else {
+            // This is a two byte code.
+            let base = first_code.checked_add(first_byte << 8)?;
+            for k in 0..sub_header.entry_count {
+                let code_point = base.checked_add(k)?;
+                f(u32::from(code_point));
+            }
+        }
+    }
+
+    Some(())
+}
+
 #[cfg(test)]
-mod format2_tests {
+mod tests {
     use crate::parser::FromData;
-    use super::parse;
+    use super::{parse, codepoints};
+
+    #[test]
+    fn collect_codepoints() {
+        let mut data = vec![
+            0x00, 0x02, // format: 2
+            0x02, 0x16, // subtable size: 534
+            0x00, 0x00, // language ID: 0
+        ];
+
+        // Make only high byte 0x28 multi-byte.
+        data.extend(std::iter::repeat(0x00).take(256 * u16::SIZE));
+        data[6 + 0x28 * u16::SIZE + 1] = 0x08;
+
+        data.extend(&[
+            // First sub header (for single byte mapping)
+            0x00, 0xFE, // first code: 254
+            0x00, 0x02, // entry count: 2
+            0x00, 0x00, // id delta: uninteresting
+            0x00, 0x00, // id range offset: uninteresting
+            // Second sub header (for high byte 0x28)
+            0x00, 0x10, // first code: (0x28 << 8) + 0x10 = 10256,
+            0x00, 0x03, // entry count: 3
+            0x00, 0x00, // id delta: uninteresting
+            0x00, 0x00, // id range offset: uninteresting
+        ]);
+
+        // Now only glyph ID's would follow. Not interesting for codepoints.
+
+        let mut vec = vec![];
+        codepoints(&data, |c| vec.push(c));
+        assert_eq!(vec, [10256, 10257, 10258, 254, 255]);
+    }
 
     #[test]
     fn codepoint_at_range_end() {

diff --git a/src/tables/cmap/format4.rs b/src/tables/cmap/format4.rs
@@ -66,9 +66,33 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
     None
 }
 
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.advance(6); // format + length + language
+    let seg_count_x2: u16 = s.read()?;
+    if seg_count_x2 < 2 {
+        return None;
+    }
+
+    let seg_count = seg_count_x2 / 2;
+    s.advance(6); // searchRange + entrySelector + rangeShift
+
+    let end_codes = s.read_array16::<u16>(seg_count)?;
+    s.skip::<u16>(); // reservedPad
+    let start_codes = s.read_array16::<u16>(seg_count)?;
+
+    for (start, end) in start_codes.into_iter().zip(end_codes) {
+        for code_point in start..=end {
+            f(u32::from(code_point));
+        }
+    }
+
+    Some(())
+}
+
 #[cfg(test)]
-mod format4_tests {
-    use super::parse;
+mod tests {
+    use super::{parse, codepoints};
 
     #[test]
     fn single_glyph() {
@@ -457,4 +481,29 @@ mod format4_tests {
 
         assert_eq!(parse(data, 0x41), None);
     }
+
+    #[test]
+    fn collect_codepoints() {
+        let data = &[
+            0x00, 0x04, // format: 4
+            0x00, 0x18, // subtable size: 24
+            0x00, 0x00, // language ID: 0
+            0x00, 0x04, // 2 x segCount: 4
+            0x00, 0x02, // search range: 2
+            0x00, 0x00, // entry selector: 0
+            0x00, 0x02, // range shift: 2
+            // End character codes
+            0x00, 0x22, // char code [0]: 34
+            0xFF, 0xFF, // char code [1]: 65535
+            0x00, 0x00, // reserved: 0
+            // Start character codes
+            0x00, 0x1B, // char code [0]: 27
+            0xFF, 0xFD, // char code [1]: 65533
+            // codepoints does not care about glyph ids
+        ];
+
+        let mut vec = vec![];
+        codepoints(data, |c| vec.push(c));
+        assert_eq!(vec, [27, 28, 29, 30, 31, 32, 33, 34, 65533, 65534, 65535]);
+    }
 }
diff --git a/src/tables/cmap/format6.rs b/src/tables/cmap/format6.rs
@@ -19,3 +19,19 @@ pub fn parse(data: &[u8], code_point: u32) -> Option<u16> {
     let idx = code_point.checked_sub(first_code_point)?;
     glyphs.get(idx)
 }
+
+pub fn codepoints(data: &[u8], mut f: impl FnMut(u32)) -> Option<()> {
+    let mut s = Stream::new(data);
+    s.skip::<u16>(); // format
+    s.skip::<u16>(); // length
+    s.skip::<u16>(); // language
+    let first_code_point: u16 = s.read()?;
+    let count: u16 = s.read()?;
+
+    for i in 0..count {
+        let code_point = first_code_point.checked_add(i)?;
+        f(u32::from(code_point));
+    }
+
+    Some(())
+}
diff --git a/src/tables/cmap/mod.rs b/src/tables/cmap/mod.rs
@@ -122,7 +122,7 @@ impl<'a> Subtable<'a> {
 
     /// Maps a character to a glyph ID.
     ///
-    /// This is a low-level method and unlike `Face::glyph_index` is doesn't
+    /// This is a low-level method and unlike `Face::glyph_index` it doesn't
     /// check that the current encoding is Unicode.
     /// It simply maps a `u32` codepoint number to a glyph ID.
     ///
@@ -180,6 +180,53 @@ impl<'a> Subtable<'a> {
             None
         }
     }
+
+    /// Calls `f` for all codepoints contained in this subtable.
+    ///
+    /// This is a low-level method and it doesn't check that the current
+    /// encoding is Unicode. It simply calls the function `f` for all `u32`
+    /// codepoints that are present in this subtable.
+    ///
+    /// Note that this may list codepoints for which `glyph_index` still returns
+    /// `None` because this method finds all codepoints which were _defined_ in
+    /// this subtable. The subtable may still map them to glyph ID `0`.
+    ///
+    /// Returns without doing anything:
+    /// - when format is `MixedCoverage`, since it's not supported.
+    /// - when format is `UnicodeVariationSequences`, since it's not supported.
+    pub fn codepoints<F: FnMut(u32)>(&self, f: F) {
+        let _ = match self.format {
+            Format::ByteEncodingTable => {
+                format0::codepoints(self.subtable_data, f)
+            }
+            Format::HighByteMappingThroughTable => {
+                format2::codepoints(self.subtable_data, f)
+            },
+            Format::SegmentMappingToDeltaValues => {
+                format4::codepoints(self.subtable_data, f)
+            },
+            Format::TrimmedTableMapping => {
+                format6::codepoints(self.subtable_data, f)
+            },
+            Format::MixedCoverage => {
+                // Unsupported
+                None
+            },
+            Format::TrimmedArray => {
+                format10::codepoints(self.subtable_data, f)
+            },
+            Format::SegmentedCoverage => {
+                format12::codepoints(self.subtable_data, f)
+            }
+            Format::ManyToOneRangeMappings => {
+                format13::codepoints(self.subtable_data, f)
+            },
+            Format::UnicodeVariationSequences => {
+                // Unsupported
+                None
+            },
+        };
+    }
 }
 
 impl<'a> core::fmt::Debug for Subtable<'a> {