read: add read_bytes_at_until

Use this for dyld cache image paths. The main benefit is avoiding an artificial limit for mapped data.
gimli-rs · May 26, 2021 · df31a2c · df31a2c
1 parent 3c0504b
commit df31a2c
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 11 deletions.
diff --git a/src/read/macho/dyld_cache.rs b/src/read/macho/dyld_cache.rs
@@ -1,7 +1,7 @@
 use core::slice;
 
 use crate::read::{Error, File, ReadError, ReadRef, Result};
-use crate::{macho, Architecture, Bytes, Endian, Endianness};
+use crate::{macho, Architecture, Endian, Endianness};
 
 /// A parsed representation of the dyld shared cache.
 #[derive(Debug)]
@@ -192,16 +192,8 @@ impl<E: Endian> macho::DyldCacheHeader<E> {
 impl<E: Endian> macho::DyldCacheImageInfo<E> {
     /// The file system path of this image.
     pub fn path<'data, R: ReadRef<'data>>(&self, endian: E, data: R) -> Result<&'data [u8]> {
-        // The longest path I've seen is 164 bytes long. In theory paths could be longer than 256.
-        const MAX_PATH_LEN: u64 = 256;
-
-        let path_offset = self.path_file_offset.get(endian).into();
-        let slice_containing_path = data
-            .read_bytes_at(path_offset, MAX_PATH_LEN)
-            .read_error("Couldn't read path")?;
-        Bytes(slice_containing_path)
-            .read_string()
-            .read_error("Couldn't read path string (didn't find nul byte within first 256 bytes)")
+        data.read_bytes_at_until(self.path_file_offset.get(endian).into(), 0)
+            .read_error("Couldn't read dyld cache image path")
     }
 
     /// Find the file offset of the image by looking up its address in the mappings.

diff --git a/src/read/read_cache.rs b/src/read/read_cache.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
 use std::convert::TryInto;
 use std::io::{Read, Seek, SeekFrom};
 use std::mem;
+use std::vec::Vec;
 
 use crate::read::ReadRef;
 
@@ -24,6 +25,7 @@ pub struct ReadCache<R: Read + Seek> {
 struct ReadCacheInternal<R: Read + Seek> {
     read: R,
     bufs: HashMap<(u64, u64), Box<[u8]>>,
+    strings: HashMap<(u64, u8), Box<[u8]>>,
 }
 
 impl<R: Read + Seek> ReadCache<R> {
@@ -33,6 +35,7 @@ impl<R: Read + Seek> ReadCache<R> {
             cache: RefCell::new(ReadCacheInternal {
                 read,
                 bufs: HashMap::new(),
+                strings: HashMap::new(),
             }),
         }
     }
@@ -86,6 +89,44 @@ impl<'a, R: Read + Seek> ReadRef<'a> for &'a ReadCache<R> {
         // This is OK because we never mutate or remove entries.
         Ok(unsafe { mem::transmute::<&[u8], &[u8]>(buf) })
     }
+
+    fn read_bytes_at_until(self, offset: u64, delimiter: u8) -> Result<&'a [u8], ()> {
+        let cache = &mut *self.cache.borrow_mut();
+        let buf = match cache.strings.entry((offset, delimiter)) {
+            Entry::Occupied(entry) => entry.into_mut(),
+            Entry::Vacant(entry) => {
+                cache
+                    .read
+                    .seek(SeekFrom::Start(offset as u64))
+                    .map_err(|_| ())?;
+                let mut bytes = Vec::new();
+                let mut checked = 0;
+                loop {
+                    bytes.resize(checked + 256, 0);
+                    let read = cache.read.read(&mut bytes[checked..]).map_err(|_| ())?;
+                    if read == 0 {
+                        return Err(());
+                    }
+                    match memchr::memchr(delimiter, &bytes[checked..][..read]) {
+                        Some(len) => {
+                            bytes.truncate(checked + len);
+                            break entry.insert(bytes.into_boxed_slice());
+                        }
+                        None => {}
+                    }
+                    checked += read;
+                    // Strings should be relatively small.
+                    // TODO: make this configurable?
+                    if checked > 4096 {
+                        return Err(());
+                    }
+                }
+            }
+        };
+        // Extend the lifetime to that of self.
+        // This is OK because we never mutate or remove entries.
+        Ok(unsafe { mem::transmute::<&[u8], &[u8]>(buf) })
+    }
 }
 
 /// An implementation of `ReadRef` for a range of data in a stream that
@@ -127,4 +168,15 @@ impl<'a, R: Read + Seek> ReadRef<'a> for ReadCacheRange<'a, R> {
         let r_offset = self.offset.checked_add(offset).ok_or(())?;
         self.r.read_bytes_at(r_offset, size)
     }
+
+    fn read_bytes_at_until(self, offset: u64, delimiter: u8) -> Result<&'a [u8], ()> {
+        let r_offset = self.offset.checked_add(offset).ok_or(())?;
+        let bytes = self.r.read_bytes_at_until(r_offset, delimiter)?;
+        let size = bytes.len().try_into().map_err(|_| ())?;
+        let end = offset.checked_add(size).ok_or(())?;
+        if end > self.size {
+            return Err(());
+        }
+        Ok(bytes)
+    }
 }
diff --git a/src/read/read_ref.rs b/src/read/read_ref.rs
@@ -40,6 +40,14 @@ pub trait ReadRef<'a>: Clone + Copy {
     /// Returns an error if offset or size are out of bounds.
     fn read_bytes_at(self, offset: u64, size: u64) -> Result<&'a [u8]>;
 
+    /// Get a reference to a delimited `u8` slice at the given offset.
+    ///
+    /// Does not include the delimiter.
+    ///
+    /// Returns an error if offset is out of bounds or the delimiter is
+    /// not found.
+    fn read_bytes_at_until(self, offset: u64, terminator: u8) -> Result<&'a [u8]>;
+
     /// Get a reference to a `u8` slice at the given offset, and update the offset.
     ///
     /// Returns an error if offset or size are out of bounds.
@@ -110,4 +118,16 @@ impl<'a> ReadRef<'a> for &'a [u8] {
         let size: usize = size.try_into().map_err(|_| ())?;
         self.get(offset..).ok_or(())?.get(..size).ok_or(())
     }
+
+    fn read_bytes_at_until(self, offset: u64, delimiter: u8) -> Result<&'a [u8]> {
+        let offset: usize = offset.try_into().map_err(|_| ())?;
+        let bytes = self.get(offset..).ok_or(())?;
+        match memchr::memchr(delimiter, bytes) {
+            Some(len) => {
+                // This will never fail.
+                bytes.get(..len).ok_or(())
+            }
+            None => Err(()),
+        }
+    }
 }