Skip to content

Commit

Permalink
ref: Use watto for ppdb and symcache parsing (#670)
Browse files Browse the repository at this point in the history
This requires a version bump of the symcache format
because the length prefixes of strings change from a fixed
u32 to variable length LEB128.

Co-authored-by: Arpad Borsos <[email protected]>
  • Loading branch information
loewenheim and Swatinem authored Sep 21, 2022
1 parent 2baf70f commit 2411baf
Show file tree
Hide file tree
Showing 17 changed files with 271 additions and 368 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## Unreleased

**Features**:

- The new version 8 of the symcache format saves strings slightly more compactly. Reading of version 7 is still supported. ([#670](https://github.com/getsentry/symbolic/pull/670))

**Internal**:

- Use the `watto` library for de/serialization of symcache and ppdbcache. ([#670](https://github.com/getsentry/symbolic/pull/670))

## 9.1.4

**Fixes**:
Expand Down
4 changes: 3 additions & 1 deletion symbolic-cabi/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,9 @@ impl SymbolicErrorCode {
if let Some(error) = error.downcast_ref::<Error>() {
return match error.kind() {
ErrorKind::WrongFormat => SymbolicErrorCode::SymCacheErrorBadFileMagic,
ErrorKind::HeaderTooSmall => SymbolicErrorCode::SymCacheErrorBadFileHeader,
ErrorKind::HeaderTooSmall | ErrorKind::InvalidHeader => {
SymbolicErrorCode::SymCacheErrorBadFileHeader
}
ErrorKind::WrongVersion => SymbolicErrorCode::SymCacheErrorUnsupportedVersion,
ErrorKind::BadDebugFile => SymbolicErrorCode::SymCacheErrorBadDebugFile,
_ => SymbolicErrorCode::SymCacheErrorUnknown,
Expand Down
3 changes: 1 addition & 2 deletions symbolic-ppdb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ all-features = true

[dependencies]
indexmap = "1.9.1"
leb128 = "0.2.5"
symbolic-common = { version = "9.1.4", path = "../symbolic-common" }
watto = { version = "0.1.0", features = ["writer", "strings"] }
thiserror = "1.0.31"
uuid = "1.0.0"
zerocopy = "0.6.1"
7 changes: 1 addition & 6 deletions symbolic-ppdb/src/cache/lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,6 @@ impl<'data> PortablePdbCache<'data> {

/// Resolves a string reference to the pointed-to `&str` data.
fn get_string(&self, offset: u32) -> Option<&'data str> {
let reader = &mut self.string_bytes.get(offset as usize..)?;
let len = leb128::read::unsigned(reader).ok()? as usize;

let bytes = reader.get(..len)?;

std::str::from_utf8(bytes).ok()
watto::StringTable::read(self.string_bytes, offset as usize).ok()
}
}
53 changes: 18 additions & 35 deletions symbolic-ppdb/src/cache/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub(crate) mod writer;

use symbolic_common::DebugId;
use thiserror::Error;
use zerocopy::LayoutVerified;
use watto::{align_to, Pod};

const PPDBCACHE_VERSION: u32 = 1;

Expand Down Expand Up @@ -148,10 +148,8 @@ pub struct PortablePdbCache<'data> {
impl<'data> PortablePdbCache<'data> {
/// Parses the given buffer into a `PortablePdbCache`.
pub fn parse(buf: &'data [u8]) -> Result<Self, CacheError> {
let (lv, rest) = LayoutVerified::<_, raw::Header>::new_from_prefix(buf)
.ok_or(CacheErrorKind::InvalidHeader)?;

let header = lv.into_ref();
let (header, rest) =
raw::Header::ref_from_prefix(buf).ok_or(CacheErrorKind::InvalidHeader)?;

if header.magic == raw::PPDBCACHE_MAGIC_FLIPPED {
return Err(CacheErrorKind::WrongEndianness.into());
Expand All @@ -164,34 +162,26 @@ impl<'data> PortablePdbCache<'data> {
return Err(CacheErrorKind::WrongVersion(header.version).into());
}

let rest = align_buf(rest);
let (_, rest) = align_to(rest, 8).ok_or(CacheErrorKind::InvalidFiles)?;

let (lv, rest) = LayoutVerified::<_, [raw::File]>::new_slice_from_prefix(
rest,
header.num_files as usize,
)
.ok_or(CacheErrorKind::InvalidFiles)?;
let (files, rest) = raw::File::slice_from_prefix(rest, header.num_files as usize)
.ok_or(CacheErrorKind::InvalidFiles)?;

let files = lv.into_slice();
let rest = align_buf(rest);
let (_, rest) = align_to(rest, 8).ok_or(CacheErrorKind::InvalidSourceLocations)?;

let (lv, rest) = LayoutVerified::<_, [raw::SourceLocation]>::new_slice_from_prefix(
rest,
header.num_ranges as usize,
)
.ok_or(CacheErrorKind::InvalidSourceLocations)?;
let (source_locations, rest) =
raw::SourceLocation::slice_from_prefix(rest, header.num_ranges as usize)
.ok_or(CacheErrorKind::InvalidSourceLocations)?;

let source_locations = lv.into_slice();
let rest = align_buf(rest);
let (_, rest) = align_to(rest, 8).ok_or(CacheErrorKind::InvalidRanges)?;

let (lv, rest) = LayoutVerified::<_, [raw::Range]>::new_slice_from_prefix(
rest,
header.num_ranges as usize,
)
.ok_or(CacheErrorKind::InvalidRanges)?;
let (ranges, rest) = raw::Range::slice_from_prefix(rest, header.num_ranges as usize)
.ok_or(CacheErrorKind::InvalidRanges)?;

let ranges = lv.into_slice();
let rest = align_buf(rest);
let (_, rest) = align_to(rest, 8).ok_or(CacheErrorKind::UnexpectedStringBytes {
expected: header.string_bytes as usize,
found: 0,
})?;

if rest.len() < header.string_bytes as usize {
return Err(CacheErrorKind::UnexpectedStringBytes {
Expand All @@ -212,9 +202,7 @@ impl<'data> PortablePdbCache<'data> {

/// Returns the [`DebugId`] of this portable PDB cache.
pub fn debug_id(&self) -> DebugId {
let (guid, age) = self.header.pdb_id.split_at(16);
let age = u32::from_ne_bytes(age.try_into().unwrap());
DebugId::from_guid_age(guid, age).unwrap()
self.header.pdb_id
}
}

Expand All @@ -229,8 +217,3 @@ impl<'data> std::fmt::Debug for PortablePdbCache<'data> {
.finish()
}
}

fn align_buf(buf: &[u8]) -> &[u8] {
let offset = buf.as_ptr().align_offset(8);
buf.get(offset..).unwrap_or(&[])
}
40 changes: 34 additions & 6 deletions symbolic-ppdb/src/cache/raw.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use zerocopy::{AsBytes, FromBytes};
use symbolic_common::DebugId;
use watto::Pod;

/// The magic file preamble as individual bytes.
const PPDBCACHE_MAGIC_BYTES: [u8; 4] = *b"PDBC";
Expand All @@ -11,15 +12,15 @@ pub(crate) const PPDBCACHE_MAGIC: u32 = u32::from_le_bytes(PPDBCACHE_MAGIC_BYTES
pub(crate) const PPDBCACHE_MAGIC_FLIPPED: u32 = PPDBCACHE_MAGIC.swap_bytes();

/// The header of a PortablePdbCache file.
#[derive(Debug, Clone, FromBytes, AsBytes)]
#[derive(Debug, Clone)]
#[repr(C)]
pub(crate) struct Header {
/// The file magic representing the file format and endianness.
pub(crate) magic: u32,
/// The PortablePdbCache format version.
pub(crate) version: u32,
/// A byte sequence uniquely representing the debugging metadata blob content.
pub(crate) pdb_id: [u8; 20],
pub(crate) pdb_id: DebugId,
/// The number of files contained in the cache file.
pub(crate) num_files: u32,
/// The number of ranges/source locations contained in the cache file.
Expand All @@ -32,7 +33,7 @@ pub(crate) struct Header {
}

/// A location in a source file, comprising a line and the index of a file.
#[derive(Debug, Clone, Copy, FromBytes, AsBytes)]
#[derive(Debug, Clone, Copy)]
#[repr(C)]
pub(crate) struct SourceLocation {
pub(crate) line: u32,
Expand All @@ -43,19 +44,46 @@ pub(crate) struct SourceLocation {
///
/// Only the starting IL offset is saved; the ending offset is given implicitly by
/// the starting offset of the next range (if any).
#[derive(Debug, Clone, Copy, FromBytes, AsBytes, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub(crate) struct Range {
pub(crate) func_idx: u32,
pub(crate) il_offset: u32,
}

/// Serialized file in the cache.
#[derive(Debug, Clone, Copy, FromBytes, AsBytes, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub(crate) struct File {
/// The file path (reference to a [`String`]).
pub(crate) name_offset: u32,
/// The file's source language.
pub(crate) lang: u32,
}

unsafe impl Pod for Header {}
unsafe impl Pod for SourceLocation {}
unsafe impl Pod for Range {}
unsafe impl Pod for File {}

#[cfg(test)]
mod tests {
use std::mem;

use super::*;

#[test]
fn test_sizeof() {
assert_eq!(mem::size_of::<Header>(), 68);
assert_eq!(mem::align_of::<Header>(), 4);

assert_eq!(mem::size_of::<File>(), 8);
assert_eq!(mem::align_of::<File>(), 4);

assert_eq!(mem::size_of::<SourceLocation>(), 8);
assert_eq!(mem::align_of::<SourceLocation>(), 4);

assert_eq!(mem::size_of::<Range>(), 8);
assert_eq!(mem::align_of::<Range>(), 4);
}
}
102 changes: 19 additions & 83 deletions symbolic-ppdb/src/cache/writer.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use std::io::Write;

use indexmap::IndexSet;
use symbolic_common::Language;
use zerocopy::AsBytes;
use symbolic_common::{DebugId, Language};
use watto::{Pod, StringTable};

use super::{raw, CacheError};
use crate::PortablePdb;
Expand All @@ -15,18 +15,15 @@ use crate::PortablePdb;
#[derive(Debug, Default)]
pub struct PortablePdbCacheConverter {
/// A byte sequence uniquely representing the debugging metadata blob content.
pdb_id: [u8; 20],
pdb_id: DebugId,
/// The set of all [`raw::File`]s that have been added to this `Converter`.
files: IndexSet<raw::File>,
/// The concatenation of all strings that have been added to this `Converter`.
string_bytes: Vec<u8>,
/// A map from [`String`]s that have been added to this `Converter` to their offsets in the `string_bytes` field.
strings: HashMap<String, u32>,
/// A map from [`raw::Range`]s to the [`raw::SourceLocation`]s they correspond to.
///
/// Only the starting address of a range is saved, the end address is given implicitly
/// by the start address of the next range.
ranges: BTreeMap<raw::Range, raw::SourceLocation>,
string_table: StringTable,
}

impl PortablePdbCacheConverter {
Expand Down Expand Up @@ -69,11 +66,11 @@ impl PortablePdbCacheConverter {
///
/// This writes the PortablePdbCache binary format into the given [`Write`].
pub fn serialize<W: Write>(self, writer: &mut W) -> std::io::Result<()> {
let mut writer = WriteWrapper::new(writer);
let mut writer = watto::Writer::new(writer);

let num_ranges = self.ranges.len() as u32;
let num_files = self.files.len() as u32;
let string_bytes = self.string_bytes.len() as u32;
let string_bytes = self.string_table.into_bytes();

let header = raw::Header {
magic: raw::PPDBCACHE_MAGIC,
Expand All @@ -83,105 +80,44 @@ impl PortablePdbCacheConverter {

num_files,
num_ranges,
string_bytes,
string_bytes: string_bytes.len() as u32,
_reserved: [0; 16],
};

writer.write(header.as_bytes())?;
writer.align()?;
writer.write_all(header.as_bytes())?;
writer.align_to(8)?;

for file in self.files.into_iter() {
writer.write(file.as_bytes())?;
writer.write_all(file.as_bytes())?;
}
writer.align()?;
writer.align_to(8)?;

for sl in self.ranges.values() {
writer.write(sl.as_bytes())?;
writer.write_all(sl.as_bytes())?;
}
writer.align()?;
writer.align_to(8)?;

for r in self.ranges.keys() {
writer.write(r.as_bytes())?;
writer.write_all(r.as_bytes())?;
}
writer.align()?;
writer.align_to(8)?;

writer.write(&self.string_bytes)?;
writer.write_all(&string_bytes)?;

Ok(())
}

fn set_pdb_id(&mut self, id: [u8; 20]) {
fn set_pdb_id(&mut self, id: DebugId) {
self.pdb_id = id;
}

fn insert_file(&mut self, name: &str, lang: Language) -> u32 {
let name_offset = self.insert_string(name);
let name_offset = self.string_table.insert(name) as u32;
let file = raw::File {
name_offset,
lang: lang as u32,
};

self.files.insert_full(file).0 as u32
}

/// Insert a string into this converter.
///
/// If the string was already present, it is not added again. A newly added string
/// is prefixed by its length in LEB128 encoding. The returned `u32`
/// is the offset into the `string_bytes` field where the string is saved.
fn insert_string(&mut self, s: &str) -> u32 {
let Self {
ref mut strings,
ref mut string_bytes,
..
} = self;
if s.is_empty() {
return u32::MAX;
}
if let Some(&offset) = strings.get(s) {
return offset;
}
let string_offset = string_bytes.len() as u32;
let string_len = s.len() as u64;
leb128::write::unsigned(string_bytes, string_len).unwrap();
string_bytes.extend(s.bytes());

strings.insert(s.to_owned(), string_offset);
string_offset
}
}

struct WriteWrapper<W> {
writer: W,
position: usize,
}

impl<W: Write> WriteWrapper<W> {
fn new(writer: W) -> Self {
Self {
writer,
position: 0,
}
}

fn write(&mut self, data: &[u8]) -> std::io::Result<usize> {
let len = data.len();
self.writer.write_all(data)?;
self.position += len;
Ok(len)
}

fn align(&mut self) -> std::io::Result<usize> {
let buf = &[0u8; 7];
let len = {
let to_align = self.position;
let remainder = to_align % 8;
if remainder == 0 {
remainder
} else {
8 - remainder
}
};
self.write(&buf[0..len])
}
}
Loading

0 comments on commit 2411baf

Please sign in to comment.