Introduce rerun compare to check whether 2 rrd files are functional…

…ly equivalent (#2597) 1. Run some example and save the result to an .rrd file: ``` $ ./examples/python/clock/main.py --steps 50 --save examples/python/clock/out1.rrd ``` 2. Do it again: ``` $ ./examples/python/clock/main.py --steps 50 --save examples/python/clock/out2.rrd ``` 3. Compare these two rrd files: ``` $ cargo r -p rerun-cli --quiet -- compare examples/python/clock/out1.rrd examples/python/clock/out2.rrd $ echo $? 0 ``` 3. Modify the example slightly, and save it to a third rrd file: ``` $ vim ./examples/python/clock/main.py $ git diff - color_m = (int(255 - (scaled_m * 255)), int(scaled_m * 255), 128, 128) + color_m = (int(255 - (scaled_m * 255)), int(scaled_m * 250), 128, 128) $ ./examples/python/clock/main.py --steps 50 --save examples/python/clock/out1.rrd ``` 4. Compare the two .rrd files ``` $ cargo r -p rerun-cli --quiet -- compare examples/python/clock/out1.rrd examples/python/clock/out3.rrd ``` ![image](https://github.com/rerun-io/rerun/assets/2910679/70be3ec3-cf4f-4584-affc-46b72154fc9b) ``` $ echo $? 1 ``` --- ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested [demo.rerun.io](https://demo.rerun.io/pr/2597) (if applicable) - [PR Build Summary](https://build.rerun.io/pr/2597) - [Docs preview](https://rerun.io/preview/pr%3Acmc%2Fcompare_rrd/docs) - [Examples preview](https://rerun.io/preview/pr%3Acmc%2Fcompare_rrd/examples)
rerun-io · Jul 5, 2023 · caba397 · caba397
1 parent 2569eb7
commit caba397
Show file tree

Hide file tree

Showing 5 changed files with 197 additions and 3 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
   "docs/code-examples",
   "rerun_py",
   "run_wasm",
-  "tests/rust/*",
+  "tests/rust/test_*",
 ]
 
 [workspace.package]

diff --git a/crates/re_log_types/Cargo.toml b/crates/re_log_types/Cargo.toml
@@ -40,6 +40,7 @@ re_tuid = { workspace = true, features = ["arrow2_convert"] }
 
 # External
 ahash.workspace = true
+anyhow.workspace = true
 arrow2 = { workspace = true, features = [
   "io_ipc",
   "io_print",
@@ -53,6 +54,7 @@ web-time.workspace = true
 nohash-hasher = "0.2"
 num-derive = "0.3"
 num-traits = "0.2"
+similar-asserts = "1.4.2"
 smallvec.workspace = true
 thiserror.workspace = true
 time = { workspace = true, features = ["formatting", "macros"] }

diff --git a/crates/re_log_types/src/data_table.rs b/crates/re_log_types/src/data_table.rs
@@ -1057,3 +1057,102 @@ impl std::fmt::Display for DataTable {
         .fmt(f)
     }
 }
+
+impl DataTable {
+    /// Checks whether two [`DataTable`]s are _similar_, i.e. not equal on a byte-level but
+    /// functionally equivalent.
+    ///
+    /// Returns `Ok(())` if they match, or an error containing a detailed diff otherwise.
+    pub fn similar(table1: &DataTable, table2: &DataTable) -> anyhow::Result<()> {
+        /// Given a [`DataTable`], returns all of its rows sorted by timeline.
+        fn compute_rows(table: &DataTable) -> HashMap<Timeline, Vec<DataRow>> {
+            let mut rows_by_timeline: HashMap<Timeline, Vec<DataRow>> = Default::default();
+
+            let rows = table.to_rows().flat_map(|row| {
+                row.timepoint
+                    .iter()
+                    .map(|(timeline, time)| {
+                        let mut row = row.clone();
+                        row.timepoint = TimePoint::from([(*timeline, *time)]);
+                        (*timeline, row)
+                    })
+                    .collect_vec()
+            });
+
+            for (timeline, row) in rows {
+                rows_by_timeline.entry(timeline).or_default().push(row);
+            }
+
+            rows_by_timeline
+        }
+
+        let mut rows_by_timeline1 = compute_rows(table1);
+        let mut rows_by_timeline2 = compute_rows(table2);
+
+        for timeline1 in rows_by_timeline1.keys() {
+            anyhow::ensure!(
+                rows_by_timeline2.contains_key(timeline1),
+                "timeline {timeline1:?} was present in the first rrd file but not in the second",
+            );
+        }
+        for timeline2 in rows_by_timeline2.keys() {
+            anyhow::ensure!(
+                rows_by_timeline1.contains_key(timeline2),
+                "timeline {timeline2:?} was present in the second rrd file but not in the first",
+            );
+        }
+
+        // NOTE: Can't compare `log_time`, by definition.
+        rows_by_timeline1.remove(&Timeline::log_time());
+        rows_by_timeline2.remove(&Timeline::log_time());
+
+        for (timeline, rows1) in &mut rows_by_timeline1 {
+            let rows2 = rows_by_timeline2.get_mut(timeline).unwrap(); // safe
+
+            // NOTE: We need both sets of rows to follow a common natural order for the comparison
+            // to make sense.
+            rows1.sort_by_key(|row| (row.timepoint.clone(), row.row_id));
+            rows2.sort_by_key(|row| (row.timepoint.clone(), row.row_id));
+
+            anyhow::ensure!(
+                rows1.len() == rows2.len(),
+                "rrd files yielded different number of datastore rows for timeline {timeline:?}: {} vs. {}",
+                rows1.len(),
+                rows2.len()
+            );
+
+            for (ri, (row1, row2)) in rows1.iter().zip(rows2).enumerate() {
+                let DataRow {
+                    row_id: _,
+                    timepoint: timepoint1,
+                    entity_path: entity_path1,
+                    num_instances: num_instances1,
+                    cells: cells1,
+                } = row1;
+                let DataRow {
+                    row_id: _,
+                    timepoint: timepoint2,
+                    entity_path: entity_path2,
+                    num_instances: num_instances2,
+                    cells: cells2,
+                } = row2;
+
+                anyhow::ensure!(
+                    timepoint1 == timepoint2
+                        && entity_path1 == entity_path2
+                        && num_instances1 == num_instances2
+                        && cells1 == cells2,
+                    "Found discrepancy in row #{ri}:\n{}",
+                    similar_asserts::SimpleDiff::from_str(
+                        &row1.to_string(),
+                        &row2.to_string(),
+                        "row1",
+                        "row2"
+                    )
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/crates/rerun/src/run.rs b/crates/rerun/src/run.rs
@@ -1,3 +1,5 @@
+use std::path::{Path, PathBuf};
+
 use itertools::Itertools;
 use re_log_types::{LogMsg, PythonVersion};
 use re_smart_channel::{Receiver, SmartMessagePayload};
@@ -149,6 +151,19 @@ enum Commands {
     #[cfg(all(feature = "analytics"))]
     #[command(subcommand)]
     Analytics(AnalyticsCommands),
+
+    /// Compares the data between 2 .rrd files, returning a successful shell exit code if they
+    /// match.
+    ///
+    /// This ignores the `log_time` timeline.
+    Compare {
+        path_to_rrd1: String,
+        path_to_rrd2: String,
+
+        /// If specified, dumps both .rrd files as tables.
+        #[clap(long, default_value_t = false)]
+        full_dump: bool,
+    },
 }
 
 #[derive(Debug, Clone, Subcommand)]
@@ -254,8 +269,16 @@ where
         match commands {
             #[cfg(all(feature = "analytics"))]
             Commands::Analytics(analytics) => run_analytics(analytics).map_err(Into::into),
-            #[cfg(not(all(feature = "analytics")))]
-            _ => Ok(()),
+
+            Commands::Compare {
+                path_to_rrd1,
+                path_to_rrd2,
+                full_dump,
+            } => {
+                let path_to_rrd1 = PathBuf::from(path_to_rrd1);
+                let path_to_rrd2 = PathBuf::from(path_to_rrd2);
+                run_compare(&path_to_rrd1, &path_to_rrd2, *full_dump)
+            }
         }
     } else {
         run_impl(build_info, call_source, args).await
@@ -280,6 +303,75 @@ where
     }
 }
 
+/// Checks whether two .rrd files are _similar_, i.e. not equal on a byte-level but
+/// functionally equivalent.
+///
+/// Returns `Ok(())` if they match, or an error containing a detailed diff otherwise.
+fn run_compare(path_to_rrd1: &Path, path_to_rrd2: &Path, full_dump: bool) -> anyhow::Result<()> {
+    /// Given a path to an rrd file, builds up a `DataStore` and returns its contents as one big
+    /// `DataTable`.
+    ///
+    /// Fails if there are more than one data recordings present in the rrd file.
+    fn compute_uber_table(path_to_rrd: &Path) -> anyhow::Result<re_log_types::DataTable> {
+        use re_data_store::StoreDb;
+        use re_log_types::StoreId;
+
+        let rrd_file = std::fs::File::open(path_to_rrd)
+            .with_context(|| format!("couldn't open rrd file contents at {path_to_rrd:?}"))?;
+
+        let mut stores: std::collections::HashMap<StoreId, StoreDb> = Default::default();
+        let decoder = re_log_encoding::decoder::Decoder::new(rrd_file)?;
+        for msg in decoder {
+            let msg = msg
+                .with_context(|| format!("couldn't decode rrd file contents at {path_to_rrd:?}"))?;
+            stores
+                .entry(msg.store_id().clone())
+                .or_insert(re_data_store::StoreDb::new(msg.store_id().clone()))
+                .add(&msg)
+                .with_context(|| format!("couldn't decode rrd file contents at {path_to_rrd:?}"))?;
+        }
+
+        let mut stores = stores
+            .values()
+            .filter(|store| store.store_kind() == re_log_types::StoreKind::Recording)
+            .collect_vec();
+
+        anyhow::ensure!(
+            !stores.is_empty(),
+            "no data recording found in rrd file at {path_to_rrd:?}"
+        );
+        anyhow::ensure!(
+            stores.len() == 1,
+            "more than one data recording found in rrd file at {path_to_rrd:?}"
+        );
+
+        let store = stores.pop().unwrap(); // safe, ensured above
+
+        let table = re_log_types::DataTable::from_rows(re_log_types::TableId::random(), {
+            let mut rows = store
+                .store()
+                .to_data_tables(None)
+                .flat_map(|t| t.to_rows().collect_vec())
+                .collect_vec();
+            // NOTE: So the full dump makes sense, if enabled.
+            rows.sort_by_key(|row| (row.timepoint.clone(), row.row_id));
+            rows
+        });
+
+        Ok::<_, anyhow::Error>(table)
+    }
+
+    let table1 = compute_uber_table(path_to_rrd1)?;
+    let table2 = compute_uber_table(path_to_rrd2)?;
+
+    if full_dump {
+        println!("{table1}");
+        println!("{table2}");
+    }
+
+    re_log_types::DataTable::similar(&table1, &table2)
+}
+
 #[cfg(all(feature = "analytics"))]
 fn run_analytics(cmd: &AnalyticsCommands) -> Result<(), re_analytics::cli::CliError> {
     match cmd {