rerun-io · jleibs · May 13, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/crates/re_sdk/src/recording_stream.rs b/crates/re_sdk/src/recording_stream.rs
@@ -1403,6 +1403,11 @@ impl RecordingStream {
     /// If the current sink is in a broken state (e.g. a TCP sink with a broken connection that
     /// cannot be repaired), all pending data in its buffers will be dropped.
     pub fn set_sink(&self, sink: Box<dyn LogSink>) {
+        if self.is_forked_child() {
+            re_log::error_once!("Fork detected during set_sink. cleanup_if_forked() should always be called after forking. This is likely a bug in the SDK.");
+            return;
+        }
+
         let f = move |inner: &RecordingStreamInner| {
             // NOTE: Internal channels can never be closed outside of the `Drop` impl, all these sends
             // are safe.
@@ -1435,6 +1440,11 @@ impl RecordingStream {
     /// This does **not** wait for the flush to propagate (see [`Self::flush_blocking`]).
     /// See [`RecordingStream`] docs for ordering semantics and multithreading guarantees.
     pub fn flush_async(&self) {
+        if self.is_forked_child() {
+            re_log::error_once!("Fork detected during flush_async. cleanup_if_forked() should always be called after forking. This is likely a bug in the SDK.");
+            return;
+        }
+
         let f = move |inner: &RecordingStreamInner| {
             // NOTE: Internal channels can never be closed outside of the `Drop` impl, all these sends
             // are safe.

diff --git a/rerun_py/rerun_sdk/rerun/recording_stream.py b/rerun_py/rerun_sdk/rerun/recording_stream.py
@@ -243,7 +243,13 @@ def to_native(self: RecordingStream | None) -> bindings.PyRecordingStream | None
 
     def __del__(self):  # type: ignore[no-untyped-def]
         recording = RecordingStream.to_native(self)
-        bindings.flush(blocking=False, recording=recording)
+        # TODO(jleibs): I'm 98% sure this flush is redundant, but removing it requires more thorough testing.
+        # However, it's definitely a problem if we are in a forked child process. The rerun SDK will still
+        # detect this case and prevent a hang internally, but will do so with a warning that we should avoid.
+        #
+        # See: https://github.com/rerun-io/rerun/issues/6223 for context on why this is necessary.
+        if recording is not None and not recording.is_forked_child():
+            bindings.flush(blocking=False, recording=recording)
 
 
 def binary_stream(recording: RecordingStream | None = None) -> BinaryStream:

diff --git a/rerun_py/src/python_bridge.rs b/rerun_py/src/python_bridge.rs
@@ -312,6 +312,20 @@ fn shutdown(py: Python<'_>) {
 #[derive(Clone)]
 struct PyRecordingStream(RecordingStream);
 
+#[pymethods]
+impl PyRecordingStream {
+    /// Determine if this stream is operating in the context of a forked child process.
+    ///
+    /// This means the stream was created in the parent process. It now exists in the child
+    /// process by way of fork, but it is effectively a zombie since it's batcher and sink
+    /// threads would not have been copied.
+    ///
+    /// Calling operations such as flush or set_sink will result in an error.
+    fn is_forked_child(&self) -> bool {
+        self.0.is_forked_child()
+    }
+}
+
 impl std::ops::Deref for PyRecordingStream {
     type Target = RecordingStream;
 

diff --git a/rerun_py/tests/unit/test_multiprocessing_gc.py b/rerun_py/tests/unit/test_multiprocessing_gc.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import gc
+
+import rerun as rr
+
+# If torch is available, use torch.multiprocessing instead of multiprocessing
+# since it causes more issues. But, it's annoying to always require it so at
+# least for the tests in other contexts, we'll use the standard library version.
+try:
+    import torch.multiprocessing as multiprocessing
+except ImportError:
+    import multiprocessing  # ignore[no-redef]
+
+
+def task() -> None:
+    # Forcing a gc in the multiprocess task can cause issues, most notably
+    # hangs, if recording streams were leaked across the fork. We see this
+    # happen specifically using the `torch.multiprocessing` module.
+    gc.collect()
+
+
+def test_multiprocessing_gc() -> None:
+    rr.init("rerun_example_multiprocessing_gc")
+
+    proc = multiprocessing.Process(
+        target=task,
+    )
+    proc.start()
+    proc.join(1)
+    if proc.is_alive():
+        # Terminate so our test doesn't get stuck
+        proc.terminate()
+        assert False, "Process deadlocked during gc.collect()"