Add support for RealityCapture (#1055)

* Add support for RealityCapture * Fix coordinate bug * Remove transform * Update focal lengths * isort
nerfstudio-project · Jan 20, 2023 · af41819 · af41819
1 parent b1da94c
commit af41819
Show file tree

Hide file tree

Showing 3 changed files with 244 additions and 34 deletions.
diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py
@@ -45,6 +45,29 @@ class CameraModel(Enum):
 }
 
 
+def get_image_filenames(directory: Path, max_num_images: int = -1) -> Tuple[List[Path], int]:
+    """Returns a list of image filenames in a directory.
+
+    Args:
+        dir: Path to the directory.
+        max_num_images: The maximum number of images to return. -1 means no limit.
+    Returns:
+        A tuple of A list of image filenames, number of original image paths.
+    """
+    allowed_exts = [".jpg", ".jpeg", ".png", ".tif", ".tiff"]
+    image_paths = sorted([p for p in directory.glob("[!.]*") if p.suffix.lower() in allowed_exts])
+    num_orig_images = len(image_paths)
+
+    if max_num_images != -1 and num_orig_images > max_num_images:
+        idx = np.round(np.linspace(0, num_orig_images - 1, max_num_images)).astype(int)
+    else:
+        idx = np.arange(num_orig_images)
+
+    image_filenames = list(np.array(image_paths)[idx])
+
+    return image_filenames, num_orig_images
+
+
 def get_num_frames_in_video(video: Path) -> int:
     """Returns the number of frames in a video.
 

diff --git a/nerfstudio/process_data/realitycapture_utils.py b/nerfstudio/process_data/realitycapture_utils.py
@@ -0,0 +1,119 @@
+# Copyright 2022 The Nerfstudio Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper utils for processing polycam data into the nerfstudio format."""
+
+import csv
+import json
+from pathlib import Path
+from typing import List
+
+import numpy as np
+from PIL import Image
+from rich.console import Console
+
+from nerfstudio.process_data.process_data_utils import CAMERA_MODELS
+
+CONSOLE = Console(width=120)
+
+
+def realitycapture_to_json(
+    image_filename_map: List[Path],
+    csv_filename: Path,
+    output_dir: Path,
+) -> List[str]:
+    """Convert RealityCapture data into a nerfstudio dataset.
+
+    Args:
+        image_filenames: List of paths to the original images.
+        csv_filename: Path to the csv file containing the camera poses.
+        output_dir: Path to the output directory.
+        verbose: Whether to print verbose output.
+
+    Returns:
+        Summary of the conversion.
+    """
+    data = {}
+    data["camera_model"] = CAMERA_MODELS["perspective"].value
+    # Needs to be a string for camera_utils.auto_orient_and_center_poses
+    data["orientation_override"] = "none"
+
+    frames = []
+
+    with open(csv_filename, encoding="UTF-8") as file:
+        reader = csv.DictReader(file)
+        cameras = {}
+        for row in reader:
+            for column, value in row.items():
+                cameras.setdefault(column, []).append(value)
+
+    img = np.array(Image.open(output_dir / image_filename_map[cameras["#name"][0].split(".")[0]]))
+    height, width, _ = img.shape
+
+    data["h"] = int(height)
+    data["w"] = int(width)
+
+    for i, name in enumerate(cameras["#name"]):
+        frame = {}
+        frame["file_path"] = image_filename_map[name.split(".")[0]].as_posix()
+        frame["fl_x"] = float(cameras["f"][i]) * max(width, height) / 36
+        frame["fl_y"] = float(cameras["f"][i]) * max(width, height) / 36
+        # TODO: Unclear how to get the principal point from RealityCapture, here a guess...
+        frame["cx"] = float(cameras["px"][i]) / 36.0 + width / 2.0
+        frame["cy"] = float(cameras["py"][i]) / 36.0 + height / 2.0
+        # TODO: Not sure if RealityCapture uses this distortion model
+        frame["k1"] = cameras["k1"][i]
+        frame["k2"] = cameras["k2"][i]
+        frame["k3"] = cameras["k3"][i]
+        frame["k4"] = cameras["k4"][i]
+        frame["p1"] = cameras["t1"][i]
+        frame["p2"] = cameras["t2"][i]
+
+        # Transform matrix to nerfstudio format. Please refer to the documentation for coordinate system conventions.
+        rot = _get_rotation_matrix(-float(cameras["heading"][i]), float(cameras["pitch"][i]), float(cameras["roll"][i]))
+
+        transform = np.eye(4)
+        transform[:3, :3] = rot
+        transform[:3, 3] = np.array([float(cameras["x"][i]), float(cameras["y"][i]), float(cameras["alt"][i])])
+
+        frame["transform_matrix"] = transform.tolist()
+        frames.append(frame)
+    data["frames"] = frames
+
+    with open(output_dir / "transforms.json", "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4)
+
+    summary = []
+    if len(frames) < len(image_filename_map):
+        summary.append(f"Missing camera data for {len(image_filename_map) - len(frames)} frames.")
+    summary.append(f"Final dataset is {len(frames)} frames.")
+
+    return summary
+
+
+def _get_rotation_matrix(yaw, pitch, roll):
+    """Returns a rotation matrix given euler angles."""
+
+    s_yaw = np.sin(np.deg2rad(yaw))
+    c_yaw = np.cos(np.deg2rad(yaw))
+    s_pitch = np.sin(np.deg2rad(pitch))
+    c_pitch = np.cos(np.deg2rad(pitch))
+    s_roll = np.sin(np.deg2rad(roll))
+    c_roll = np.cos(np.deg2rad(roll))
+
+    rot_x = np.array([[1, 0, 0], [0, c_pitch, -s_pitch], [0, s_pitch, c_pitch]])
+    rot_y = np.array([[c_roll, 0, s_roll], [0, 1, 0], [-s_roll, 0, c_roll]])
+    rot_z = np.array([[c_yaw, -s_yaw, 0], [s_yaw, c_yaw, 0], [0, 0, 1]])
+
+    return rot_z @ rot_x @ rot_y
diff --git a/scripts/process_data.py b/scripts/process_data.py
@@ -21,6 +21,7 @@
     metashape_utils,
     polycam_utils,
     process_data_utils,
+    realitycapture_utils,
     record3d_utils,
 )
 from nerfstudio.process_data.process_data_utils import CAMERA_MODELS
@@ -479,8 +480,8 @@ def main(self) -> None:
         summary_log.append(f"Used {num_frames} images out of {num_images} total")
         if self.max_dataset_size > 0:
             summary_log.append(
-                "To change the size of the dataset add the argument --max_dataset_size to larger than the "
-                f"current value ({self.max_dataset_size}), or -1 to use all images."
+                "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to "
+                f"larger than the current value ({self.max_dataset_size}), or -1 to use all images."
             )
 
         # Downscale images
@@ -558,19 +559,10 @@ def main(self) -> None:
             raise ValueError(f"Image directory {polycam_image_dir} doesn't exist")
 
         # Copy images to output directory
+        polycam_image_filenames, num_orig_images = process_data_utils.get_image_filenames(
+            polycam_image_dir, self.max_dataset_size
+        )
 
-        polycam_image_filenames = []
-        for f in polycam_image_dir.iterdir():
-            if f.suffix.lower() in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]:
-                polycam_image_filenames.append(f)
-        polycam_image_filenames = sorted(polycam_image_filenames, key=lambda fn: int(fn.stem))
-        num_images = len(polycam_image_filenames)
-        idx = np.arange(num_images)
-        if self.max_dataset_size != -1 and num_images > self.max_dataset_size:
-            idx = np.round(np.linspace(0, num_images - 1, self.max_dataset_size)).astype(int)
-
-        polycam_image_filenames = list(np.array(polycam_image_filenames)[idx])
-        # Copy images to output directory
         copied_image_paths = process_data_utils.copy_images_list(
             polycam_image_filenames,
             image_dir=image_dir,
@@ -581,11 +573,11 @@ def main(self) -> None:
 
         copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths]
 
-        if self.max_dataset_size > 0 and num_frames != num_images:
-            summary_log.append(f"Started with {num_frames} images out of {num_images} total")
+        if self.max_dataset_size > 0 and num_frames != num_orig_images:
+            summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total")
             summary_log.append(
-                "To change the size of the dataset add the argument --max_dataset_size to larger than the "
-                f"current value ({self.max_dataset_size}), or -1 to use all images."
+                "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to "
+                f"larger than the current value ({self.max_dataset_size}), or -1 to use all images."
             )
         else:
             summary_log.append(f"Started with {num_frames} images")
@@ -657,18 +649,7 @@ def main(self) -> None:
         summary_log = []
 
         # Copy images to output directory
-        image_filenames = []
-        for f in self.data.iterdir():
-            if f.suffix.lower() in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]:
-                image_filenames.append(f)
-        image_filenames = sorted(image_filenames, key=lambda fn: fn.stem)
-        num_images = len(image_filenames)
-        idx = np.arange(num_images)
-        if self.max_dataset_size != -1 and num_images > self.max_dataset_size:
-            idx = np.round(np.linspace(0, num_images - 1, self.max_dataset_size)).astype(int)
-
-        image_filenames = list(np.array(image_filenames)[idx])
-        # Copy images to output directory
+        image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size)
         copied_image_paths = process_data_utils.copy_images_list(
             image_filenames,
             image_dir=image_dir,
@@ -680,11 +661,11 @@ def main(self) -> None:
         original_names = [image_path.stem for image_path in image_filenames]
         image_filename_map = dict(zip(original_names, copied_image_paths))
 
-        if self.max_dataset_size > 0 and num_frames != num_images:
-            summary_log.append(f"Started with {num_frames} images out of {num_images} total")
+        if self.max_dataset_size > 0 and num_frames != num_orig_images:
+            summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total")
             summary_log.append(
-                "To change the size of the dataset add the argument --max_dataset_size to larger than the "
-                f"current value ({self.max_dataset_size}), or -1 to use all images."
+                "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to "
+                f"larger than the current value ({self.max_dataset_size}), or -1 to use all images."
             )
         else:
             summary_log.append(f"Started with {num_frames} images")
@@ -712,11 +693,98 @@ def main(self) -> None:
         CONSOLE.rule()
 
 
+@dataclass
+class ProcessRealityCapture:
+    """Process RealityCapture data into a nerfstudio dataset.
+
+    This script assumes that cameras have been aligned using RealityCapture. After alignment, it is necessary to
+    export the camera poses as a `.csv` file.
+
+    This script does the following:
+
+    1. Scales images to a specified size.
+    2. Converts RealityCapture poses into the nerfstudio format.
+    """
+
+    data: Path
+    """Path to a folder of images."""
+    csv: Path
+    """Path to the RealityCapture cameras CSV file."""
+    output_dir: Path
+    """Path to the output directory."""
+    num_downscales: int = 3
+    """Number of times to downscale the images. Downscales by 2 each time. For example a value of 3
+        will downscale the images by 2x, 4x, and 8x."""
+    max_dataset_size: int = 600
+    """Max number of images to train on. If the dataset has more, images will be sampled approximately evenly. If -1,
+    use all images."""
+    verbose: bool = False
+    """If True, print extra logging."""
+
+    def main(self) -> None:
+        """Process images into a nerfstudio dataset."""
+
+        if self.csv.suffix != ".csv":
+            raise ValueError(f"CSV file {self.csv} must have a .csv extension")
+        if not self.csv.exists:
+            raise ValueError(f"CSV file {self.csv} doesn't exist")
+
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        image_dir = self.output_dir / "images"
+        image_dir.mkdir(parents=True, exist_ok=True)
+
+        summary_log = []
+
+        # Copy images to output directory
+        image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size)
+        copied_image_paths = process_data_utils.copy_images_list(
+            image_filenames,
+            image_dir=image_dir,
+            verbose=self.verbose,
+        )
+        num_frames = len(copied_image_paths)
+
+        copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths]
+        original_names = [image_path.stem for image_path in image_filenames]
+        image_filename_map = dict(zip(original_names, copied_image_paths))
+
+        if self.max_dataset_size > 0 and num_frames != num_orig_images:
+            summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total")
+            summary_log.append(
+                "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to "
+                f"larger than the current value ({self.max_dataset_size}), or -1 to use all images."
+            )
+        else:
+            summary_log.append(f"Started with {num_frames} images")
+
+        # Downscale images
+        summary_log.append(process_data_utils.downscale_images(image_dir, self.num_downscales, verbose=self.verbose))
+
+        # Save json
+        if num_frames == 0:
+            CONSOLE.print("[bold red]No images found, exiting")
+            sys.exit(1)
+        summary_log.extend(
+            realitycapture_utils.realitycapture_to_json(
+                image_filename_map=image_filename_map,
+                csv_filename=self.csv,
+                output_dir=self.output_dir,
+            )
+        )
+
+        CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:")
+
+        for summary in summary_log:
+            CONSOLE.print(summary, justify="center")
+        CONSOLE.rule()
+
+
 Commands = Union[
     Annotated[ProcessImages, tyro.conf.subcommand(name="images")],
     Annotated[ProcessVideo, tyro.conf.subcommand(name="video")],
     Annotated[ProcessPolycam, tyro.conf.subcommand(name="polycam")],
     Annotated[ProcessMetashape, tyro.conf.subcommand(name="metashape")],
+    Annotated[ProcessRealityCapture, tyro.conf.subcommand(name="realitycapture")],
     Annotated[ProcessInsta360, tyro.conf.subcommand(name="insta360")],
     Annotated[ProcessRecord3D, tyro.conf.subcommand(name="record3d")],
 ]