diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py index 853ed8d332..e6e58f1846 100644 --- a/nerfstudio/process_data/process_data_utils.py +++ b/nerfstudio/process_data/process_data_utils.py @@ -45,6 +45,29 @@ class CameraModel(Enum): } +def get_image_filenames(directory: Path, max_num_images: int = -1) -> Tuple[List[Path], int]: + """Returns a list of image filenames in a directory. + + Args: + dir: Path to the directory. + max_num_images: The maximum number of images to return. -1 means no limit. + Returns: + A tuple of A list of image filenames, number of original image paths. + """ + allowed_exts = [".jpg", ".jpeg", ".png", ".tif", ".tiff"] + image_paths = sorted([p for p in directory.glob("[!.]*") if p.suffix.lower() in allowed_exts]) + num_orig_images = len(image_paths) + + if max_num_images != -1 and num_orig_images > max_num_images: + idx = np.round(np.linspace(0, num_orig_images - 1, max_num_images)).astype(int) + else: + idx = np.arange(num_orig_images) + + image_filenames = list(np.array(image_paths)[idx]) + + return image_filenames, num_orig_images + + def get_num_frames_in_video(video: Path) -> int: """Returns the number of frames in a video. diff --git a/nerfstudio/process_data/realitycapture_utils.py b/nerfstudio/process_data/realitycapture_utils.py new file mode 100644 index 0000000000..eaba5aeb0a --- /dev/null +++ b/nerfstudio/process_data/realitycapture_utils.py @@ -0,0 +1,119 @@ +# Copyright 2022 The Nerfstudio Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper utils for processing polycam data into the nerfstudio format.""" + +import csv +import json +from pathlib import Path +from typing import List + +import numpy as np +from PIL import Image +from rich.console import Console + +from nerfstudio.process_data.process_data_utils import CAMERA_MODELS + +CONSOLE = Console(width=120) + + +def realitycapture_to_json( + image_filename_map: List[Path], + csv_filename: Path, + output_dir: Path, +) -> List[str]: + """Convert RealityCapture data into a nerfstudio dataset. + + Args: + image_filenames: List of paths to the original images. + csv_filename: Path to the csv file containing the camera poses. + output_dir: Path to the output directory. + verbose: Whether to print verbose output. + + Returns: + Summary of the conversion. + """ + data = {} + data["camera_model"] = CAMERA_MODELS["perspective"].value + # Needs to be a string for camera_utils.auto_orient_and_center_poses + data["orientation_override"] = "none" + + frames = [] + + with open(csv_filename, encoding="UTF-8") as file: + reader = csv.DictReader(file) + cameras = {} + for row in reader: + for column, value in row.items(): + cameras.setdefault(column, []).append(value) + + img = np.array(Image.open(output_dir / image_filename_map[cameras["#name"][0].split(".")[0]])) + height, width, _ = img.shape + + data["h"] = int(height) + data["w"] = int(width) + + for i, name in enumerate(cameras["#name"]): + frame = {} + frame["file_path"] = image_filename_map[name.split(".")[0]].as_posix() + frame["fl_x"] = float(cameras["f"][i]) * max(width, height) / 36 + frame["fl_y"] = float(cameras["f"][i]) * max(width, height) / 36 + # TODO: Unclear how to get the principal point from RealityCapture, here a guess... + frame["cx"] = float(cameras["px"][i]) / 36.0 + width / 2.0 + frame["cy"] = float(cameras["py"][i]) / 36.0 + height / 2.0 + # TODO: Not sure if RealityCapture uses this distortion model + frame["k1"] = cameras["k1"][i] + frame["k2"] = cameras["k2"][i] + frame["k3"] = cameras["k3"][i] + frame["k4"] = cameras["k4"][i] + frame["p1"] = cameras["t1"][i] + frame["p2"] = cameras["t2"][i] + + # Transform matrix to nerfstudio format. Please refer to the documentation for coordinate system conventions. + rot = _get_rotation_matrix(-float(cameras["heading"][i]), float(cameras["pitch"][i]), float(cameras["roll"][i])) + + transform = np.eye(4) + transform[:3, :3] = rot + transform[:3, 3] = np.array([float(cameras["x"][i]), float(cameras["y"][i]), float(cameras["alt"][i])]) + + frame["transform_matrix"] = transform.tolist() + frames.append(frame) + data["frames"] = frames + + with open(output_dir / "transforms.json", "w", encoding="utf-8") as f: + json.dump(data, f, indent=4) + + summary = [] + if len(frames) < len(image_filename_map): + summary.append(f"Missing camera data for {len(image_filename_map) - len(frames)} frames.") + summary.append(f"Final dataset is {len(frames)} frames.") + + return summary + + +def _get_rotation_matrix(yaw, pitch, roll): + """Returns a rotation matrix given euler angles.""" + + s_yaw = np.sin(np.deg2rad(yaw)) + c_yaw = np.cos(np.deg2rad(yaw)) + s_pitch = np.sin(np.deg2rad(pitch)) + c_pitch = np.cos(np.deg2rad(pitch)) + s_roll = np.sin(np.deg2rad(roll)) + c_roll = np.cos(np.deg2rad(roll)) + + rot_x = np.array([[1, 0, 0], [0, c_pitch, -s_pitch], [0, s_pitch, c_pitch]]) + rot_y = np.array([[c_roll, 0, s_roll], [0, 1, 0], [-s_roll, 0, c_roll]]) + rot_z = np.array([[c_yaw, -s_yaw, 0], [s_yaw, c_yaw, 0], [0, 0, 1]]) + + return rot_z @ rot_x @ rot_y diff --git a/scripts/process_data.py b/scripts/process_data.py index 4837359114..a8551dff3c 100755 --- a/scripts/process_data.py +++ b/scripts/process_data.py @@ -21,6 +21,7 @@ metashape_utils, polycam_utils, process_data_utils, + realitycapture_utils, record3d_utils, ) from nerfstudio.process_data.process_data_utils import CAMERA_MODELS @@ -479,8 +480,8 @@ def main(self) -> None: summary_log.append(f"Used {num_frames} images out of {num_images} total") if self.max_dataset_size > 0: summary_log.append( - "To change the size of the dataset add the argument --max_dataset_size to larger than the " - f"current value ({self.max_dataset_size}), or -1 to use all images." + "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to " + f"larger than the current value ({self.max_dataset_size}), or -1 to use all images." ) # Downscale images @@ -558,19 +559,10 @@ def main(self) -> None: raise ValueError(f"Image directory {polycam_image_dir} doesn't exist") # Copy images to output directory + polycam_image_filenames, num_orig_images = process_data_utils.get_image_filenames( + polycam_image_dir, self.max_dataset_size + ) - polycam_image_filenames = [] - for f in polycam_image_dir.iterdir(): - if f.suffix.lower() in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]: - polycam_image_filenames.append(f) - polycam_image_filenames = sorted(polycam_image_filenames, key=lambda fn: int(fn.stem)) - num_images = len(polycam_image_filenames) - idx = np.arange(num_images) - if self.max_dataset_size != -1 and num_images > self.max_dataset_size: - idx = np.round(np.linspace(0, num_images - 1, self.max_dataset_size)).astype(int) - - polycam_image_filenames = list(np.array(polycam_image_filenames)[idx]) - # Copy images to output directory copied_image_paths = process_data_utils.copy_images_list( polycam_image_filenames, image_dir=image_dir, @@ -581,11 +573,11 @@ def main(self) -> None: copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths] - if self.max_dataset_size > 0 and num_frames != num_images: - summary_log.append(f"Started with {num_frames} images out of {num_images} total") + if self.max_dataset_size > 0 and num_frames != num_orig_images: + summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total") summary_log.append( - "To change the size of the dataset add the argument --max_dataset_size to larger than the " - f"current value ({self.max_dataset_size}), or -1 to use all images." + "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to " + f"larger than the current value ({self.max_dataset_size}), or -1 to use all images." ) else: summary_log.append(f"Started with {num_frames} images") @@ -657,18 +649,7 @@ def main(self) -> None: summary_log = [] # Copy images to output directory - image_filenames = [] - for f in self.data.iterdir(): - if f.suffix.lower() in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]: - image_filenames.append(f) - image_filenames = sorted(image_filenames, key=lambda fn: fn.stem) - num_images = len(image_filenames) - idx = np.arange(num_images) - if self.max_dataset_size != -1 and num_images > self.max_dataset_size: - idx = np.round(np.linspace(0, num_images - 1, self.max_dataset_size)).astype(int) - - image_filenames = list(np.array(image_filenames)[idx]) - # Copy images to output directory + image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size) copied_image_paths = process_data_utils.copy_images_list( image_filenames, image_dir=image_dir, @@ -680,11 +661,11 @@ def main(self) -> None: original_names = [image_path.stem for image_path in image_filenames] image_filename_map = dict(zip(original_names, copied_image_paths)) - if self.max_dataset_size > 0 and num_frames != num_images: - summary_log.append(f"Started with {num_frames} images out of {num_images} total") + if self.max_dataset_size > 0 and num_frames != num_orig_images: + summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total") summary_log.append( - "To change the size of the dataset add the argument --max_dataset_size to larger than the " - f"current value ({self.max_dataset_size}), or -1 to use all images." + "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to " + f"larger than the current value ({self.max_dataset_size}), or -1 to use all images." ) else: summary_log.append(f"Started with {num_frames} images") @@ -712,11 +693,98 @@ def main(self) -> None: CONSOLE.rule() +@dataclass +class ProcessRealityCapture: + """Process RealityCapture data into a nerfstudio dataset. + + This script assumes that cameras have been aligned using RealityCapture. After alignment, it is necessary to + export the camera poses as a `.csv` file. + + This script does the following: + + 1. Scales images to a specified size. + 2. Converts RealityCapture poses into the nerfstudio format. + """ + + data: Path + """Path to a folder of images.""" + csv: Path + """Path to the RealityCapture cameras CSV file.""" + output_dir: Path + """Path to the output directory.""" + num_downscales: int = 3 + """Number of times to downscale the images. Downscales by 2 each time. For example a value of 3 + will downscale the images by 2x, 4x, and 8x.""" + max_dataset_size: int = 600 + """Max number of images to train on. If the dataset has more, images will be sampled approximately evenly. If -1, + use all images.""" + verbose: bool = False + """If True, print extra logging.""" + + def main(self) -> None: + """Process images into a nerfstudio dataset.""" + + if self.csv.suffix != ".csv": + raise ValueError(f"CSV file {self.csv} must have a .csv extension") + if not self.csv.exists: + raise ValueError(f"CSV file {self.csv} doesn't exist") + + self.output_dir.mkdir(parents=True, exist_ok=True) + image_dir = self.output_dir / "images" + image_dir.mkdir(parents=True, exist_ok=True) + + summary_log = [] + + # Copy images to output directory + image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size) + copied_image_paths = process_data_utils.copy_images_list( + image_filenames, + image_dir=image_dir, + verbose=self.verbose, + ) + num_frames = len(copied_image_paths) + + copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths] + original_names = [image_path.stem for image_path in image_filenames] + image_filename_map = dict(zip(original_names, copied_image_paths)) + + if self.max_dataset_size > 0 and num_frames != num_orig_images: + summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total") + summary_log.append( + "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to " + f"larger than the current value ({self.max_dataset_size}), or -1 to use all images." + ) + else: + summary_log.append(f"Started with {num_frames} images") + + # Downscale images + summary_log.append(process_data_utils.downscale_images(image_dir, self.num_downscales, verbose=self.verbose)) + + # Save json + if num_frames == 0: + CONSOLE.print("[bold red]No images found, exiting") + sys.exit(1) + summary_log.extend( + realitycapture_utils.realitycapture_to_json( + image_filename_map=image_filename_map, + csv_filename=self.csv, + output_dir=self.output_dir, + ) + ) + + CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:") + + for summary in summary_log: + CONSOLE.print(summary, justify="center") + CONSOLE.rule() + + Commands = Union[ Annotated[ProcessImages, tyro.conf.subcommand(name="images")], Annotated[ProcessVideo, tyro.conf.subcommand(name="video")], Annotated[ProcessPolycam, tyro.conf.subcommand(name="polycam")], Annotated[ProcessMetashape, tyro.conf.subcommand(name="metashape")], + Annotated[ProcessRealityCapture, tyro.conf.subcommand(name="realitycapture")], Annotated[ProcessInsta360, tyro.conf.subcommand(name="insta360")], Annotated[ProcessRecord3D, tyro.conf.subcommand(name="record3d")], ]