Skip to content

Commit

Permalink
Add CameraGroup class (#146)
Browse files Browse the repository at this point in the history
* Convert points to float before cv2.undistortPoints

* Implement CameraGroup.triangulate
and provide default triangulate_dlt_vectorized function

* Test triangulate and cameras list

* Return 3D nan for 2D all nans
- Likely should also filter out points when only a single points is non-nan, but curious...

* Handle points of arbitrary shape

* Pydocstyle

* Add CameraGroup fixture

* Test triangulate shapes
and use fixture

* Add initial CameraGroup.project
loops over Camera.project

* Test CameraGroup.project

* Add CameraGroup from/to dict and load methods

* Add fixture for calibration toml

* Test CameraGroup from/to dict and load

* Docstring clarification and string spelling

* Retain points shape when project

* Test points shape is retained on project

* Add to/from dict methods

* Test to/from dict methods

* Remove Camera aliases
now that standalone triangulation method is implemented

* Add get_camera and videos to RecordingSession

* Formatting

* Preserve datatype on triangulate and project

* Return nan when underdetermined DLT

* Lint

* Test dtype is preserved on triangulate and reproject

* Fix preserve dtype for project
and update docstrings

* Fix check for enough instances to triangulate

* Retain points dtype on project

* Test points dtype is retained on project

* Increase test coverage

* Improve docstrings

* Lint

---------

Co-authored-by: Talmo Pereira <[email protected]>
  • Loading branch information
roomrys and talmo authored Jan 22, 2025
1 parent a73106a commit de9da79
Show file tree
Hide file tree
Showing 6 changed files with 548 additions and 44 deletions.
1 change: 1 addition & 0 deletions sleap_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module exposes all high level APIs for sleap-io."""

from sleap_io.version import __version__
from sleap_io.model.camera import Camera, CameraGroup
from sleap_io.model.skeleton import Node, Edge, Skeleton, Symmetry
from sleap_io.model.video import Video
from sleap_io.model.instance import (
Expand Down
275 changes: 245 additions & 30 deletions sleap_io/model/camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,75 @@

from __future__ import annotations

from collections.abc import Callable

import attrs
import cv2
import numpy as np
import toml
from attrs import define, field

from sleap_io.model.video import Video


def triangulate_dlt_vectorized(
points: np.ndarray, projection_matrices: np.ndarray
) -> np.ndarray:
"""Triangulate 3D points from multiple camera views using Direct Linear Transform.
Args:
points: Array of N 2D points from each camera view M of dtype float64 and shape
(M, N, 2) where N is the number of points.
projection_matrices: Array of (3, 4) projection matrices for each camera M of
shape (M, 3, 4).
Returns:
Triangulated 3D points of shape (N, 3) where N is the number of points.
"""
n_cameras, n_points, _ = points.shape

# Flatten points to shape needed for multiplication
points_flattened = points.reshape(n_cameras, 2 * n_points, 1, order="C")

# Create row selector matrix to select correct rows from projection matrix
row_selector = np.zeros((n_cameras * n_points, 2, 2))
row_selector[:, 0, 0] = -1 # Negate 1st row of projection matrix for x
row_selector[:, 1, 1] = -1 # Negate 2nd row of projection matrix for y
row_selector = row_selector.reshape(n_cameras, 2 * n_points, 2, order="C")

# Concatenate row selector and points matrices to shape (M, 2N, 3)
left_matrix = np.concatenate((row_selector, points_flattened), axis=2)

# Get A (stacked in a weird way) of shape (M, 2N, 4)
a_stacked = np.matmul(left_matrix, projection_matrices)

# Reorganize A to shape (N, 2M, 4) s.t. each 3D point has A of shape 2M x 4
a = (
a_stacked.reshape(n_cameras, n_points, 2, 4)
.transpose(1, 0, 2, 3)
.reshape(n_points, 2 * n_cameras, 4)
)

# Remove rows with NaNs before SVD which may result in a ragged A (hence for loop)
points_3d = []
for a_slice in a:
# Check that we have at least 2 views worth of non-nan points.
nan_mask = np.isnan(a_slice) # 2M x 4
has_enough_matches = np.all(~nan_mask, axis=1).sum() >= 4 # Need 2 (x, y) pairs

point_3d = np.full(3, np.nan)
if has_enough_matches:
a_no_nan = a_slice[~nan_mask].reshape(-1, 4, order="C")
_, _, vh = np.linalg.svd(a_no_nan)
point_3d = vh[-1, :-1] / vh[-1, -1]

points_3d.append(point_3d)

points_3d = np.array(points_3d)

return points_3d


@define
class CameraGroup:
"""A group of cameras used to record a multi-view `RecordingSession`.
Expand All @@ -20,6 +81,190 @@ class CameraGroup:

cameras: list[Camera] = field(factory=list)

def triangulate(
self,
points: np.ndarray,
triangulation_func: Callable[
[np.ndarray, np.ndarray], np.ndarray
] = triangulate_dlt_vectorized,
) -> np.ndarray:
"""Triangulate 2D points from multiple camera views M.
This function reshapes the input `points` to shape (M, N, 2) where M is the
number of camera views and N is the number of 2D points to triangulate. The
points are then undistorted so that we can use the extrinsic matrices of the
cameras as projection matrices to call `triangulation_func` and triangulate
the 3D points.
Args:
points: Array of 2D points from each camera view of any dtype and shape
(M, ..., 2) where M is the number of camera views and "..." is any
number of dimensions (including 0).
triangulation_func: Function to use for triangulation. The
triangulation_func should take the following arguments:
points: Array of undistorted 2D points from each camera view of dtype
float64 and shape (M, N, 2) where M is the number of cameras and N
is the number of points.
projection_matrices: Array of (3, 4) projection matrices for each of the
M cameras of shape (M, 3, 4) - note that points are undistorted.
and return the triangulated 3D points of shape (N, 3) where N is the
number of points.
Default is vectorized DLT.
Raises:
ValueError: If points are not of shape (M, ..., 2).
ValueError: If number of cameras M do not match number of cameras in group.
ValueError: If number of points returned by triangulation function does not
match number of points in input.
Returns:
Triangulated 3D points of same dtype as `points` and shape (..., 3) where
"..." is any number of dimensions and matches the "..." dimensions of
`points`.
"""
# Validate points in
points_shape = points.shape
try:
n_cameras = points_shape[0]
if n_cameras != len(self.cameras):
raise ValueError
if 2 != points.shape[-1]:
raise ValueError
if len(points_shape) != 3:
points = points.reshape(n_cameras, -1, 2)
except Exception as e:
raise ValueError(
"Expected points to be an array of 2D points from each camera view of "
f"shape (M, ..., 2) where M = {len(self.cameras)} and '...' is any "
f"number of dimensions, but received shape {points_shape}.\n\n{e}"
)
n_points = points.shape[1]

# Undistort points
points_dtype = points.dtype
points = points.astype("float64") # Ensure float64 for opencv undistort
for cam_idx, camera in enumerate(self.cameras):
cam_points = camera.undistort_points(points[cam_idx])
points[cam_idx] = cam_points

# Since points are undistorted, use extrinsic matrices as projection matrices
projection_matrices = np.array(
[camera.extrinsic_matrix[:3] for camera in self.cameras]
)

# Triangulate points using provided function
points_3d = triangulation_func(points, projection_matrices)
n_points_returned = points_3d.shape[0]
if n_points_returned != n_points:
raise ValueError(
f"Expected triangulation function to return {n_points} 3D points, but "
f"received {n_points_returned} 3D points."
)

# Reshape to (N, 3) and cast to the original dtype.
points_3d = points_3d.reshape(*points_shape[1:-1], 3).astype(points_dtype)
return points_3d

def project(self, points: np.ndarray) -> np.ndarray:
"""Project 3D points to 2D using camera group.
Args:
points: 3D points to project of any dtype and shape (..., 3). where "..." is
any number of dimensions (including 0).
Returns:
Projected 2D points of same dtype as `points` and shape (M, ..., 2)
where M is the number of cameras and "..." matches the "..." dimensions of
`points`.
"""
# Validate points in
points_shape = points.shape
try:
# Check if points are 3D
if points_shape[-1] != 3:
raise ValueError
except Exception as e:
raise ValueError(
"Expected points to be an array of 3D points of shape (..., 3) "
"where '...' is any number of non-zero dimensions, but received shape "
f"{points_shape}.\n\n{e}"
)

# Project 3D points to 2D for each camera
points_dtype = points.dtype
points = points.astype(np.float64) # Ensure float for opencv project
n_cameras = len(self.cameras)
n_points = np.prod(points_shape[:-1])
projected_points = np.zeros((n_cameras, n_points, 2))
for cam_idx, camera in enumerate(self.cameras):
cam_points = camera.project(points)
projected_points[cam_idx] = cam_points.reshape(n_points, 2)

return projected_points.reshape(n_cameras, *points_shape[:-1], 2).astype(
points_dtype
)

@classmethod
def from_dict(cls, calibration_dict: dict) -> CameraGroup:
"""Create `CameraGroup` from calibration dictionary.
Args:
calibration_dict: Dictionary containing calibration information for cameras.
Returns:
`CameraGroup` object created from calibration dictionary.
"""
cameras = []
for dict_name, camera_dict in calibration_dict.items():
if dict_name == "metadata":
continue
camera = Camera.from_dict(camera_dict)
cameras.append(camera)

camera_group = cls(cameras=cameras)

return camera_group

def to_dict(self) -> dict:
"""Convert `CameraGroup` to dictionary.
Returns:
Dictionary containing camera group information with the following keys:
cam_n: Camera dictionary containing information for camera at index "n"
with the following keys:
name: Camera name.
size: Image size (height, width) of camera in pixels of size (2,)
and type int.
matrix: Intrinsic camera matrix of size (3, 3) and type float64.
distortions: Radial-tangential distortion coefficients
[k_1, k_2, p_1, p_2, k_3] of size (5,) and type float64.
rotation: Rotation vector in unnormalized axis-angle representation
of size (3,) and type float64.
translation: Translation vector of size (3,) and type float64.
"""
calibration_dict = {}
for cam_idx, camera in enumerate(self.cameras):
camera_dict = camera.to_dict()
calibration_dict[f"cam_{cam_idx}"] = camera_dict

return calibration_dict

@classmethod
def load(cls, filename: str) -> CameraGroup:
"""Load `CameraGroup` from JSON file.
Args:
filename: Path to JSON file to load `CameraGroup` from.
Returns:
`CameraGroup` object loaded from JSON file.
"""
calibration_dict = toml.load(filename)
camera_group = cls.from_dict(calibration_dict)

return camera_group


@define(eq=False) # Set eq to false to make class hashable
class RecordingSession:
Expand Down Expand Up @@ -374,33 +619,3 @@ def from_dict(cls, camera_dict: dict) -> Camera:
)

return camera

# TODO: Remove this when we implement triangulation without aniposelib
def __getattr__(self, name: str):
"""Get attribute by name.
Args:
name: Name of attribute to get.
Returns:
Value of attribute.
Raises:
AttributeError: If attribute does not exist.
"""
if name in self.__attrs_attrs__:
return getattr(self, name)

# The aliases for methods called when triangulate with sleap_anipose
method_aliases = {
"get_name": self.name,
"get_extrinsic_matrix": self.extrinsic_matrix,
}

def return_callable_method_alias():
return method_aliases[name]

if name in method_aliases:
return return_callable_method_alias

raise AttributeError(f"'Camera' object has no attribute or method '{name}'")
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from tests.fixtures.labelstudio import *
from tests.fixtures.videos import *
from tests.fixtures.jabs import *
from tests.fixtures.camera import *
65 changes: 65 additions & 0 deletions tests/data/camera/calibration.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
[cam_0]
name = "back"
size = [ 1280, 1024,]
matrix = [ [ 762.513822135494, 0.0, 639.5,], [ 0.0, 762.513822135494, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.2868458380166852, 0.0, 0.0, 0.0, 0.0,]
rotation = [ 0.3571857188780474, 0.8879473292757126, 1.6832001677006176,]
translation = [ -555.4577842902744, -294.43494957092884, -190.82196458369515,]

[cam_1]
name = "backL"
size = [ 1280, 1024,]
matrix = [ [ 817.6484736604496, 0.0, 639.5,], [ 0.0, 817.6484736604496, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.33100176822475913, 0.0, 0.0, 0.0, 0.0,]
rotation = [ -0.026283027462693898, -0.010379556518966196, -0.027099257463872917,]
translation = [ 9.862823625864475, -26.76921192059893, -434.1389532578435,]

[cam_2]
name = "mid"
size = [ 1280, 1024,]
matrix = [ [ 757.8876636778425, 0.0, 639.5,], [ 0.0, 757.8876636778425, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.28903872836928324, 0.0, 0.0, 0.0, 0.0,]
rotation = [ -0.37194922755270515, -0.4001867844465869, -1.0086626457565362,]
translation = [ 170.36246990443806, -465.67014395255075, -308.4662718040283,]

[cam_3]
name = "midL"
size = [ 1280, 1024,]
matrix = [ [ 865.1641815534884, 0.0, 639.5,], [ 0.0, 865.1641815534884, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.36452920053871213, 0.0, 0.0, 0.0, 0.0,]
rotation = [ 0.05817434916890774, -1.054534889033804, -2.7344311787180726,]
translation = [ 64.84029309392507, -664.6153475753681, -175.7828934255145,]

[cam_4]
name = "side"
size = [ 1280, 1024,]
matrix = [ [ 754.7093944045533, 0.0, 639.5,], [ 0.0, 754.7093944045533, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.2952746154319528, 0.0, 0.0, 0.0, 0.0,]
rotation = [ -0.8648522139486137, -1.1788453690264329, -1.0219549009876152,]
translation = [ 260.9490547859552, -1027.1241968566128, 279.38287656265214,]

[cam_5]
name = "sideL"
size = [ 1280, 1024,]
matrix = [ [ 754.8402306632529, 0.0, 639.5,], [ 0.0, 754.8402306632529, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.28917397473074263, 0.0, 0.0, 0.0, 0.0,]
rotation = [ -0.7375958632452171, -1.5393315718217462, -1.3307245772011242,]
translation = [ 254.07566513275054, -1043.3222053681916, 408.2123665768046,]

[cam_6]
name = "top"
size = [ 1280, 1024,]
matrix = [ [ 958.8409001521982, 0.0, 639.5,], [ 0.0, 958.8409001521982, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.2883038721040841, 0.0, 0.0, 0.0, 0.0,]
rotation = [ 0.14923195260261604, -0.35651045266677683, -1.571999724050101,]
translation = [ 320.58744336418937, -113.31460508230508, -298.24659146410477,]

[cam_7]
name = "topL"
size = [ 1280, 1024,]
matrix = [ [ 860.8004120100271, 0.0, 639.5,], [ 0.0, 860.8004120100271, 511.5,], [ 0.0, 0.0, 1.0,],]
distortions = [ -0.34378860528857375, 0.0, 0.0, 0.0, 0.0,]
rotation = [ 0.19329909189795189, -0.4852167922768701, -1.5625699029758116,]
translation = [ 459.78373522464585, -180.81976306477637, -266.6352363727575,]

[metadata]
Loading

0 comments on commit de9da79

Please sign in to comment.