Skip to content

Commit 97cb9d8

Browse files
authored
Merge pull request #139 from tryolabs/motion-estimator
Estimate camera motion using the mode of the Optical Flow
2 parents 7dcfe9c + ab32eb8 commit 97cb9d8

13 files changed

+846
-15
lines changed

demos/camera_motion/Dockerfile

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM ultralytics/yolov5:v6.2
2+
3+
# Install Norfair
4+
RUN pip install git+https://github.com/tryolabs/norfair.git@master#egg=norfair
5+
6+
WORKDIR /demo/src/

demos/camera_motion/README.md

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Moving Camera Demo
2+
3+
In this example, we show how to estimate the camera movement in Norfair.
4+
5+
What's the motivation for estimating camera movement?
6+
7+
- When the camera moves, the apparent movement of the objects can be quite erratic and confuse the tracker; by estimating the camera movement we can stabilize the objects and improve tracking.
8+
- By estimating the position of objects in a fixed reference we can correctly calculate their trajectory. This can help you if you are trying to determine when objects enter a predefined zone on the scene or trying to draw their trajectory
9+
10+
Keep in mind that for estimating the camera movement we rely on a static background, if the scene is too chaotic with a lot of movement the estimation will lose accuracy. Nevertheless, even when the estimation is incorrect it will not hurt the tracking.
11+
12+
## First Example - Translation
13+
14+
This method only works for camera pans and tilts.
15+
16+
![Pan and Tilt](/docs/pan_tilt.png)
17+
18+
The following video shows on the left we lost the person 4 times while on the right we were able to maintain the tracked object throughout the video:
19+
20+
![camera_stabilization](/docs/camera_stabilization.gif)
21+
22+
> videos generated using command `python demo.py --transformation none --draw-objects --track-boxes --id-size 1.8 --distance-threshold 200 --save video.mp4` and `python demo.py --transformation translation --fixed-camera-scale 2 --draw-objects --track-boxes --id-size 1.8 --distance-threshold 200 --save video.mp4`
23+
24+
## Second Example - Homographies
25+
26+
This method can work with any camera movement, this includes pan, tilt, rotation, traveling in any direction, and zoom.
27+
28+
In the following video, the correct trajectory of the players is drawn even as the camera moves:
29+
30+
![soccer](/docs/soccer.gif)
31+
32+
> video generated using command `python demo.py --transformation homography --draw-paths --path-history 150 --distance-threshold 200 --track-boxes --max-points=900 --min-distance=14 --save --model yolov5x --hit-counter-max 3 video.mp4` on a snippet of this [video](https://www.youtube.com/watch?v=CGFgHjeEkbY&t=1200s)
33+
34+
35+
## Setup
36+
37+
Build and run the Docker container with ./run_gpu.sh.
38+
39+
Copy a video to the src folder.
40+
41+
Within the container, run with the default parameters:
42+
43+
`python demo.py <video>.mp4`
44+
45+
For additional settings, you may display the instructions using `python demo.py --help`.

demos/camera_motion/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
yolov5==6.1.8

demos/camera_motion/run_gpu.sh

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/usr/bin/env -S bash -e
2+
docker build . -t norfair-camera-motion
3+
docker run -it --rm \
4+
--gpus all \
5+
--shm-size=1gb \
6+
-v `realpath .`:/demo \
7+
norfair-camera-motion \
8+
bash

demos/camera_motion/src/demo.py

+284
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
import argparse
2+
from functools import partial
3+
4+
import numpy as np
5+
import torch
6+
7+
from norfair import (
8+
AbsolutePaths,
9+
Detection,
10+
FixedCamera,
11+
Tracker,
12+
Video,
13+
draw_absolute_grid,
14+
draw_tracked_boxes,
15+
)
16+
from norfair.camera_motion import (
17+
HomographyTransformationGetter,
18+
MotionEstimator,
19+
TranslationTransformationGetter,
20+
)
21+
from norfair.drawing import draw_tracked_objects
22+
23+
24+
def yolo_detections_to_norfair_detections(yolo_detections, track_boxes):
25+
norfair_detections = []
26+
boxes = []
27+
detections_as_xyxy = yolo_detections.xyxy[0]
28+
for detection_as_xyxy in detections_as_xyxy:
29+
detection_as_xyxy = detection_as_xyxy.cpu().numpy()
30+
bbox = np.array(
31+
[
32+
[detection_as_xyxy[0].item(), detection_as_xyxy[1].item()],
33+
[detection_as_xyxy[2].item(), detection_as_xyxy[3].item()],
34+
]
35+
)
36+
boxes.append(bbox)
37+
if track_boxes:
38+
points = bbox
39+
scores = np.array([detection_as_xyxy[4], detection_as_xyxy[4]])
40+
else:
41+
points = bbox.mean(axis=0, keepdims=True)
42+
scores = detection_as_xyxy[[4]]
43+
44+
norfair_detections.append(
45+
Detection(points=points, scores=scores, label=detection_as_xyxy[-1].item())
46+
)
47+
48+
return norfair_detections, boxes
49+
50+
51+
def run():
52+
parser = argparse.ArgumentParser(description="Track objects in a video.")
53+
parser.add_argument("files", type=str, nargs="+", help="Video files to process")
54+
parser.add_argument(
55+
"--model",
56+
type=str,
57+
default="yolov5n",
58+
help="YOLO model to use, possible values are yolov5n, yolov5s, yolov5m, yolov5l, yolov5x",
59+
)
60+
parser.add_argument(
61+
"--confidence-threshold",
62+
type=float,
63+
help="Confidence threshold of detections",
64+
default=0.15,
65+
)
66+
parser.add_argument(
67+
"--distance-threshold",
68+
type=float,
69+
default=0.8,
70+
help="Max distance to consider when matching detections and tracked objects",
71+
)
72+
parser.add_argument(
73+
"--initialization-delay",
74+
type=float,
75+
default=3,
76+
help="Min detections needed to start the tracked object",
77+
)
78+
parser.add_argument(
79+
"--track-boxes",
80+
dest="track_boxes",
81+
action="store_true",
82+
help="Pass it to track bounding boxes instead of just the centroids",
83+
)
84+
parser.add_argument(
85+
"--hit-counter-max",
86+
type=int,
87+
default=30,
88+
help="Max iteration the tracked object is kept after when there are no detections",
89+
)
90+
parser.add_argument(
91+
"--iou-threshold", type=float, help="Iou threshold for detector", default=0.15
92+
)
93+
parser.add_argument(
94+
"--image-size", type=int, help="Size of the images for detector", default=480
95+
)
96+
parser.add_argument(
97+
"--classes", type=int, nargs="+", default=[0], help="Classes to track"
98+
)
99+
parser.add_argument(
100+
"--transformation",
101+
default="homography",
102+
help="Type of transformation, possible values are homography, translation, none",
103+
)
104+
parser.add_argument(
105+
"--max-points",
106+
type=int,
107+
default=500,
108+
help="Max points sampled to calculate camera motion",
109+
)
110+
parser.add_argument(
111+
"--min-distance",
112+
type=float,
113+
default=7,
114+
help="Min distance between points sampled to calculate camera motion",
115+
)
116+
parser.add_argument(
117+
"--no-mask-detections",
118+
dest="mask_detections",
119+
action="store_false",
120+
default=True,
121+
help="By default we don't sample regions where objects were detected when estimating camera motion. Pass this flag to disable this behavior",
122+
)
123+
parser.add_argument(
124+
"--save",
125+
dest="save",
126+
action="store_true",
127+
help="Pass this flag to save the video instead of showing the frames",
128+
)
129+
parser.add_argument(
130+
"--output-name",
131+
default=None,
132+
help="Name of the output file",
133+
)
134+
parser.add_argument(
135+
"--downsample-ratio",
136+
type=int,
137+
default=1,
138+
help="Downsample ratio when showing frames",
139+
)
140+
parser.add_argument(
141+
"--fixed-camera-scale",
142+
type=float,
143+
default=0,
144+
help="Scale of the fixed camera, set to 0 to disable. Note that this only works for translation",
145+
)
146+
parser.add_argument(
147+
"--draw-absolute-grid",
148+
dest="absolute_grid",
149+
action="store_true",
150+
help="Pass this flag to draw absolute grid for reference",
151+
)
152+
parser.add_argument(
153+
"--draw-objects",
154+
dest="draw_objects",
155+
action="store_true",
156+
help="Pass this flag to draw tracked object as points or as boxes if --track-boxes is used.",
157+
)
158+
parser.add_argument(
159+
"--draw-paths",
160+
dest="draw_paths",
161+
action="store_true",
162+
help="Pass this flag to draw the paths of the objects (SLOW)",
163+
)
164+
parser.add_argument(
165+
"--path-history",
166+
type=int,
167+
default=20,
168+
help="Length of the paths",
169+
)
170+
parser.add_argument(
171+
"--id-size",
172+
type=float,
173+
default=None,
174+
help="Size multiplier of the ids when drawing. Thikness will addapt to size",
175+
)
176+
parser.add_argument(
177+
"--draw-flow",
178+
dest="draw_flow",
179+
action="store_true",
180+
help="Pass this flag to draw the optical flow of the selected points",
181+
)
182+
183+
args = parser.parse_args()
184+
185+
model = torch.hub.load("ultralytics/yolov5", args.model)
186+
model.conf_threshold = 0
187+
model.iou_threshold = args.iou_threshold
188+
model.image_size = args.image_size
189+
model.classes = args.classes
190+
191+
use_fixed_camera = args.fixed_camera_scale > 0
192+
tracked_objects = []
193+
# Process Videos
194+
for input_path in args.files:
195+
if args.transformation == "homography":
196+
transformations_getter = HomographyTransformationGetter()
197+
elif args.transformation == "translation":
198+
transformations_getter = TranslationTransformationGetter()
199+
elif args.transformation == "none":
200+
transformations_getter = None
201+
else:
202+
raise ValueError(f"invalid transformation {args.transformation}")
203+
if transformations_getter is not None:
204+
motion_estimator = MotionEstimator(
205+
max_points=args.max_points,
206+
min_distance=args.min_distance,
207+
transformations_getter=transformations_getter,
208+
draw_flow=args.draw_flow
209+
)
210+
else:
211+
motion_estimator = None
212+
213+
if use_fixed_camera:
214+
fixed_camera = FixedCamera(scale=args.fixed_camera_scale)
215+
216+
if args.draw_paths:
217+
path_drawer = AbsolutePaths(max_history=args.path_history, thickness=2)
218+
219+
video = Video(input_path=input_path)
220+
show_or_write = (
221+
video.write
222+
if args.save
223+
else partial(video.show, downsample_ratio=args.downsample_ratio)
224+
)
225+
226+
tracker = Tracker(
227+
distance_function="frobenius",
228+
detection_threshold=args.confidence_threshold,
229+
distance_threshold=args.distance_threshold,
230+
initialization_delay=args.initialization_delay,
231+
hit_counter_max=args.hit_counter_max,
232+
)
233+
for frame in video:
234+
detections = model(frame)
235+
detections, boxes = yolo_detections_to_norfair_detections(detections, args.track_boxes)
236+
237+
mask = None
238+
if args.mask_detections:
239+
# create a mask of ones
240+
mask = np.ones(frame.shape[:2], frame.dtype)
241+
# set to 0 all detections
242+
for b in boxes:
243+
i = b.astype(int)
244+
mask[i[0, 1] : i[1, 1], i[0, 0] : i[1, 0]] = 0
245+
if args.track_boxes:
246+
for obj in tracked_objects:
247+
i = obj.estimate.astype(int)
248+
mask[i[0, 1] : i[1, 1], i[0, 0] : i[1, 0]] = 0
249+
250+
if motion_estimator is None:
251+
coord_transformations = None
252+
else:
253+
coord_transformations = motion_estimator.update(frame, mask)
254+
255+
tracked_objects = tracker.update(
256+
detections=detections, coord_transformations=coord_transformations
257+
)
258+
259+
if args.draw_objects:
260+
draw_tracked_objects(
261+
frame,
262+
tracked_objects,
263+
id_size=args.id_size,
264+
id_thickness=None
265+
if args.id_size is None
266+
else int(args.id_size * 2),
267+
)
268+
269+
if args.absolute_grid:
270+
draw_absolute_grid(frame, coord_transformations)
271+
272+
if args.draw_paths:
273+
frame = path_drawer.draw(
274+
frame, tracked_objects, coord_transform=coord_transformations
275+
)
276+
277+
if use_fixed_camera:
278+
frame = fixed_camera.adjust_frame(frame, coord_transformations)
279+
280+
show_or_write(frame)
281+
282+
283+
if __name__ == "__main__":
284+
run()

docs/camera_stabilization.gif

29.2 MB
Loading

docs/pan_tilt.png

120 KB
Loading

docs/soccer.gif

30.4 MB
Loading

0 commit comments

Comments
 (0)