From b3d63b067243611f532a639ab8a9fbdc27b60aed Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Mon, 14 Apr 2025 17:54:29 +0200 Subject: [PATCH 1/6] feat(encoding): switching to pyav ffmpeg API to handle encoding and info gathering --- lerobot/common/datasets/video_utils.py | 202 +++++++++++++------------ 1 file changed, 104 insertions(+), 98 deletions(-) diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index c38d570ddf..7e04d2a9ac 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -13,16 +13,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob import importlib -import json import logging -import subprocess import warnings -from collections import OrderedDict from dataclasses import dataclass, field from pathlib import Path from typing import Any, ClassVar +import av import pyarrow as pa import torch import torchvision @@ -252,51 +251,68 @@ def encode_video_frames( g: int | None = 2, crf: int | None = 30, fast_decode: int = 0, - log_level: str | None = "error", + log_level: int | None = av.logging.ERROR, overwrite: bool = False, ) -> None: """More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" video_path = Path(video_path) imgs_dir = Path(imgs_dir) + + if video_path.exists() and not overwrite: + raise FileExistsError( + f"Video file already exists at {video_path}. Use `overwrite=True` to overwrite it." + ) + video_path.parent.mkdir(parents=True, exist_ok=True) - ffmpeg_args = OrderedDict( - [ - ("-f", "image2"), - ("-r", str(fps)), - ("-i", str(imgs_dir / "frame_%06d.png")), - ("-vcodec", vcodec), - ("-pix_fmt", pix_fmt), - ] + # Get input frames + template = "frame_" + ("[0-9]" * 6) + ".png" + input_list = sorted( + glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0]) ) + # Define video output options + video_options = {"pix_fmt": pix_fmt} + if g is not None: - ffmpeg_args["-g"] = str(g) + video_options["g"] = str(g) if crf is not None: - ffmpeg_args["-crf"] = str(crf) + video_options["crf"] = str(crf) if fast_decode: - key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune" + key = "svtav1-params" if vcodec == "libsvtav1" else "tune" value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode" - ffmpeg_args[key] = value + video_options[key] = value + # Set logging level if log_level is not None: - ffmpeg_args["-loglevel"] = str(log_level) - - ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair] - if overwrite: - ffmpeg_args.append("-y") - - ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)] - # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal - subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL) + # "While less efficient, it is generally preferable to modify logging with Python’s logging" + logging.getLogger("libav").setLevel(log_level) + + # Create and open output file (overwrite by default) + with av.open(str(video_path), "w", format="mp4") as output: + output_stream = output.add_stream(vcodec, fps, options=video_options) + + # Loop through input frames and encode them + for input in input_list: + input_image = Image.open(input).convert("RGB") + input_frame = av.VideoFrame.from_image(input_image) + packet = output_stream.encode(input_frame) + if packet: + output.mux(packet) + + # Flush the encoder + packet = output_stream.encode() + if packet: + output.mux(packet) + + # Reset logging level + if log_level is not None: + av.logging.restore_default_callback() if not video_path.exists(): - raise OSError( - f"Video encoding did not work. File not found: {video_path}. " - f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`" - ) + raise OSError(f"Video encoding did not work. File not found: {video_path}.") @dataclass @@ -332,78 +348,68 @@ def __call__(self): def get_audio_info(video_path: Path | str) -> dict: - ffprobe_audio_cmd = [ - "ffprobe", - "-v", - "error", - "-select_streams", - "a:0", - "-show_entries", - "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration", - "-of", - "json", - str(video_path), - ] - result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - raise RuntimeError(f"Error running ffprobe: {result.stderr}") - - info = json.loads(result.stdout) - audio_stream_info = info["streams"][0] if info.get("streams") else None - if audio_stream_info is None: - return {"has_audio": False} - - # Return the information, defaulting to None if no audio stream is present - return { - "has_audio": True, - "audio.channels": audio_stream_info.get("channels", None), - "audio.codec": audio_stream_info.get("codec_name", None), - "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None, - "audio.sample_rate": int(audio_stream_info["sample_rate"]) - if audio_stream_info.get("sample_rate") - else None, - "audio.bit_depth": audio_stream_info.get("bit_depth", None), - "audio.channel_layout": audio_stream_info.get("channel_layout", None), - } + # Set logging level + logging.getLogger("libav").setLevel(av.logging.ERROR) + + # Getting audio stream information + audio_info = {} + with av.open(str(video_path), "r") as audio_file: + try: + audio_stream = audio_file.streams.audio[0] + except IndexError: + # Reset logging level + av.logging.restore_default_callback() + return {"has_audio": False} + + audio_info["audio.channels"] = audio_stream.channels + audio_info["audio.codec"] = audio_stream.codec.canonical_name + audio_info["audio.bit_rate"] = ( + audio_stream.bit_rate + ) # In an ideal loseless case : bit depth x sample rate x channels = bit rate. In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied. + audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second + audio_info["audio.bit_depth"] = ( + audio_stream.format.bits + ) # In an ideal loseless case : fixed number of bits per sample. In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate). + audio_info["audio.channel_layout"] = audio_stream.layout.name + audio_info["has_audio"] = True + + # Reset logging level + av.logging.restore_default_callback() + + return audio_info def get_video_info(video_path: Path | str) -> dict: - ffprobe_video_cmd = [ - "ffprobe", - "-v", - "error", - "-select_streams", - "v:0", - "-show_entries", - "stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt", - "-of", - "json", - str(video_path), - ] - result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - raise RuntimeError(f"Error running ffprobe: {result.stderr}") - - info = json.loads(result.stdout) - video_stream_info = info["streams"][0] - - # Calculate fps from r_frame_rate - r_frame_rate = video_stream_info["r_frame_rate"] - num, denom = map(int, r_frame_rate.split("/")) - fps = num / denom - - pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"]) - - video_info = { - "video.fps": fps, - "video.height": video_stream_info["height"], - "video.width": video_stream_info["width"], - "video.channels": pixel_channels, - "video.codec": video_stream_info["codec_name"], - "video.pix_fmt": video_stream_info["pix_fmt"], - "video.is_depth_map": False, - **get_audio_info(video_path), - } + # Set logging level + logging.getLogger("libav").setLevel(av.logging.ERROR) + + # Getting video stream information + video_info = {} + with av.open(str(video_path), "r") as video_file: + try: + video_stream = video_file.streams.video[0] + except IndexError: + # Reset logging level + av.logging.restore_default_callback() + return {} + + video_info["video.height"] = video_stream.height + video_info["video.width"] = video_stream.width + video_info["video.codec"] = video_stream.codec.canonical_name + video_info["video.pix_fmt"] = video_stream.pix_fmt + video_info["video.is_depth_map"] = False + + # Calculate fps from r_frame_rate + video_info["video.fps"] = int(video_stream.base_rate) + + pixel_channels = get_video_pixel_channels(video_stream.pix_fmt) + video_info["video.channels"] = pixel_channels + + # Reset logging level + av.logging.restore_default_callback() + + # Adding audio stream information + video_info.update(**get_audio_info(video_path)) return video_info From 3e32cf5e3cc56336f1a65a01dba75dac8163766f Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Fri, 18 Apr 2025 18:34:33 +0200 Subject: [PATCH 2/6] style: lightweight overwrite implementation, better variable naming and comments formatting, deleting debug artifacts --- lerobot/common/datasets/video_utils.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 7e04d2a9ac..37d1bd86f1 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -258,12 +258,7 @@ def encode_video_frames( video_path = Path(video_path) imgs_dir = Path(imgs_dir) - if video_path.exists() and not overwrite: - raise FileExistsError( - f"Video file already exists at {video_path}. Use `overwrite=True` to overwrite it." - ) - - video_path.parent.mkdir(parents=True, exist_ok=True) + video_path.parent.mkdir(parents=True, exist_ok=overwrite) # Get input frames template = "frame_" + ("[0-9]" * 6) + ".png" @@ -291,12 +286,12 @@ def encode_video_frames( logging.getLogger("libav").setLevel(log_level) # Create and open output file (overwrite by default) - with av.open(str(video_path), "w", format="mp4") as output: + with av.open(str(video_path), "w") as output: output_stream = output.add_stream(vcodec, fps, options=video_options) # Loop through input frames and encode them - for input in input_list: - input_image = Image.open(input).convert("RGB") + for input_data in input_list: + input_image = Image.open(input_data).convert("RGB") input_frame = av.VideoFrame.from_image(input_image) packet = output_stream.encode(input_frame) if packet: @@ -363,13 +358,13 @@ def get_audio_info(video_path: Path | str) -> dict: audio_info["audio.channels"] = audio_stream.channels audio_info["audio.codec"] = audio_stream.codec.canonical_name - audio_info["audio.bit_rate"] = ( - audio_stream.bit_rate - ) # In an ideal loseless case : bit depth x sample rate x channels = bit rate. In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied. + # In an ideal loseless case : bit depth x sample rate x channels = bit rate. + # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied. + audio_info["audio.bit_rate"] = audio_stream.bit_rate audio_info["audio.sample_rate"] = audio_stream.sample_rate # Number of samples per second - audio_info["audio.bit_depth"] = ( - audio_stream.format.bits - ) # In an ideal loseless case : fixed number of bits per sample. In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate). + # In an ideal loseless case : fixed number of bits per sample. + # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate). + audio_info["audio.bit_depth"] = audio_stream.format.bits audio_info["audio.channel_layout"] = audio_stream.layout.name audio_info["has_audio"] = True From 4e0f0b2347d9730092613933e29b50414fb0b2a8 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Fri, 18 Apr 2025 19:59:54 +0200 Subject: [PATCH 3/6] fix(encoding): adding output pixel format and frame size as output stream arguments and not codec options --- lerobot/common/datasets/video_utils.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 37d1bd86f1..c3cd46a1ab 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -266,8 +266,14 @@ def encode_video_frames( glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0]) ) - # Define video output options - video_options = {"pix_fmt": pix_fmt} + # Define video output frame size (assuming all input frames are the same size) + if len(input_list) == 0: + raise FileNotFoundError(f"No images found in {imgs_dir}.") + dummy_image = Image.open(input_list[0]) + width, height = dummy_image.size + + # Define video codec options + video_options = {} if g is not None: video_options["g"] = str(g) @@ -288,6 +294,9 @@ def encode_video_frames( # Create and open output file (overwrite by default) with av.open(str(video_path), "w") as output: output_stream = output.add_stream(vcodec, fps, options=video_options) + output_stream.pix_fmt = pix_fmt + output_stream.width = width + output_stream.height = height # Loop through input frames and encode them for input_data in input_list: From 5b783be2fcf5a7a7cfe8058bcdff9c8a875de63d Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Fri, 18 Apr 2025 20:18:39 +0200 Subject: [PATCH 4/6] fix(incompatibility warning): adding error handling when picking yuv444p pixel format with libsvtav1 --- lerobot/common/datasets/video_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index c3cd46a1ab..8b96bc5e1d 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -260,6 +260,13 @@ def encode_video_frames( video_path.parent.mkdir(parents=True, exist_ok=overwrite) + # Encoders/pixel formats incompatibility check + if vcodec == "libsvtav1" and pix_fmt == "yuv444p": + logging.warning( + "Incompatible pixel format 'yuv444p' for codec 'libsvtav1', auto-selecting format 'yuv420p'" + ) + pix_fmt = "yuv420p" + # Get input frames template = "frame_" + ("[0-9]" * 6) + ".png" input_list = sorted( From b68ff704de4c20e48a04bc4e8b63e72f3d10e285 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Wed, 23 Apr 2025 15:22:44 +0200 Subject: [PATCH 5/6] fix(encoders): adding pyav supported hevc encoder instead of h265 --- benchmarks/video/run_video_benchmark.py | 4 ++-- lerobot/common/datasets/video_utils.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index c62578c46e..9d587ee9fd 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -416,7 +416,7 @@ def main( "--vcodec", type=str, nargs="*", - default=["libx264", "libx265", "libsvtav1"], + default=["libx264", "hevc", "libsvtav1"], help="Video codecs to be tested", ) parser.add_argument( @@ -446,7 +446,7 @@ def main( # nargs="*", # default=[0, 1], # help="Use the fastdecode tuning option. 0 disables it. " - # "For libx264 and libx265, only 1 is possible. " + # "For libx264 and libx265/hevc, only 1 is possible. " # "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization", # ) parser.add_argument( diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 8b96bc5e1d..375314e985 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -255,15 +255,19 @@ def encode_video_frames( overwrite: bool = False, ) -> None: """More info on ffmpeg arguments tuning on `benchmark/video/README.md`""" + # Check encoder availability + if vcodec not in ["h264", "hevc", "libsvtav1"]: + raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.") + video_path = Path(video_path) imgs_dir = Path(imgs_dir) video_path.parent.mkdir(parents=True, exist_ok=overwrite) # Encoders/pixel formats incompatibility check - if vcodec == "libsvtav1" and pix_fmt == "yuv444p": + if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p": logging.warning( - "Incompatible pixel format 'yuv444p' for codec 'libsvtav1', auto-selecting format 'yuv420p'" + f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'" ) pix_fmt = "yuv420p" From bfe05c1031acb489754422c4468446ee2c66fae1 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Fri, 25 Apr 2025 11:13:29 +0200 Subject: [PATCH 6/6] fix(dependencies): updating pyav required minimal version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index db3d8e21cf..72047a4fbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ dependencies = [ "omegaconf>=2.3.0", "opencv-python-headless>=4.9.0", "packaging>=24.2", - "av>=12.0.5", + "av>=14.2.0", "pymunk>=6.6.0", "pynput>=1.7.7", "pyzmq>=26.2.1",