From b3d63b067243611f532a639ab8a9fbdc27b60aed Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Mon, 14 Apr 2025 17:54:29 +0200
Subject: [PATCH 1/6] feat(encoding): switching to pyav ffmpeg API to handle
 encoding and info gathering

---
 lerobot/common/datasets/video_utils.py | 202 +++++++++++++------------
 1 file changed, 104 insertions(+), 98 deletions(-)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index c38d570ddf..7e04d2a9ac 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -13,16 +13,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import glob
 import importlib
-import json
 import logging
-import subprocess
 import warnings
-from collections import OrderedDict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, ClassVar
 
+import av
 import pyarrow as pa
 import torch
 import torchvision
@@ -252,51 +251,68 @@ def encode_video_frames(
     g: int | None = 2,
     crf: int | None = 30,
     fast_decode: int = 0,
-    log_level: str | None = "error",
+    log_level: int | None = av.logging.ERROR,
     overwrite: bool = False,
 ) -> None:
     """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
     video_path = Path(video_path)
     imgs_dir = Path(imgs_dir)
+
+    if video_path.exists() and not overwrite:
+        raise FileExistsError(
+            f"Video file already exists at {video_path}. Use `overwrite=True` to overwrite it."
+        )
+
     video_path.parent.mkdir(parents=True, exist_ok=True)
 
-    ffmpeg_args = OrderedDict(
-        [
-            ("-f", "image2"),
-            ("-r", str(fps)),
-            ("-i", str(imgs_dir / "frame_%06d.png")),
-            ("-vcodec", vcodec),
-            ("-pix_fmt", pix_fmt),
-        ]
+    # Get input frames
+    template = "frame_" + ("[0-9]" * 6) + ".png"
+    input_list = sorted(
+        glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
     )
 
+    # Define video output options
+    video_options = {"pix_fmt": pix_fmt}
+
     if g is not None:
-        ffmpeg_args["-g"] = str(g)
+        video_options["g"] = str(g)
 
     if crf is not None:
-        ffmpeg_args["-crf"] = str(crf)
+        video_options["crf"] = str(crf)
 
     if fast_decode:
-        key = "-svtav1-params" if vcodec == "libsvtav1" else "-tune"
+        key = "svtav1-params" if vcodec == "libsvtav1" else "tune"
         value = f"fast-decode={fast_decode}" if vcodec == "libsvtav1" else "fastdecode"
-        ffmpeg_args[key] = value
+        video_options[key] = value
 
+    # Set logging level
     if log_level is not None:
-        ffmpeg_args["-loglevel"] = str(log_level)
-
-    ffmpeg_args = [item for pair in ffmpeg_args.items() for item in pair]
-    if overwrite:
-        ffmpeg_args.append("-y")
-
-    ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
-    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
-    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
+        # "While less efficient, it is generally preferable to modify logging with Python’s logging"
+        logging.getLogger("libav").setLevel(log_level)
+
+    # Create and open output file (overwrite by default)
+    with av.open(str(video_path), "w", format="mp4") as output:
+        output_stream = output.add_stream(vcodec, fps, options=video_options)
+
+        # Loop through input frames and encode them
+        for input in input_list:
+            input_image = Image.open(input).convert("RGB")
+            input_frame = av.VideoFrame.from_image(input_image)
+            packet = output_stream.encode(input_frame)
+            if packet:
+                output.mux(packet)
+
+        # Flush the encoder
+        packet = output_stream.encode()
+        if packet:
+            output.mux(packet)
+
+    # Reset logging level
+    if log_level is not None:
+        av.logging.restore_default_callback()
 
     if not video_path.exists():
-        raise OSError(
-            f"Video encoding did not work. File not found: {video_path}. "
-            f"Try running the command manually to debug: `{''.join(ffmpeg_cmd)}`"
-        )
+        raise OSError(f"Video encoding did not work. File not found: {video_path}.")
 
 
 @dataclass
@@ -332,78 +348,68 @@ def __call__(self):
 
 
 def get_audio_info(video_path: Path | str) -> dict:
-    ffprobe_audio_cmd = [
-        "ffprobe",
-        "-v",
-        "error",
-        "-select_streams",
-        "a:0",
-        "-show_entries",
-        "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration",
-        "-of",
-        "json",
-        str(video_path),
-    ]
-    result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
-
-    info = json.loads(result.stdout)
-    audio_stream_info = info["streams"][0] if info.get("streams") else None
-    if audio_stream_info is None:
-        return {"has_audio": False}
-
-    # Return the information, defaulting to None if no audio stream is present
-    return {
-        "has_audio": True,
-        "audio.channels": audio_stream_info.get("channels", None),
-        "audio.codec": audio_stream_info.get("codec_name", None),
-        "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None,
-        "audio.sample_rate": int(audio_stream_info["sample_rate"])
-        if audio_stream_info.get("sample_rate")
-        else None,
-        "audio.bit_depth": audio_stream_info.get("bit_depth", None),
-        "audio.channel_layout": audio_stream_info.get("channel_layout", None),
-    }
+    # Set logging level
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
+
+    # Getting audio stream information
+    audio_info = {}
+    with av.open(str(video_path), "r") as audio_file:
+        try:
+            audio_stream = audio_file.streams.audio[0]
+        except IndexError:
+            # Reset logging level
+            av.logging.restore_default_callback()
+            return {"has_audio": False}
+
+        audio_info["audio.channels"] = audio_stream.channels
+        audio_info["audio.codec"] = audio_stream.codec.canonical_name
+        audio_info["audio.bit_rate"] = (
+            audio_stream.bit_rate
+        )  # In an ideal loseless case : bit depth x sample rate x channels = bit rate. In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
+        audio_info["audio.sample_rate"] = audio_stream.sample_rate  # Number of samples per second
+        audio_info["audio.bit_depth"] = (
+            audio_stream.format.bits
+        )  # In an ideal loseless case : fixed number of bits per sample. In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
+        audio_info["audio.channel_layout"] = audio_stream.layout.name
+        audio_info["has_audio"] = True
+
+    # Reset logging level
+    av.logging.restore_default_callback()
+
+    return audio_info
 
 
 def get_video_info(video_path: Path | str) -> dict:
-    ffprobe_video_cmd = [
-        "ffprobe",
-        "-v",
-        "error",
-        "-select_streams",
-        "v:0",
-        "-show_entries",
-        "stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt",
-        "-of",
-        "json",
-        str(video_path),
-    ]
-    result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"Error running ffprobe: {result.stderr}")
-
-    info = json.loads(result.stdout)
-    video_stream_info = info["streams"][0]
-
-    # Calculate fps from r_frame_rate
-    r_frame_rate = video_stream_info["r_frame_rate"]
-    num, denom = map(int, r_frame_rate.split("/"))
-    fps = num / denom
-
-    pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"])
-
-    video_info = {
-        "video.fps": fps,
-        "video.height": video_stream_info["height"],
-        "video.width": video_stream_info["width"],
-        "video.channels": pixel_channels,
-        "video.codec": video_stream_info["codec_name"],
-        "video.pix_fmt": video_stream_info["pix_fmt"],
-        "video.is_depth_map": False,
-        **get_audio_info(video_path),
-    }
+    # Set logging level
+    logging.getLogger("libav").setLevel(av.logging.ERROR)
+
+    # Getting video stream information
+    video_info = {}
+    with av.open(str(video_path), "r") as video_file:
+        try:
+            video_stream = video_file.streams.video[0]
+        except IndexError:
+            # Reset logging level
+            av.logging.restore_default_callback()
+            return {}
+
+        video_info["video.height"] = video_stream.height
+        video_info["video.width"] = video_stream.width
+        video_info["video.codec"] = video_stream.codec.canonical_name
+        video_info["video.pix_fmt"] = video_stream.pix_fmt
+        video_info["video.is_depth_map"] = False
+
+        # Calculate fps from r_frame_rate
+        video_info["video.fps"] = int(video_stream.base_rate)
+
+        pixel_channels = get_video_pixel_channels(video_stream.pix_fmt)
+        video_info["video.channels"] = pixel_channels
+
+    # Reset logging level
+    av.logging.restore_default_callback()
+
+    # Adding audio stream information
+    video_info.update(**get_audio_info(video_path))
 
     return video_info
 

From 3e32cf5e3cc56336f1a65a01dba75dac8163766f Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 18 Apr 2025 18:34:33 +0200
Subject: [PATCH 2/6] style: lightweight overwrite implementation, better
 variable naming and comments formatting, deleting debug artifacts

---
 lerobot/common/datasets/video_utils.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 7e04d2a9ac..37d1bd86f1 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -258,12 +258,7 @@ def encode_video_frames(
     video_path = Path(video_path)
     imgs_dir = Path(imgs_dir)
 
-    if video_path.exists() and not overwrite:
-        raise FileExistsError(
-            f"Video file already exists at {video_path}. Use `overwrite=True` to overwrite it."
-        )
-
-    video_path.parent.mkdir(parents=True, exist_ok=True)
+    video_path.parent.mkdir(parents=True, exist_ok=overwrite)
 
     # Get input frames
     template = "frame_" + ("[0-9]" * 6) + ".png"
@@ -291,12 +286,12 @@ def encode_video_frames(
         logging.getLogger("libav").setLevel(log_level)
 
     # Create and open output file (overwrite by default)
-    with av.open(str(video_path), "w", format="mp4") as output:
+    with av.open(str(video_path), "w") as output:
         output_stream = output.add_stream(vcodec, fps, options=video_options)
 
         # Loop through input frames and encode them
-        for input in input_list:
-            input_image = Image.open(input).convert("RGB")
+        for input_data in input_list:
+            input_image = Image.open(input_data).convert("RGB")
             input_frame = av.VideoFrame.from_image(input_image)
             packet = output_stream.encode(input_frame)
             if packet:
@@ -363,13 +358,13 @@ def get_audio_info(video_path: Path | str) -> dict:
 
         audio_info["audio.channels"] = audio_stream.channels
         audio_info["audio.codec"] = audio_stream.codec.canonical_name
-        audio_info["audio.bit_rate"] = (
-            audio_stream.bit_rate
-        )  # In an ideal loseless case : bit depth x sample rate x channels = bit rate. In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
+        # In an ideal loseless case : bit depth x sample rate x channels = bit rate.
+        # In an actual compressed case, the bit rate is set according to the compression level : the lower the bit rate, the more compression is applied.
+        audio_info["audio.bit_rate"] = audio_stream.bit_rate
         audio_info["audio.sample_rate"] = audio_stream.sample_rate  # Number of samples per second
-        audio_info["audio.bit_depth"] = (
-            audio_stream.format.bits
-        )  # In an ideal loseless case : fixed number of bits per sample. In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
+        # In an ideal loseless case : fixed number of bits per sample.
+        # In an actual compressed case : variable number of bits per sample (often reduced to match a given depth rate).
+        audio_info["audio.bit_depth"] = audio_stream.format.bits
         audio_info["audio.channel_layout"] = audio_stream.layout.name
         audio_info["has_audio"] = True
 

From 4e0f0b2347d9730092613933e29b50414fb0b2a8 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 18 Apr 2025 19:59:54 +0200
Subject: [PATCH 3/6] fix(encoding): adding output pixel format and frame size
 as output stream arguments and not codec options

---
 lerobot/common/datasets/video_utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 37d1bd86f1..c3cd46a1ab 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -266,8 +266,14 @@ def encode_video_frames(
         glob.glob(str(imgs_dir / template)), key=lambda x: int(x.split("_")[-1].split(".")[0])
     )
 
-    # Define video output options
-    video_options = {"pix_fmt": pix_fmt}
+    # Define video output frame size (assuming all input frames are the same size)
+    if len(input_list) == 0:
+        raise FileNotFoundError(f"No images found in {imgs_dir}.")
+    dummy_image = Image.open(input_list[0])
+    width, height = dummy_image.size
+
+    # Define video codec options
+    video_options = {}
 
     if g is not None:
         video_options["g"] = str(g)
@@ -288,6 +294,9 @@ def encode_video_frames(
     # Create and open output file (overwrite by default)
     with av.open(str(video_path), "w") as output:
         output_stream = output.add_stream(vcodec, fps, options=video_options)
+        output_stream.pix_fmt = pix_fmt
+        output_stream.width = width
+        output_stream.height = height
 
         # Loop through input frames and encode them
         for input_data in input_list:

From 5b783be2fcf5a7a7cfe8058bcdff9c8a875de63d Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 18 Apr 2025 20:18:39 +0200
Subject: [PATCH 4/6] fix(incompatibility warning): adding error handling when
 picking yuv444p pixel format with libsvtav1

---
 lerobot/common/datasets/video_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index c3cd46a1ab..8b96bc5e1d 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -260,6 +260,13 @@ def encode_video_frames(
 
     video_path.parent.mkdir(parents=True, exist_ok=overwrite)
 
+    # Encoders/pixel formats incompatibility check
+    if vcodec == "libsvtav1" and pix_fmt == "yuv444p":
+        logging.warning(
+            "Incompatible pixel format 'yuv444p' for codec 'libsvtav1', auto-selecting format 'yuv420p'"
+        )
+        pix_fmt = "yuv420p"
+
     # Get input frames
     template = "frame_" + ("[0-9]" * 6) + ".png"
     input_list = sorted(

From b68ff704de4c20e48a04bc4e8b63e72f3d10e285 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Wed, 23 Apr 2025 15:22:44 +0200
Subject: [PATCH 5/6] fix(encoders): adding pyav supported hevc encoder instead
 of h265

---
 benchmarks/video/run_video_benchmark.py | 4 ++--
 lerobot/common/datasets/video_utils.py  | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py
index c62578c46e..9d587ee9fd 100644
--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -416,7 +416,7 @@ def main(
         "--vcodec",
         type=str,
         nargs="*",
-        default=["libx264", "libx265", "libsvtav1"],
+        default=["libx264", "hevc", "libsvtav1"],
         help="Video codecs to be tested",
     )
     parser.add_argument(
@@ -446,7 +446,7 @@ def main(
     #     nargs="*",
     #     default=[0, 1],
     #     help="Use the fastdecode tuning option. 0 disables it. "
-    #         "For libx264 and libx265, only 1 is possible. "
+    #         "For libx264 and libx265/hevc, only 1 is possible. "
     #         "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
     # )
     parser.add_argument(
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 8b96bc5e1d..375314e985 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -255,15 +255,19 @@ def encode_video_frames(
     overwrite: bool = False,
 ) -> None:
     """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
+    # Check encoder availability
+    if vcodec not in ["h264", "hevc", "libsvtav1"]:
+        raise ValueError(f"Unsupported video codec: {vcodec}. Supported codecs are: h264, hevc, libsvtav1.")
+
     video_path = Path(video_path)
     imgs_dir = Path(imgs_dir)
 
     video_path.parent.mkdir(parents=True, exist_ok=overwrite)
 
     # Encoders/pixel formats incompatibility check
-    if vcodec == "libsvtav1" and pix_fmt == "yuv444p":
+    if (vcodec == "libsvtav1" or vcodec == "hevc") and pix_fmt == "yuv444p":
         logging.warning(
-            "Incompatible pixel format 'yuv444p' for codec 'libsvtav1', auto-selecting format 'yuv420p'"
+            f"Incompatible pixel format 'yuv444p' for codec {vcodec}, auto-selecting format 'yuv420p'"
         )
         pix_fmt = "yuv420p"
 

From bfe05c1031acb489754422c4468446ee2c66fae1 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 25 Apr 2025 11:13:29 +0200
Subject: [PATCH 6/6] fix(dependencies): updating pyav required minimal version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index db3d8e21cf..72047a4fbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ dependencies = [
     "omegaconf>=2.3.0",
     "opencv-python-headless>=4.9.0",
     "packaging>=24.2",
-    "av>=12.0.5",
+    "av>=14.2.0",
     "pymunk>=6.6.0",
     "pynput>=1.7.7",
     "pyzmq>=26.2.1",