Skip to content

Commit f8239dc

Browse files
committed
Fixing yt captions when encountering inconsistent transcripts
Sometimes duration is not given, we default it to 0
1 parent 20c4dd8 commit f8239dc

File tree

2 files changed

+16
-4
lines changed

2 files changed

+16
-4
lines changed

minet/scrape/soup.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import re
2-
from typing import List, Optional, cast
2+
from typing import List, Optional, cast, overload
33

44
import warnings
55
from contextlib import contextmanager
@@ -94,8 +94,16 @@ def get_outer_html(self) -> str:
9494
def __getitem__(self, name: str) -> str:
9595
return cast(str, super().__getitem__(name))
9696

97-
def get(self, name: str) -> Optional[str]:
98-
return cast(Optional[str], super().get(name))
97+
@overload
98+
def get(self, name: str, default: str = ...) -> str:
99+
...
100+
101+
@overload
102+
def get(self, name: str, default: None = ...) -> Optional[str]:
103+
...
104+
105+
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
106+
return cast(Optional[str], super().get(name, default))
99107

100108
def get_list(self, name: str) -> List[str]:
101109
value = super().get(name)

minet/youtube/scraper.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,14 @@ def get_video_captions(
101101
captions = []
102102

103103
for item in soup.select("text"):
104+
105+
# NOTE: sometimes duration is absent. I don't really
106+
# know what is the best solution there (merging with
107+
# previous item?). So for now, we default duration to 0.
104108
captions.append(
105109
YouTubeCaptionLine(
106110
float(item["start"]),
107-
float(item["dur"]),
111+
float(item.get("dur", "0")),
108112
unescape(item.get_text().strip()),
109113
)
110114
)

0 commit comments

Comments
 (0)