Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature dump #1

Merged
merged 8 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/
test.ipynb
.gitignore
tests/samples/text.2.eng.srt
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM ubuntu:focal

RUN apt update && apt install software-properties-common -y
RUN add-apt-repository ppa:alex-p/tesseract-ocr5
RUN apt update && apt install software-properties-common -y && add-apt-repository ppa:alex-p/tesseract-ocr5
RUN apt update && apt install mkvtoolnix tesseract-ocr python3-pip ffmpeg libsm6 libxext6 -y

COPY ./requirements.txt requirements.txt
Expand Down
37 changes: 22 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,35 @@ services:
## Usage:

```sh
usage: main.py [-h] [--path PATH] [--formats {ass,srt,vtt} [{ass,srt,vtt} ...]] [--languages LANGUAGES [LANGUAGES ...]] [--overwrite] [--disable_bitmap_extraction]
[--postprocess_only] [--postprocessing POSTPROCESSING] [--scan_interval SCAN_INTERVAL] [--log_level LOG_LEVEL] [--log_file LOG_FILE] [--progress_bar {on,off}]
positional arguments:
path path to media file/folder

options:
-h, --help show this help message and exit
--path PATH Path to media file/folder
--formats {ass,srt,vtt} [{ass,srt,vtt} ...]
Subtitles formats to extract to
output subtitles formats
--languages LANGUAGES [LANGUAGES ...]
Select subtitle languages stream to extract from, use 'all' to extract all languages
--overwrite Overwrite existing subtitle file
extract subtitle for selected languages, use 'all' to extract all languages
--unknown_language_as UNKNOWN_LANGUAGE_AS
treat unknown language as
--overwrite overwrite existing subtitle file
--disable_bitmap_extraction
Disable bitmap subtitle extraction via OCR
--postprocess_only Only do conduct post processing
disable extraction for bitmap based subtitle extraction via OCR
--postprocess_only only do conduct post processing
--postprocessing POSTPROCESSING
Path to postprocessing config file
path to postprocessing config file
--scan_interval SCAN_INTERVAL
Interval to scan folder in mins (set 0 to disable and exit upon completion)
interval to monitor and scan folder in mins (set 0 to disable and exit upon completion)
--log_level LOG_LEVEL
Setting logging level
--log_file LOG_FILE Path to log file
--progress_bar {on,off}
Enable progress bar

setting logging level
--log_file LOG_FILE path to log file
--disable_progress_bar
enable progress bar
--exclude_videos EXCLUDE_VIDEOS
path to a newline separated file with paths to video files to exclude
--exclude_subtitles EXCLUDE_SUBTITLES
path to a newline separated file with paths to subtitles files to exclude
--exclude_mode {e,e+a}
set file exclusion behavior, e = exclude only, e+a = exclude and append new extracted file
--threads THREADS set number of extraction thread
```
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
services:

subtitle-extract:
build: .

subtitle-extract-image:
image: ghcr.io/klementng/subtitle-extract:main
container_name: subtitle-extract
volumes:
Expand Down
28 changes: 14 additions & 14 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _run_subprocess(args):
return completed_process

@staticmethod
@cachetools.func.fifo_cache(maxsize=1)
@cachetools.func.fifo_cache(maxsize=10)
def _run_ffprobe(path):
"""
The function `_run_ffprobe` is used to probe a media file and retrieve information about its
Expand Down Expand Up @@ -337,7 +337,7 @@ def extract(self, media_path: str, stream_indexes: list):
a list of file paths.
"""

logger.info(
logger.debug(
f"[BitMapSubtitleExtractor] Processing subtitles for {media_path}, streams: {stream_indexes}")

ffmpeg_args = []
Expand All @@ -351,11 +351,11 @@ def extract(self, media_path: str, stream_indexes: list):
ffmpeg_args.extend(['-map', f"0:{i}", "-c", "copy", sup_f])

if ffmpeg_args != []:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Extracting subtitles to .sup format...")
self._run_ffmpeg(media_path, ffmpeg_args)
else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] .sup format found. Skipping ffmpeg extraction..")


Expand All @@ -367,12 +367,12 @@ def extract(self, media_path: str, stream_indexes: list):
srt_f = self.format_subtitle_path(media_path, i, 'srt')

if self.is_wanted(media_path, srt_f, i):
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Performing OCR. Converting .sup to .srt")
self._run_psgrip_ocr(sup_f, srt_f, lang)
filelist.append(srt_f)
else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] .srt format found. Skipping OCR...")

# Convert SRT to desired subtitles formats
Expand All @@ -387,12 +387,12 @@ def extract(self, media_path: str, stream_indexes: list):
filelist.append(sub_f)

if len(tmp_filelist) != 0:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Converting .srt to desired format")
self._run_ffmpeg(srt_f, tmp_filelist)

else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] All desired format found, skipping...")

return filelist
Expand Down Expand Up @@ -423,7 +423,7 @@ def extract(self, media_path: str, stream_indexes: list):
a list of file paths for the extracted subtitles.
"""

logger.info(
logger.debug(
f"[TextSubtitleExtractor] Processing subtitles for {media_path}, streams: {stream_indexes}")

ffmpeg_args = []
Expand All @@ -439,10 +439,10 @@ def extract(self, media_path: str, stream_indexes: list):
filelist.append(f)

if ffmpeg_args != []:
logger.info("[TextSubtitleExtractor] Extracting Subtitles...")
logger.debug("[TextSubtitleExtractor] Extracting Subtitles...")
self._run_ffmpeg(media_path, ffmpeg_args)
else:
logger.info(
logger.debug(
"[TextSubtitleExtractor] All desired format found, skipping...")

return filelist
Expand All @@ -466,7 +466,7 @@ def extract(self, media_path):
Returns:
a list of files that were extracted.
"""
logger.info(f"[SubtitleExtractor] Processing {media_path}")
logger.debug(f"[SubtitleExtractor] Processing {media_path}")

ffprobe_info = self.get_subtitle_info(media_path)

Expand All @@ -490,13 +490,13 @@ def extract(self, media_path):

filelist = []
if len(text_streams) != 0:
logger.info("[SubtitleExtractor] Extracting text based subtitles")
logger.debug("[SubtitleExtractor] Extracting text based subtitles")
extractor = TextSubtitleExtractor.init(self)
filelist1 = extractor.extract(media_path, text_streams)
filelist.extend(filelist1)

if len(bitmap_streams) != 0 and self.extract_bitmap:
logger.info("[SubtitleExtractor] Extracting bitmap based subtitles")
logger.debug("[SubtitleExtractor] Extracting bitmap based subtitles")
extractor = BitmapSubtitleExtractor.init(self)
filelist2 = extractor.extract(media_path, bitmap_streams)
filelist.extend(filelist2)
Expand Down
Loading
Loading