Skip to content

Commit

Permalink
Merge pull request #1 from klementng/dev
Browse files Browse the repository at this point in the history
Feature dump
  • Loading branch information
klementng authored Oct 20, 2023
2 parents 88b63de + 444c423 commit d5e0a8f
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 181 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/
test.ipynb
.gitignore
tests/samples/text.2.eng.srt
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM ubuntu:focal

RUN apt update && apt install software-properties-common -y
RUN add-apt-repository ppa:alex-p/tesseract-ocr5
RUN apt update && apt install software-properties-common -y && add-apt-repository ppa:alex-p/tesseract-ocr5
RUN apt update && apt install mkvtoolnix tesseract-ocr python3-pip ffmpeg libsm6 libxext6 -y

COPY ./requirements.txt requirements.txt
Expand Down
37 changes: 22 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,35 @@ services:
## Usage:
```sh
usage: main.py [-h] [--path PATH] [--formats {ass,srt,vtt} [{ass,srt,vtt} ...]] [--languages LANGUAGES [LANGUAGES ...]] [--overwrite] [--disable_bitmap_extraction]
[--postprocess_only] [--postprocessing POSTPROCESSING] [--scan_interval SCAN_INTERVAL] [--log_level LOG_LEVEL] [--log_file LOG_FILE] [--progress_bar {on,off}]
positional arguments:
path path to media file/folder

options:
-h, --help show this help message and exit
--path PATH Path to media file/folder
--formats {ass,srt,vtt} [{ass,srt,vtt} ...]
Subtitles formats to extract to
output subtitles formats
--languages LANGUAGES [LANGUAGES ...]
Select subtitle languages stream to extract from, use 'all' to extract all languages
--overwrite Overwrite existing subtitle file
extract subtitle for selected languages, use 'all' to extract all languages
--unknown_language_as UNKNOWN_LANGUAGE_AS
treat unknown language as
--overwrite overwrite existing subtitle file
--disable_bitmap_extraction
Disable bitmap subtitle extraction via OCR
--postprocess_only Only do conduct post processing
disable extraction for bitmap based subtitle extraction via OCR
--postprocess_only only do conduct post processing
--postprocessing POSTPROCESSING
Path to postprocessing config file
path to postprocessing config file
--scan_interval SCAN_INTERVAL
Interval to scan folder in mins (set 0 to disable and exit upon completion)
interval to monitor and scan folder in mins (set 0 to disable and exit upon completion)
--log_level LOG_LEVEL
Setting logging level
--log_file LOG_FILE Path to log file
--progress_bar {on,off}
Enable progress bar

setting logging level
--log_file LOG_FILE path to log file
--disable_progress_bar
enable progress bar
--exclude_videos EXCLUDE_VIDEOS
path to a newline separated file with paths to video files to exclude
--exclude_subtitles EXCLUDE_SUBTITLES
path to a newline separated file with paths to subtitles files to exclude
--exclude_mode {e,e+a}
set file exclusion behavior, e = exclude only, e+a = exclude and append new extracted file
--threads THREADS set number of extraction thread
```
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
services:

subtitle-extract:
build: .

subtitle-extract-image:
image: ghcr.io/klementng/subtitle-extract:main
container_name: subtitle-extract
volumes:
Expand Down
28 changes: 14 additions & 14 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _run_subprocess(args):
return completed_process

@staticmethod
@cachetools.func.fifo_cache(maxsize=1)
@cachetools.func.fifo_cache(maxsize=10)
def _run_ffprobe(path):
"""
The function `_run_ffprobe` is used to probe a media file and retrieve information about its
Expand Down Expand Up @@ -337,7 +337,7 @@ def extract(self, media_path: str, stream_indexes: list):
a list of file paths.
"""

logger.info(
logger.debug(
f"[BitMapSubtitleExtractor] Processing subtitles for {media_path}, streams: {stream_indexes}")

ffmpeg_args = []
Expand All @@ -351,11 +351,11 @@ def extract(self, media_path: str, stream_indexes: list):
ffmpeg_args.extend(['-map', f"0:{i}", "-c", "copy", sup_f])

if ffmpeg_args != []:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Extracting subtitles to .sup format...")
self._run_ffmpeg(media_path, ffmpeg_args)
else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] .sup format found. Skipping ffmpeg extraction..")


Expand All @@ -367,12 +367,12 @@ def extract(self, media_path: str, stream_indexes: list):
srt_f = self.format_subtitle_path(media_path, i, 'srt')

if self.is_wanted(media_path, srt_f, i):
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Performing OCR. Converting .sup to .srt")
self._run_psgrip_ocr(sup_f, srt_f, lang)
filelist.append(srt_f)
else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] .srt format found. Skipping OCR...")

# Convert SRT to desired subtitles formats
Expand All @@ -387,12 +387,12 @@ def extract(self, media_path: str, stream_indexes: list):
filelist.append(sub_f)

if len(tmp_filelist) != 0:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] Converting .srt to desired format")
self._run_ffmpeg(srt_f, tmp_filelist)

else:
logger.info(
logger.debug(
"[BitMapSubtitleExtractor] All desired format found, skipping...")

return filelist
Expand Down Expand Up @@ -423,7 +423,7 @@ def extract(self, media_path: str, stream_indexes: list):
a list of file paths for the extracted subtitles.
"""

logger.info(
logger.debug(
f"[TextSubtitleExtractor] Processing subtitles for {media_path}, streams: {stream_indexes}")

ffmpeg_args = []
Expand All @@ -439,10 +439,10 @@ def extract(self, media_path: str, stream_indexes: list):
filelist.append(f)

if ffmpeg_args != []:
logger.info("[TextSubtitleExtractor] Extracting Subtitles...")
logger.debug("[TextSubtitleExtractor] Extracting Subtitles...")
self._run_ffmpeg(media_path, ffmpeg_args)
else:
logger.info(
logger.debug(
"[TextSubtitleExtractor] All desired format found, skipping...")

return filelist
Expand All @@ -466,7 +466,7 @@ def extract(self, media_path):
Returns:
a list of files that were extracted.
"""
logger.info(f"[SubtitleExtractor] Processing {media_path}")
logger.debug(f"[SubtitleExtractor] Processing {media_path}")

ffprobe_info = self.get_subtitle_info(media_path)

Expand All @@ -490,13 +490,13 @@ def extract(self, media_path):

filelist = []
if len(text_streams) != 0:
logger.info("[SubtitleExtractor] Extracting text based subtitles")
logger.debug("[SubtitleExtractor] Extracting text based subtitles")
extractor = TextSubtitleExtractor.init(self)
filelist1 = extractor.extract(media_path, text_streams)
filelist.extend(filelist1)

if len(bitmap_streams) != 0 and self.extract_bitmap:
logger.info("[SubtitleExtractor] Extracting bitmap based subtitles")
logger.debug("[SubtitleExtractor] Extracting bitmap based subtitles")
extractor = BitmapSubtitleExtractor.init(self)
filelist2 = extractor.extract(media_path, bitmap_streams)
filelist.extend(filelist2)
Expand Down
Loading

0 comments on commit d5e0a8f

Please sign in to comment.