diff --git a/src/requirements.txt b/src/requirements.txt index 9ffeeaa..6c5e6af 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -4,4 +4,3 @@ pysrt pyvtt chardet pyasstosrt -pydantic diff --git a/src/subtitle_utils/NOTICE.MD b/src/subtitle_utils/NOTICE.MD new file mode 100644 index 0000000..20816e9 --- /dev/null +++ b/src/subtitle_utils/NOTICE.MD @@ -0,0 +1,7 @@ +# NOTICE OF THIS PROJECT + +## MIT License + +The MIT license applies to the files in: + + file: "subtitle_utils/convert/bcc.py" motify from https://github.com/FXTD-ODYSSEY/bilibili-subtile-uploader diff --git a/src/subtitle_utils/__init__.py b/src/subtitle_utils/__init__.py new file mode 100644 index 0000000..5c54462 --- /dev/null +++ b/src/subtitle_utils/__init__.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# @Time : 12/29/22 10:06 PM +# @FileName: convert.py +# @Software: PyCharm +# @Github :sudoskys +import json +from typing import Union, IO, Callable, Any + +from .convert.ass import AssConvert +from .convert.bcc import BccConvert + +FOOTNOTE = None + + +def srt2bcc(content: Union[str, IO] + ) -> str: + result = BccConvert().srt2bcc(content=content, about=FOOTNOTE) + result = json.dumps(result, ensure_ascii=False, indent=None) + return result + + +def vtt2bcc(content: Union[str, IO] + ) -> str: + result = BccConvert().vtt2bcc(content=content, about=FOOTNOTE) + result = json.dumps(result, ensure_ascii=False, indent=None) + return result + + +def ass2bcc(content: Union[str, IO] + ) -> str: + ass_result = AssConvert().ass2srt(content=content) + result = BccConvert().srt2bcc(content=ass_result, about=FOOTNOTE) + result = json.dumps(result, ensure_ascii=False, indent=None) + return result + + +def ass2srt(content: Union[str, IO] + ) -> str: + """ + :param content: + :return: result + """ + result = AssConvert().ass2srt(content=content) + return result + + +def srt2ass(content: Union[str, IO], + *, + header: str = None + ) -> str: + """ + :param content: srt str| IO + :param header: ass subtitle style + :return: + """ + result = AssConvert().srt2ass(content=content, header=header) + return result + + +def bcc2srt(content: Union[str, IO], + ) -> str: + result = BccConvert().bcc2srt(content=content) + return result + + +def bcc2ass(content: Union[str, IO] + ) -> str: + bcc_result = BccConvert().bcc2srt(content=content) + result = AssConvert().srt2ass(content=bcc_result) + return result + + +_to_table = { + "2srt": { + "ass": ass2srt, + "bcc": bcc2srt, + }, + "2bcc": { + "vtt": vtt2bcc, + "srt": srt2bcc, + "ass": ass2bcc, + }, + "2ass": { + "srt": srt2ass, + "bcc": bcc2ass, + }, +} + + +def get_method(method: str) -> Callable[..., Any]: + available_method = show_available() + assert method in available_method, f"Not available in {available_method}" + sub_key, key = method.split("2", maxsplit=1) + top_key = f"2{key}" + child = _to_table.get(top_key, None) + assert child, f"{method} NotImplemented for top class" + method_func = child.get(sub_key, None) + assert method_func, f"{method} NotImplemented for sub class" + return method_func + + +def show_available() -> list: + """ + 查询可用方法,返回功能列表 + :return: + """ + _method = [] + for it in _to_table.keys(): + _child = _to_table[it] + if not isinstance(_child, dict): + continue + _from = _child.keys() + for ti in _from: + _method.append(f"{ti}{it}") + return _method diff --git a/src/subtitle_utils/convert/LICENSE b/src/subtitle_utils/convert/LICENSE new file mode 100644 index 0000000..2849113 --- /dev/null +++ b/src/subtitle_utils/convert/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 智伤帝 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/subtitle_utils/AssConverter.py b/src/subtitle_utils/convert/ass.py similarity index 57% rename from subtitle_utils/AssConverter.py rename to src/subtitle_utils/convert/ass.py index 1503a48..49c0600 100644 --- a/subtitle_utils/AssConverter.py +++ b/src/subtitle_utils/convert/ass.py @@ -1,19 +1,17 @@ # -*- coding: utf-8 -*- -# @Time : 12/30/22 2:54 PM -# @FileName: AssConverter.py +# @Time : 2024/1/18 下午6:05 +# @Author : sudoskys +# @File : ass.py # @Software: PyCharm -# @Github :sudoskys -import re -from pathlib import Path -from pyasstosrt import Subtitle +import tempfile +from typing import Union, IO -from .utils import SrtParse +from pyasstosrt import Subtitle +from ..parse import SrtParse +from ..schema import Convert -class AssUtils(object): - @staticmethod - def defultHeader() -> str: - return """[V4+ Styles] +ASS_HEADER = """[V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,Arial,20,&H00FFFFFF,&HF0000000,&H00000000,&HF0000000,1,0,0,0,100,100,0,0.00,1,1,0,2,30,30,10,134 @@ -21,18 +19,24 @@ def defultHeader() -> str: Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text """ - def ass_content(self, content, header: str) -> str: + +class AssConvert(Convert): + @staticmethod + def srt2ass(content: Union[str, IO], + *, + header: str = None) -> str: """ - 字幕转换 - :param timestamps: 时间轴 - :param subtitles: 字幕 - :param header: 头 - :return: 合成字幕 + Subtitle Converter + :param content: subtitle path or content + :param header: ASS HEADER (Style) + :return: processed subtitle """ - subs = SrtParse().parse(strs=content) + assert isinstance(content, (str, IO)), "content must be str or IO" + subs = SrtParse().parse(content=content) timestamps = [[str(sub.start), str(sub.end)] for sub in subs] subtitles = [sub.text for sub in subs] - header = header if header else AssUtils.defultHeader() + if header is None: + header = ASS_HEADER content = header + '\n' body = { 'dialogue': 'Dialogue: ', @@ -63,21 +67,19 @@ def ass_content(self, content, header: str) -> str: content += '\n' return content - -class AssConvert(object): - - def ass2srt(self, files: str) -> str: - path = Path(files) - sub = Subtitle(path) - dialog = sub.export(output_dialogues=True) - _result = [] - for dialogue in dialog: - _result.append(str(dialogue)) + @staticmethod + def ass2srt(content: Union[str, IO]) -> str: + assert isinstance(content, (str, IO)), "content must be str or IO" + # write to temp file + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as f: + if isinstance(content, str): + f.write(content) + else: + f.write(content.read()) + f.close() + sub = Subtitle(filepath=f.name) + dialog = sub.export(output_dialogues=True) + _result = [] + for dialogue in dialog: + _result.append(str(dialogue)) return "".join(_result) - - def srt2ass(self, strs: str, header: str = "") -> str: - content = AssUtils().ass_content(content=strs, header=header) - return content - -# res = AssConvert().ass2srt(files="../test/sub.ass") -# print(res) diff --git a/subtitle_utils/BccConverter.py b/src/subtitle_utils/convert/bcc.py similarity index 69% rename from subtitle_utils/BccConverter.py rename to src/subtitle_utils/convert/bcc.py index c6ec226..f2a59c9 100644 --- a/subtitle_utils/BccConverter.py +++ b/src/subtitle_utils/convert/bcc.py @@ -1,19 +1,15 @@ # -*- coding: utf-8 -*- -# @Time : 12/30/22 2:39 PM -# @FileName: BccConvert.py -# @Software: PyCharm -# @Github :sudoskys +# @Author : sudoskys,智伤帝 +# @File : bcc.py import re -import os -import json -import pyvtt -import pysrt - -from typing import Union from datetime import datetime +from typing import Union, IO + from loguru import logger +from ..parse import VttParser, BccParser, SrtParse +from ..schema import Convert ##### # BCC2SRT @@ -21,51 +17,29 @@ # VTT2BCC ##### +item = { + "from": 0, + "to": 0, + "location": 2, + "content": "", +} -class BccParser(object): - def __init__(self): - pass - - def _parse(self, content: str): - try: - return json.loads(content) - except Exception as e: - raise Exception(e) - def parseFile(self, files): - path = files if files else "" - if not os.path.exists(path): - return - with open(files, "r") as f: - return self._parse(f.read()) +class BccConvert(Convert): - def parseStr(self, files): - strs = files if files else "" - return self._parse(strs) - - -class BccConvert(object): - def __init__(self): - self.item = { - "from": 0, - "to": 0, - "location": 2, - "content": "", - } - - def merge_timeline(self, time_line: list): + @staticmethod + def _merge_timeline(time_line: list): """ 防止时间码重合,压扁时间轴 - :param time_line: - :param additive: 附加字幕 - :return: + :param time_line: 时间轴 + :return: 压扁后的时间轴 """ # 制作爆破点 _time_dot = {} - for item in time_line: - _start = item["from"] - _end = item["to"] - _content = item["content"] + for items in time_line: + _start = items["from"] + _end = items["to"] + _content = items["content"] _uid = _start + _end import uuid uid1 = uuid.uuid1() @@ -75,11 +49,11 @@ def merge_timeline(self, time_line: list): _time_dot[uid2.hex] = {"time": _end, "type": "end", "content": _content, "group": uid} # 查找当前点的字幕。 - def sub_title_now(dot: float): + def sub_title_now(dot_: float): sub_title_n = [] rev = False for it in time_line: - if it["from"] <= dot < it["to"]: + if it["from"] <= dot_ < it["to"]: if "字幕" in it["content"] and len(it["content"]) > 7: rev = True sub_title_n.append(it["content"]) @@ -148,7 +122,14 @@ def merge_large(timeline: list): return merge_large(_result) - def process_body(self, subs, about: str = None): + def _process_body(self, subs, about: str = None): + """ + 处理字幕内容 + :param subs: 字幕列表 + :param about: 关于字幕 + :return: 处理后的字幕内容 + """ + _origin = [] if about: _origin.append({ @@ -166,23 +147,21 @@ def process_body(self, subs, about: str = None): } for sub in subs ]) - _fix = self.merge_timeline(_origin) + _fix = self._merge_timeline(_origin) return _fix - def time2str(self, time: float): + @staticmethod + def _time2str(time: float): return datetime.utcfromtimestamp(time).strftime("%H:%M:%S,%f")[:-3] - def srt2bcc(self, files: Union[str], about: str = None): + def srt2bcc(self, content: Union[str, IO], about: str = None): """ srt2bcc 将 srt 转换为 bcc B站字幕格式 + :param content: srt format :return: """ - path = files if files else "" - if os.path.exists(path): - subs = pysrt.open(path=files) - else: - subs = pysrt.from_string(source=files) - body = self.process_body(subs, about=about) + subs = SrtParse().parse(content) + body = self._process_body(subs, about=about) bcc = { "font_size": 0.4, "font_color": "#FFFFFF", @@ -193,17 +172,13 @@ def srt2bcc(self, files: Union[str], about: str = None): } return bcc if subs else {} - def bcc2srt(self, files: Union[str]): + def bcc2srt(self, content: Union[str, IO]): """ bcc2srt 将 bcc 转换为 srt 字幕格式 + :param content: bcc format :return: """ - path = files if files else "" - if os.path.exists(path): - with open(path, "r", encoding="utf-8") as f: - subs = json.load(f) - else: - subs = json.loads(path) + subs = BccParser().parse(content) srt = "" count = 0 for single_str in subs["body"]: @@ -212,16 +187,20 @@ def bcc2srt(self, files: Union[str]): from_str = single_str['from'] to_str = single_str['to'] srt += f"{count}\n" - srt += f"{self.time2str(from_str)} --> {self.time2str(to_str)}\n" + srt += f"{self._time2str(from_str)} --> {self._time2str(to_str)}\n" srt += f"{content_str}\n\n" return srt[:-1] if subs else "" - def vtt2bcc(self, files, threshold=0.1, word=True, about: str = None): - path = files if files else "" - if os.path.exists(path): - subs = pyvtt.open(path) - else: - subs = pyvtt.from_string(path) + def vtt2bcc(self, content: Union[str, IO], threshold=0.1, word=True, about: str = None): + """ + vtt2bcc 将 vtt 转换为 bcc B站字幕格式 + :param content: vtt format + :param threshold: 两个字幕之间的间隔时间 + :param word: 是否按照断词模式分隔字幕 + :param about: 关于字幕 + :return: bcc format + """ + subs = VttParser().parse(content) # NOTE 按照 vtt 的断词模式分隔 bcc caption_list = [] if not word: @@ -275,7 +254,8 @@ def vtt2bcc(self, files, threshold=0.1, word=True, about: str = None): } ) start = sec - except: + except Exception as e: + logger.trace(e) final_text = sub.text.split("\n")[-1] if caption_list and caption_list[-1]["content"] == final_text: caption_list[-1].update( @@ -300,7 +280,7 @@ def vtt2bcc(self, files, threshold=0.1, word=True, about: str = None): # NOTE 避免超出视频长度 last = caption_list[-1] last["to"] = last.get("from") + 0.1 - body = self.process_body(caption_list, about=about) + body = self._process_body(caption_list, about=about) bcc = { "font_size": 0.4, "font_color": "#FFFFFF", @@ -310,29 +290,3 @@ def vtt2bcc(self, files, threshold=0.1, word=True, about: str = None): "body": body, } return bcc if subs else {} - - -""" -# 部分原始代码协议:https://github.com/FXTD-ODYSSEY/bilibili-subtile-uploader/blob/main/LICENSE -MIT License - -Copyright (c) 2020 智伤帝 - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" diff --git a/src/subtitle_utils/parse.py b/src/subtitle_utils/parse.py new file mode 100644 index 0000000..f15bdb4 --- /dev/null +++ b/src/subtitle_utils/parse.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +# @Time : 12/31/22 9:53 AM +# @FileName: parse.py +# @Software: PyCharm +# @Github :sudoskys +import json +import os +import tempfile +from abc import ABC +from typing import Union, IO + +import pysrt +import pyvtt +from pysrt import SubRipFile +from pyvtt import WebVTTFile + + +class Parser(ABC): + """ + Base Parser + """ + + def parse(self, content: Union[str, IO]): + raise NotImplementedError + + +class SrtParse(Parser): + + def parse(self, content: Union[str, IO]) -> SubRipFile: + if isinstance(content, str): + return pysrt.from_string(content) + # write to temp file + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=True) as f: + f.write(content.read()) + f.close() + return pysrt.open(f.name) + + +class BccParser(Parser): + + @staticmethod + def _parse(content: str): + try: + return json.loads(content) + except Exception as e: + raise e + + def parse_file(self, files): + path = files if files else "" + if not os.path.exists(path): + return + with open(files, "r") as f: + return self._parse(f.read()) + + def parse_str(self, sentence): + strs = sentence if sentence else "" + return self._parse(strs) + + def parse(self, content: Union[str, IO]) -> dict: + """ + Parse bcc + :param content: str or IO + :return: json + """ + if isinstance(content, str): + return self.parse_str(content) + return self.parse_file(content) + + +class VttParser(Parser): + + def parse(self, content: Union[str, IO]) -> WebVTTFile: + """ + :param content: str or IO + :return: pyvtt.WebVTTFile + """ + if isinstance(content, str): + return pyvtt.from_string(content) + # write to temp file + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=True) as f: + f.write(content.read()) + f.close() + return pyvtt.open(f.name) diff --git a/src/subtitle_utils/schema.py b/src/subtitle_utils/schema.py new file mode 100644 index 0000000..b82384c --- /dev/null +++ b/src/subtitle_utils/schema.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/1/18 下午4:22 +# @Author : sudoskys +# @File : schema.py +# @Software: PyCharm +from abc import ABC +from enum import Enum + + +class SubtitleType(Enum): + ASS = "ass" + BCC = "bcc" + SRT = "srt" + VTT = "vtt" + + +class Convert(ABC): + pass