Skip to content

Commit

Permalink
Resolve buggy tts save by updating gTTS to v2.2.1
Browse files Browse the repository at this point in the history
Fix jdlorimer#161.
Thanks to @psii for mentioning that the update fixes the
issue.
  • Loading branch information
3ter committed Nov 17, 2020
1 parent b005552 commit d282e99
Show file tree
Hide file tree
Showing 19 changed files with 964 additions and 231 deletions.
12 changes: 11 additions & 1 deletion chinese/lib/gtts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def print_languages(ctx, param, value):
"""
if not value or ctx.resilient_parsing:
return

try:
langs = tts_langs()
langs_str_list = sorted("{}: {}".format(k, langs[k]) for k in langs)
Expand Down Expand Up @@ -136,6 +137,14 @@ def set_debug(ctx, param, debug):
show_default=True,
callback=validate_lang,
help="IETF language tag. Language to speak in. List documented tags with --all.")
@click.option(
'-t',
'--tld',
metavar='<tld>',
default='com',
show_default=True,
is_eager=True, # Prioritize <tld> to ensure it gets set before <lang>
help="Top-level domain for the Google host, i.e https://translate.google.<tld>")
@click.option(
'--nocheck',
default=False,
Expand All @@ -159,7 +168,7 @@ def set_debug(ctx, param, debug):
callback=set_debug,
help="Show debug information.")
@click.version_option(version=__version__)
def tts_cli(text, file, output, slow, lang, nocheck):
def tts_cli(text, file, output, slow, tld, lang, nocheck):
""" Read <text> to mp3 format using Google Translate's Text-to-Speech API
(set <text> or --file <file> to - for standard input)
"""
Expand Down Expand Up @@ -189,6 +198,7 @@ def tts_cli(text, file, output, slow, lang, nocheck):
text=text,
lang=lang,
slow=slow,
tld=tld,
lang_check=not nocheck)
tts.write_to_fp(output)
except (ValueError, AssertionError) as e:
Expand Down
137 changes: 78 additions & 59 deletions chinese/lib/gtts/lang.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import logging
import re

__all__ = ['tts_langs']

URL_BASE = 'http://translate.google.com'
JS_FILE = 'translate_m.js'

# Logger
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
Expand All @@ -18,69 +12,94 @@ def tts_langs():
"""Languages Google Text-to-Speech supports.
Returns:
dict: A dictionnary of the type `{ '<lang>': '<name>'}`
dict: A dictionary of the type `{ '<lang>': '<name>'}`
Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.
Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.
The dictionnary returned combines languages from two origins:
The dictionary returned combines languages from two origins:
- Languages fetched automatically from Google Translate
- Languages fetched from Google Translate
- Languages that are undocumented variations that were observed to work and
present different dialects or accents.
"""
try:
langs = dict()
langs.update(_fetch_langs())
langs.update(_extra_langs())
log.debug("langs: %s", langs)
return langs
except Exception as e:
raise RuntimeError("Unable to get language list: %s" % str(e))

langs = dict()
langs.update(_main_langs())
langs.update(_extra_langs())
log.debug("langs: {}".format(langs))
return langs

def _fetch_langs():
"""Fetch (scrape) languages from Google Translate.

Google Translate loads a JavaScript Array of 'languages codes' that can
be spoken. We intersect this list with all the languages Google Translate
provides to get the ones that support text-to-speech.
def _main_langs():
"""Define the main languages.
Returns:
dict: A dictionnary of languages from Google Translate
dict: A dictionnary of the main languages extracted from
Google Translate.
"""
# Load HTML
page = requests.get(URL_BASE)
soup = BeautifulSoup(page.content, 'html.parser')

# JavaScript URL
# The <script src=''> path can change, but not the file.
# Ex: /zyx/abc/20180211/desktop_module_main.js
js_path = soup.find(src=re.compile(JS_FILE))['src']
js_url = "{}/{}".format(URL_BASE, js_path)

# Load JavaScript
js_contents = requests.get(js_url).text

# Approximately extract TTS-enabled language codes
# RegEx pattern search because minified variables can change.
# Extra garbage will be dealt with later as we keep languages only.
# In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]"
# Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw']
pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)'
tts_langs = re.findall(pattern, js_contents)

# Build lang. dict. from main page (JavaScript object populating lang. menu)
# Filtering with the TTS-enabled languages
# In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]"
# re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]]
# Out: {'af': 'Afrikaans', [...]}
trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}"
trans_langs = re.findall(trans_pattern, page.text)
return {lang: name for lang, name in trans_langs if lang in tts_langs}
return {
'af': 'Afrikaans',
'ar': 'Arabic',
'bn': 'Bengali',
'bs': 'Bosnian',
'ca': 'Catalan',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'fi': 'Finnish',
'fr': 'French',
'gu': 'Gujarati',
'hi': 'Hindi',
'hr': 'Croatian',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'is': 'Icelandic',
'it': 'Italian',
'ja': 'Japanese',
'jw': 'Javanese',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'la': 'Latin',
'lv': 'Latvian',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mr': 'Marathi',
'my': 'Myanmar (Burmese)',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'pl': 'Polish',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'si': 'Sinhala',
'sk': 'Slovak',
'sq': 'Albanian',
'sr': 'Serbian',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'th': 'Thai',
'tl': 'Filipino',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'vi': 'Vietnamese',
'zh-CN': 'Chinese'
}


def _extra_langs():
Expand All @@ -89,9 +108,9 @@ def _extra_langs():
Returns:
dict: A dictionnary of extra languages manually defined.
Variations of the ones fetched by `_fetch_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.
Variations of the ones fetched by `_main_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.
"""
return {
Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger?
How much will it cost the website doesn't have the theme i was going for.
5 changes: 5 additions & 0 deletions chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
这是一个三岁的小孩
在讲述她从一系列照片里看到的东西。
对这个世界, 她也许还有很多要学的东西,
但在一个重要的任务上, 她已经是专家了:
去理解她所看到的东西。
Loading

0 comments on commit d282e99

Please sign in to comment.