Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updated gTTS
Browse files Browse the repository at this point in the history
psii committed Jan 29, 2021

Verified

This commit was signed with the committer’s verified signature.
sdispater Sébastien Eustace
1 parent 435bbdc commit 2ae3be4
Showing 19 changed files with 964 additions and 231 deletions.
12 changes: 11 additions & 1 deletion chinese/lib/gtts/cli.py
Original file line number Diff line number Diff line change
@@ -88,6 +88,7 @@ def print_languages(ctx, param, value):
"""
if not value or ctx.resilient_parsing:
return

try:
langs = tts_langs()
langs_str_list = sorted("{}: {}".format(k, langs[k]) for k in langs)
@@ -136,6 +137,14 @@ def set_debug(ctx, param, debug):
show_default=True,
callback=validate_lang,
help="IETF language tag. Language to speak in. List documented tags with --all.")
@click.option(
'-t',
'--tld',
metavar='<tld>',
default='com',
show_default=True,
is_eager=True, # Prioritize <tld> to ensure it gets set before <lang>
help="Top-level domain for the Google host, i.e https://translate.google.<tld>")
@click.option(
'--nocheck',
default=False,
@@ -159,7 +168,7 @@ def set_debug(ctx, param, debug):
callback=set_debug,
help="Show debug information.")
@click.version_option(version=__version__)
def tts_cli(text, file, output, slow, lang, nocheck):
def tts_cli(text, file, output, slow, tld, lang, nocheck):
""" Read <text> to mp3 format using Google Translate's Text-to-Speech API
(set <text> or --file <file> to - for standard input)
"""
@@ -189,6 +198,7 @@ def tts_cli(text, file, output, slow, lang, nocheck):
text=text,
lang=lang,
slow=slow,
tld=tld,
lang_check=not nocheck)
tts.write_to_fp(output)
except (ValueError, AssertionError) as e:
137 changes: 78 additions & 59 deletions chinese/lib/gtts/lang.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import logging
import re

__all__ = ['tts_langs']

URL_BASE = 'http://translate.google.com'
JS_FILE = 'translate_m.js'

# Logger
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
@@ -18,69 +12,94 @@ def tts_langs():
"""Languages Google Text-to-Speech supports.
Returns:
dict: A dictionnary of the type `{ '<lang>': '<name>'}`
dict: A dictionary of the type `{ '<lang>': '<name>'}`
Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.
Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.
The dictionnary returned combines languages from two origins:
The dictionary returned combines languages from two origins:
- Languages fetched automatically from Google Translate
- Languages fetched from Google Translate
- Languages that are undocumented variations that were observed to work and
present different dialects or accents.
"""
try:
langs = dict()
langs.update(_fetch_langs())
langs.update(_extra_langs())
log.debug("langs: %s", langs)
return langs
except Exception as e:
raise RuntimeError("Unable to get language list: %s" % str(e))

langs = dict()
langs.update(_main_langs())
langs.update(_extra_langs())
log.debug("langs: {}".format(langs))
return langs

def _fetch_langs():
"""Fetch (scrape) languages from Google Translate.

Google Translate loads a JavaScript Array of 'languages codes' that can
be spoken. We intersect this list with all the languages Google Translate
provides to get the ones that support text-to-speech.
def _main_langs():
"""Define the main languages.
Returns:
dict: A dictionnary of languages from Google Translate
dict: A dictionnary of the main languages extracted from
Google Translate.
"""
# Load HTML
page = requests.get(URL_BASE)
soup = BeautifulSoup(page.content, 'html.parser')

# JavaScript URL
# The <script src=''> path can change, but not the file.
# Ex: /zyx/abc/20180211/desktop_module_main.js
js_path = soup.find(src=re.compile(JS_FILE))['src']
js_url = "{}/{}".format(URL_BASE, js_path)

# Load JavaScript
js_contents = requests.get(js_url).text

# Approximately extract TTS-enabled language codes
# RegEx pattern search because minified variables can change.
# Extra garbage will be dealt with later as we keep languages only.
# In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]"
# Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw']
pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)'
tts_langs = re.findall(pattern, js_contents)

# Build lang. dict. from main page (JavaScript object populating lang. menu)
# Filtering with the TTS-enabled languages
# In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]"
# re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]]
# Out: {'af': 'Afrikaans', [...]}
trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}"
trans_langs = re.findall(trans_pattern, page.text)
return {lang: name for lang, name in trans_langs if lang in tts_langs}
return {
'af': 'Afrikaans',
'ar': 'Arabic',
'bn': 'Bengali',
'bs': 'Bosnian',
'ca': 'Catalan',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'fi': 'Finnish',
'fr': 'French',
'gu': 'Gujarati',
'hi': 'Hindi',
'hr': 'Croatian',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'is': 'Icelandic',
'it': 'Italian',
'ja': 'Japanese',
'jw': 'Javanese',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'la': 'Latin',
'lv': 'Latvian',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mr': 'Marathi',
'my': 'Myanmar (Burmese)',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'pl': 'Polish',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'si': 'Sinhala',
'sk': 'Slovak',
'sq': 'Albanian',
'sr': 'Serbian',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'th': 'Thai',
'tl': 'Filipino',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'vi': 'Vietnamese',
'zh-CN': 'Chinese'
}


def _extra_langs():
@@ -89,9 +108,9 @@ def _extra_langs():
Returns:
dict: A dictionnary of extra languages manually defined.
Variations of the ones fetched by `_fetch_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.
Variations of the ones fetched by `_main_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.
"""
return {
File renamed without changes.
2 changes: 2 additions & 0 deletions chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger?
How much will it cost the website doesn't have the theme i was going for.
5 changes: 5 additions & 0 deletions chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
这是一个三岁的小孩
在讲述她从一系列照片里看到的东西。
对这个世界, 她也许还有很多要学的东西,
但在一个重要的任务上, 她已经是专家了:
去理解她所看到的东西。
Loading

0 comments on commit 2ae3be4

Please sign in to comment.