Skip to content
This repository has been archived by the owner on Nov 30, 2023. It is now read-only.

Update gtts to 2.2.1 and gtts-token to 1.1.4 #167

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion chinese/lib/gtts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def print_languages(ctx, param, value):
"""
if not value or ctx.resilient_parsing:
return

try:
langs = tts_langs()
langs_str_list = sorted("{}: {}".format(k, langs[k]) for k in langs)
Expand Down Expand Up @@ -136,6 +137,14 @@ def set_debug(ctx, param, debug):
show_default=True,
callback=validate_lang,
help="IETF language tag. Language to speak in. List documented tags with --all.")
@click.option(
'-t',
'--tld',
metavar='<tld>',
default='com',
show_default=True,
is_eager=True, # Prioritize <tld> to ensure it gets set before <lang>
help="Top-level domain for the Google host, i.e https://translate.google.<tld>")
@click.option(
'--nocheck',
default=False,
Expand All @@ -159,7 +168,7 @@ def set_debug(ctx, param, debug):
callback=set_debug,
help="Show debug information.")
@click.version_option(version=__version__)
def tts_cli(text, file, output, slow, lang, nocheck):
def tts_cli(text, file, output, slow, tld, lang, nocheck):
""" Read <text> to mp3 format using Google Translate's Text-to-Speech API
(set <text> or --file <file> to - for standard input)
"""
Expand Down Expand Up @@ -189,6 +198,7 @@ def tts_cli(text, file, output, slow, lang, nocheck):
text=text,
lang=lang,
slow=slow,
tld=tld,
lang_check=not nocheck)
tts.write_to_fp(output)
except (ValueError, AssertionError) as e:
Expand Down
137 changes: 78 additions & 59 deletions chinese/lib/gtts/lang.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import logging
import re

__all__ = ['tts_langs']

URL_BASE = 'http://translate.google.com'
JS_FILE = 'translate_m.js'

# Logger
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
Expand All @@ -18,69 +12,94 @@ def tts_langs():
"""Languages Google Text-to-Speech supports.

Returns:
dict: A dictionnary of the type `{ '<lang>': '<name>'}`
dict: A dictionary of the type `{ '<lang>': '<name>'}`

Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.
Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
and `<name>` is the full English name of the language, such as
`English` or `Portuguese (Brazil)`.

The dictionnary returned combines languages from two origins:
The dictionary returned combines languages from two origins:

- Languages fetched automatically from Google Translate
- Languages fetched from Google Translate
- Languages that are undocumented variations that were observed to work and
present different dialects or accents.

"""
try:
langs = dict()
langs.update(_fetch_langs())
langs.update(_extra_langs())
log.debug("langs: %s", langs)
return langs
except Exception as e:
raise RuntimeError("Unable to get language list: %s" % str(e))

langs = dict()
langs.update(_main_langs())
langs.update(_extra_langs())
log.debug("langs: {}".format(langs))
return langs

def _fetch_langs():
"""Fetch (scrape) languages from Google Translate.

Google Translate loads a JavaScript Array of 'languages codes' that can
be spoken. We intersect this list with all the languages Google Translate
provides to get the ones that support text-to-speech.
def _main_langs():
"""Define the main languages.

Returns:
dict: A dictionnary of languages from Google Translate
dict: A dictionnary of the main languages extracted from
Google Translate.

"""
# Load HTML
page = requests.get(URL_BASE)
soup = BeautifulSoup(page.content, 'html.parser')

# JavaScript URL
# The <script src=''> path can change, but not the file.
# Ex: /zyx/abc/20180211/desktop_module_main.js
js_path = soup.find(src=re.compile(JS_FILE))['src']
js_url = "{}/{}".format(URL_BASE, js_path)

# Load JavaScript
js_contents = requests.get(js_url).text

# Approximately extract TTS-enabled language codes
# RegEx pattern search because minified variables can change.
# Extra garbage will be dealt with later as we keep languages only.
# In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]"
# Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw']
pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)'
tts_langs = re.findall(pattern, js_contents)

# Build lang. dict. from main page (JavaScript object populating lang. menu)
# Filtering with the TTS-enabled languages
# In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]"
# re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]]
# Out: {'af': 'Afrikaans', [...]}
trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}"
trans_langs = re.findall(trans_pattern, page.text)
return {lang: name for lang, name in trans_langs if lang in tts_langs}
return {
'af': 'Afrikaans',
'ar': 'Arabic',
'bn': 'Bengali',
'bs': 'Bosnian',
'ca': 'Catalan',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'fi': 'Finnish',
'fr': 'French',
'gu': 'Gujarati',
'hi': 'Hindi',
'hr': 'Croatian',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'is': 'Icelandic',
'it': 'Italian',
'ja': 'Japanese',
'jw': 'Javanese',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'la': 'Latin',
'lv': 'Latvian',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mr': 'Marathi',
'my': 'Myanmar (Burmese)',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'pl': 'Polish',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'si': 'Sinhala',
'sk': 'Slovak',
'sq': 'Albanian',
'sr': 'Serbian',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'th': 'Thai',
'tl': 'Filipino',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'vi': 'Vietnamese',
'zh-CN': 'Chinese'
}


def _extra_langs():
Expand All @@ -89,9 +108,9 @@ def _extra_langs():
Returns:
dict: A dictionnary of extra languages manually defined.

Variations of the ones fetched by `_fetch_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.
Variations of the ones fetched by `_main_langs`,
observed to provide different dialects or accents or
just simply accepted by the Google Translate Text-to-Speech API.

"""
return {
Expand Down
8 changes: 4 additions & 4 deletions chinese/lib/gtts/tokenizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


class RegexBuilder():
"""Builds regex using arguments passed into a pattern template.
r"""Builds regex using arguments passed into a pattern template.

Builds a regex object for which the pattern is made from an argument
passed into a template. If more than one argument is passed (iterable),
Expand Down Expand Up @@ -71,7 +71,7 @@ def __repr__(self): # pragma: no cover


class PreProcessorRegex():
"""Regex-based substitution text pre-processor.
r"""Regex-based substitution text pre-processor.

Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
:class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
Expand Down Expand Up @@ -147,7 +147,7 @@ def __repr__(self): # pragma: no cover


class PreProcessorSub():
"""Simple substitution text preprocessor.
r"""Simple substitution text preprocessor.

Performs string-for-string substitution from list a find/replace pairs.
It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
Expand Down Expand Up @@ -213,7 +213,7 @@ def __repr__(self): # pragma: no cover


class Tokenizer():
"""An extensible but simple generic rule-based tokenizer.
r"""An extensible but simple generic rule-based tokenizer.

A generic and simple string tokenizer that takes a list of functions
(called `tokenizer cases`) returning ``regex`` objects and joins them by
Expand Down
2 changes: 1 addition & 1 deletion chinese/lib/gtts/tokenizer/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
'prof', 'sr', 'st']

SUB_PAIRS = [
('M.', 'Monsieur')
('Esq.', 'Esquire')
]

ALL_PUNC = u"?!?!.,¡()[]¿…‥،;:—。,、:\n"
Expand Down
Loading