jdlorimer · dgeranton · Nov 25, 2020
diff --git a/chinese/lib/gtts/cli.py b/chinese/lib/gtts/cli.py
@@ -88,6 +88,7 @@ def print_languages(ctx, param, value):
     """
     if not value or ctx.resilient_parsing:
         return
+
     try:
         langs = tts_langs()
         langs_str_list = sorted("{}: {}".format(k, langs[k]) for k in langs)
@@ -136,6 +137,14 @@ def set_debug(ctx, param, debug):
     show_default=True,
     callback=validate_lang,
     help="IETF language tag. Language to speak in. List documented tags with --all.")
+@click.option(
+    '-t',
+    '--tld',
+    metavar='<tld>',
+    default='com',
+    show_default=True,
+    is_eager=True,  # Prioritize <tld> to ensure it gets set before <lang>
+    help="Top-level domain for the Google host, i.e https://translate.google.<tld>")
 @click.option(
     '--nocheck',
     default=False,
@@ -159,7 +168,7 @@ def set_debug(ctx, param, debug):
     callback=set_debug,
     help="Show debug information.")
 @click.version_option(version=__version__)
-def tts_cli(text, file, output, slow, lang, nocheck):
+def tts_cli(text, file, output, slow, tld, lang, nocheck):
     """ Read <text> to mp3 format using Google Translate's Text-to-Speech API
     (set <text> or --file <file> to - for standard input)
     """
@@ -189,6 +198,7 @@ def tts_cli(text, file, output, slow, lang, nocheck):
             text=text,
             lang=lang,
             slow=slow,
+            tld=tld,
             lang_check=not nocheck)
         tts.write_to_fp(output)
     except (ValueError, AssertionError) as e:

diff --git a/chinese/lib/gtts/lang.py b/chinese/lib/gtts/lang.py
@@ -1,14 +1,8 @@
 # -*- coding: utf-8 -*-
-from bs4 import BeautifulSoup
-import requests
 import logging
-import re
 
 __all__ = ['tts_langs']
 
-URL_BASE = 'http://translate.google.com'
-JS_FILE = 'translate_m.js'
-
 # Logger
 log = logging.getLogger(__name__)
 log.addHandler(logging.NullHandler())
@@ -18,69 +12,94 @@ def tts_langs():
     """Languages Google Text-to-Speech supports.
 
     Returns:
-        dict: A dictionnary of the type `{ '<lang>': '<name>'}`
+        dict: A dictionary of the type `{ '<lang>': '<name>'}`
 
-        Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
-        and `<name>` is the full English name of the language, such as
-        `English` or `Portuguese (Brazil)`.
+            Where `<lang>` is an IETF language tag such as `en` or `pt-br`,
+            and `<name>` is the full English name of the language, such as
+            `English` or `Portuguese (Brazil)`.
 
-    The dictionnary returned combines languages from two origins:
+    The dictionary returned combines languages from two origins:
 
-    - Languages fetched automatically from Google Translate
+    - Languages fetched from Google Translate
     - Languages that are undocumented variations that were observed to work and
       present different dialects or accents.
 
     """
-    try:
-        langs = dict()
-        langs.update(_fetch_langs())
-        langs.update(_extra_langs())
-        log.debug("langs: %s", langs)
-        return langs
-    except Exception as e:
-        raise RuntimeError("Unable to get language list: %s" % str(e))
-
+    langs = dict()
+    langs.update(_main_langs())
+    langs.update(_extra_langs())
+    log.debug("langs: {}".format(langs))
+    return langs
 
-def _fetch_langs():
-    """Fetch (scrape) languages from Google Translate.
 
-    Google Translate loads a JavaScript Array of 'languages codes' that can
-    be spoken. We intersect this list with all the languages Google Translate
-    provides to get the ones that support text-to-speech.
+def _main_langs():
+    """Define the main languages.
 
     Returns:
-        dict: A dictionnary of languages from Google Translate
+        dict: A dictionnary of the main languages extracted from
+            Google Translate.
 
     """
-    # Load HTML
-    page = requests.get(URL_BASE)
-    soup = BeautifulSoup(page.content, 'html.parser')
-
-    # JavaScript URL
-    # The <script src=''> path can change, but not the file.
-    # Ex: /zyx/abc/20180211/desktop_module_main.js
-    js_path = soup.find(src=re.compile(JS_FILE))['src']
-    js_url = "{}/{}".format(URL_BASE, js_path)
-
-    # Load JavaScript
-    js_contents = requests.get(js_url).text
-
-    # Approximately extract TTS-enabled language codes
-    # RegEx pattern search because minified variables can change.
-    # Extra garbage will be dealt with later as we keep languages only.
-    # In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]"
-    # Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw']
-    pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)'
-    tts_langs = re.findall(pattern, js_contents)
-
-    # Build lang. dict. from main page (JavaScript object populating lang. menu)
-    # Filtering with the TTS-enabled languages
-    # In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]"
-    # re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]]
-    # Out: {'af': 'Afrikaans', [...]}
-    trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}"
-    trans_langs = re.findall(trans_pattern, page.text)
-    return {lang: name for lang, name in trans_langs if lang in tts_langs}
+    return {
+        'af': 'Afrikaans',
+        'ar': 'Arabic',
+        'bn': 'Bengali',
+        'bs': 'Bosnian',
+        'ca': 'Catalan',
+        'cs': 'Czech',
+        'cy': 'Welsh',
+        'da': 'Danish',
+        'de': 'German',
+        'el': 'Greek',
+        'en': 'English',
+        'eo': 'Esperanto',
+        'es': 'Spanish',
+        'et': 'Estonian',
+        'fi': 'Finnish',
+        'fr': 'French',
+        'gu': 'Gujarati',
+        'hi': 'Hindi',
+        'hr': 'Croatian',
+        'hu': 'Hungarian',
+        'hy': 'Armenian',
+        'id': 'Indonesian',
+        'is': 'Icelandic',
+        'it': 'Italian',
+        'ja': 'Japanese',
+        'jw': 'Javanese',
+        'km': 'Khmer',
+        'kn': 'Kannada',
+        'ko': 'Korean',
+        'la': 'Latin',
+        'lv': 'Latvian',
+        'mk': 'Macedonian',
+        'ml': 'Malayalam',
+        'mr': 'Marathi',
+        'my': 'Myanmar (Burmese)',
+        'ne': 'Nepali',
+        'nl': 'Dutch',
+        'no': 'Norwegian',
+        'pl': 'Polish',
+        'pt': 'Portuguese',
+        'ro': 'Romanian',
+        'ru': 'Russian',
+        'si': 'Sinhala',
+        'sk': 'Slovak',
+        'sq': 'Albanian',
+        'sr': 'Serbian',
+        'su': 'Sundanese',
+        'sv': 'Swedish',
+        'sw': 'Swahili',
+        'ta': 'Tamil',
+        'te': 'Telugu',
+        'th': 'Thai',
+        'tl': 'Filipino',
+        'tr': 'Turkish',
+        'uk': 'Ukrainian',
+        'ur': 'Urdu',
+        'vi': 'Vietnamese',
+        'zh-CN': 'Chinese'
+    }
 
 
 def _extra_langs():
@@ -89,9 +108,9 @@ def _extra_langs():
     Returns:
         dict: A dictionnary of extra languages manually defined.
 
-        Variations of the ones fetched by `_fetch_langs`,
-        observed to provide different dialects or accents or
-        just simply accepted by the Google Translate Text-to-Speech API.
+            Variations of the ones fetched by `_main_langs`,
+            observed to provide different dialects or accents or
+            just simply accepted by the Google Translate Text-to-Speech API.
 
     """
     return {

diff --git a/chinese/lib/gtts/tokenizer/core.py b/chinese/lib/gtts/tokenizer/core.py
@@ -3,7 +3,7 @@
 
 
 class RegexBuilder():
-    """Builds regex using arguments passed into a pattern template.
+    r"""Builds regex using arguments passed into a pattern template.
 
     Builds a regex object for which the pattern is made from an argument
     passed into a template. If more than one argument is passed (iterable),
@@ -71,7 +71,7 @@ def __repr__(self):  # pragma: no cover
 
 
 class PreProcessorRegex():
-    """Regex-based substitution text pre-processor.
+    r"""Regex-based substitution text pre-processor.
 
     Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
     :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
@@ -147,7 +147,7 @@ def __repr__(self):  # pragma: no cover
 
 
 class PreProcessorSub():
-    """Simple substitution text preprocessor.
+    r"""Simple substitution text preprocessor.
 
     Performs string-for-string substitution from list a find/replace pairs.
     It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
@@ -213,7 +213,7 @@ def __repr__(self):  # pragma: no cover
 
 
 class Tokenizer():
-    """An extensible but simple generic rule-based tokenizer.
+    r"""An extensible but simple generic rule-based tokenizer.
 
     A generic and simple string tokenizer that takes a list of functions
     (called `tokenizer cases`) returning ``regex`` objects and joins them by

diff --git a/chinese/lib/gtts/tokenizer/symbols.py b/chinese/lib/gtts/tokenizer/symbols.py
@@ -6,7 +6,7 @@
     'prof', 'sr', 'st']
 
 SUB_PAIRS = [
-    ('M.', 'Monsieur')
+    ('Esq.', 'Esquire')
 ]
 
 ALL_PUNC = u"?!？！.,¡()[]¿…‥،;:—。，、：\n"