Merge from 3.x: PR #3640

Fixes #3631 Fixes #3670
spyder-ide · Nov 23, 2016 · 378aba5 · 378aba5
2 parents 08dc2dd + bb8e516
commit 378aba5
Show file tree

Hide file tree

Showing 11 changed files with 194 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -156,6 +156,7 @@ a Python version greater than 2.7 (Python 3.2 is not supported anymore).
 * **PyZMQ**: Run introspection services asynchronously.
 * **QtPy** 1.1.0+: Abstracion layer for Python Qt bindings so that Spyder can run on PyQt4
   and PyQt5.
+* **Chardet**: Character encoding auto-detection in Python.
 
 ### Optional dependencies
 

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -32,6 +32,7 @@ requirements:
     - qtawesome
     - qtpy >=1.1.0
     - pyzmq
+    - chardet >=2.0.0
 
 about:
   home: https://github.com/spyder-ide/spyder

diff --git a/continuous_integration/conda-recipes/spyder/meta.yaml b/continuous_integration/conda-recipes/spyder/meta.yaml
@@ -33,6 +33,7 @@ requirements:
     - pylint
     - qtawesome
     - qtpy
+    - chardet >=2.0.0
 
 about:
   home: https://github.com/spyder-ide/spyder

diff --git a/doc/installation.rst b/doc/installation.rst
@@ -194,6 +194,8 @@ The requirements to run Spyder are:
 * `QtPy <https://github.com/spyder-ide/qtpy>`_ >=1.1.0 -- To run Spyder with PyQt4 or
   PyQt5 seamlessly.
 
+* `Chardet <https://github.com/chardet/chardet>`_ >=2.0.0-- Character encoding auto-detection
+  in Python.
 
 Optional modules
 ~~~~~~~~~~~~~~~~

diff --git a/setup.py b/setup.py
@@ -283,7 +283,8 @@ def run(self):
     'qtawesome',
     'qtpy>=1.1.0',
     'pickleshare',
-    'pyzmq'
+    'pyzmq',
+    'chardet>=2.0.0',
 ]
 
 if 'setuptools' in sys.modules:

diff --git a/spyder/utils/encoding.py b/spyder/utils/encoding.py
@@ -20,6 +20,7 @@
 # Local imports
 from spyder.py3compat import (is_string, to_text_string, is_binary_string,
                               is_unicode)
+from spyder.utils.external.binaryornot.check import is_binary
 
 
 PREFERRED_ENCODING = locale.getpreferredencoding()
@@ -228,36 +229,5 @@ def readlines(filename, encoding='utf-8'):
 def is_text_file(filename):
     """
     Test if the given path is a text-like file.
-    
-    Adapted from: http://stackoverflow.com/a/3002505
-    
-    Original Authors: Trent Mick <[email protected]>
-                      Jorge Orpinel <[email protected]>
     """
-    try:
-        open(filename)
-    except Exception:
-        return False
-    with open(filename, 'rb') as fid:
-        try:
-            CHUNKSIZE = 1024
-            chunk = fid.read(CHUNKSIZE)
-            # check for a UTF BOM
-            for bom in [BOM_UTF8, BOM_UTF16, BOM_UTF32]:
-                if chunk.startswith(bom):
-                    return True
-
-            decoder = getincrementaldecoder('utf-8')()
-            while 1:
-                is_final = len(chunk) < CHUNKSIZE
-                chunk = decoder.decode(chunk, final=is_final)
-                if '\0' in chunk: # found null byte
-                    return False
-                if is_final:
-                    break # done
-                chunk = fid.read(CHUNKSIZE)
-        except UnicodeDecodeError:
-            return False
-        except Exception:
-            pass
-    return True
+    return not is_binary(filename)
diff --git a/spyder/utils/external/__init__.py b/spyder/utils/external/__init__.py
@@ -11,16 +11,3 @@
 External libraries needed for Spyder to work.
 Put here only untouched libraries, else put them in utils.
 """
-
-import os
-
-# Hack to be able to use our own versions of rope and pyflakes,
-# included in our Windows installers
-if os.name == 'nt':
-    import os.path as osp
-    import sys
-    from spyder.config.base import get_module_source_path
-
-    dirname = get_module_source_path(__name__)
-    if osp.isdir(osp.join(dirname, 'rope')):
-        sys.path.insert(0, dirname)
diff --git a/spyder/utils/external/binaryornot/__init__.py b/spyder/utils/external/binaryornot/__init__.py
@@ -0,0 +1,3 @@
+__author__ = 'Audrey Roy'
+__email__ = '[email protected]'
+__version__ = '0.4.0'
diff --git a/spyder/utils/external/binaryornot/check.py b/spyder/utils/external/binaryornot/check.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+"""
+binaryornot.check
+-----------------
+
+Main code for checking if a file is binary or text.
+"""
+
+import logging
+
+from spyder.utils.external.binaryornot.helpers import get_starting_chunk, is_binary_string
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_binary(filename):
+    """
+    :param filename: File to check.
+    :returns: True if it's a binary file, otherwise False.
+    """
+    logger.debug('is_binary: %(filename)r', locals())
+
+    # Check if the file extension is in a list of known binary types
+    binary_extensions = ['pyc']
+    for ext in binary_extensions:
+        if filename.endswith(ext):
+            return True
+
+    # Check if the starting chunk is a binary string
+    chunk = get_starting_chunk(filename)
+    return is_binary_string(chunk)
diff --git a/spyder/utils/external/binaryornot/helpers.py b/spyder/utils/external/binaryornot/helpers.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+
+"""
+binaryornot.helpers
+-------------------
+
+Helper utilities used by BinaryOrNot.
+"""
+
+import chardet
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def print_as_hex(s):
+    """
+    Print a string as hex bytes.
+    """
+    print(":".join("{0:x}".format(ord(c)) for c in s))
+
+
+def get_starting_chunk(filename, length=1024):
+    """
+    :param filename: File to open and get the first little chunk of.
+    :param length: Number of bytes to read, default 1024.
+    :returns: Starting chunk of bytes.
+    """
+    # Ensure we open the file in binary mode
+    with open(filename, 'rb') as f:
+        chunk = f.read(length)
+        return chunk
+
+
+_control_chars = b'\n\r\t\f\b'
+if bytes is str:
+    # Python 2 means we need to invoke chr() explicitly
+    _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
+    _printable_high_ascii = b''.join(map(chr, range(127, 256)))
+else:
+    # Python 3 means bytes accepts integer input directly
+    _printable_ascii = _control_chars + bytes(range(32, 127))
+    _printable_high_ascii = bytes(range(127, 256))
+
+
+def is_binary_string(bytes_to_check):
+    """
+    Uses a simplified version of the Perl detection algorithm,
+    based roughly on Eli Bendersky's translation to Python:
+    http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
+
+    This is biased slightly more in favour of deeming files as text
+    files than the Perl algorithm, since all ASCII compatible character
+    sets are accepted as text, not just utf-8.
+
+    :param bytes: A chunk of bytes to check.
+    :returns: True if appears to be a binary, otherwise False.
+    """
+
+    # Empty files are considered text files
+    if not bytes_to_check:
+        return False
+
+    # Now check for a high percentage of ASCII control characters
+    # Binary if control chars are > 30% of the string
+    low_chars = bytes_to_check.translate(None, _printable_ascii)
+    nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
+    logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())
+
+    # and check for a low percentage of high ASCII characters:
+    # Binary if high ASCII chars are < 5% of the string
+    # From: https://en.wikipedia.org/wiki/UTF-8
+    # If the bytes are random, the chances of a byte with the high bit set
+    # starting a valid UTF-8 character is only 6.64%. The chances of finding 7
+    # of these without finding an invalid sequence is actually lower than the
+    # chance of the first three bytes randomly being the UTF-8 BOM.
+
+    high_chars = bytes_to_check.translate(None, _printable_high_ascii)
+    nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
+    logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())
+
+    is_likely_binary = (
+        (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
+        (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
+    )
+    logger.debug('is_likely_binary: %(is_likely_binary)r', locals())
+
+    # then check for binary for possible encoding detection with chardet
+    detected_encoding = chardet.detect(bytes_to_check)
+    logger.debug('detected_encoding: %(detected_encoding)r', locals())
+
+    # finally use all the check to decide binary or text
+    decodable_as_unicode = False
+    if (detected_encoding['confidence'] > 0.9 and
+            detected_encoding['encoding'] != 'ascii'):
+        try:
+            try:
+                bytes_to_check.decode(encoding=detected_encoding['encoding'])
+            except TypeError:
+                # happens only on Python 2.6
+                unicode(bytes_to_check, encoding=detected_encoding['encoding'])  # noqa
+            decodable_as_unicode = True
+            logger.debug('success: decodable_as_unicode: '
+                         '%(decodable_as_unicode)r', locals())
+        except LookupError:
+            logger.debug('failure: could not look up encoding %(encoding)s',
+                         detected_encoding)
+        except UnicodeDecodeError:
+            logger.debug('failure: decodable_as_unicode: '
+                         '%(decodable_as_unicode)r', locals())
+
+    logger.debug('failure: decodable_as_unicode: '
+                 '%(decodable_as_unicode)r', locals())
+    if is_likely_binary:
+        if decodable_as_unicode:
+            return False
+        else:
+            return True
+    else:
+        if decodable_as_unicode:
+            return False
+        else:
+            if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
+                # Check for NULL bytes last
+                logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
+                return True
+        return False
diff --git a/spyder/utils/tests/test_encoding.py b/spyder/utils/tests/test_encoding.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © Spyder Project Contributors
+# Licensed under the terms of the MIT License
+
+"""Tests for encodings.py"""
+
+import pytest
+
+from spyder.utils.encoding import is_text_file
+
+
+def test_is_text_file(tmpdir):
+    p = tmpdir.mkdir("sub").join("random_text.txt")
+    p.write("Some random text")
+    assert is_text_file(str(p)) == True
+
+
+if __name__ == '__main__':
+    pytest.main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,7 @@ requirements: @@
         - pylint
         - qtawesome
         - qtpy
+        - chardet >=2.0.0
     about:
       home: https://github.com/spyder-ide/spyder
@@ Expand Down @@