Skip to content

Commit

Permalink
Merge from 3.x: PR #3640
Browse files Browse the repository at this point in the history
Fixes #3631
Fixes #3670
  • Loading branch information
ccordoba12 committed Nov 23, 2016
2 parents 08dc2dd + bb8e516 commit 378aba5
Show file tree
Hide file tree
Showing 11 changed files with 194 additions and 46 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ a Python version greater than 2.7 (Python 3.2 is not supported anymore).
* **PyZMQ**: Run introspection services asynchronously.
* **QtPy** 1.1.0+: Abstracion layer for Python Qt bindings so that Spyder can run on PyQt4
and PyQt5.
* **Chardet**: Character encoding auto-detection in Python.

### Optional dependencies

Expand Down
1 change: 1 addition & 0 deletions conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ requirements:
- qtawesome
- qtpy >=1.1.0
- pyzmq
- chardet >=2.0.0

about:
home: https://github.com/spyder-ide/spyder
Expand Down
1 change: 1 addition & 0 deletions continuous_integration/conda-recipes/spyder/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ requirements:
- pylint
- qtawesome
- qtpy
- chardet >=2.0.0

about:
home: https://github.com/spyder-ide/spyder
Expand Down
2 changes: 2 additions & 0 deletions doc/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ The requirements to run Spyder are:
* `QtPy <https://github.com/spyder-ide/qtpy>`_ >=1.1.0 -- To run Spyder with PyQt4 or
PyQt5 seamlessly.

* `Chardet <https://github.com/chardet/chardet>`_ >=2.0.0-- Character encoding auto-detection
in Python.

Optional modules
~~~~~~~~~~~~~~~~
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def run(self):
'qtawesome',
'qtpy>=1.1.0',
'pickleshare',
'pyzmq'
'pyzmq',
'chardet>=2.0.0',
]

if 'setuptools' in sys.modules:
Expand Down
34 changes: 2 additions & 32 deletions spyder/utils/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# Local imports
from spyder.py3compat import (is_string, to_text_string, is_binary_string,
is_unicode)
from spyder.utils.external.binaryornot.check import is_binary


PREFERRED_ENCODING = locale.getpreferredencoding()
Expand Down Expand Up @@ -228,36 +229,5 @@ def readlines(filename, encoding='utf-8'):
def is_text_file(filename):
"""
Test if the given path is a text-like file.
Adapted from: http://stackoverflow.com/a/3002505
Original Authors: Trent Mick <[email protected]>
Jorge Orpinel <[email protected]>
"""
try:
open(filename)
except Exception:
return False
with open(filename, 'rb') as fid:
try:
CHUNKSIZE = 1024
chunk = fid.read(CHUNKSIZE)
# check for a UTF BOM
for bom in [BOM_UTF8, BOM_UTF16, BOM_UTF32]:
if chunk.startswith(bom):
return True

decoder = getincrementaldecoder('utf-8')()
while 1:
is_final = len(chunk) < CHUNKSIZE
chunk = decoder.decode(chunk, final=is_final)
if '\0' in chunk: # found null byte
return False
if is_final:
break # done
chunk = fid.read(CHUNKSIZE)
except UnicodeDecodeError:
return False
except Exception:
pass
return True
return not is_binary(filename)
13 changes: 0 additions & 13 deletions spyder/utils/external/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,3 @@
External libraries needed for Spyder to work.
Put here only untouched libraries, else put them in utils.
"""

import os

# Hack to be able to use our own versions of rope and pyflakes,
# included in our Windows installers
if os.name == 'nt':
import os.path as osp
import sys
from spyder.config.base import get_module_source_path

dirname = get_module_source_path(__name__)
if osp.isdir(osp.join(dirname, 'rope')):
sys.path.insert(0, dirname)
3 changes: 3 additions & 0 deletions spyder/utils/external/binaryornot/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__author__ = 'Audrey Roy'
__email__ = '[email protected]'
__version__ = '0.4.0'
33 changes: 33 additions & 0 deletions spyder/utils/external/binaryornot/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-

"""
binaryornot.check
-----------------
Main code for checking if a file is binary or text.
"""

import logging

from spyder.utils.external.binaryornot.helpers import get_starting_chunk, is_binary_string


logger = logging.getLogger(__name__)


def is_binary(filename):
"""
:param filename: File to check.
:returns: True if it's a binary file, otherwise False.
"""
logger.debug('is_binary: %(filename)r', locals())

# Check if the file extension is in a list of known binary types
binary_extensions = ['pyc']
for ext in binary_extensions:
if filename.endswith(ext):
return True

# Check if the starting chunk is a binary string
chunk = get_starting_chunk(filename)
return is_binary_string(chunk)
129 changes: 129 additions & 0 deletions spyder/utils/external/binaryornot/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-


"""
binaryornot.helpers
-------------------
Helper utilities used by BinaryOrNot.
"""

import chardet
import logging


logger = logging.getLogger(__name__)


def print_as_hex(s):
"""
Print a string as hex bytes.
"""
print(":".join("{0:x}".format(ord(c)) for c in s))


def get_starting_chunk(filename, length=1024):
"""
:param filename: File to open and get the first little chunk of.
:param length: Number of bytes to read, default 1024.
:returns: Starting chunk of bytes.
"""
# Ensure we open the file in binary mode
with open(filename, 'rb') as f:
chunk = f.read(length)
return chunk


_control_chars = b'\n\r\t\f\b'
if bytes is str:
# Python 2 means we need to invoke chr() explicitly
_printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
_printable_high_ascii = b''.join(map(chr, range(127, 256)))
else:
# Python 3 means bytes accepts integer input directly
_printable_ascii = _control_chars + bytes(range(32, 127))
_printable_high_ascii = bytes(range(127, 256))


def is_binary_string(bytes_to_check):
"""
Uses a simplified version of the Perl detection algorithm,
based roughly on Eli Bendersky's translation to Python:
http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
This is biased slightly more in favour of deeming files as text
files than the Perl algorithm, since all ASCII compatible character
sets are accepted as text, not just utf-8.
:param bytes: A chunk of bytes to check.
:returns: True if appears to be a binary, otherwise False.
"""

# Empty files are considered text files
if not bytes_to_check:
return False

# Now check for a high percentage of ASCII control characters
# Binary if control chars are > 30% of the string
low_chars = bytes_to_check.translate(None, _printable_ascii)
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())

# and check for a low percentage of high ASCII characters:
# Binary if high ASCII chars are < 5% of the string
# From: https://en.wikipedia.org/wiki/UTF-8
# If the bytes are random, the chances of a byte with the high bit set
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
# of these without finding an invalid sequence is actually lower than the
# chance of the first three bytes randomly being the UTF-8 BOM.

high_chars = bytes_to_check.translate(None, _printable_high_ascii)
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())

is_likely_binary = (
(nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
(nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
)
logger.debug('is_likely_binary: %(is_likely_binary)r', locals())

# then check for binary for possible encoding detection with chardet
detected_encoding = chardet.detect(bytes_to_check)
logger.debug('detected_encoding: %(detected_encoding)r', locals())

# finally use all the check to decide binary or text
decodable_as_unicode = False
if (detected_encoding['confidence'] > 0.9 and
detected_encoding['encoding'] != 'ascii'):
try:
try:
bytes_to_check.decode(encoding=detected_encoding['encoding'])
except TypeError:
# happens only on Python 2.6
unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa
decodable_as_unicode = True
logger.debug('success: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())
except LookupError:
logger.debug('failure: could not look up encoding %(encoding)s',
detected_encoding)
except UnicodeDecodeError:
logger.debug('failure: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())

logger.debug('failure: decodable_as_unicode: '
'%(decodable_as_unicode)r', locals())
if is_likely_binary:
if decodable_as_unicode:
return False
else:
return True
else:
if decodable_as_unicode:
return False
else:
if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
# Check for NULL bytes last
logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
return True
return False
20 changes: 20 additions & 0 deletions spyder/utils/tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
#
# Copyright © Spyder Project Contributors
# Licensed under the terms of the MIT License

"""Tests for encodings.py"""

import pytest

from spyder.utils.encoding import is_text_file


def test_is_text_file(tmpdir):
p = tmpdir.mkdir("sub").join("random_text.txt")
p.write("Some random text")
assert is_text_file(str(p)) == True


if __name__ == '__main__':
pytest.main()

0 comments on commit 378aba5

Please sign in to comment.