-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
194 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
# Local imports | ||
from spyder.py3compat import (is_string, to_text_string, is_binary_string, | ||
is_unicode) | ||
from spyder.utils.external.binaryornot.check import is_binary | ||
|
||
|
||
PREFERRED_ENCODING = locale.getpreferredencoding() | ||
|
@@ -228,36 +229,5 @@ def readlines(filename, encoding='utf-8'): | |
def is_text_file(filename): | ||
""" | ||
Test if the given path is a text-like file. | ||
Adapted from: http://stackoverflow.com/a/3002505 | ||
Original Authors: Trent Mick <[email protected]> | ||
Jorge Orpinel <[email protected]> | ||
""" | ||
try: | ||
open(filename) | ||
except Exception: | ||
return False | ||
with open(filename, 'rb') as fid: | ||
try: | ||
CHUNKSIZE = 1024 | ||
chunk = fid.read(CHUNKSIZE) | ||
# check for a UTF BOM | ||
for bom in [BOM_UTF8, BOM_UTF16, BOM_UTF32]: | ||
if chunk.startswith(bom): | ||
return True | ||
|
||
decoder = getincrementaldecoder('utf-8')() | ||
while 1: | ||
is_final = len(chunk) < CHUNKSIZE | ||
chunk = decoder.decode(chunk, final=is_final) | ||
if '\0' in chunk: # found null byte | ||
return False | ||
if is_final: | ||
break # done | ||
chunk = fid.read(CHUNKSIZE) | ||
except UnicodeDecodeError: | ||
return False | ||
except Exception: | ||
pass | ||
return True | ||
return not is_binary(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
__author__ = 'Audrey Roy' | ||
__email__ = '[email protected]' | ||
__version__ = '0.4.0' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
binaryornot.check | ||
----------------- | ||
Main code for checking if a file is binary or text. | ||
""" | ||
|
||
import logging | ||
|
||
from spyder.utils.external.binaryornot.helpers import get_starting_chunk, is_binary_string | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def is_binary(filename): | ||
""" | ||
:param filename: File to check. | ||
:returns: True if it's a binary file, otherwise False. | ||
""" | ||
logger.debug('is_binary: %(filename)r', locals()) | ||
|
||
# Check if the file extension is in a list of known binary types | ||
binary_extensions = ['pyc'] | ||
for ext in binary_extensions: | ||
if filename.endswith(ext): | ||
return True | ||
|
||
# Check if the starting chunk is a binary string | ||
chunk = get_starting_chunk(filename) | ||
return is_binary_string(chunk) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
""" | ||
binaryornot.helpers | ||
------------------- | ||
Helper utilities used by BinaryOrNot. | ||
""" | ||
|
||
import chardet | ||
import logging | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def print_as_hex(s): | ||
""" | ||
Print a string as hex bytes. | ||
""" | ||
print(":".join("{0:x}".format(ord(c)) for c in s)) | ||
|
||
|
||
def get_starting_chunk(filename, length=1024): | ||
""" | ||
:param filename: File to open and get the first little chunk of. | ||
:param length: Number of bytes to read, default 1024. | ||
:returns: Starting chunk of bytes. | ||
""" | ||
# Ensure we open the file in binary mode | ||
with open(filename, 'rb') as f: | ||
chunk = f.read(length) | ||
return chunk | ||
|
||
|
||
_control_chars = b'\n\r\t\f\b' | ||
if bytes is str: | ||
# Python 2 means we need to invoke chr() explicitly | ||
_printable_ascii = _control_chars + b''.join(map(chr, range(32, 127))) | ||
_printable_high_ascii = b''.join(map(chr, range(127, 256))) | ||
else: | ||
# Python 3 means bytes accepts integer input directly | ||
_printable_ascii = _control_chars + bytes(range(32, 127)) | ||
_printable_high_ascii = bytes(range(127, 256)) | ||
|
||
|
||
def is_binary_string(bytes_to_check): | ||
""" | ||
Uses a simplified version of the Perl detection algorithm, | ||
based roughly on Eli Bendersky's translation to Python: | ||
http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ | ||
This is biased slightly more in favour of deeming files as text | ||
files than the Perl algorithm, since all ASCII compatible character | ||
sets are accepted as text, not just utf-8. | ||
:param bytes: A chunk of bytes to check. | ||
:returns: True if appears to be a binary, otherwise False. | ||
""" | ||
|
||
# Empty files are considered text files | ||
if not bytes_to_check: | ||
return False | ||
|
||
# Now check for a high percentage of ASCII control characters | ||
# Binary if control chars are > 30% of the string | ||
low_chars = bytes_to_check.translate(None, _printable_ascii) | ||
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check)) | ||
logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals()) | ||
|
||
# and check for a low percentage of high ASCII characters: | ||
# Binary if high ASCII chars are < 5% of the string | ||
# From: https://en.wikipedia.org/wiki/UTF-8 | ||
# If the bytes are random, the chances of a byte with the high bit set | ||
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7 | ||
# of these without finding an invalid sequence is actually lower than the | ||
# chance of the first three bytes randomly being the UTF-8 BOM. | ||
|
||
high_chars = bytes_to_check.translate(None, _printable_high_ascii) | ||
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check)) | ||
logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals()) | ||
|
||
is_likely_binary = ( | ||
(nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or | ||
(nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8) | ||
) | ||
logger.debug('is_likely_binary: %(is_likely_binary)r', locals()) | ||
|
||
# then check for binary for possible encoding detection with chardet | ||
detected_encoding = chardet.detect(bytes_to_check) | ||
logger.debug('detected_encoding: %(detected_encoding)r', locals()) | ||
|
||
# finally use all the check to decide binary or text | ||
decodable_as_unicode = False | ||
if (detected_encoding['confidence'] > 0.9 and | ||
detected_encoding['encoding'] != 'ascii'): | ||
try: | ||
try: | ||
bytes_to_check.decode(encoding=detected_encoding['encoding']) | ||
except TypeError: | ||
# happens only on Python 2.6 | ||
unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa | ||
decodable_as_unicode = True | ||
logger.debug('success: decodable_as_unicode: ' | ||
'%(decodable_as_unicode)r', locals()) | ||
except LookupError: | ||
logger.debug('failure: could not look up encoding %(encoding)s', | ||
detected_encoding) | ||
except UnicodeDecodeError: | ||
logger.debug('failure: decodable_as_unicode: ' | ||
'%(decodable_as_unicode)r', locals()) | ||
|
||
logger.debug('failure: decodable_as_unicode: ' | ||
'%(decodable_as_unicode)r', locals()) | ||
if is_likely_binary: | ||
if decodable_as_unicode: | ||
return False | ||
else: | ||
return True | ||
else: | ||
if decodable_as_unicode: | ||
return False | ||
else: | ||
if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check: | ||
# Check for NULL bytes last | ||
logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check)) | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright © Spyder Project Contributors | ||
# Licensed under the terms of the MIT License | ||
|
||
"""Tests for encodings.py""" | ||
|
||
import pytest | ||
|
||
from spyder.utils.encoding import is_text_file | ||
|
||
|
||
def test_is_text_file(tmpdir): | ||
p = tmpdir.mkdir("sub").join("random_text.txt") | ||
p.write("Some random text") | ||
assert is_text_file(str(p)) == True | ||
|
||
|
||
if __name__ == '__main__': | ||
pytest.main() |