Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions .github/workflows/build-and-upload-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,12 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-11]
os: [ubuntu-latest, macos-11, windows-2019]

steps:
- uses: actions/checkout@v3

- name: Checkout submodules
shell: bash
run: |
git submodule sync --recursive
git submodule update --init --force --recursive --depth=1
with:
submodules: 'src/ext/uchardet'

- name: Set up QEMU
if: runner.os == 'Linux'
Expand All @@ -52,6 +48,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
submodules: 'src/ext/uchardet'

- name: Build sdist
run: pipx run build --sdist
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@ environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/lo
before-build = [
"git submodule sync --recursive",
"git submodule update --init --force --recursive --depth=1",
"test -d {project}/src/ext/uchardet/build || (cd {project}/src/ext/uchardet/ && mkdir build && cd build && cmake .. && make && make install)",
]

[tool.cibuildwheel.macos]
environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/local/lib/"}
before-build = [
"git submodule sync --recursive",
"git submodule update --init --force --recursive --depth=1",
"test -d {project}/src/ext/uchardet/build || (cd {project}/src/ext/uchardet/ && mkdir build && cd build && cmake -DCMAKE_MACOSX_RPATH=1 -DCMAKE_INSTALL_NAME_DIR=$LIBRARY_PATH -DCMAKE_BUILD_RPATH=$LIBRARY_PATH .. && make && make install)",
]

[tool.cibuildwheel.windows]
Expand Down
123 changes: 66 additions & 57 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
# coding: utf-8

import os
import sys
import glob
import codecs
import re
import pkgconfig
from distutils.command.build_ext import build_ext
from distutils import sysconfig

Expand All @@ -17,19 +14,31 @@

from Cython.Build import cythonize

cchardet_dir = os.path.join("src", "cchardet") + os.path.sep

try:
ext_args = pkgconfig.parse('uchardet')
except pkgconfig.PackageNotFoundError:
include_path = os.environ.get('INCLUDE_PATH')
library_path = os.environ.get('LIBRARY_PATH')
join = os.path.join

cchardet_dir = join("src", "cchardet") + os.path.sep
uchardet_dir = join("src", "ext", "uchardet", "src")
uchardet_lang_models_dir = join(uchardet_dir, "LangModels")

cchardet_sources = [join("src", "cchardet", "_cchardet.pyx")]
uchardet_sources = [
join(uchardet_dir, file)
for file in os.listdir(uchardet_dir)
if file.endswith(".cpp")
]
uchardet_lang_source = [
join(uchardet_lang_models_dir, file)
for file in os.listdir(uchardet_lang_models_dir)
if file.endswith(".cpp")
]
sources = cchardet_sources + uchardet_sources + uchardet_lang_source

ext_args = {
"include_dirs": uchardet_dir.split(os.pathsep),
"library_dirs": uchardet_dir.split(os.pathsep),
}

ext_args = {
'include_dirs': include_path.split(os.pathsep) if include_path else [],
'library_dirs': library_path.split(os.pathsep) if library_path else [],
'libraries': ['uchardet'],
}

# Remove the "-Wstrict-prototypes" compiler option, which isn't valid for C++.
cfg_vars = sysconfig.get_config_vars()
Expand All @@ -40,61 +49,61 @@
# cfg_vars[key] = value.replace("-O2", "-O3")


cchardet_module = Extension(
'cchardet._cchardet',
[
os.path.join('src', 'cchardet', '_cchardet.pyx')
],
language='c++',
**ext_args
)
cchardet_module = Extension("cchardet._cchardet", sources, language="c++", **ext_args)


def read(f):
return open(os.path.join(os.path.dirname(__file__), f)).read().strip()


with codecs.open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'src', 'cchardet', 'version.py'), 'r', 'latin1') as fp:
with codecs.open(
os.path.join(
os.path.abspath(os.path.dirname(__file__)), "src", "cchardet", "version.py"
),
"r",
"latin1",
) as fp:
try:
version = re.findall(
r"^__version__ = '([^']+)'\r?$", fp.read(), re.M)[0]
version = re.findall(r"^__version__ = '([^']+)'\r?$", fp.read(), re.M)[0]
except IndexError:
raise RuntimeError('Unable to determine version.')
raise RuntimeError("Unable to determine version.")

setup(
name='faust-cchardet',
author='PyYoshi',
author_email='myoshi321go@gmail.com',
url=r'https://github.com/faust-streaming/cChardet',
description='cChardet is high speed universal character encoding detector.',
long_description='\n\n'.join((read('README.rst'), read('CHANGES.rst'))),
name="faust-cchardet",
author="PyYoshi",
author_email="myoshi321go@gmail.com",
url=r"https://github.com/faust-streaming/cChardet",
description="cChardet is high speed universal character encoding detector.",
long_description="\n\n".join((read("README.rst"), read("CHANGES.rst"))),
version=version,
license='Mozilla Public License',
license="Mozilla Public License",
classifiers=[
'License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)',
'License :: OSI Approved :: GNU General Public License (GPL)',
'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
'Programming Language :: Cython',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
"License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)",
"License :: OSI Approved :: GNU General Public License (GPL)",
"License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
"Programming Language :: Cython",
"Programming Language :: Python",
"Topic :: Software Development :: Libraries",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
],
keywords=[
'cython',
'chardet',
'charsetdetect'
keywords=["cython", "chardet", "charsetdetect"],
cmdclass={"build_ext": build_ext},
package_dir={"": "src"},
packages=[
"cchardet",
],
cmdclass={'build_ext': build_ext},
package_dir={'': 'src'},
packages=['cchardet', ],
scripts=['bin/cchardetect'],
ext_modules=cythonize([
cchardet_module,
]),
scripts=["bin/cchardetect"],
ext_modules=cythonize(
[
cchardet_module,
],
cplus=True,
compiler_directives={"language_level": "3"}, # Python 3
),
)
72 changes: 46 additions & 26 deletions src/tests/cchardet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,46 +4,66 @@

import cchardet
import pytest
import sys

SKIP_LIST = [
os.path.join('src','tests','testdata','ja','utf-16le.txt'),
os.path.join('src','tests','testdata','ja','utf-16be.txt'),
os.path.join('src','tests','testdata','es','iso-8859-15.txt'),
os.path.join('src','tests','testdata','da','iso-8859-1.txt'),
os.path.join('src','tests','testdata','he','iso-8859-8.txt'),
os.path.join("src", "tests", "testdata", "ja", "utf-16le.txt"),
os.path.join("src", "tests", "testdata", "ja", "utf-16be.txt"),
os.path.join("src", "tests", "testdata", "es", "iso-8859-15.txt"),
os.path.join("src", "tests", "testdata", "da", "iso-8859-1.txt"),
os.path.join("src", "tests", "testdata", "he", "iso-8859-8.txt"),
]

if sys.maxsize <= 2**32:
# Fails on i686 only, original cchardet test fails too
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "th", "tis-620.txt"))
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "fi", "iso-8859-1.txt"))
SKIP_LIST.append(os.path.join("src", "tests", "testdata", "ga", "iso-8859-1.txt"))

# Python can't decode encoding
SKIP_LIST_02 = [
os.path.join('src','tests','testdata','vi','viscii.txt'),
os.path.join('src','tests','testdata','zh','euc-tw.txt'),
os.path.join("src", "tests", "testdata", "vi", "viscii.txt"),
os.path.join("src", "tests", "testdata", "zh", "euc-tw.txt"),
]

SKIP_LIST_02.extend(SKIP_LIST)


def test_ascii():
detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
assert 'ascii' == detected_encoding['encoding'].lower()
detected_encoding = cchardet.detect(b"abcdefghijklmnopqrstuvwxyz")
assert "ascii" == detected_encoding["encoding"].lower()


def test_detect():
testfiles = glob.glob(os.path.join('src','tests','testdata','*','*.txt'))
for testfile in testfiles:
if testfile.replace("\\", "/") in SKIP_LIST:
continue
@pytest.mark.parametrize(
"testfile", glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt"))
)
def test_detect(testfile):
if testfile.replace("\\", "/") in SKIP_LIST:
return

base = os.path.basename(testfile)
expected_charset = os.path.splitext(base)[0]
with open(testfile, 'rb') as f:
msg = f.read()
detected_encoding = cchardet.detect(msg)
assert expected_charset.lower() == detected_encoding['encoding'].lower()
base = os.path.basename(testfile)
expected_charset = os.path.splitext(base)[0]
with open(testfile, "rb") as f:
msg = f.read()
detected_encoding = cchardet.detect(msg)
assert expected_charset.lower() == detected_encoding["encoding"].lower()


@pytest.mark.skipif(platform.system() == 'Windows', reason="FIXME: Cannot find test file on Windows for some reason")
@pytest.mark.skipif(
platform.system() == "Windows",
reason="FIXME: Cannot find test file on Windows for some reason",
)
def test_detector():
detector = cchardet.UniversalDetector()
with open(os.path.join('src','tests','samples','wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt'), 'rb') as f:
with open(
os.path.join(
"src",
"tests",
"samples",
"wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt",
),
"rb",
) as f:
line = f.readline()
while line:
detector.feed(line)
Expand All @@ -52,14 +72,14 @@ def test_detector():
line = f.readline()
detector.close()
detected_encoding = detector.result
assert "shift_jis" == detected_encoding['encoding'].lower()
assert "shift_jis" == detected_encoding["encoding"].lower()


def test_github_issue_20():
"""
https://github.com/PyYoshi/cChardet/issues/20
"""
msg = b'\x8f'
msg = b"\x8f"

cchardet.detect(msg)

Expand All @@ -69,14 +89,14 @@ def test_github_issue_20():


def test_decode():
testfiles = glob.glob(os.path.join('src','tests','testdata','*','*.txt'))
testfiles = glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt"))
for testfile in testfiles:
if testfile.replace("\\", "/") in SKIP_LIST_02:
continue

base = os.path.basename(testfile)
expected_charset = os.path.splitext(base)[0]
with open(testfile, 'rb') as f:
with open(testfile, "rb") as f:
msg = f.read()
detected_encoding = cchardet.detect(msg)
try:
Expand Down