diff --git a/.github/workflows/build-and-upload-to-pypi.yml b/.github/workflows/build-and-upload-to-pypi.yml index c60226b..d70651f 100644 --- a/.github/workflows/build-and-upload-to-pypi.yml +++ b/.github/workflows/build-and-upload-to-pypi.yml @@ -18,16 +18,12 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-11] + os: [ubuntu-latest, macos-11, windows-2019] steps: - uses: actions/checkout@v3 - - - name: Checkout submodules - shell: bash - run: | - git submodule sync --recursive - git submodule update --init --force --recursive --depth=1 + with: + submodules: 'src/ext/uchardet' - name: Set up QEMU if: runner.os == 'Linux' @@ -52,6 +48,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + with: + submodules: 'src/ext/uchardet' - name: Build sdist run: pipx run build --sdist diff --git a/pyproject.toml b/pyproject.toml index 222f62c..5dc03a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/lo before-build = [ "git submodule sync --recursive", "git submodule update --init --force --recursive --depth=1", - "test -d {project}/src/ext/uchardet/build || (cd {project}/src/ext/uchardet/ && mkdir build && cd build && cmake .. && make && make install)", ] [tool.cibuildwheel.macos] @@ -25,7 +24,6 @@ environment = {INCLUDE_PATH="/usr/local/include/uchardet", LIBRARY_PATH="/usr/lo before-build = [ "git submodule sync --recursive", "git submodule update --init --force --recursive --depth=1", - "test -d {project}/src/ext/uchardet/build || (cd {project}/src/ext/uchardet/ && mkdir build && cd build && cmake -DCMAKE_MACOSX_RPATH=1 -DCMAKE_INSTALL_NAME_DIR=$LIBRARY_PATH -DCMAKE_BUILD_RPATH=$LIBRARY_PATH .. && make && make install)", ] [tool.cibuildwheel.windows] diff --git a/setup.py b/setup.py index 508e9f4..fb192c9 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,8 @@ # coding: utf-8 import os -import sys -import glob import codecs import re -import pkgconfig from distutils.command.build_ext import build_ext from distutils import sysconfig @@ -17,19 +14,31 @@ from Cython.Build import cythonize -cchardet_dir = os.path.join("src", "cchardet") + os.path.sep -try: - ext_args = pkgconfig.parse('uchardet') -except pkgconfig.PackageNotFoundError: - include_path = os.environ.get('INCLUDE_PATH') - library_path = os.environ.get('LIBRARY_PATH') +join = os.path.join + +cchardet_dir = join("src", "cchardet") + os.path.sep +uchardet_dir = join("src", "ext", "uchardet", "src") +uchardet_lang_models_dir = join(uchardet_dir, "LangModels") + +cchardet_sources = [join("src", "cchardet", "_cchardet.pyx")] +uchardet_sources = [ + join(uchardet_dir, file) + for file in os.listdir(uchardet_dir) + if file.endswith(".cpp") +] +uchardet_lang_source = [ + join(uchardet_lang_models_dir, file) + for file in os.listdir(uchardet_lang_models_dir) + if file.endswith(".cpp") +] +sources = cchardet_sources + uchardet_sources + uchardet_lang_source + +ext_args = { + "include_dirs": uchardet_dir.split(os.pathsep), + "library_dirs": uchardet_dir.split(os.pathsep), +} - ext_args = { - 'include_dirs': include_path.split(os.pathsep) if include_path else [], - 'library_dirs': library_path.split(os.pathsep) if library_path else [], - 'libraries': ['uchardet'], - } # Remove the "-Wstrict-prototypes" compiler option, which isn't valid for C++. cfg_vars = sysconfig.get_config_vars() @@ -40,61 +49,61 @@ # cfg_vars[key] = value.replace("-O2", "-O3") -cchardet_module = Extension( - 'cchardet._cchardet', - [ - os.path.join('src', 'cchardet', '_cchardet.pyx') - ], - language='c++', - **ext_args -) +cchardet_module = Extension("cchardet._cchardet", sources, language="c++", **ext_args) def read(f): return open(os.path.join(os.path.dirname(__file__), f)).read().strip() -with codecs.open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'src', 'cchardet', 'version.py'), 'r', 'latin1') as fp: +with codecs.open( + os.path.join( + os.path.abspath(os.path.dirname(__file__)), "src", "cchardet", "version.py" + ), + "r", + "latin1", +) as fp: try: - version = re.findall( - r"^__version__ = '([^']+)'\r?$", fp.read(), re.M)[0] + version = re.findall(r"^__version__ = '([^']+)'\r?$", fp.read(), re.M)[0] except IndexError: - raise RuntimeError('Unable to determine version.') + raise RuntimeError("Unable to determine version.") setup( - name='faust-cchardet', - author='PyYoshi', - author_email='myoshi321go@gmail.com', - url=r'https://github.com/faust-streaming/cChardet', - description='cChardet is high speed universal character encoding detector.', - long_description='\n\n'.join((read('README.rst'), read('CHANGES.rst'))), + name="faust-cchardet", + author="PyYoshi", + author_email="myoshi321go@gmail.com", + url=r"https://github.com/faust-streaming/cChardet", + description="cChardet is high speed universal character encoding detector.", + long_description="\n\n".join((read("README.rst"), read("CHANGES.rst"))), version=version, - license='Mozilla Public License', + license="Mozilla Public License", classifiers=[ - 'License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', - 'Programming Language :: Cython', - 'Programming Language :: Python', - 'Topic :: Software Development :: Libraries', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + "License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)", + "License :: OSI Approved :: GNU General Public License (GPL)", + "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", + "Programming Language :: Cython", + "Programming Language :: Python", + "Topic :: Software Development :: Libraries", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], - keywords=[ - 'cython', - 'chardet', - 'charsetdetect' + keywords=["cython", "chardet", "charsetdetect"], + cmdclass={"build_ext": build_ext}, + package_dir={"": "src"}, + packages=[ + "cchardet", ], - cmdclass={'build_ext': build_ext}, - package_dir={'': 'src'}, - packages=['cchardet', ], - scripts=['bin/cchardetect'], - ext_modules=cythonize([ - cchardet_module, - ]), + scripts=["bin/cchardetect"], + ext_modules=cythonize( + [ + cchardet_module, + ], + cplus=True, + compiler_directives={"language_level": "3"}, # Python 3 + ), ) diff --git a/src/tests/cchardet_test.py b/src/tests/cchardet_test.py index 99b1223..7a5886e 100644 --- a/src/tests/cchardet_test.py +++ b/src/tests/cchardet_test.py @@ -4,46 +4,66 @@ import cchardet import pytest +import sys SKIP_LIST = [ - os.path.join('src','tests','testdata','ja','utf-16le.txt'), - os.path.join('src','tests','testdata','ja','utf-16be.txt'), - os.path.join('src','tests','testdata','es','iso-8859-15.txt'), - os.path.join('src','tests','testdata','da','iso-8859-1.txt'), - os.path.join('src','tests','testdata','he','iso-8859-8.txt'), + os.path.join("src", "tests", "testdata", "ja", "utf-16le.txt"), + os.path.join("src", "tests", "testdata", "ja", "utf-16be.txt"), + os.path.join("src", "tests", "testdata", "es", "iso-8859-15.txt"), + os.path.join("src", "tests", "testdata", "da", "iso-8859-1.txt"), + os.path.join("src", "tests", "testdata", "he", "iso-8859-8.txt"), ] +if sys.maxsize <= 2**32: + # Fails on i686 only, original cchardet test fails too + SKIP_LIST.append(os.path.join("src", "tests", "testdata", "th", "tis-620.txt")) + SKIP_LIST.append(os.path.join("src", "tests", "testdata", "fi", "iso-8859-1.txt")) + SKIP_LIST.append(os.path.join("src", "tests", "testdata", "ga", "iso-8859-1.txt")) + # Python can't decode encoding SKIP_LIST_02 = [ - os.path.join('src','tests','testdata','vi','viscii.txt'), - os.path.join('src','tests','testdata','zh','euc-tw.txt'), + os.path.join("src", "tests", "testdata", "vi", "viscii.txt"), + os.path.join("src", "tests", "testdata", "zh", "euc-tw.txt"), ] + SKIP_LIST_02.extend(SKIP_LIST) def test_ascii(): - detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz') - assert 'ascii' == detected_encoding['encoding'].lower() + detected_encoding = cchardet.detect(b"abcdefghijklmnopqrstuvwxyz") + assert "ascii" == detected_encoding["encoding"].lower() -def test_detect(): - testfiles = glob.glob(os.path.join('src','tests','testdata','*','*.txt')) - for testfile in testfiles: - if testfile.replace("\\", "/") in SKIP_LIST: - continue +@pytest.mark.parametrize( + "testfile", glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt")) +) +def test_detect(testfile): + if testfile.replace("\\", "/") in SKIP_LIST: + return - base = os.path.basename(testfile) - expected_charset = os.path.splitext(base)[0] - with open(testfile, 'rb') as f: - msg = f.read() - detected_encoding = cchardet.detect(msg) - assert expected_charset.lower() == detected_encoding['encoding'].lower() + base = os.path.basename(testfile) + expected_charset = os.path.splitext(base)[0] + with open(testfile, "rb") as f: + msg = f.read() + detected_encoding = cchardet.detect(msg) + assert expected_charset.lower() == detected_encoding["encoding"].lower() -@pytest.mark.skipif(platform.system() == 'Windows', reason="FIXME: Cannot find test file on Windows for some reason") +@pytest.mark.skipif( + platform.system() == "Windows", + reason="FIXME: Cannot find test file on Windows for some reason", +) def test_detector(): detector = cchardet.UniversalDetector() - with open(os.path.join('src','tests','samples','wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt'), 'rb') as f: + with open( + os.path.join( + "src", + "tests", + "samples", + "wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", + ), + "rb", + ) as f: line = f.readline() while line: detector.feed(line) @@ -52,14 +72,14 @@ def test_detector(): line = f.readline() detector.close() detected_encoding = detector.result - assert "shift_jis" == detected_encoding['encoding'].lower() + assert "shift_jis" == detected_encoding["encoding"].lower() def test_github_issue_20(): """ https://github.com/PyYoshi/cChardet/issues/20 """ - msg = b'\x8f' + msg = b"\x8f" cchardet.detect(msg) @@ -69,14 +89,14 @@ def test_github_issue_20(): def test_decode(): - testfiles = glob.glob(os.path.join('src','tests','testdata','*','*.txt')) + testfiles = glob.glob(os.path.join("src", "tests", "testdata", "*", "*.txt")) for testfile in testfiles: if testfile.replace("\\", "/") in SKIP_LIST_02: continue base = os.path.basename(testfile) expected_charset = os.path.splitext(base)[0] - with open(testfile, 'rb') as f: + with open(testfile, "rb") as f: msg = f.read() detected_encoding = cchardet.detect(msg) try: