diff --git a/ci/travis_lint.sh b/ci/travis_lint.sh index 0cbdf1a5cb5..7cdbad4a316 100755 --- a/ci/travis_lint.sh +++ b/ci/travis_lint.sh @@ -43,9 +43,9 @@ if [ "$ARROW_CI_PYTHON_AFFECTED" != "0" ]; then PYTHON_DIR=$TRAVIS_BUILD_DIR/python - flake8 --count $PYTHON_DIR/pyarrow + flake8 --count $PYTHON_DIR # Check Cython files with some checks turned off flake8 --count --config=$PYTHON_DIR/.flake8.cython \ - $PYTHON_DIR/pyarrow + $PYTHON_DIR fi diff --git a/python/benchmarks/__init__.py b/python/benchmarks/__init__.py index 245692337bc..13a83393a91 100644 --- a/python/benchmarks/__init__.py +++ b/python/benchmarks/__init__.py @@ -14,4 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - diff --git a/python/benchmarks/array_ops.py b/python/benchmarks/array_ops.py index 70ee7f1e1fc..696b17109f0 100644 --- a/python/benchmarks/array_ops.py +++ b/python/benchmarks/array_ops.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import numpy as np import pyarrow as pa diff --git a/python/benchmarks/common.py b/python/benchmarks/common.py index 70cd92492f7..48526a4054e 100644 --- a/python/benchmarks/common.py +++ b/python/benchmarks/common.py @@ -19,7 +19,6 @@ import decimal from functools import partial import itertools -import os import sys import unicodedata @@ -39,7 +38,7 @@ def _multiplicate_sequence(base, target_size): return [base] * q + [base[:r]] -def get_random_bytes(n, *, seed=42): +def get_random_bytes(n, seed=42): """ Generate a random bytes object of size *n*. Note the result might be compressible. @@ -58,7 +57,7 @@ def get_random_bytes(n, *, seed=42): return result -def get_random_ascii(n, *, seed=42): +def get_random_ascii(n, seed=42): """ Get a random ASCII-only unicode string of size *n*. """ @@ -69,7 +68,7 @@ def get_random_ascii(n, *, seed=42): return result -def _random_unicode_letters(n, *, seed=42): +def _random_unicode_letters(n, seed=42): """ Generate a string of random unicode letters (slow). """ @@ -93,7 +92,7 @@ def _get_more_candidates(): _1024_random_unicode_letters = _random_unicode_letters(1024) -def get_random_unicode(n, *, seed=42): +def get_random_unicode(n, seed=42): """ Get a random non-ASCII unicode string of size *n*. """ @@ -179,7 +178,8 @@ def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB): self.sprinkle_nones(data, none_prob) return data - def _generate_varying_sequences(self, random_factory, n, min_size, max_size, none_prob): + def _generate_varying_sequences(self, random_factory, n, min_size, + max_size, none_prob): """ Generate a list of *n* sequences of varying size between *min_size* and *max_size*, with *none_prob* probability of an entry being None. @@ -207,7 +207,6 @@ def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB): return self._generate_varying_sequences(get_random_bytes, n, size, size, none_prob) - def generate_varying_binary_list(self, n, min_size, max_size, none_prob=DEFAULT_NONE_PROB): """ @@ -217,7 +216,6 @@ def generate_varying_binary_list(self, n, min_size, max_size, return self._generate_varying_sequences(get_random_bytes, n, min_size, max_size, none_prob) - def generate_ascii_string_list(self, n, min_size, max_size, none_prob=DEFAULT_NONE_PROB): """ @@ -227,7 +225,6 @@ def generate_ascii_string_list(self, n, min_size, max_size, return self._generate_varying_sequences(get_random_ascii, n, min_size, max_size, none_prob) - def generate_unicode_string_list(self, n, min_size, max_size, none_prob=DEFAULT_NONE_PROB): """ @@ -237,7 +234,6 @@ def generate_unicode_string_list(self, n, min_size, max_size, return self._generate_varying_sequences(get_random_unicode, n, min_size, max_size, none_prob) - def generate_int_list_list(self, n, min_size, max_size, none_prob=DEFAULT_NONE_PROB): """ @@ -263,7 +259,9 @@ def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB): def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB): """ Generate a list of dicts with random values. - Each dict has the form `{'u': int value, 'v': float value, 'w': bool value}` + Each dict has the form + + `{'u': int value, 'v': float value, 'w': bool value}` """ ints = self.generate_int_list(n, none_prob=none_prob) floats = self.generate_float_list(n, none_prob=none_prob) diff --git a/python/benchmarks/microbenchmarks.py b/python/benchmarks/microbenchmarks.py index bae5806e141..f8ba383c70b 100644 --- a/python/benchmarks/microbenchmarks.py +++ b/python/benchmarks/microbenchmarks.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import pyarrow as pa import pyarrow.benchmark as pb from . import common @@ -44,4 +43,3 @@ def setup(self, type_name): def time_PandasObjectIsNull(self, *args): pb.benchmark_PandasObjectIsNull(self.lst) - diff --git a/python/benchmarks/plasma.py b/python/benchmarks/plasma.py index 8a607a3c623..7cefcdffad2 100644 --- a/python/benchmarks/plasma.py +++ b/python/benchmarks/plasma.py @@ -18,11 +18,8 @@ import numpy as np import timeit -import pyarrow as pa import pyarrow.plasma as plasma -from . import common - class SimplePlasmaThroughput(object): """Benchmark plasma store throughput with a single client.""" @@ -32,7 +29,8 @@ class SimplePlasmaThroughput(object): timer = timeit.default_timer def setup(self, size): - self.plasma_store_ctx = plasma.start_plasma_store(plasma_store_memory=10**9) + self.plasma_store_ctx = plasma.start_plasma_store( + plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() self.plasma_client = plasma.connect(plasma_store_name, "", 64) @@ -51,7 +49,8 @@ class SimplePlasmaLatency(object): timer = timeit.default_timer def setup(self): - self.plasma_store_ctx = plasma.start_plasma_store(plasma_store_memory=10**9) + self.plasma_store_ctx = plasma.start_plasma_store( + plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() self.plasma_client = plasma.connect(plasma_store_name, "", 64) diff --git a/python/benchmarks/streaming.py b/python/benchmarks/streaming.py index be7fda42c67..2012141a9e3 100644 --- a/python/benchmarks/streaming.py +++ b/python/benchmarks/streaming.py @@ -26,12 +26,15 @@ def generate_chunks(total_size, nchunks, ncols, dtype=np.dtype('int64')): rowsize = total_size // nchunks // ncols assert rowsize % dtype.itemsize == 0 + + def make_column(col, chunk): + return np.frombuffer(common.get_random_bytes( + rowsize, seed=col + 997 * chunk)).view(dtype) + return [pd.DataFrame({ - 'c' + str(col): np.frombuffer( - common.get_random_bytes(rowsize, seed=col + 997 * chunk)).view(dtype) - for col in range(ncols) - }) - for chunk in range(nchunks)] + 'c' + str(col): make_column(col, chunk) + for col in range(ncols)}) + for chunk in range(nchunks)] class StreamReader(object): @@ -64,4 +67,4 @@ def setup(self, chunk_size): def time_read_to_dataframe(self, *args): reader = pa.RecordBatchStreamReader(self.source) table = reader.read_all() - df = table.to_pandas() + df = table.to_pandas() # noqa diff --git a/python/doc/source/conf.py b/python/doc/source/conf.py index 1e35ef93590..51e60b1a9a5 100644 --- a/python/doc/source/conf.py +++ b/python/doc/source/conf.py @@ -30,6 +30,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import glob import os import sys @@ -77,7 +78,6 @@ # source_suffix = ['.rst', '.md'] source_suffix = '.rst' -import glob autosummary_generate = glob.glob("*.rst") # The encoding of source files. @@ -187,8 +187,8 @@ # html_logo = None # The name of an image file (relative to this directory) to use as a favicon of -# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. +# the docs. This file should be a Windows icon file (.ico) being 16x16 or +# 32x32 pixels large. # # html_favicon = None diff --git a/python/examples/plasma/sorting/multimerge.pyx b/python/examples/plasma/sorting/multimerge.pyx index 6dd5aaef95c..5e77fdfcc87 100644 --- a/python/examples/plasma/sorting/multimerge.pyx +++ b/python/examples/plasma/sorting/multimerge.pyx @@ -23,9 +23,9 @@ from libc.stdint cimport uintptr_t from libcpp.vector cimport vector from libcpp.pair cimport pair -cimport numpy as np import numpy as np +cimport numpy as np cdef extern from "" namespace "std" nogil: cdef cppclass priority_queue[T]: @@ -44,7 +44,7 @@ def multimerge2d(*arrays): This assumes C style ordering for both input and output arrays. For each input array we have array[i,0] <= array[i+1,0] and for the output array the same will hold. - + Ideally this code would be simpler and also support both C style and Fortran style ordering. """ diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py index 0805592b9f9..2e4df589ee3 100644 --- a/python/examples/plasma/sorting/sort_df.py +++ b/python/examples/plasma/sorting/sort_df.py @@ -17,7 +17,6 @@ from multiprocessing import Pool import numpy as np -import os import pandas as pd import pyarrow as pa import pyarrow.plasma as plasma diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py index 0b12fb5cd25..e3de56b28a1 100644 --- a/python/scripts/test_leak.py +++ b/python/scripts/test_leak.py @@ -57,4 +57,5 @@ def leak2(): gc.collect() + leak2() diff --git a/python/setup.py b/python/setup.py index 7a8118b1fb9..6b1e8b3d4f1 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -102,7 +101,8 @@ def run(self): ('with-static-boost', None, 'link boost statically'), ('with-plasma', None, 'build the Plasma extension'), ('with-orc', None, 'build the ORC extension'), - ('generate-coverage', None, 'enable Cython code coverage'), + ('generate-coverage', None, + 'enable Cython code coverage'), ('bundle-boost', None, 'bundle the (shared) Boost libraries'), ('bundle-arrow-cpp', None, @@ -116,7 +116,8 @@ def initialize_options(self): self.cmake_generator = 'Visual Studio 14 2015 Win64' self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower() - self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE', 'boost') + self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE', + 'boost') self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '') @@ -252,7 +253,8 @@ def _run_cmake(self): print("-- Finished cmake for pyarrow") # Do the build print("-- Running cmake --build for pyarrow") - self.spawn(['cmake', '--build', '.', '--config', self.build_type]) + self.spawn(['cmake', '--build', '.', '--config', + self.build_type]) print("-- Finished cmake --build for pyarrow") if self.inplace: @@ -297,14 +299,16 @@ def _run_cmake(self): shutil.move(pjoin(build_prefix, 'include'), pjoin(build_lib, 'pyarrow')) - # Move the built C-extension to the place expected by the Python build + # Move the built C-extension to the place expected by the Python + # build self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = self.get_ext_built(name) if not os.path.exists(built_path): print(built_path) if self._failure_permitted(name): - print('Cython module {0} failure permitted'.format(name)) + print('Cython module {0} failure permitted' + .format(name)) continue raise RuntimeError('pyarrow C-extension failed to build:', os.path.abspath(built_path)) @@ -337,11 +341,11 @@ def _run_cmake(self): if os.path.exists(self.get_ext_built_api_header(name)): shutil.move(self.get_ext_built_api_header(name), - pjoin(os.path.dirname(ext_path), name + '_api.h')) + pjoin(os.path.dirname(ext_path), + name + '_api.h')) # Move the plasma store if self.with_plasma: - build_py = self.get_finalized_command('build_py') source = os.path.join(self.build_type, "plasma_store") target = os.path.join(build_lib, self._get_build_dir(), @@ -486,7 +490,8 @@ def has_ext_modules(foo): def parse_version(root): from setuptools_scm import version_from_scm import setuptools_scm.git - describe = setuptools_scm.git.DEFAULT_DESCRIBE + " --match 'apache-arrow-[0-9]*'" + describe = (setuptools_scm.git.DEFAULT_DESCRIBE + + " --match 'apache-arrow-[0-9]*'") # Strip catchall from the commandline describe = describe.replace("--match *.*", "") version = setuptools_scm.git.parse(root, describe) @@ -520,7 +525,8 @@ def parse_version(root): 'plasma_store = pyarrow:_plasma_store_entry_point' ] }, - use_scm_version={"root": "..", "relative_to": __file__, "parse": parse_version}, + use_scm_version={"root": "..", "relative_to": __file__, + "parse": parse_version}, setup_requires=['setuptools_scm', 'cython >= 0.27'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas'], diff --git a/python/testing/parquet_interop.py b/python/testing/parquet_interop.py index ba2eb6fa416..6d41ba4b6a5 100644 --- a/python/testing/parquet_interop.py +++ b/python/testing/parquet_interop.py @@ -16,10 +16,8 @@ # under the License. import os -import pytest import fastparquet -import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pandas.util.testing as tm