diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index d5d5fdfc99f..e0537777260 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -21,7 +21,7 @@ on: pull_request: paths: - 'docs/**' - - '.github/workflows/docs.yml' + - '.github/workflows/docs_light.yml' - 'ci/docker/conda.dockerfile' - 'ci/docker/conda-cpp.dockerfile' - 'ci/docker/conda-python.dockerfile' @@ -37,12 +37,12 @@ env: jobs: light: - name: AMD64 Ubuntu 20.04 Sphinx Documentation + name: AMD64 Conda Python 3.9 Sphinx Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 30 env: - UBUNTU: "20.04" + PYTHON: "3.9" steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -52,8 +52,8 @@ jobs: uses: actions/cache@v2 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: conda-docs-${{ hashFiles('cpp/**') }} + restore-keys: conda-docs- - name: Setup Python uses: actions/setup-python@v2 with: diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index f58d99f60dc..ed1a8cf1feb 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -303,15 +303,22 @@ def lint(ctx, src, fix, iwyu_all, **checks): sys.exit(1) +def _flatten_numpydoc_rules(rules): + flattened = [] + for rule in rules: + flattened.extend(filter(None, rule.split(','))) + return flattened + + @archery.command(short_help="Lint python docstring with NumpyDoc") @click.argument('symbols', nargs=-1) @click.option("--src", metavar="", default=None, callback=validate_arrow_sources, help="Specify Arrow source directory") @click.option("--allow-rule", "-a", multiple=True, - help="Allow only these rules") + help="Allow only these rules (can be comma-separated)") @click.option("--disallow-rule", "-d", multiple=True, - help="Disallow these rules") + help="Disallow these rules (can be comma-separated)") def numpydoc(src, symbols, allow_rule, disallow_rule): """ Pass list of modules or symbols as arguments to restrict the validation. @@ -326,8 +333,9 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): """ disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} try: - results = python_numpydoc(symbols, allow_rules=allow_rule, - disallow_rules=disallow_rule) + results = python_numpydoc( + symbols, allow_rules=_flatten_numpydoc_rules(allow_rule), + disallow_rules=_flatten_numpydoc_rules(disallow_rule)) for result in results: result.ok() except LintValidationException: diff --git a/dev/archery/archery/compat.py b/dev/archery/archery/compat.py index 595a0276264..33ff869668d 100644 --- a/dev/archery/archery/compat.py +++ b/dev/archery/archery/compat.py @@ -51,3 +51,18 @@ def _import_pandas(): sys.modules['pyarrow'] = None import pandas as pd return pd + + +def _get_module(obj, *, default=None): + """ + Try to find the name of the module *obj* is defined on. + """ + try: + return obj.__module__ + except AttributeError: + # Might be a method/property descriptor as generated by Cython, + # look up the enclosing class. + try: + return obj.__objclass__.__module__ + except AttributeError: + return default diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index 6ffa9014430..a89a1cdca17 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. +from contextlib import contextmanager import inspect import tokenize -from contextlib import contextmanager try: from numpydoc.validate import Docstring, validate @@ -26,6 +26,7 @@ else: have_numpydoc = True +from ..compat import _get_module from ..utils.logger import logger from ..utils.command import Command, capture_stdout, default_bin @@ -118,8 +119,11 @@ def traverse(self, fn, obj, from_package): Parameters ---------- + fn : callable + A function to apply on all traversed objects. obj : Any - from_package : string, default 'pyarrow' + The object to start from. + from_package : string Predicate to only consider objects from this package. """ todo = [obj] @@ -139,10 +143,20 @@ def traverse(self, fn, obj, from_package): continue member = getattr(obj, name) - module = getattr(member, '__module__', None) - if not (module and module.startswith(from_package)): + module = _get_module(member) + if module is None or not module.startswith(from_package): continue - + # Is it a Cython-generated method? If so, try to detect + # whether it only has a implicitly-generated docstring, + # and no user-defined docstring following it. + # The generated docstring would lack description of method + # parameters and therefore fail Numpydoc validation. + if hasattr(member, '__objclass__'): + doc = getattr(member, '__doc__', None) + # The Cython-generated docstring would be a one-liner, + # such as "ReadOptions.equals(self, ReadOptions other)". + if (doc and '\n' not in doc and f'.{name}(' in doc): + continue todo.append(member) @contextmanager @@ -195,7 +209,7 @@ def callback(obj): try: result = validate(obj) except OSError as e: - symbol = f"{obj.__module__}.{obj.__name__}" + symbol = f"{_get_module(obj, default='')}.{obj.__name__}" logger.warning(f"Unable to validate `{symbol}` due to `{e}`") return diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 48131856402..6c01a349a34 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -23,6 +23,7 @@ import click from .command import Bash, Command, default_bin +from ..compat import _get_module from .cmake import CMake from .git import git from .logger import logger @@ -284,7 +285,7 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): doc = getattr(obj, '__doc__', '') name = getattr(obj, '__name__', '') qualname = getattr(obj, '__qualname__', '') - module = getattr(obj, '__module__', '') + module = _get_module(obj, default='') instance = getattr(obj, '__self__', '') if instance: klass = instance.__class__.__name__ diff --git a/docker-compose.yml b/docker-compose.yml index 387291af996..f1245525fc9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -979,7 +979,7 @@ services: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && - archery numpydoc --allow-rule PR01"] + archery numpydoc --allow-rule PR01,PR10"] conda-python-dask: # Possible $DASK parameters: diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 81b0b801a28..40fc3b4fff0 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -313,6 +313,17 @@ cdef class Function(_Weakrefable): MemoryPool memory_pool=None): """ Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. """ cdef: const CFunctionOptions* c_options = NULL @@ -2005,8 +2016,8 @@ cdef class Expression(_Weakrefable): ``|`` (logical or) and ``~`` (logical not). Note: python keywords ``and``, ``or`` and ``not`` cannot be used to combine expressions. - - Check whether the expression is contained in a list of values with - the ``pyarrow.compute.Expression.isin()`` member function. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. Examples -------- @@ -2130,21 +2141,76 @@ cdef class Expression(_Weakrefable): return Expression._call("divide_checked", [self, other]) def is_valid(self): - """Checks whether the expression is not-null (valid)""" + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ return Expression._call("is_valid", [self]) def is_null(self, bint nan_is_null=False): - """Checks whether the expression is null""" + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ options = NullOptions(nan_is_null=nan_is_null) return Expression._call("is_null", [self], options) def cast(self, type, bint safe=True): - """Explicitly change the expression's data type""" + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + + Returns + ------- + cast : Expression + """ options = CastOptions.safe(ensure_type(type)) return Expression._call("cast", [self], options) def isin(self, values): - """Checks whether the expression is contained in values""" + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ if not isinstance(values, Array): values = lib.array(values) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index f3c2220a55d..701af591dd5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -195,6 +195,11 @@ cdef class Dataset(_Weakrefable): The copy will view the same Fragments. If the new schema is not compatible with the original dataset's schema then an error will be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. """ cdef shared_ptr[CDataset] copy = GetResultValue( self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema))) @@ -229,48 +234,19 @@ cdef class Dataset(_Weakrefable): yield Fragment.wrap(GetResultValue(move(maybe_fragment))) def scanner(self, **kwargs): - """Builds a scan operation against the dataset. + """ + Build a scan operation against the dataset. Data is not loaded immediately. Instead, this produces a Scanner, which exposes further operations (e.g. loading all data as a table, counting rows). + See the `Scanner.from_dataset` method for further information. + Parameters ---------- - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 1M - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - use_async : bool, default True - This flag is deprecated and is being kept for this release for - backwards compatibility. It will be removed in the next - release. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. + **kwargs : dict, optional + Arguments for `Scanner.from_dataset`. Returns ------- @@ -298,9 +274,13 @@ cdef class Dataset(_Weakrefable): return Scanner.from_dataset(self, **kwargs) def to_batches(self, **kwargs): - """Read the dataset as materialized record batches. + """ + Read the dataset as materialized record batches. - See scanner method parameters documentation. + Parameters + ---------- + **kwargs : dict, optional + Arguments for `Scanner.from_dataset`. Returns ------- @@ -309,45 +289,65 @@ cdef class Dataset(_Weakrefable): return self.scanner(**kwargs).to_batches() def to_table(self, **kwargs): - """Read the dataset to an arrow table. + """ + Read the dataset to an Arrow table. Note that this method reads all the selected data from the dataset into memory. - See scanner method parameters documentation. + Parameters + ---------- + **kwargs : dict, optional + Arguments for `Scanner.from_dataset`. Returns ------- - Table + table : Table """ return self.scanner(**kwargs).to_table() def take(self, object indices, **kwargs): - """Select rows of data by index. + """ + Select rows of data by index. - See scanner method parameters documentation. + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + **kwargs : dict, optional + See scanner() method for full parameter description. Returns ------- - Table + table : Table """ return self.scanner(**kwargs).take(indices) def head(self, int num_rows, **kwargs): - """Load the first N rows of the dataset. + """ + Load the first N rows of the dataset. - See scanner method parameters documentation. + Parameters + ---------- + num_rows : int + The number of rows to load. + **kwargs : dict, optional + See scanner() method for full parameter description. Returns ------- - Table + table : Table """ return self.scanner(**kwargs).head(num_rows) def count_rows(self, **kwargs): - """Count rows matching the scanner filter. + """ + Count rows matching the scanner filter. - See scanner method parameters documentation. + Parameters + ---------- + **kwargs : dict, optional + See scanner() method for full parameter description. Returns ------- @@ -700,7 +700,22 @@ cdef class FileFormat(_Weakrefable): return self.wrapped def inspect(self, file, filesystem=None): - """Infer the schema of a file.""" + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ c_source = _make_file_source(file, filesystem) c_schema = GetResultValue(self.format.Inspect(c_source)) return pyarrow_wrap_schema(move(c_schema)) @@ -708,9 +723,17 @@ cdef class FileFormat(_Weakrefable): def make_fragment(self, file, filesystem=None, Expression partition_expression=None): """ - Make a FileFragment of this FileFormat. The filter may not reference - fields absent from the provided schema. If no schema is provided then - one will be inferred. + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression + The filter expression. """ if partition_expression is None: partition_expression = _true @@ -804,7 +827,8 @@ cdef class Fragment(_Weakrefable): return Expression.wrap(self.fragment.partition_expression()) def scanner(self, Schema schema=None, **kwargs): - """Builds a scan operation against the dataset. + """ + Build a scan operation against the fragment. Data is not loaded immediately. Instead, this produces a Scanner, which exposes further operations (e.g. loading all data as a @@ -816,48 +840,25 @@ cdef class Fragment(_Weakrefable): Schema to use for scanning. This is used to unify a Fragment to it's Dataset's schema. If not specified this will use the Fragment's physical schema which might differ for each Fragment. - columns : list of str, default None - The columns to project. This can be a list of column names to - include (order and duplicates will be preserved), or a dictionary - with {new_column_name: expression} values for more advanced - projections. - The columns will be passed down to Datasets and corresponding data - fragments to avoid loading, copying, and deserializing columns - that will not be required further down the compute chain. - By default all of the available columns are projected. Raises - an exception if any of the referenced column names does not exist - in the dataset's Schema. - filter : Expression, default None - Scan will return only the rows matching the filter. - If possible the predicate will be pushed down to exploit the - partition information or internal metadata found in the data - source, e.g. Parquet statistics. Otherwise filters the loaded - RecordBatches before yielding them. - batch_size : int, default 1M - The maximum row count for scanned record batches. If scanned - record batches are overflowing memory then this method can be - called to reduce their size. - use_threads : bool, default True - If enabled, then maximum parallelism will be used determined by - the number of available CPU cores. - memory_pool : MemoryPool, default None - For memory allocations, if required. If not specified, uses the - default pool. - fragment_scan_options : FragmentScanOptions, default None - Options specific to a particular scan and fragment type, which - can change between different scans of the same dataset. + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- scanner : Scanner - """ return Scanner.from_fragment(self, schema=schema, **kwargs) def to_batches(self, Schema schema=None, **kwargs): - """Read the fragment as materialized record batches. + """ + Read the fragment as materialized record batches. - See scanner method parameters documentation. + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- @@ -866,12 +867,18 @@ cdef class Fragment(_Weakrefable): return self.scanner(schema=schema, **kwargs).to_batches() def to_table(self, Schema schema=None, **kwargs): - """Convert this Fragment into a Table. + """ + Convert this Fragment into a Table. Use this convenience utility with care. This will serially materialize the Scan result in memory before creating the Table. - See scanner method parameters documentation. + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- @@ -880,9 +887,15 @@ cdef class Fragment(_Weakrefable): return self.scanner(schema=schema, **kwargs).to_table() def take(self, object indices, **kwargs): - """Select rows of data by index. + """ + Select rows of data by index. - See scanner method parameters documentation. + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- @@ -891,9 +904,15 @@ cdef class Fragment(_Weakrefable): return self.scanner(**kwargs).take(indices) def head(self, int num_rows, **kwargs): - """Load the first N rows of the fragment. + """ + Load the first N rows of the fragment. - See scanner method parameters documentation. + Parameters + ---------- + num_rows : int + The number of rows to load. + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- @@ -902,9 +921,13 @@ cdef class Fragment(_Weakrefable): return self.scanner(**kwargs).head(num_rows) def count_rows(self, **kwargs): - """Count rows matching the scanner filter. + """ + Count rows matching the scanner filter. - See scanner method parameters documentation. + Parameters + ---------- + **kwargs : dict, optional + Arguments for `Scanner.from_fragment`. Returns ------- @@ -2078,30 +2101,45 @@ cdef class Scanner(_Weakrefable): FragmentScanOptions fragment_scan_options=None): """ Create Scanner from Dataset, - refer to Scanner class doc for additional details on Scanner. Parameters ---------- dataset : Dataset Dataset to scan. - columns : list of str or dict, default None - The columns to project. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. filter : Expression, default None Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. batch_size : int, default 1M - The maximum row count for scanned record batches. + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. - use_async : bool, default N/A + use_async : bool, default True This flag is deprecated and is being kept for this release for backwards compatibility. It will be removed in the next release. memory_pool : MemoryPool, default None For memory allocations, if required. If not specified, uses the default pool. - fragment_scan_options : FragmentScanOptions - The fragment scan options. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() @@ -2131,32 +2169,47 @@ cdef class Scanner(_Weakrefable): FragmentScanOptions fragment_scan_options=None): """ Create Scanner from Fragment, - refer to Scanner class doc for additional details on Scanner. Parameters ---------- fragment : Fragment fragment to scan. - schema : Schema + schema : Schema, optional The schema of the fragment. - columns : list of str or dict, default None - The columns to project. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. filter : Expression, default None Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. batch_size : int, default 1M - The maximum row count for scanned record batches. + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. - use_async : bool, default N/A + use_async : bool, default True This flag is deprecated and is being kept for this release for backwards compatibility. It will be removed in the next release. memory_pool : MemoryPool, default None For memory allocations, if required. If not specified, uses the default pool. - fragment_scan_options : FragmentScanOptions - The fragment scan options. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() @@ -2260,7 +2313,8 @@ cdef class Scanner(_Weakrefable): @property def projected_schema(self): - """The materialized schema of the data, accounting for projections. + """ + The materialized schema of the data, accounting for projections. This is the schema of any data returned from the scanner. """ @@ -2268,7 +2322,8 @@ cdef class Scanner(_Weakrefable): self.scanner.options().get().projected_schema) def to_batches(self): - """Consume a Scanner in record batches. + """ + Consume a Scanner in record batches. Returns ------- @@ -2281,7 +2336,8 @@ cdef class Scanner(_Weakrefable): return _iterator(self.scan_batches()) def scan_batches(self): - """Consume a Scanner in record batches with corresponding fragments. + """ + Consume a Scanner in record batches with corresponding fragments. Returns ------- @@ -2294,7 +2350,8 @@ cdef class Scanner(_Weakrefable): return TaggedRecordBatchIterator.wrap(self, move(iterator)) def to_table(self): - """Convert a Scanner into a Table. + """ + Convert a Scanner into a Table. Use this convenience utility with care. This will serially materialize the Scan result in memory before creating the Table. @@ -2311,12 +2368,18 @@ cdef class Scanner(_Weakrefable): return pyarrow_wrap_table(GetResultValue(result)) def take(self, object indices): - """Select rows of data by index. + """ + Select rows of data by index. Will only consume as many batches of the underlying dataset as needed. Otherwise, this is equivalent to ``to_table().take(indices)``. + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + Returns ------- Table @@ -2333,7 +2396,13 @@ cdef class Scanner(_Weakrefable): return pyarrow_wrap_table(GetResultValue(result)) def head(self, int num_rows): - """Load the first N rows of the dataset. + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. Returns ------- @@ -2345,7 +2414,8 @@ cdef class Scanner(_Weakrefable): return pyarrow_wrap_table(GetResultValue(result)) def count_rows(self): - """Count rows matching the scanner filter. + """ + Count rows matching the scanner filter. Returns ------- diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 197bd24cd12..20e7be18bfa 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -408,7 +408,7 @@ cdef class FileSystem(_Weakrefable): Parameters ---------- - paths_or_selector: FileSelector, path-like or list of path-likes + paths_or_selector : FileSelector, path-like or list of path-likes Either a selector object, a path-like object or a list of path-like objects. The selector's base directory will not be part of the results, even if it exists. If it doesn't exist, @@ -455,7 +455,7 @@ cdef class FileSystem(_Weakrefable): ---------- path : str The path of the new directory. - recursive: bool, default True + recursive : bool, default True Create nested directories as well. """ cdef c_string directory = _path_as_bytes(path) @@ -463,7 +463,8 @@ cdef class FileSystem(_Weakrefable): check_status(self.fs.CreateDir(directory, recursive=recursive)) def delete_dir(self, path): - """Delete a directory and its contents, recursively. + """ + Delete a directory and its contents, recursively. Parameters ---------- @@ -475,7 +476,8 @@ cdef class FileSystem(_Weakrefable): check_status(self.fs.DeleteDir(directory)) def delete_dir_contents(self, path, *, bint accept_root_dir=False): - """Delete a directory's contents, recursively. + """ + Delete a directory's contents, recursively. Like delete_dir, but doesn't delete the directory itself. @@ -599,7 +601,7 @@ cdef class FileSystem(_Weakrefable): Parameters ---------- - source : str + path : str The source to open for reading. compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index cabf0093b56..44ca4e1c8dd 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -613,8 +613,15 @@ cdef class FileMetaData(_Weakrefable): def set_file_path(self, path): """ - Modify the file_path field of each ColumnChunk in the - FileMetaData to be a particular value + Set ColumnChunk file paths to the given value. + + This method modifies the ``file_path`` field of each ColumnChunk + in the FileMetaData to be a particular value. + + Parameters + ---------- + path : str + The file path to set on all ColumnChunks. """ cdef: c_string c_path = tobytes(path) @@ -622,7 +629,12 @@ cdef class FileMetaData(_Weakrefable): def append_row_groups(self, FileMetaData other): """ - Append row groups of other FileMetaData object + Append row groups from other FileMetaData object. + + Parameters + ---------- + other : FileMetaData + Other metadata to append row groups from. """ cdef shared_ptr[CFileMetaData] c_metadata @@ -631,7 +643,13 @@ cdef class FileMetaData(_Weakrefable): def write_metadata_file(self, where): """ - Write the metadata object to a metadata-only file + Write the metadata to a metadata-only Parquet file. + + Parameters + ---------- + where : path or file-like object + Where to write the metadata. Should be a writable path on + the local filesystem, or a writable file-like object. """ cdef: shared_ptr[COutputStream] sink @@ -700,7 +718,16 @@ cdef class ParquetSchema(_Weakrefable): def equals(self, ParquetSchema other): """ - Returns True if the Parquet schemas are equal + Return whether the two schemas are equal. + + Parameters + ---------- + other : ParquetSchema + Schema to compare against. + + Returns + ------- + are_equal : bool """ return self.schema.Equals(deref(other.schema)) @@ -733,7 +760,16 @@ cdef class ColumnSchema(_Weakrefable): def equals(self, ColumnSchema other): """ - Returns True if the column schemas are equal + Return whether the two column schemas are equal. + + Parameters + ---------- + other : ColumnSchema + Schema to compare against. + + Returns + ------- + are_equal : bool """ return self.descr.Equals(deref(other.descr)) @@ -1173,17 +1209,17 @@ cdef class ParquetReader(_Weakrefable): def column_name_idx(self, column_name): """ - Find the matching index of a column in the schema. + Find the index of a column by its name. - Parameter - --------- - column_name: str - Name of the column, separation of nesting levels is done via ".". + Parameters + ---------- + column_name : str + Name of the column; separation of nesting levels is done via ".". Returns ------- - column_idx: int - Integer index of the position of the column + column_idx : int + Integer index of the column in the schema. """ cdef: FileMetaData container = self.metadata diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index 7aeb9e83851..35d39073634 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -339,18 +339,23 @@ cdef class PlasmaClient(_Weakrefable): """ Create a new buffer in the PlasmaStore for a particular object ID. - The returned buffer is mutable until seal is called. + The returned buffer is mutable until ``seal()`` is called. Parameters ---------- object_id : ObjectID The object ID used to identify an object. - size : int + data_size : int The size in bytes of the created buffer. metadata : bytes An optional string of bytes encoding whatever metadata the user wishes to encode. + Returns + ------- + buffer : Buffer + A mutable buffer where to write the object data. + Raises ------ PlasmaObjectExists diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cd39c1c79a7..f86a0b90592 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -696,7 +696,7 @@ cdef class _PandasConvertible(_Weakrefable): pool is not passed. strings_to_categorical : bool, default False Encode string (UTF8) and binary types to pandas.Categorical. - categories: list, default empty + categories : list, default empty List of fields that should be returned as pandas.Categorical. Only applies to table-like data structures. zero_copy_only : bool, default False @@ -809,8 +809,31 @@ cdef class Array(_PandasConvertible): """ Compare contents of this array against another one. - Return string containing the result of arrow::Diff comparing contents - of this array against the other array. + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + """ cdef c_string result with nogil: @@ -821,7 +844,18 @@ cdef class Array(_PandasConvertible): """ Cast array values to another data type - See pyarrow.compute.cast for usage + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + + Returns + ------- + cast : Array """ return _pc().cast(self, target_type, safe=safe) @@ -849,6 +883,18 @@ cdef class Array(_PandasConvertible): def sum(self, **kwargs): """ Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. """ options = _pc().ScalarAggregateOptions(**kwargs) return _pc().call_function('sum', [self], options) @@ -856,12 +902,29 @@ cdef class Array(_PandasConvertible): def unique(self): """ Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. """ return _pc().call_function('unique', [self]) def dictionary_encode(self, null_encoding='mask'): """ Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. """ options = _pc().DictionaryEncodeOptions(null_encoding) return _pc().call_function('dictionary_encode', [self], options) @@ -1132,7 +1195,17 @@ cdef class Array(_PandasConvertible): def fill_null(self, fill_value): """ - See pyarrow.compute.fill_null for usage. + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. """ return _pc().fill_null(self, fill_value) @@ -1192,7 +1265,19 @@ cdef class Array(_PandasConvertible): def take(self, object indices): """ - Select values from an array. See pyarrow.compute.take for full usage. + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. """ return _pc().take(self, indices) @@ -1204,7 +1289,22 @@ cdef class Array(_PandasConvertible): def filter(self, Array mask, *, null_selection_behavior='drop'): """ - Select values from an array. See pyarrow.compute.filter for full usage. + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. """ return _pc().filter(self, mask, null_selection_behavior=null_selection_behavior) @@ -1213,7 +1313,23 @@ cdef class Array(_PandasConvertible): """ Find the first index of a value. - See pyarrow.compute.index for full usage. + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). """ return _pc().index(self, value, start, end, memory_pool=memory_pool) @@ -1307,7 +1423,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - full: bool, default False + full : bool, default False If True, run expensive checks, otherwise cheap checks only. Raises @@ -2041,6 +2157,16 @@ cdef class UnionArray(Array): and null count adjusted. For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. """ cdef shared_ptr[CArray] result result = ( self.ap).field(pos) @@ -2556,6 +2682,13 @@ cdef class ExtensionArray(Array): """ Convert extension array to a numpy ndarray. + This method simply delegates to the underlying storage array. + + Parameters + ---------- + **kwargs : dict, optional + See `Array.to_numpy` for parameter description. + See Also -------- Array.to_numpy diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 0fcbd9a5219..c6bf11b2dc4 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -433,7 +433,13 @@ cdef class NativeFile(_Weakrefable): def read1(self, nbytes=None): """Read and return up to n bytes. - Alias for read, needed to match the IOBase interface.""" + Alias for read, needed to match the BufferedIOBase interface. + + Parameters + ---------- + nbytes : int + The maximum number of bytes to read. + """ return self.read(nbytes=None) def readall(self): @@ -444,12 +450,13 @@ cdef class NativeFile(_Weakrefable): Read into the supplied buffer Parameters - ----------- - b: any python object supporting buffer interface + ---------- + b : buffer-like object + A writable buffer object (such as a bytearray). Returns - -------- - int + ------- + written : int number of bytes written """ @@ -476,19 +483,22 @@ cdef class NativeFile(_Weakrefable): If size is specified, read at most size bytes. Line terminator is always b"\\n". - """ + Parameters + ---------- + size : int + maximum number of bytes read + """ raise UnsupportedOperation() def readlines(self, hint=None): """NOT IMPLEMENTED. Read lines of the file Parameters - ----------- - - hint: int maximum number of bytes read until we stop + ---------- + hint : int + maximum number of bytes read until we stop """ - raise UnsupportedOperation() def __iter__(self): @@ -536,8 +546,17 @@ cdef class NativeFile(_Weakrefable): def download(self, stream_or_path, buffer_size=None): """ - Read file completely to local path (rather than reading completely into - memory). First seeks to the beginning of the file. + Read this file completely to a local path or destination stream. + + This method first seeks to the beginning of the file. + + Parameters + ---------- + stream_or_path : str or file-like object + If a string, a local file path to write to; otherwise, + should be a writable stream. + buffer_size : int, optional + The buffer size to use for data transfers. """ cdef: int64_t bytes_read = 0 @@ -624,7 +643,14 @@ cdef class NativeFile(_Weakrefable): def upload(self, stream, buffer_size=None): """ - Pipe file-like object to file + Write from a source stream to this file. + + Parameters + ---------- + stream : file-like object + Source stream to pipe to this file. + buffer_size : int, optional + The buffer size to use for data transfers. """ write_queue = Queue(50) self._assert_writable() @@ -1257,6 +1283,10 @@ cdef class BufferReader(NativeFile): cdef: Buffer buffer + # XXX Needed to make numpydoc happy + def __init__(self, obj): + pass + def __cinit__(self, object obj): self.buffer = as_buffer(obj) self.set_random_access_file(shared_ptr[CRandomAccessFile]( diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 101f9ac96c9..7c3ecb22c0c 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -429,7 +429,7 @@ cdef class _CRecordBatchWriter(_Weakrefable): check_status(self.writer.get() .WriteRecordBatch(deref(batch.batch))) - def write_table(self, Table table, max_chunksize=None, **kwargs): + def write_table(self, Table table, max_chunksize=None): """ Write Table to stream in (contiguous) RecordBatch objects. @@ -444,13 +444,6 @@ cdef class _CRecordBatchWriter(_Weakrefable): # max_chunksize must be > 0 to have any impact int64_t c_max_chunksize = -1 - if 'chunksize' in kwargs: - max_chunksize = kwargs['chunksize'] - msg = ('The parameter chunksize is deprecated for the write_table ' - 'methods as of 0.15, please use parameter ' - 'max_chunksize instead') - warnings.warn(msg, FutureWarning) - if max_chunksize is not None: c_max_chunksize = max_chunksize @@ -771,9 +764,24 @@ cdef class _RecordBatchFileReader(_Weakrefable): @property def num_record_batches(self): + """ + The number of record batches in the IPC file. + """ return self.reader.get().num_record_batches() def get_batch(self, int i): + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ cdef shared_ptr[CRecordBatch] batch if i < 0 or i >= self.num_record_batches: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 748a6a1684f..de33d999eb2 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -70,6 +70,15 @@ cdef class Scalar(_Weakrefable): def cast(self, object target_type): """ Attempt a safe cast to target data type. + + Parameters + ---------- + target_type : DataType or string coercible to DataType + The type to cast the scalar to. + + Returns + ------- + scalar : A Scalar of the given target data type. """ cdef: DataType type = ensure_type(target_type) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 37fd6baeb71..f93c20f49d4 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -122,7 +122,7 @@ cdef class ChunkedArray(_PandasConvertible): Parameters ---------- - full: bool, default False + full : bool, default False If True, run expensive checks, otherwise cheap checks only. Raises @@ -253,7 +253,17 @@ cdef class ChunkedArray(_PandasConvertible): def fill_null(self, fill_value): """ - See pyarrow.compute.fill_null docstring for usage. + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. """ return _pc().fill_null(self, fill_value) @@ -335,18 +345,36 @@ cdef class ChunkedArray(_PandasConvertible): """ Cast array values to another data type - See pyarrow.compute.cast for usage + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + + Returns + ------- + cast : Array or ChunkedArray """ return _pc().cast(self, target_type, safe=safe) def dictionary_encode(self, null_encoding='mask'): """ - Compute dictionary-encoded representation of array + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding + How to handle null entries. Returns ------- - pyarrow.ChunkedArray - Same chunking as the input, all chunks share a common dictionary. + encoded : ChunkedArray + A dictionary-encoded version of this array. """ options = _pc().DictionaryEncodeOptions(null_encoding) return _pc().call_function('dictionary_encode', [self], options) @@ -440,8 +468,22 @@ cdef class ChunkedArray(_PandasConvertible): def filter(self, mask, object null_selection_behavior="drop"): """ - Select values from a chunked array. See pyarrow.compute.filter for full - usage. + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. """ return _pc().filter(self, mask, null_selection_behavior) @@ -449,21 +491,48 @@ cdef class ChunkedArray(_PandasConvertible): """ Find the first index of a value. - See pyarrow.compute.index for full usage. + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). """ return _pc().index(self, value, start, end, memory_pool=memory_pool) def take(self, object indices): """ - Select values from a chunked array. See pyarrow.compute.take for full - usage. + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. """ return _pc().take(self, indices) def drop_null(self): """ Remove missing values from a chunked array. - See pyarrow.compute.drop_null for full description. + See :func:`pyarrow.compute.drop_null` for full description. """ return _pc().drop_null(self) @@ -795,7 +864,7 @@ cdef class RecordBatch(_PandasConvertible): Parameters ---------- - full: bool, default False + full : bool, default False If True, run expensive checks, otherwise cheap checks only. Raises @@ -1069,8 +1138,22 @@ cdef class RecordBatch(_PandasConvertible): def filter(self, mask, object null_selection_behavior="drop"): """ - Select record from a record batch. See pyarrow.compute.filter for full - usage. + Select rows from the record batch. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the record batch with. + null_selection_behavior + How nulls in the mask should be handled. + + Returns + ------- + filtered : RecordBatch + A record batch of the same schema, with only the rows selected + by the boolean mask. """ return _pc().filter(self, mask, null_selection_behavior) @@ -1104,15 +1187,26 @@ cdef class RecordBatch(_PandasConvertible): def take(self, object indices): """ - Select records from a RecordBatch. See pyarrow.compute.take for full - usage. + Select rows from the record batch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the record batch whose rows will be returned. + + Returns + ------- + taken : RecordBatch + A record batch with the same schema, containing the taken rows. """ return _pc().take(self, indices) def drop_null(self): """ Remove missing values from a RecordBatch. - See pyarrow.compute.drop_null for full usage. + See :func:`pyarrow.compute.drop_null` for full usage. """ return _pc().drop_null(self) @@ -1443,7 +1537,7 @@ cdef class Table(_PandasConvertible): Parameters ---------- - full: bool, default False + full : bool, default False If True, run expensive checks, otherwise cheap checks only. Raises @@ -1513,15 +1607,40 @@ cdef class Table(_PandasConvertible): def filter(self, mask, object null_selection_behavior="drop"): """ - Select records from a Table. See :func:`pyarrow.compute.filter` for - full usage. + Select rows from the table. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the table with. + null_selection_behavior + How nulls in the mask should be handled. + + Returns + ------- + filtered : Table + A table of the same schema, with only the rows selected + by the boolean mask. """ return _pc().filter(self, mask, null_selection_behavior) def take(self, object indices): """ - Select records from a Table. See :func:`pyarrow.compute.take` for full - usage. + Select rows from the table. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the table whose rows will be returned. + + Returns + ------- + taken : Table + A table with the same schema, containing the taken rows. """ return _pc().take(self, indices) @@ -1966,9 +2085,12 @@ cdef class Table(_PandasConvertible): return pyarrow_wrap_table(c_table) - def to_batches(self, max_chunksize=None, **kwargs): + def to_batches(self, max_chunksize=None): """ - Convert Table to list of (contiguous) RecordBatch objects. + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. Parameters ---------- @@ -1988,13 +2110,6 @@ cdef class Table(_PandasConvertible): reader.reset(new TableBatchReader(deref(self.table))) - if 'chunksize' in kwargs: - max_chunksize = kwargs['chunksize'] - msg = ('The parameter chunksize is deprecated for ' - 'pyarrow.Table.to_batches as of 0.15, please use ' - 'the parameter max_chunksize instead') - warnings.warn(msg, FutureWarning) - if max_chunksize is not None: c_max_chunksize = max_chunksize reader.get().set_chunksize(c_max_chunksize) @@ -2012,7 +2127,17 @@ cdef class Table(_PandasConvertible): def to_reader(self, max_chunksize=None): """ - Convert a Table to RecordBatchReader + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum size for RecordBatch chunks. Individual chunks may be + smaller depending on the chunk layout of individual columns. + Returns ------- RecordBatchReader diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 42fd4474155..0fc7937f810 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -71,7 +71,12 @@ strides: {0.strides}""".format(self) def equals(self, Tensor other): """ - Return true if the tensors contains exactly equal data + Return true if the tensors contains exactly equal data. + + Parameters + ---------- + other : Tensor + The other tensor to compare for equality. """ return self.tp.Equals(deref(other.tp)) @@ -364,6 +369,11 @@ shape: {0.shape}""".format(self) def equals(self, SparseCOOTensor other): """ Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCOOTensor + The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) @@ -590,6 +600,11 @@ shape: {0.shape}""".format(self) def equals(self, SparseCSRMatrix other): """ Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCSRMatrix + The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) @@ -800,6 +815,11 @@ shape: {0.shape}""".format(self) def equals(self, SparseCSCMatrix other): """ Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSCMatrix + The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) @@ -987,6 +1007,11 @@ shape: {0.shape}""".format(self) def equals(self, SparseCSFTensor other): """ Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSFTensor + The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 32d70887aab..569a4b61a00 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -429,7 +429,23 @@ cdef class StructType(DataType): cdef Field field_by_name(self, name): """ - Return a child field by its name rather than its index. + Return a child field by its name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + field : Field + The child field with the given name. + + Raises + ------ + KeyError + If the name isn't found, or if several fields have the given + name. """ cdef vector[shared_ptr[CField]] fields @@ -445,14 +461,34 @@ cdef class StructType(DataType): def get_field_index(self, name): """ - Return index of field with given unique name. Returns -1 if not found - or if duplicated + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. """ return self.struct_type.GetFieldIndex(tobytes(name)) def get_all_field_indices(self, name): """ - Return sorted list of indices for fields with the given name + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] """ return self.struct_type.GetAllFieldIndices(tobytes(name)) @@ -1522,7 +1558,17 @@ cdef class Schema(_Weakrefable): raise TypeError("Index must either be string or integer") def _field(self, int i): - """Select a field by its numeric index.""" + """ + Select a field by its numeric index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + """ cdef int index = _normalize_index(i, self.schema.num_fields()) return pyarrow_wrap_field(self.schema.field(index)) @@ -1532,7 +1578,7 @@ cdef class Schema(_Weakrefable): Parameters ---------- - name: str + name : str Returns ------- @@ -1557,14 +1603,34 @@ cdef class Schema(_Weakrefable): def get_field_index(self, name): """ - Return index of field with given unique name. Returns -1 if not found - or if duplicated + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. """ return self.schema.GetFieldIndex(tobytes(name)) def get_all_field_indices(self, name): """ - Return sorted list of indices for fields with the given name + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] """ return self.schema.GetAllFieldIndices(tobytes(name)) @@ -1577,7 +1643,7 @@ cdef class Schema(_Weakrefable): Parameters ---------- - field: Field + field : Field Returns ------- @@ -1592,8 +1658,8 @@ cdef class Schema(_Weakrefable): Parameters ---------- - i: int - field: Field + i : int + field : Field Returns ------- @@ -1616,7 +1682,7 @@ cdef class Schema(_Weakrefable): Parameters ---------- - i: int + i : int Returns ------- @@ -1635,8 +1701,8 @@ cdef class Schema(_Weakrefable): Parameters ---------- - i: int - field: Field + i : int + field : Field Returns -------