Skip to content

Commit

Permalink
Merge pull request #362 from capitalone/develop
Browse files Browse the repository at this point in the history
Release v0.15.0
  • Loading branch information
fdosani authored Dec 17, 2024
2 parents d6f5893 + f00d0bd commit 6b6affc
Show file tree
Hide file tree
Showing 19 changed files with 251 additions and 159 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ jobs:
- name: Install datacompy
run: |
python -m pip install --upgrade pip
python -m pip install .[tests,duckdb,polars,dask,ray]
python -m pip install .[tests,fugue]
- name: Test with pytest
run: |
python -m pytest tests/ --ignore=tests/test_snowflake.py
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ If you would like to use Spark or any other backends please make sure you instal

```shell
pip install datacompy[spark]
pip install datacompy[dask]
pip install datacompy[duckdb]
pip install datacompy[ray]
pip install datacompy[fugue]
pip install datacompy[snowflake]

```
Expand All @@ -45,7 +43,7 @@ Pandas on Spark implementation. The original ``SparkCompare`` implementation dif
from all the other native implementations. To align the API better, and keep behaviour
consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare``

Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``)
Subsequently in ``v0.13.0`` a PySpark DataFrame class has been introduced (``SparkSQLCompare``)
which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version
the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark
logic is now under the ``spark`` submodule.
Expand Down
2 changes: 1 addition & 1 deletion datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Then extended to carry that functionality over to Spark Dataframes.
"""

__version__ = "0.14.4"
__version__ = "0.15.0"

import platform
from warnings import warn
Expand Down
7 changes: 4 additions & 3 deletions datacompy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@
two dataframes.
"""

import logging
from abc import ABC, abstractmethod
from typing import Any, Optional

from ordered_set import OrderedSet

LOG = logging.getLogger(__name__)
from datacompy.logger import INFO, get_logger

LOG = get_logger(__name__, INFO)


class BaseCompare(ABC):
Expand Down Expand Up @@ -118,7 +119,7 @@ def all_rows_overlap(self) -> bool:

@abstractmethod
def count_matching_rows(self) -> int:
"""Count the number of matchin grows."""
"""Count the number of matching rows."""
pass

@abstractmethod
Expand Down
4 changes: 2 additions & 2 deletions datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
two dataframes.
"""

import logging
import os
from typing import Any, Dict, List, Optional, Union, cast

Expand All @@ -30,8 +29,9 @@
from ordered_set import OrderedSet

from datacompy.base import BaseCompare, temp_column_name
from datacompy.logger import INFO, get_logger

LOG = logging.getLogger(__name__)
LOG = get_logger(__name__, INFO)


class Compare(BaseCompare):
Expand Down
46 changes: 27 additions & 19 deletions datacompy/fugue.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,33 @@

"""Compare two DataFrames that are supported by Fugue."""

import logging
import pickle
from collections import defaultdict
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast

import fugue.api as fa
import pandas as pd
import pyarrow as pa
from fugue import AnyDataFrame
from ordered_set import OrderedSet
from triad import Schema

from datacompy.core import Compare, render
from datacompy.logger import INFO, get_logger

LOG = logging.getLogger(__name__)
LOG = get_logger(__name__, INFO)
HASH_COL = "__datacompy__hash__"


def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
try:
import fugue.api as fa
import pyarrow as pa
from fugue import AnyDataFrame
from triad import Schema
except ImportError:
LOG.warning(
"Please note that you are missing the optional dependency: fugue. "
"If you need to use this functionality it must be installed."
)


def unq_columns(df1: "AnyDataFrame", df2: "AnyDataFrame") -> OrderedSet[str]:
"""Get columns that are unique to df1.
Parameters
Expand All @@ -54,7 +62,7 @@ def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
return cast(OrderedSet[str], OrderedSet(col1) - OrderedSet(col2))


def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
def intersect_columns(df1: "AnyDataFrame", df2: "AnyDataFrame") -> OrderedSet[str]:
"""Get columns that are shared between the two dataframes.
Parameters
Expand All @@ -75,7 +83,7 @@ def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
return OrderedSet(col1) & OrderedSet(col2)


def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame) -> bool:
def all_columns_match(df1: "AnyDataFrame", df2: "AnyDataFrame") -> bool:
"""Whether the columns all match in the dataframes.
Parameters
Expand All @@ -95,8 +103,8 @@ def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame) -> bool:


def is_match(
df1: AnyDataFrame,
df2: AnyDataFrame,
df1: "AnyDataFrame",
df2: "AnyDataFrame",
join_columns: Union[str, List[str]],
abs_tol: float = 0,
rel_tol: float = 0,
Expand Down Expand Up @@ -194,8 +202,8 @@ def is_match(


def all_rows_overlap(
df1: AnyDataFrame,
df2: AnyDataFrame,
df1: "AnyDataFrame",
df2: "AnyDataFrame",
join_columns: Union[str, List[str]],
abs_tol: float = 0,
rel_tol: float = 0,
Expand Down Expand Up @@ -290,8 +298,8 @@ def all_rows_overlap(


def count_matching_rows(
df1: AnyDataFrame,
df2: AnyDataFrame,
df1: "AnyDataFrame",
df2: "AnyDataFrame",
join_columns: Union[str, List[str]],
abs_tol: float = 0,
rel_tol: float = 0,
Expand Down Expand Up @@ -385,8 +393,8 @@ def count_matching_rows(


def report(
df1: AnyDataFrame,
df2: AnyDataFrame,
df1: "AnyDataFrame",
df2: "AnyDataFrame",
join_columns: Union[str, List[str]],
abs_tol: float = 0,
rel_tol: float = 0,
Expand Down Expand Up @@ -638,8 +646,8 @@ def _any(col: str) -> int:


def _distributed_compare(
df1: AnyDataFrame,
df2: AnyDataFrame,
df1: "AnyDataFrame",
df2: "AnyDataFrame",
join_columns: Union[str, List[str]],
return_obj_func: Callable[[Compare], Any],
abs_tol: float = 0,
Expand Down
61 changes: 61 additions & 0 deletions datacompy/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-Copyright: Copyright (c) Capital One Services, LLC
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 Capital One Services, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Logging Module.
Module which sets up the basic logging infrustrcuture for the application.
"""

import logging
import sys

# logger formating
BRIEF_FORMAT = "%(levelname)s %(asctime)s - %(name)s: %(message)s"
VERBOSE_FORMAT = (
"%(levelname)s|%(asctime)s|%(name)s|%(filename)s|"
"%(funcName)s|%(lineno)d: %(message)s"
)
FORMAT_TO_USE = VERBOSE_FORMAT

# logger levels
DEBUG = logging.DEBUG
INFO = logging.INFO
WARN = logging.WARNING
ERROR = logging.ERROR
CRITICAL = logging.CRITICAL


def get_logger(name=None, log_level=logging.DEBUG):
"""Set the basic logging features for the application.
Parameters
----------
name : str, optional
The name of the logger. Defaults to ``None``
log_level : int, optional
The logging level. Defaults to ``logging.INFO``
Returns
-------
logging.Logger
Returns a Logger obejct which is set with the passed in paramters.
Please see the following for more details:
https://docs.python.org/2/library/logging.html
"""
logging.basicConfig(format=FORMAT_TO_USE, stream=sys.stdout, level=log_level)
logging.captureWarnings(True)
logger = logging.getLogger(name)
return logger
12 changes: 4 additions & 8 deletions datacompy/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,19 @@
two dataframes.
"""

import logging
import os
from copy import deepcopy
from typing import Any, Dict, List, Optional, Union, cast

import numpy as np
import polars as pl
from ordered_set import OrderedSet
from polars.exceptions import ComputeError, InvalidOperationError

from datacompy.base import BaseCompare, temp_column_name
from datacompy.logger import INFO, get_logger

try:
import polars as pl
from polars.exceptions import ComputeError, InvalidOperationError
except ImportError:
pass # Let non-Polars people at least enjoy the loveliness of the pandas datacompy functionality

LOG = logging.getLogger(__name__)
LOG = get_logger(__name__, INFO)

STRING_TYPE = ["String", "Utf8"]
DATE_TYPE = ["Date", "Datetime"]
Expand Down
Loading

0 comments on commit 6b6affc

Please sign in to comment.