Merge pull request #362 from capitalone/develop

Release v0.15.0
capitalone · Dec 17, 2024 · 6b6affc · 6b6affc
2 parents d6f5893 + f00d0bd
commit 6b6affc
Show file tree

Hide file tree

Showing 19 changed files with 251 additions and 159 deletions.
diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -134,7 +134,7 @@ jobs:
     - name: Install datacompy
       run: |
         python -m pip install --upgrade pip
-        python -m pip install .[tests,duckdb,polars,dask,ray]
+        python -m pip install .[tests,fugue]
     - name: Test with pytest
       run: |
         python -m pytest tests/ --ignore=tests/test_snowflake.py
diff --git a/README.md b/README.md
@@ -31,9 +31,7 @@ If you would like to use Spark or any other backends please make sure you instal
 
 ```shell
 pip install datacompy[spark]
-pip install datacompy[dask]
-pip install datacompy[duckdb]
-pip install datacompy[ray]
+pip install datacompy[fugue]
 pip install datacompy[snowflake]
 
 ```
@@ -45,7 +43,7 @@ Pandas on Spark implementation. The original ``SparkCompare`` implementation dif
 from all the other native implementations. To align the API better,  and keep behaviour
 consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare``
 
-Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``)
+Subsequently in ``v0.13.0`` a PySpark DataFrame class has been introduced (``SparkSQLCompare``)
 which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version
 the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark
 logic is now under the ``spark`` submodule.

diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.14.4"
+__version__ = "0.15.0"
 
 import platform
 from warnings import warn

diff --git a/datacompy/base.py b/datacompy/base.py
@@ -21,13 +21,14 @@
 two dataframes.
 """
 
-import logging
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
 from ordered_set import OrderedSet
 
-LOG = logging.getLogger(__name__)
+from datacompy.logger import INFO, get_logger
+
+LOG = get_logger(__name__, INFO)
 
 
 class BaseCompare(ABC):
@@ -118,7 +119,7 @@ def all_rows_overlap(self) -> bool:
 
     @abstractmethod
     def count_matching_rows(self) -> int:
-        """Count the number of matchin grows."""
+        """Count the number of matching rows."""
         pass
 
     @abstractmethod

diff --git a/datacompy/core.py b/datacompy/core.py
@@ -21,7 +21,6 @@
 two dataframes.
 """
 
-import logging
 import os
 from typing import Any, Dict, List, Optional, Union, cast
 
@@ -30,8 +29,9 @@
 from ordered_set import OrderedSet
 
 from datacompy.base import BaseCompare, temp_column_name
+from datacompy.logger import INFO, get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__, INFO)
 
 
 class Compare(BaseCompare):

diff --git a/datacompy/fugue.py b/datacompy/fugue.py
@@ -15,25 +15,33 @@
 
 """Compare two DataFrames that are supported by Fugue."""
 
-import logging
 import pickle
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 
-import fugue.api as fa
 import pandas as pd
-import pyarrow as pa
-from fugue import AnyDataFrame
 from ordered_set import OrderedSet
-from triad import Schema
 
 from datacompy.core import Compare, render
+from datacompy.logger import INFO, get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__, INFO)
 HASH_COL = "__datacompy__hash__"
 
 
-def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
+try:
+    import fugue.api as fa
+    import pyarrow as pa
+    from fugue import AnyDataFrame
+    from triad import Schema
+except ImportError:
+    LOG.warning(
+        "Please note that you are missing the optional dependency: fugue. "
+        "If you need to use this functionality it must be installed."
+    )
+
+
+def unq_columns(df1: "AnyDataFrame", df2: "AnyDataFrame") -> OrderedSet[str]:
     """Get columns that are unique to df1.
 
     Parameters
@@ -54,7 +62,7 @@ def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
     return cast(OrderedSet[str], OrderedSet(col1) - OrderedSet(col2))
 
 
-def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
+def intersect_columns(df1: "AnyDataFrame", df2: "AnyDataFrame") -> OrderedSet[str]:
     """Get columns that are shared between the two dataframes.
 
     Parameters
@@ -75,7 +83,7 @@ def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame) -> OrderedSet[str]:
     return OrderedSet(col1) & OrderedSet(col2)
 
 
-def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame) -> bool:
+def all_columns_match(df1: "AnyDataFrame", df2: "AnyDataFrame") -> bool:
     """Whether the columns all match in the dataframes.
 
     Parameters
@@ -95,8 +103,8 @@ def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame) -> bool:
 
 
 def is_match(
-    df1: AnyDataFrame,
-    df2: AnyDataFrame,
+    df1: "AnyDataFrame",
+    df2: "AnyDataFrame",
     join_columns: Union[str, List[str]],
     abs_tol: float = 0,
     rel_tol: float = 0,
@@ -194,8 +202,8 @@ def is_match(
 
 
 def all_rows_overlap(
-    df1: AnyDataFrame,
-    df2: AnyDataFrame,
+    df1: "AnyDataFrame",
+    df2: "AnyDataFrame",
     join_columns: Union[str, List[str]],
     abs_tol: float = 0,
     rel_tol: float = 0,
@@ -290,8 +298,8 @@ def all_rows_overlap(
 
 
 def count_matching_rows(
-    df1: AnyDataFrame,
-    df2: AnyDataFrame,
+    df1: "AnyDataFrame",
+    df2: "AnyDataFrame",
     join_columns: Union[str, List[str]],
     abs_tol: float = 0,
     rel_tol: float = 0,
@@ -385,8 +393,8 @@ def count_matching_rows(
 
 
 def report(
-    df1: AnyDataFrame,
-    df2: AnyDataFrame,
+    df1: "AnyDataFrame",
+    df2: "AnyDataFrame",
     join_columns: Union[str, List[str]],
     abs_tol: float = 0,
     rel_tol: float = 0,
@@ -638,8 +646,8 @@ def _any(col: str) -> int:
 
 
 def _distributed_compare(
-    df1: AnyDataFrame,
-    df2: AnyDataFrame,
+    df1: "AnyDataFrame",
+    df2: "AnyDataFrame",
     join_columns: Union[str, List[str]],
     return_obj_func: Callable[[Compare], Any],
     abs_tol: float = 0,

diff --git a/datacompy/logger.py b/datacompy/logger.py
@@ -0,0 +1,61 @@
+# SPDX-Copyright: Copyright (c) Capital One Services, LLC
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 Capital One Services, LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Logging Module.
+
+Module which sets up the basic logging infrustrcuture for the application.
+"""
+
+import logging
+import sys
+
+# logger formating
+BRIEF_FORMAT = "%(levelname)s %(asctime)s - %(name)s: %(message)s"
+VERBOSE_FORMAT = (
+    "%(levelname)s|%(asctime)s|%(name)s|%(filename)s|"
+    "%(funcName)s|%(lineno)d: %(message)s"
+)
+FORMAT_TO_USE = VERBOSE_FORMAT
+
+# logger levels
+DEBUG = logging.DEBUG
+INFO = logging.INFO
+WARN = logging.WARNING
+ERROR = logging.ERROR
+CRITICAL = logging.CRITICAL
+
+
+def get_logger(name=None, log_level=logging.DEBUG):
+    """Set the basic logging features for the application.
+
+    Parameters
+    ----------
+    name : str, optional
+        The name of the logger. Defaults to ``None``
+    log_level : int, optional
+        The logging level. Defaults to ``logging.INFO``
+
+    Returns
+    -------
+    logging.Logger
+        Returns a Logger obejct which is set with the passed in paramters.
+        Please see the following for more details:
+        https://docs.python.org/2/library/logging.html
+    """
+    logging.basicConfig(format=FORMAT_TO_USE, stream=sys.stdout, level=log_level)
+    logging.captureWarnings(True)
+    logger = logging.getLogger(name)
+    return logger
diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -21,23 +21,19 @@
 two dataframes.
 """
 
-import logging
 import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Union, cast
 
 import numpy as np
+import polars as pl
 from ordered_set import OrderedSet
+from polars.exceptions import ComputeError, InvalidOperationError
 
 from datacompy.base import BaseCompare, temp_column_name
+from datacompy.logger import INFO, get_logger
 
-try:
-    import polars as pl
-    from polars.exceptions import ComputeError, InvalidOperationError
-except ImportError:
-    pass  # Let non-Polars people at least enjoy the loveliness of the pandas datacompy functionality
-
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__, INFO)
 
 STRING_TYPE = ["String", "Utf8"]
 DATE_TYPE = ["Date", "Datetime"]