From 7b6ac454417c0ba3f43d0a9813c4429df689c2df Mon Sep 17 00:00:00 2001 From: TheFrok Date: Fri, 14 Feb 2020 12:10:55 +0200 Subject: [PATCH 1/9] Added a function to detect problematic converters, and converts the dependent column before pruning --- partridge/gtfs.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index 62b91c5..eed3f39 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -1,6 +1,7 @@ import os from threading import RLock from typing import Dict, Optional, Union +from warnings import warn import networkx as nx import numpy as np @@ -26,6 +27,7 @@ def __init__( config: Optional[nx.DiGraph] = None, ): self._config: nx.DiGraph = default_config() if config is None else config + self._validate_dependencies_conversion() self._view: View = {} if view is None else view self._cache: Dict[str, pd.DataFrame] = {} self._pathmap: Dict[str, str] = {} @@ -147,10 +149,35 @@ def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: depcol = deps[depfile] # If applicable, prune this dataframe by the other if col in df.columns and depcol in depdf.columns: - df = df[df[col].isin(depdf[depcol])] + # Check converters + converter = self._config.nodes.get(filename, {}).get("converters", {}).get(col) + col_series = converter(df[col]) if converter else df[col] + df = df[col_series.isin(depdf[depcol])] return df + def _validate_dependencies_conversion(self): + def check_column_pair(column_pair: dict) -> bool: + assert len(column_pair) == 2 + convert_funcs = [] + for filename, colname in column_pair.items(): + converter = self._config.nodes.get(filename, {}).get("converters", {}).get(colname) + convert_funcs.append(converter) + if convert_funcs[0] != convert_funcs[1]: + return False + return True + + for file_a, file_b, data in self._config.edges(data=True): + deps = data.get("dependencies") + if not deps: + continue + for column_pair in deps: + if check_column_pair(column_pair): + continue + warn(f"Converter Mismatch: column {file_a}.{column_pair[file_a]} " + f"is dependant on column {file_b}.{column_pair[file_b]} " + f"but converted with different functions, which might cause merging problems.") + def _convert_types(self, filename: str, df: pd.DataFrame) -> None: """ Apply type conversions From 57b095c7ec843e322cdae7496398c20988566709 Mon Sep 17 00:00:00 2001 From: TheFrok Date: Fri, 14 Feb 2020 12:40:29 +0200 Subject: [PATCH 2/9] Added a test and fixed the warning message --- partridge/gtfs.py | 4 ++-- tests/test_feed.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index eed3f39..3ae255f 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -174,8 +174,8 @@ def check_column_pair(column_pair: dict) -> bool: for column_pair in deps: if check_column_pair(column_pair): continue - warn(f"Converter Mismatch: column {file_a}.{column_pair[file_a]} " - f"is dependant on column {file_b}.{column_pair[file_b]} " + warn(f"Converters Mismatch: column `{column_pair[file_a]}` in {file_a} " + f"is dependant on column `{column_pair[file_b]}` in {file_b} " f"but converted with different functions, which might cause merging problems.") def _convert_types(self, filename: str, df: pd.DataFrame) -> None: diff --git a/tests/test_feed.py b/tests/test_feed.py index a69a01f..487af43 100644 --- a/tests/test_feed.py +++ b/tests/test_feed.py @@ -1,4 +1,6 @@ import datetime +import pandas as pd + import pytest import partridge as ptg @@ -225,3 +227,18 @@ def test_filtered_columns(path): assert set(feed_full.trips.columns) == set(feed_view.trips.columns) assert set(feed_full.trips.columns) == set(feed_null.trips.columns) + + +@pytest.mark.parametrize( + "path", [fixture("amazon-2017-08-06")] +) +def test_converted_id_column(path): + conf = default_config() + conf.nodes['routes.txt']['converters']['agency_id'] = pd.to_numeric + with pytest.warns(UserWarning, match="Converters Mismatch:"): + ptg.load_feed(path, config=conf) + conf.nodes['agency.txt']['converters'] = {} + conf.nodes['agency.txt']['converters']['agency_id'] = pd.to_numeric + feed = ptg.load_feed(path, config=conf) + assert len(feed.routes) > 0 + From 410e1c6c9c09a1ca112c222dee65ac433cc529d4 Mon Sep 17 00:00:00 2001 From: TheFrok Date: Fri, 14 Feb 2020 17:06:28 +0200 Subject: [PATCH 3/9] Black compatible and documentation improvements --- partridge/gtfs.py | 34 +++++++++++++++++++++------------- tests/test_feed.py | 11 ++++------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index 3ae255f..612aa09 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -27,6 +27,7 @@ def __init__( config: Optional[nx.DiGraph] = None, ): self._config: nx.DiGraph = default_config() if config is None else config + # Validate the configuration and raise warning if needed self._validate_dependencies_conversion() self._view: View = {} if view is None else view self._cache: Dict[str, pd.DataFrame] = {} @@ -149,34 +150,41 @@ def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: depcol = deps[depfile] # If applicable, prune this dataframe by the other if col in df.columns and depcol in depdf.columns: - # Check converters - converter = self._config.nodes.get(filename, {}).get("converters", {}).get(col) + converter = self._get_convert_function(filename, col) + # Convert the column before pruning since depdf is already converted col_series = converter(df[col]) if converter else df[col] df = df[col_series.isin(depdf[depcol])] return df + def _get_convert_function(self, filename, colname): + return self._config.nodes.get(filename, {}).get("converters", {}).get(colname) + def _validate_dependencies_conversion(self): + """Validate that dependent columns in different files + has the same convert function if one exist. + """ + def check_column_pair(column_pair: dict) -> bool: assert len(column_pair) == 2 - convert_funcs = [] - for filename, colname in column_pair.items(): - converter = self._config.nodes.get(filename, {}).get("converters", {}).get(colname) - convert_funcs.append(converter) + convert_funcs = [ + self._get_convert_function(filename, colname) + for filename, colname in column_pair.items() + ] if convert_funcs[0] != convert_funcs[1]: return False return True for file_a, file_b, data in self._config.edges(data=True): - deps = data.get("dependencies") - if not deps: - continue - for column_pair in deps: + dependencies = data.get("dependencies", []) + for column_pair in dependencies: if check_column_pair(column_pair): continue - warn(f"Converters Mismatch: column `{column_pair[file_a]}` in {file_a} " - f"is dependant on column `{column_pair[file_b]}` in {file_b} " - f"but converted with different functions, which might cause merging problems.") + warn( + f"Converters Mismatch: column `{column_pair[file_a]}` in {file_a} " + f"is dependant on column `{column_pair[file_b]}` in {file_b} " + f"but converted with different functions, which might cause merging problems." + ) def _convert_types(self, filename: str, df: pd.DataFrame) -> None: """ diff --git a/tests/test_feed.py b/tests/test_feed.py index 487af43..7181910 100644 --- a/tests/test_feed.py +++ b/tests/test_feed.py @@ -229,16 +229,13 @@ def test_filtered_columns(path): assert set(feed_full.trips.columns) == set(feed_null.trips.columns) -@pytest.mark.parametrize( - "path", [fixture("amazon-2017-08-06")] -) +@pytest.mark.parametrize("path", [fixture("amazon-2017-08-06")]) def test_converted_id_column(path): conf = default_config() - conf.nodes['routes.txt']['converters']['agency_id'] = pd.to_numeric + conf.nodes["routes.txt"]["converters"]["agency_id"] = pd.to_numeric with pytest.warns(UserWarning, match="Converters Mismatch:"): ptg.load_feed(path, config=conf) - conf.nodes['agency.txt']['converters'] = {} - conf.nodes['agency.txt']['converters']['agency_id'] = pd.to_numeric + conf.nodes["agency.txt"]["converters"] = {} + conf.nodes["agency.txt"]["converters"]["agency_id"] = pd.to_numeric feed = ptg.load_feed(path, config=conf) assert len(feed.routes) > 0 - From be5f2659d72d9ed10cdd3f7c61758d4270a7280d Mon Sep 17 00:00:00 2001 From: TheFrok Date: Sun, 16 Feb 2020 16:46:07 +0200 Subject: [PATCH 4/9] left the converters attribute in the dummy feed in `_load_feed` since after converting one column for `trips.txt` for example comparing other columns to that would be impossible without access to their convert configuration --- partridge/gtfs.py | 4 +++- partridge/readers.py | 2 +- tests/test_feed.py | 9 ++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index 612aa09..4627094 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -98,7 +98,7 @@ def _read_csv(self, filename: str) -> pd.DataFrame: # DataFrame containing any required columns. return empty_df(columns) - # If the file isn't in the zip, return an empty DataFrame. + # Read file encoding with open(path, "rb") as f: encoding = detect_encoding(f) @@ -158,6 +158,8 @@ def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: return df def _get_convert_function(self, filename, colname): + """return the convert function from the config + for a specific file and column""" return self._config.nodes.get(filename, {}).get("converters", {}).get(colname) def _validate_dependencies_conversion(self): diff --git a/partridge/readers.py b/partridge/readers.py index 1fdcb4e..600a1ab 100644 --- a/partridge/readers.py +++ b/partridge/readers.py @@ -105,7 +105,7 @@ def finalize() -> None: def _load_feed(path: str, view: View, config: nx.DiGraph) -> Feed: """Multi-file feed filtering""" - config_ = remove_node_attributes(config, ["converters", "transformations"]) + config_ = remove_node_attributes(config, ["transformations"]) feed_ = Feed(path, view={}, config=config_) for filename, column_filters in view.items(): config_ = reroot_graph(config_, filename) diff --git a/tests/test_feed.py b/tests/test_feed.py index 7181910..bbd8dd8 100644 --- a/tests/test_feed.py +++ b/tests/test_feed.py @@ -232,10 +232,13 @@ def test_filtered_columns(path): @pytest.mark.parametrize("path", [fixture("amazon-2017-08-06")]) def test_converted_id_column(path): conf = default_config() - conf.nodes["routes.txt"]["converters"]["agency_id"] = pd.to_numeric + conf.nodes["routes.txt"]["converters"]["route_id"] = pd.to_numeric with pytest.warns(UserWarning, match="Converters Mismatch:"): ptg.load_feed(path, config=conf) - conf.nodes["agency.txt"]["converters"] = {} - conf.nodes["agency.txt"]["converters"]["agency_id"] = pd.to_numeric + conf.nodes["trips.txt"]["converters"]["route_id"] = pd.to_numeric + # Just to prevent another warning + conf.nodes["fare_rules.txt"]["converters"] = {} + conf.nodes["fare_rules.txt"]["converters"]["route_id"] = pd.to_numeric feed = ptg.load_feed(path, config=conf) + assert len(feed.trips) > 0 assert len(feed.routes) > 0 From 9186b3f786c8f5f488f8bcf352e4681be3cf9225 Mon Sep 17 00:00:00 2001 From: TheFrok Date: Sun, 16 Feb 2020 17:13:03 +0200 Subject: [PATCH 5/9] deleted `config_` var in `_load_feed` and replaced it with a `is_dummy` attribute in the Feed class that prevent transforms and dtype_conversion --- partridge/gtfs.py | 7 +++++-- partridge/readers.py | 7 +++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index 4627094..14b4e57 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -25,10 +25,12 @@ def __init__( source: Union[str, "Feed"], view: Optional[View] = None, config: Optional[nx.DiGraph] = None, + is_dummy: Optional[bool] = False, ): self._config: nx.DiGraph = default_config() if config is None else config # Validate the configuration and raise warning if needed self._validate_dependencies_conversion() + self._is_dummy = is_dummy self._view: View = {} if view is None else view self._cache: Dict[str, pd.DataFrame] = {} self._pathmap: Dict[str, str] = {} @@ -51,9 +53,10 @@ def get(self, filename: str) -> pd.DataFrame: df = self._read(filename) df = self._filter(filename, df) df = self._prune(filename, df) - self._convert_types(filename, df) df = df.reset_index(drop=True) - df = self._transform(filename, df) + if not self._is_dummy: + self._convert_types(filename, df) + df = self._transform(filename, df) self.set(filename, df) return self._cache[filename] diff --git a/partridge/readers.py b/partridge/readers.py index 600a1ab..99affc9 100644 --- a/partridge/readers.py +++ b/partridge/readers.py @@ -105,12 +105,11 @@ def finalize() -> None: def _load_feed(path: str, view: View, config: nx.DiGraph) -> Feed: """Multi-file feed filtering""" - config_ = remove_node_attributes(config, ["transformations"]) - feed_ = Feed(path, view={}, config=config_) + feed_ = Feed(path, view={}, config=config, is_dummy=True) for filename, column_filters in view.items(): - config_ = reroot_graph(config_, filename) + config_ = reroot_graph(config, filename) view_ = {filename: column_filters} - feed_ = Feed(feed_, view=view_, config=config_) + feed_ = Feed(feed_, view=view_, config=config_, is_dummy=True) return Feed(feed_, config=config) From a23b27cc134ab1320ee06aeb0c41fc076af28ca4 Mon Sep 17 00:00:00 2001 From: TheFrok Date: Tue, 18 Feb 2020 10:01:58 +0200 Subject: [PATCH 6/9] replaced `dummy` parameter with auto calculated `proxy` attribute --- partridge/gtfs.py | 16 +++++++++------- partridge/readers.py | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index 14b4e57..c105acf 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -25,12 +25,10 @@ def __init__( source: Union[str, "Feed"], view: Optional[View] = None, config: Optional[nx.DiGraph] = None, - is_dummy: Optional[bool] = False, ): self._config: nx.DiGraph = default_config() if config is None else config # Validate the configuration and raise warning if needed self._validate_dependencies_conversion() - self._is_dummy = is_dummy self._view: View = {} if view is None else view self._cache: Dict[str, pd.DataFrame] = {} self._pathmap: Dict[str, str] = {} @@ -39,9 +37,11 @@ def __init__( self._locks: Dict[str, RLock] = {} if isinstance(source, self.__class__): self._read = source.get + self._proxy_feed = bool(self._view) elif isinstance(source, str) and os.path.isdir(source): self._read = self._read_csv self._bootstrap(source) + self._proxy_feed = True else: raise ValueError("Invalid source") @@ -51,10 +51,13 @@ def get(self, filename: str) -> pd.DataFrame: df = self._cache.get(filename) if df is None: df = self._read(filename) - df = self._filter(filename, df) - df = self._prune(filename, df) - df = df.reset_index(drop=True) - if not self._is_dummy: + if self._proxy_feed: + # files feed responsible for file access + df = self._filter(filename, df) + df = self._prune(filename, df) + df = df.reset_index(drop=True) + else: + # proxy feed responsible for data conversion self._convert_types(filename, df) df = self._transform(filename, df) self.set(filename, df) @@ -127,7 +130,6 @@ def _filter(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: # If applicable, filter this dataframe by the given set of values if col in df.columns: df = df[df[col].isin(setwrap(values))] - return df def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: diff --git a/partridge/readers.py b/partridge/readers.py index 99affc9..66c1427 100644 --- a/partridge/readers.py +++ b/partridge/readers.py @@ -105,11 +105,11 @@ def finalize() -> None: def _load_feed(path: str, view: View, config: nx.DiGraph) -> Feed: """Multi-file feed filtering""" - feed_ = Feed(path, view={}, config=config, is_dummy=True) + feed_ = Feed(path, view={}, config=config) for filename, column_filters in view.items(): config_ = reroot_graph(config, filename) view_ = {filename: column_filters} - feed_ = Feed(feed_, view=view_, config=config_, is_dummy=True) + feed_ = Feed(feed_, view=view_, config=config_) return Feed(feed_, config=config) From 2a138bc3e1c200874f49bd13553ee98e0b8092fc Mon Sep 17 00:00:00 2001 From: TheFrok Date: Tue, 18 Feb 2020 10:09:44 +0200 Subject: [PATCH 7/9] moved config method to run only once --- partridge/gtfs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/partridge/gtfs.py b/partridge/gtfs.py index c105acf..16ed0f5 100644 --- a/partridge/gtfs.py +++ b/partridge/gtfs.py @@ -27,8 +27,6 @@ def __init__( config: Optional[nx.DiGraph] = None, ): self._config: nx.DiGraph = default_config() if config is None else config - # Validate the configuration and raise warning if needed - self._validate_dependencies_conversion() self._view: View = {} if view is None else view self._cache: Dict[str, pd.DataFrame] = {} self._pathmap: Dict[str, str] = {} @@ -42,6 +40,8 @@ def __init__( self._read = self._read_csv self._bootstrap(source) self._proxy_feed = True + # Validate the configuration and raise warning if needed + self._validate_dependencies_conversion() else: raise ValueError("Invalid source") From 705053d79e1fe3d4f620ed70f3dbf16b1def0c5f Mon Sep 17 00:00:00 2001 From: TheFrok Date: Tue, 18 Feb 2020 14:27:46 +0200 Subject: [PATCH 8/9] removed unused import --- partridge/readers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/partridge/readers.py b/partridge/readers.py index 66c1427..e573301 100644 --- a/partridge/readers.py +++ b/partridge/readers.py @@ -1,20 +1,18 @@ -from collections import defaultdict import datetime import os import shutil import tempfile -from typing import DefaultDict, Dict, FrozenSet, Optional, Set, Tuple import weakref +from collections import defaultdict +from typing import DefaultDict, Dict, FrozenSet, Optional, Set, Tuple -from isoweek import Week import networkx as nx +from isoweek import Week from .config import default_config, geo_config, empty_config, reroot_graph from .gtfs import Feed from .parsers import vparse_date from .types import View -from .utilities import remove_node_attributes - DAY_NAMES = ( "monday", From 56bae87b6b9862e8954058680dc0e2c178aa5959 Mon Sep 17 00:00:00 2001 From: TheFrok Date: Fri, 6 Mar 2020 17:06:53 +0200 Subject: [PATCH 9/9] fixed import order --- partridge/readers.py | 6 +++--- tests/test_feed.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/partridge/readers.py b/partridge/readers.py index e573301..54bf924 100644 --- a/partridge/readers.py +++ b/partridge/readers.py @@ -1,13 +1,13 @@ +from collections import defaultdict import datetime import os import shutil import tempfile -import weakref -from collections import defaultdict from typing import DefaultDict, Dict, FrozenSet, Optional, Set, Tuple +import weakref -import networkx as nx from isoweek import Week +import networkx as nx from .config import default_config, geo_config, empty_config, reroot_graph from .gtfs import Feed diff --git a/tests/test_feed.py b/tests/test_feed.py index bbd8dd8..a0b06fd 100644 --- a/tests/test_feed.py +++ b/tests/test_feed.py @@ -1,6 +1,6 @@ import datetime -import pandas as pd +import pandas as pd import pytest import partridge as ptg