-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Id column converter #63
base: master
Are you sure you want to change the base?
Changes from all commits
7b6ac45
57b095c
410e1c6
be5f265
9186b3f
a23b27c
2a138bc
705053d
56bae87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import os | ||
from threading import RLock | ||
from typing import Dict, Optional, Union | ||
from warnings import warn | ||
|
||
import networkx as nx | ||
import numpy as np | ||
|
@@ -34,9 +35,13 @@ def __init__( | |
self._locks: Dict[str, RLock] = {} | ||
if isinstance(source, self.__class__): | ||
self._read = source.get | ||
self._proxy_feed = bool(self._view) | ||
elif isinstance(source, str) and os.path.isdir(source): | ||
self._read = self._read_csv | ||
self._bootstrap(source) | ||
self._proxy_feed = True | ||
# Validate the configuration and raise warning if needed | ||
self._validate_dependencies_conversion() | ||
else: | ||
raise ValueError("Invalid source") | ||
|
||
|
@@ -46,11 +51,15 @@ def get(self, filename: str) -> pd.DataFrame: | |
df = self._cache.get(filename) | ||
if df is None: | ||
df = self._read(filename) | ||
df = self._filter(filename, df) | ||
df = self._prune(filename, df) | ||
self._convert_types(filename, df) | ||
df = df.reset_index(drop=True) | ||
df = self._transform(filename, df) | ||
if self._proxy_feed: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure the choice should be to filter+prune OR convert+transform. Can you tell me a bit about how you are thinking about this behavior? I will need to think through the logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I see it for each table you filter you create a feed, and each feed is the source of the next one. except for the last layer the feeds (the proxy feeds) are only responsible for filtering the table according to the filter and the already filtered table (pruning). That's why those proxy feed you only need to filter and prune, before my change you did that by removing the transform and convert data from the configuration. Tell me if I missed something |
||
# files feed responsible for file access | ||
df = self._filter(filename, df) | ||
df = self._prune(filename, df) | ||
df = df.reset_index(drop=True) | ||
else: | ||
# proxy feed responsible for data conversion | ||
self._convert_types(filename, df) | ||
df = self._transform(filename, df) | ||
self.set(filename, df) | ||
return self._cache[filename] | ||
|
||
|
@@ -95,7 +104,7 @@ def _read_csv(self, filename: str) -> pd.DataFrame: | |
# DataFrame containing any required columns. | ||
return empty_df(columns) | ||
|
||
# If the file isn't in the zip, return an empty DataFrame. | ||
# Read file encoding | ||
with open(path, "rb") as f: | ||
encoding = detect_encoding(f) | ||
|
||
|
@@ -121,7 +130,6 @@ def _filter(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: | |
# If applicable, filter this dataframe by the given set of values | ||
if col in df.columns: | ||
df = df[df[col].isin(setwrap(values))] | ||
|
||
return df | ||
|
||
def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: | ||
|
@@ -147,10 +155,44 @@ def _prune(self, filename: str, df: pd.DataFrame) -> pd.DataFrame: | |
depcol = deps[depfile] | ||
# If applicable, prune this dataframe by the other | ||
if col in df.columns and depcol in depdf.columns: | ||
df = df[df[col].isin(depdf[depcol])] | ||
converter = self._get_convert_function(filename, col) | ||
# Convert the column before pruning since depdf is already converted | ||
col_series = converter(df[col]) if converter else df[col] | ||
df = df[col_series.isin(depdf[depcol])] | ||
|
||
return df | ||
|
||
def _get_convert_function(self, filename, colname): | ||
"""return the convert function from the config | ||
for a specific file and column""" | ||
return self._config.nodes.get(filename, {}).get("converters", {}).get(colname) | ||
|
||
def _validate_dependencies_conversion(self): | ||
"""Validate that dependent columns in different files | ||
has the same convert function if one exist. | ||
""" | ||
|
||
def check_column_pair(column_pair: dict) -> bool: | ||
assert len(column_pair) == 2 | ||
convert_funcs = [ | ||
self._get_convert_function(filename, colname) | ||
for filename, colname in column_pair.items() | ||
] | ||
if convert_funcs[0] != convert_funcs[1]: | ||
return False | ||
return True | ||
|
||
for file_a, file_b, data in self._config.edges(data=True): | ||
dependencies = data.get("dependencies", []) | ||
for column_pair in dependencies: | ||
if check_column_pair(column_pair): | ||
continue | ||
warn( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why produce a warning here as opposed to raising an exception? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought that it might be intentional, for example int8 and int16, or something like that. |
||
f"Converters Mismatch: column `{column_pair[file_a]}` in {file_a} " | ||
f"is dependant on column `{column_pair[file_b]}` in {file_b} " | ||
f"but converted with different functions, which might cause merging problems." | ||
) | ||
|
||
def _convert_types(self, filename: str, df: pd.DataFrame) -> None: | ||
""" | ||
Apply type conversions | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer the value of
_proxy_feed
not to depend on whether in feed is initialized from a path or another feed object. Is that possible?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it could probably just be
bool(self.view)
out side the of if blockThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried that and it didn't work.
Do you prefer passing proxy as a parameter to feed.init ?