Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added static typing to data_utils.py #662

Merged
merged 28 commits into from
Oct 18, 2022
Merged
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
84b8fc4
added static typing to data_utils.py
Sanketh7 Sep 20, 2022
757076b
changed re.Pattern type annotation to typing.Pattern
Sanketh7 Sep 26, 2022
8a652e7
removed logging import
Sanketh7 Sep 26, 2022
23bb6ac
changed castings to if statement
Sanketh7 Sep 27, 2022
bc44f0d
fixed formatting with black
Sanketh7 Sep 27, 2022
cc1b2d0
Merge branch 'main' into data_utils_typing_new
micdavis Sep 28, 2022
ff82975
Merge branch 'main' into data_utils_typing_new
Sanketh7 Oct 3, 2022
9bd01a4
changed isinstance to cast
Sanketh7 Oct 3, 2022
46a6712
updated read_csv_df() type signature to include None's
Sanketh7 Oct 3, 2022
99ea3e1
changed isinstance to cast in read_json_df
Sanketh7 Oct 4, 2022
48c192a
removed is_buf_wrapped in favor of isinstance
Sanketh7 Oct 5, 2022
e4d46f5
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 6, 2022
30381bb
added deleted comment
Sanketh7 Oct 6, 2022
d12d987
added is_buf_wrapped back
Sanketh7 Oct 6, 2022
efe5fde
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 7, 2022
cdadc48
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 7, 2022
8f390c3
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 11, 2022
796dd83
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 12, 2022
1654351
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 17, 2022
5ca9305
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 17, 2022
7d5b8fb
added JSONType
Sanketh7 Oct 18, 2022
7fe1704
fixed formatting
Sanketh7 Oct 18, 2022
e0cf544
added JSONType to read_json()
Sanketh7 Oct 18, 2022
849263b
updated unicode_to_str docstring
Sanketh7 Oct 18, 2022
9485c9d
moved JSONType to _typing.py
Sanketh7 Oct 18, 2022
bb9807f
Merge branch 'main' into data_utils_typing_new
taylorfturner Oct 18, 2022
8de0cef
changed JSONType to be nonrecursive and removed cast on key
Sanketh7 Oct 18, 2022
351462b
fix docstring
taylorfturner Oct 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 78 additions & 31 deletions dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,19 @@
import urllib
from builtins import next
from collections import OrderedDict
from io import BytesIO, TextIOWrapper
from io import BytesIO, StringIO, TextIOWrapper
from typing import (
Any,
Dict,
Generator,
Iterator,
List,
Optional,
Pattern,
Tuple,
Union,
cast,
)

import dateutil
import pandas as pd
Expand All @@ -18,7 +30,7 @@
logger = dp_logging.get_child_logger(__name__)


def data_generator(data_list):
def data_generator(data_list: List[str]) -> Generator[str, None, None]:
"""
Take a list and return a generator on the list.

Expand All @@ -31,7 +43,9 @@ def data_generator(data_list):
yield item


def generator_on_file(file_object):
def generator_on_file(
file_object: Union[StringIO, BytesIO]
) -> Generator[Union[str, bytes], None, None]:
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"""
Take a file and return a generator that returns lines.

Expand All @@ -49,7 +63,7 @@ def generator_on_file(file_object):
file_object.close()


def convert_int_to_string(x):
def convert_int_to_string(x: int) -> str:
"""
Convert the given input to string.

Expand All @@ -69,7 +83,9 @@ def convert_int_to_string(x):
return str(x)


def unicode_to_str(data, ignore_dicts=False):
def unicode_to_str(
data: Union[str, List, Dict], ignore_dicts: bool = False
) -> Union[str, List, Dict]:
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"""
Convert data to string representation if it is a unicode string.

Expand Down Expand Up @@ -99,7 +115,11 @@ def unicode_to_str(data, ignore_dicts=False):
return data


def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False):
def json_to_dataframe(
json_lines: List[Dict],
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> Tuple[pd.DataFrame, pd.Series]:
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"""
Take list of json objects and return dataframe representing json list.

Expand Down Expand Up @@ -137,7 +157,11 @@ def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False):
return df, original_df_dtypes


def read_json_df(data_generator, selected_columns=None, read_in_string=False):
def read_json_df(
data_generator: Generator,
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> Tuple[Iterator[pd.DataFrame], pd.Series]:
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"""
Return an iterator that returns a chunk of data as dataframe in each call.

Expand All @@ -163,7 +187,7 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False):
each call as well as original dtypes of the dataframe columns.
:rtype: typle(Iterator(pd.DataFrame), pd.Series(dtypes)
"""
lines = list()
lines: List[Dict] = list()
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
k = 0
while True:
try:
Expand All @@ -180,7 +204,7 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False):
),
ignore_dicts=True,
)
lines.append(obj)
lines.append(cast(Dict, obj))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did we try overloading unicode_to_str? would that fix this so we don't have to cast?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did some experimentation and added this:

@overload
def unicode_to_str(data: Dict, ignore_dicts: bool) -> Dict: ...
@overload
def unicode_to_str(data: str, ignore_dicts: bool) -> str: ...
@overload
def unicode_to_str(data: List, ignore_dicts: bool) -> List: ...
def unicode_to_str(
    data: Union[str, List, Dict], ignore_dicts: bool = False
) -> Union[str, List, Dict]:
# function body here

The issue I'm getting now is when we call this

json.loads(raw_line, object_hook=unicode_to_str, object_pairs_hook=OrderedDict)

I get

Argument "object_hook" to "loads" has incompatible type overloaded function; expected "Optional[Callable[[Dict[Any, Any]], Any]]"

from mypy. My guess is that we can solve this by casting before we pass in the function to object_hook but that would somewhat defeat the purpose of overloading in the first place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sanketh7 I think you don't need to specify:

def unicode_to_str(
    data: Union[str, List, Dict], ignore_dicts: bool = False
) -> Union[str, List, Dict]:

b/c of the three other overloads. I assume the defaults would still need to be assigned though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even with that removal, I'm still getting the same issue. I think it has to do with the function taking on 3 possible types while object_hook is only compatible with one of those types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This issue no longer applies due to the new way of handling json types.

except ValueError:
pass
# To ignore malformatted lines.
Expand All @@ -190,7 +214,11 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False):
return json_to_dataframe(lines, selected_columns, read_in_string)


def read_json(data_generator, selected_columns=None, read_in_string=False):
def read_json(
data_generator: Generator,
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> List[Dict]:
"""
Return the lines of a json.

Expand Down Expand Up @@ -232,7 +260,7 @@ def read_json(data_generator, selected_columns=None, read_in_string=False):
),
ignore_dicts=True,
)
lines.append(obj)
lines.append(cast(Dict, obj)) # should always be Dict
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment needed? isnt that what the type cast is doing?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's fair ... not really needed per se. I'm fine leaving just for clarity in the code

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added it because cast is inherently unsafe but it turns out obj will always be a Dict here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this bc of the OrderedDict?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's our guarantee that it is a dict?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've determined this could be a list. however, not a string.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code that could result in this would be:

data = dp.Data(StringIO("[1]\n[2]"))
data.head()

Currently, there's an error, in the reader not supporting this. However, it should.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be resolved in PR #691

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer a need for casting from my latest commit.

except ValueError:
pass
# To ignore malformatted lines.
Expand All @@ -243,13 +271,13 @@ def read_json(data_generator, selected_columns=None, read_in_string=False):


def read_csv_df(
file_path,
delimiter,
header,
selected_columns=[],
read_in_string=False,
encoding="utf-8",
):
file_path: Union[str, BytesIO, TextIOWrapper],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar to below, if we override allowing each diff input, does that remove the need to cast?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so because as far I can tell, overloading doesn't let us have multiple implementations. It just lets us specify relationships between input types and return types.

Copy link
Contributor

@taylorfturner taylorfturner Oct 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in certain conditions, would still need to cast I think because the code doesn't know that under this condition the type should be this. So that's where cast() comes in to play under specific if conditions... at least that's how I'm reading it.

LGTM

delimiter: Optional[str],
header: Optional[int],
selected_columns: List[str] = [],
read_in_string: bool = False,
encoding: Optional[str] = "utf-8",
) -> pd.DataFrame:
"""
Read a CSV file in chunks and return dataframe in form of iterator.

Expand All @@ -267,7 +295,7 @@ def read_csv_df(
:return: Iterator
:rtype: pd.DataFrame
"""
args = {
args: Dict[str, Any] = {
"delimiter": delimiter,
"header": header,
"iterator": True,
Expand Down Expand Up @@ -299,13 +327,18 @@ def read_csv_df(

# if the buffer was wrapped, detach it before returning
if is_buf_wrapped:
file_path = cast(TextIOWrapper, file_path)
file_path.detach()
fo.close()

return data


def read_parquet_df(file_path, selected_columns=None, read_in_string=False):
def read_parquet_df(
file_path: str,
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> Tuple[pd.DataFrame, pd.Series]:
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"""
Return an iterator that returns one row group each time.

Expand Down Expand Up @@ -349,7 +382,9 @@ def read_parquet_df(file_path, selected_columns=None, read_in_string=False):
return data, original_df_dtypes


def read_text_as_list_of_strs(file_path, encoding=None):
def read_text_as_list_of_strs(
file_path: str, encoding: Optional[str] = None
) -> List[str]:
"""
Return list of strings relative to the chunk size.

Expand All @@ -367,7 +402,9 @@ def read_text_as_list_of_strs(file_path, encoding=None):
return data


def detect_file_encoding(file_path, buffer_size=1024, max_lines=20):
def detect_file_encoding(
file_path: str, buffer_size: int = 1024, max_lines: int = 20
) -> str:
"""
Determine encoding of files within initial `max_lines` of length `buffer_size`.

Expand Down Expand Up @@ -456,7 +493,7 @@ def _decode_is_valid(encoding):
return encoding.lower()


def detect_cell_type(cell):
def detect_cell_type(cell: str) -> str:
"""
Detect the cell type (int, float, etc).

Expand Down Expand Up @@ -488,7 +525,7 @@ def detect_cell_type(cell):
return cell_type


def get_delimiter_regex(delimiter=",", quotechar=","):
def get_delimiter_regex(delimiter: str = ",", quotechar: str = ",") -> Pattern[str]:
"""
Build regex for delimiter checks.

Expand Down Expand Up @@ -518,7 +555,12 @@ def get_delimiter_regex(delimiter=",", quotechar=","):
return re.compile(delimiter_regex + quotechar_regex)


def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True):
def find_nth_loc(
string: Optional[str] = None,
search_query: Optional[str] = None,
n: int = 0,
ignore_consecutive: bool = True,
) -> Tuple[int, int]:
"""
Search string via search_query and return nth index in which query occurs.

Expand Down Expand Up @@ -565,8 +607,12 @@ def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True):


def load_as_str_from_file(
file_path, file_encoding=None, max_lines=10, max_bytes=65536, chunk_size_bytes=1024
):
file_path: str,
file_encoding: Optional[str] = None,
max_lines: int = 10,
max_bytes: int = 65536,
chunk_size_bytes: int = 1024,
) -> str:
"""
Load data from a csv file up to a specific line OR byte_size.

Expand Down Expand Up @@ -602,7 +648,7 @@ def load_as_str_from_file(

# Return either the last index of sample_lines OR
# the index of the newline char that matches remaining_lines
search_query_value = "\n"
search_query_value: Union[str, bytes] = "\n"
if isinstance(sample_lines, bytes):
search_query_value = b"\n"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it make more sense to cast to string immediately? On line 653 could we do search_query_value = case(str, b"\n") then we don't need line 660 and search_query_value can just be statically typed as str

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see what you are getting at... wouldn't say its a high priority. We / I can fix in a follow-up PR though

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of casting, could nth loc just accept bytes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could make find_nth_loc accept bytes but then we'd have to deal with the cases where the search string is of type str and the search query is of type bytes (and vice versa). One way I found to deal with this is to use the type AnyStr in find_nth_loc which basically creates a generic type that can only be str or bytes. However, the issue I run into is that the current way the code is set up, we would have to pass in a Union[str, bytes] to find_nth_loc but you can't pass that in for AnyStr (see python/mypy#1533).


Expand All @@ -611,7 +657,8 @@ def load_as_str_from_file(
while start_loc < len_sample_lines - 1 and total_occurrences < max_lines:
loc, occurrence = find_nth_loc(
sample_lines[start_loc:],
search_query=search_query_value,
search_query=cast(str, search_query_value),
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
# TODO: make sure find_nth_loc() works with search_query as bytes
n=remaining_lines,
)

Expand All @@ -629,7 +676,7 @@ def load_as_str_from_file(
return data_as_str


def is_valid_url(url_as_string):
def is_valid_url(url_as_string: Any) -> bool:
"""
Determine whether a given string is a valid URL.

Expand All @@ -646,7 +693,7 @@ def is_valid_url(url_as_string):
return all([result.scheme, result.netloc])


def url_to_bytes(url_as_string, options):
def url_to_bytes(url_as_string: str, options: Dict) -> BytesIO:
"""
Read in URL and converts it to a byte stream.

Expand Down