From 84b8fc405484277bd9437d0c4d2ce7cadb5d9c42 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 20 Sep 2022 17:35:01 -0400 Subject: [PATCH 01/18] added static typing to data_utils.py --- dataprofiler/data_readers/data_utils.py | 63 +++++++++++++------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 6e7abeec0..1708b072c 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,10 +1,12 @@ """Contains functions for data readers.""" import json +from logging import Logger import re +from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union, cast import urllib from builtins import next from collections import OrderedDict -from io import BytesIO, TextIOWrapper +from io import BytesIO, StringIO, TextIOWrapper import dateutil import pandas as pd @@ -15,10 +17,10 @@ from .. import dp_logging from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA -logger = dp_logging.get_child_logger(__name__) +logger: Logger = dp_logging.get_child_logger(__name__) -def data_generator(data_list): +def data_generator(data_list: List[str]) -> Generator[str, None, None]: """ Take a list and return a generator on the list. @@ -31,7 +33,7 @@ def data_generator(data_list): yield item -def generator_on_file(file_object): +def generator_on_file(file_object: Union[StringIO, BytesIO]) -> Generator[Union[str, bytes], None, None]: """ Take a file and return a generator that returns lines. @@ -49,7 +51,7 @@ def generator_on_file(file_object): file_object.close() -def convert_int_to_string(x): +def convert_int_to_string(x: int) -> str: """ Convert the given input to string. @@ -69,7 +71,7 @@ def convert_int_to_string(x): return str(x) -def unicode_to_str(data, ignore_dicts=False): +def unicode_to_str(data: Union[str, List, Dict], ignore_dicts: bool=False) -> Union[str, List, Dict]: """ Convert data to string representation if it is a unicode string. @@ -99,7 +101,7 @@ def unicode_to_str(data, ignore_dicts=False): return data -def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False): +def json_to_dataframe(json_lines: List[Dict], selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: """ Take list of json objects and return dataframe representing json list. @@ -137,7 +139,7 @@ def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False): return df, original_df_dtypes -def read_json_df(data_generator, selected_columns=None, read_in_string=False): +def read_json_df(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[Iterator[pd.DataFrame], pd.Series]: """ Return an iterator that returns a chunk of data as dataframe in each call. @@ -187,10 +189,10 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False): k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return json_to_dataframe(lines, selected_columns, read_in_string) + return json_to_dataframe(cast(List[Dict], lines), selected_columns, read_in_string) -def read_json(data_generator, selected_columns=None, read_in_string=False): +def read_json(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> List[Dict]: """ Return the lines of a json. @@ -239,17 +241,17 @@ def read_json(data_generator, selected_columns=None, read_in_string=False): k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return lines + return cast(List[Dict], lines) def read_csv_df( - file_path, - delimiter, - header, - selected_columns=[], - read_in_string=False, - encoding="utf-8", -): + file_path: Union[str, BytesIO, TextIOWrapper], + delimiter: str, + header: int, + selected_columns: List[str]=[], + read_in_string: bool=False, + encoding: str="utf-8", +) -> pd.DataFrame: """ Read a CSV file in chunks and return dataframe in form of iterator. @@ -299,13 +301,14 @@ def read_csv_df( # if the buffer was wrapped, detach it before returning if is_buf_wrapped: + assert isinstance(file_path, TextIOWrapper) file_path.detach() fo.close() return data -def read_parquet_df(file_path, selected_columns=None, read_in_string=False): +def read_parquet_df(file_path: str, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: """ Return an iterator that returns one row group each time. @@ -349,7 +352,7 @@ def read_parquet_df(file_path, selected_columns=None, read_in_string=False): return data, original_df_dtypes -def read_text_as_list_of_strs(file_path, encoding=None): +def read_text_as_list_of_strs(file_path: str, encoding: Optional[str]=None) -> List[str]: """ Return list of strings relative to the chunk size. @@ -367,7 +370,7 @@ def read_text_as_list_of_strs(file_path, encoding=None): return data -def detect_file_encoding(file_path, buffer_size=1024, max_lines=20): +def detect_file_encoding(file_path: str, buffer_size: int=1024, max_lines: int=20) -> str: """ Determine encoding of files within initial `max_lines` of length `buffer_size`. @@ -456,7 +459,7 @@ def _decode_is_valid(encoding): return encoding.lower() -def detect_cell_type(cell): +def detect_cell_type(cell: str) -> str: """ Detect the cell type (int, float, etc). @@ -488,7 +491,7 @@ def detect_cell_type(cell): return cell_type -def get_delimiter_regex(delimiter=",", quotechar=","): +def get_delimiter_regex(delimiter: str=",", quotechar: str=",") -> re.Pattern[str]: """ Build regex for delimiter checks. @@ -518,7 +521,7 @@ def get_delimiter_regex(delimiter=",", quotechar=","): return re.compile(delimiter_regex + quotechar_regex) -def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True): +def find_nth_loc(string: Optional[str]=None, search_query: Optional[str]=None, n: int=0, ignore_consecutive: bool=True) -> Tuple[int, int]: """ Search string via search_query and return nth index in which query occurs. @@ -565,8 +568,8 @@ def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True): def load_as_str_from_file( - file_path, file_encoding=None, max_lines=10, max_bytes=65536, chunk_size_bytes=1024 -): + file_path: str, file_encoding: Optional[str]=None, max_lines: int=10, max_bytes: int=65536, chunk_size_bytes: int=1024 +) -> str: """ Load data from a csv file up to a specific line OR byte_size. @@ -602,7 +605,7 @@ def load_as_str_from_file( # Return either the last index of sample_lines OR # the index of the newline char that matches remaining_lines - search_query_value = "\n" + search_query_value: Union[str, bytes] = "\n" if isinstance(sample_lines, bytes): search_query_value = b"\n" @@ -611,7 +614,7 @@ def load_as_str_from_file( while start_loc < len_sample_lines - 1 and total_occurrences < max_lines: loc, occurrence = find_nth_loc( sample_lines[start_loc:], - search_query=search_query_value, + search_query=cast(str, search_query_value), # TODO: make sure find_nth_loc() works with search_query as bytes n=remaining_lines, ) @@ -629,7 +632,7 @@ def load_as_str_from_file( return data_as_str -def is_valid_url(url_as_string): +def is_valid_url(url_as_string: Any) -> bool: """ Determine whether a given string is a valid URL. @@ -646,7 +649,7 @@ def is_valid_url(url_as_string): return all([result.scheme, result.netloc]) -def url_to_bytes(url_as_string, options): +def url_to_bytes(url_as_string: str, options: Dict) -> BytesIO: """ Read in URL and converts it to a byte stream. From 757076b008bb4d17d75e9a6250228fd61cfbcd7e Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Mon, 26 Sep 2022 10:07:08 -0400 Subject: [PATCH 02/18] changed re.Pattern type annotation to typing.Pattern --- dataprofiler/data_readers/data_utils.py | 80 +++++++++++++++++++------ 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 1708b072c..c359575c1 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,12 +1,23 @@ """Contains functions for data readers.""" import json -from logging import Logger import re -from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union, cast import urllib from builtins import next from collections import OrderedDict from io import BytesIO, StringIO, TextIOWrapper +from logging import Logger +from typing import ( + Any, + Dict, + Generator, + Iterator, + List, + Optional, + Pattern, + Tuple, + Union, + cast, +) import dateutil import pandas as pd @@ -33,7 +44,9 @@ def data_generator(data_list: List[str]) -> Generator[str, None, None]: yield item -def generator_on_file(file_object: Union[StringIO, BytesIO]) -> Generator[Union[str, bytes], None, None]: +def generator_on_file( + file_object: Union[StringIO, BytesIO] +) -> Generator[Union[str, bytes], None, None]: """ Take a file and return a generator that returns lines. @@ -71,7 +84,9 @@ def convert_int_to_string(x: int) -> str: return str(x) -def unicode_to_str(data: Union[str, List, Dict], ignore_dicts: bool=False) -> Union[str, List, Dict]: +def unicode_to_str( + data: Union[str, List, Dict], ignore_dicts: bool = False +) -> Union[str, List, Dict]: """ Convert data to string representation if it is a unicode string. @@ -101,7 +116,11 @@ def unicode_to_str(data: Union[str, List, Dict], ignore_dicts: bool=False) -> Un return data -def json_to_dataframe(json_lines: List[Dict], selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: +def json_to_dataframe( + json_lines: List[Dict], + selected_columns: Optional[List[str]] = None, + read_in_string: bool = False, +) -> Tuple[pd.DataFrame, pd.Series]: """ Take list of json objects and return dataframe representing json list. @@ -139,7 +158,11 @@ def json_to_dataframe(json_lines: List[Dict], selected_columns: Optional[List[st return df, original_df_dtypes -def read_json_df(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[Iterator[pd.DataFrame], pd.Series]: +def read_json_df( + data_generator: Generator, + selected_columns: Optional[List[str]] = None, + read_in_string: bool = False, +) -> Tuple[Iterator[pd.DataFrame], pd.Series]: """ Return an iterator that returns a chunk of data as dataframe in each call. @@ -192,7 +215,11 @@ def read_json_df(data_generator: Generator, selected_columns: Optional[List[str] return json_to_dataframe(cast(List[Dict], lines), selected_columns, read_in_string) -def read_json(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> List[Dict]: +def read_json( + data_generator: Generator, + selected_columns: Optional[List[str]] = None, + read_in_string: bool = False, +) -> List[Dict]: """ Return the lines of a json. @@ -248,9 +275,9 @@ def read_csv_df( file_path: Union[str, BytesIO, TextIOWrapper], delimiter: str, header: int, - selected_columns: List[str]=[], - read_in_string: bool=False, - encoding: str="utf-8", + selected_columns: List[str] = [], + read_in_string: bool = False, + encoding: str = "utf-8", ) -> pd.DataFrame: """ Read a CSV file in chunks and return dataframe in form of iterator. @@ -308,7 +335,11 @@ def read_csv_df( return data -def read_parquet_df(file_path: str, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: +def read_parquet_df( + file_path: str, + selected_columns: Optional[List[str]] = None, + read_in_string: bool = False, +) -> Tuple[pd.DataFrame, pd.Series]: """ Return an iterator that returns one row group each time. @@ -352,7 +383,9 @@ def read_parquet_df(file_path: str, selected_columns: Optional[List[str]]=None, return data, original_df_dtypes -def read_text_as_list_of_strs(file_path: str, encoding: Optional[str]=None) -> List[str]: +def read_text_as_list_of_strs( + file_path: str, encoding: Optional[str] = None +) -> List[str]: """ Return list of strings relative to the chunk size. @@ -370,7 +403,9 @@ def read_text_as_list_of_strs(file_path: str, encoding: Optional[str]=None) -> L return data -def detect_file_encoding(file_path: str, buffer_size: int=1024, max_lines: int=20) -> str: +def detect_file_encoding( + file_path: str, buffer_size: int = 1024, max_lines: int = 20 +) -> str: """ Determine encoding of files within initial `max_lines` of length `buffer_size`. @@ -491,7 +526,7 @@ def detect_cell_type(cell: str) -> str: return cell_type -def get_delimiter_regex(delimiter: str=",", quotechar: str=",") -> re.Pattern[str]: +def get_delimiter_regex(delimiter: str = ",", quotechar: str = ",") -> Pattern[str]: """ Build regex for delimiter checks. @@ -521,7 +556,12 @@ def get_delimiter_regex(delimiter: str=",", quotechar: str=",") -> re.Pattern[st return re.compile(delimiter_regex + quotechar_regex) -def find_nth_loc(string: Optional[str]=None, search_query: Optional[str]=None, n: int=0, ignore_consecutive: bool=True) -> Tuple[int, int]: +def find_nth_loc( + string: Optional[str] = None, + search_query: Optional[str] = None, + n: int = 0, + ignore_consecutive: bool = True, +) -> Tuple[int, int]: """ Search string via search_query and return nth index in which query occurs. @@ -568,7 +608,11 @@ def find_nth_loc(string: Optional[str]=None, search_query: Optional[str]=None, n def load_as_str_from_file( - file_path: str, file_encoding: Optional[str]=None, max_lines: int=10, max_bytes: int=65536, chunk_size_bytes: int=1024 + file_path: str, + file_encoding: Optional[str] = None, + max_lines: int = 10, + max_bytes: int = 65536, + chunk_size_bytes: int = 1024, ) -> str: """ Load data from a csv file up to a specific line OR byte_size. @@ -614,7 +658,9 @@ def load_as_str_from_file( while start_loc < len_sample_lines - 1 and total_occurrences < max_lines: loc, occurrence = find_nth_loc( sample_lines[start_loc:], - search_query=cast(str, search_query_value), # TODO: make sure find_nth_loc() works with search_query as bytes + search_query=cast( + str, search_query_value + ), # TODO: make sure find_nth_loc() works with search_query as bytes n=remaining_lines, ) From 8a652e7d0f32a5f63cdb9a9dfa131c81daf7e235 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Mon, 26 Sep 2022 10:15:57 -0400 Subject: [PATCH 03/18] removed logging import --- dataprofiler/data_readers/data_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index c359575c1..3cecef95b 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -5,7 +5,6 @@ from builtins import next from collections import OrderedDict from io import BytesIO, StringIO, TextIOWrapper -from logging import Logger from typing import ( Any, Dict, @@ -28,7 +27,7 @@ from .. import dp_logging from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA -logger: Logger = dp_logging.get_child_logger(__name__) +logger = dp_logging.get_child_logger(__name__) def data_generator(data_list: List[str]) -> Generator[str, None, None]: From 23bb6aca223764c7fbe27b02ecff03a059799de1 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 27 Sep 2022 18:15:20 -0400 Subject: [PATCH 04/18] changed castings to if statement --- dataprofiler/data_readers/data_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 3cecef95b..2a1fdbabf 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -187,7 +187,7 @@ def read_json_df( each call as well as original dtypes of the dataframe columns. :rtype: typle(Iterator(pd.DataFrame), pd.Series(dtypes) """ - lines = list() + lines: List[Dict] = list() k = 0 while True: try: @@ -204,14 +204,15 @@ def read_json_df( ), ignore_dicts=True, ) - lines.append(obj) + if isinstance(obj, dict): # shold always pass but needed for mypy + lines.append(obj) except ValueError: pass # To ignore malformatted lines. k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return json_to_dataframe(cast(List[Dict], lines), selected_columns, read_in_string) + return json_to_dataframe(lines, selected_columns, read_in_string) def read_json( @@ -260,14 +261,15 @@ def read_json( ), ignore_dicts=True, ) - lines.append(obj) + if isinstance(obj, dict): # should always pass but needed for mypy + lines.append(obj) except ValueError: pass # To ignore malformatted lines. k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return cast(List[Dict], lines) + return lines def read_csv_df( From bc44f0d99b74aa5d79952503ac3da2c10d738e92 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 27 Sep 2022 18:18:23 -0400 Subject: [PATCH 05/18] fixed formatting with black --- dataprofiler/data_readers/data_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 2a1fdbabf..42c965d83 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -204,8 +204,8 @@ def read_json_df( ), ignore_dicts=True, ) - if isinstance(obj, dict): # shold always pass but needed for mypy - lines.append(obj) + if isinstance(obj, dict): # shold always pass but needed for mypy + lines.append(obj) except ValueError: pass # To ignore malformatted lines. @@ -261,8 +261,8 @@ def read_json( ), ignore_dicts=True, ) - if isinstance(obj, dict): # should always pass but needed for mypy - lines.append(obj) + if isinstance(obj, dict): # should always pass but needed for mypy + lines.append(obj) except ValueError: pass # To ignore malformatted lines. From 9bd01a40cd1d38bfecb0b070d08cbdf8884c4f4a Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Mon, 3 Oct 2022 10:52:57 -0400 Subject: [PATCH 06/18] changed isinstance to cast --- dataprofiler/data_readers/data_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 42c965d83..a3e8bade0 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -261,8 +261,7 @@ def read_json( ), ignore_dicts=True, ) - if isinstance(obj, dict): # should always pass but needed for mypy - lines.append(obj) + lines.append(cast(Dict, obj)) # should always be Dict except ValueError: pass # To ignore malformatted lines. From 46a6712cee66d1b08fe874ff7783fca8df6afbaa Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Mon, 3 Oct 2022 10:54:34 -0400 Subject: [PATCH 07/18] updated read_csv_df() type signature to include None's --- dataprofiler/data_readers/data_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index a3e8bade0..df9c09c56 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -273,11 +273,11 @@ def read_json( def read_csv_df( file_path: Union[str, BytesIO, TextIOWrapper], - delimiter: str, - header: int, + delimiter: Optional[str], + header: Optional[int], selected_columns: List[str] = [], read_in_string: bool = False, - encoding: str = "utf-8", + encoding: Optional[str] = "utf-8", ) -> pd.DataFrame: """ Read a CSV file in chunks and return dataframe in form of iterator. From 99ea3e186e1bf948eb7f5273af7838c366963c8f Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 4 Oct 2022 10:30:59 -0400 Subject: [PATCH 08/18] changed isinstance to cast in read_json_df --- dataprofiler/data_readers/data_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index df9c09c56..5b7bb1c09 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -204,8 +204,7 @@ def read_json_df( ), ignore_dicts=True, ) - if isinstance(obj, dict): # shold always pass but needed for mypy - lines.append(obj) + lines.append(cast(Dict, obj)) except ValueError: pass # To ignore malformatted lines. @@ -261,7 +260,7 @@ def read_json( ), ignore_dicts=True, ) - lines.append(cast(Dict, obj)) # should always be Dict + lines.append(cast(Dict, obj)) # should always be Dict except ValueError: pass # To ignore malformatted lines. @@ -658,9 +657,8 @@ def load_as_str_from_file( while start_loc < len_sample_lines - 1 and total_occurrences < max_lines: loc, occurrence = find_nth_loc( sample_lines[start_loc:], - search_query=cast( - str, search_query_value - ), # TODO: make sure find_nth_loc() works with search_query as bytes + search_query=cast(str, search_query_value), + # TODO: make sure find_nth_loc() works with search_query as bytes n=remaining_lines, ) From 48c192a06220cc13e6a61bff4628bdc486f6d7c3 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Wed, 5 Oct 2022 09:37:21 -0400 Subject: [PATCH 09/18] removed is_buf_wrapped in favor of isinstance --- dataprofiler/data_readers/data_utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 5b7bb1c09..55ef70f03 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -295,7 +295,7 @@ def read_csv_df( :return: Iterator :rtype: pd.DataFrame """ - args = { + args: Dict[str, Any] = { "delimiter": delimiter, "header": header, "iterator": True, @@ -314,20 +314,16 @@ def read_csv_df( if len(selected_columns) > 0: args["usecols"] = selected_columns - # account for py3.6 requirement for pandas, can remove if >= py3.7 - is_buf_wrapped = False if isinstance(file_path, BytesIO): # a BytesIO stream has to be wrapped in order to properly be detached # in 3.6 this avoids read_csv wrapping the stream and closing too early file_path = TextIOWrapper(file_path, encoding=encoding) - is_buf_wrapped = True fo = pd.read_csv(file_path, **args) data = fo.read() # if the buffer was wrapped, detach it before returning - if is_buf_wrapped: - assert isinstance(file_path, TextIOWrapper) + if isinstance(file_path, TextIOWrapper): file_path.detach() fo.close() From 30381bb792156198700fc7c6b92bc2579b23df9c Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Thu, 6 Oct 2022 18:38:37 -0400 Subject: [PATCH 10/18] added deleted comment --- dataprofiler/data_readers/data_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 55ef70f03..ff258b4bc 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -322,6 +322,7 @@ def read_csv_df( fo = pd.read_csv(file_path, **args) data = fo.read() + # account for py3.6 requirement for pandas, can remove if >= py3.7 # if the buffer was wrapped, detach it before returning if isinstance(file_path, TextIOWrapper): file_path.detach() From d12d9870142044c414721deca58295cef7600841 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Thu, 6 Oct 2022 19:10:26 -0400 Subject: [PATCH 11/18] added is_buf_wrapped back --- dataprofiler/data_readers/data_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index ff258b4bc..56763f88d 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -314,17 +314,20 @@ def read_csv_df( if len(selected_columns) > 0: args["usecols"] = selected_columns + # account for py3.6 requirement for pandas, can remove if >= py3.7 + is_buf_wrapped = False if isinstance(file_path, BytesIO): # a BytesIO stream has to be wrapped in order to properly be detached # in 3.6 this avoids read_csv wrapping the stream and closing too early file_path = TextIOWrapper(file_path, encoding=encoding) + is_buf_wrapped = True fo = pd.read_csv(file_path, **args) data = fo.read() - # account for py3.6 requirement for pandas, can remove if >= py3.7 # if the buffer was wrapped, detach it before returning - if isinstance(file_path, TextIOWrapper): + if is_buf_wrapped: + file_path = cast(TextIOWrapper, file_path) file_path.detach() fo.close() From 7d5b8fb4f120504a8d84ea0da8a907d66798711b Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 10:39:09 -0400 Subject: [PATCH 12/18] added JSONType --- dataprofiler/data_readers/data_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 56763f88d..bd6185985 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -29,6 +29,7 @@ logger = dp_logging.get_child_logger(__name__) +JSONType = Union[str, int, float, bool, None, List['JSONType'], Dict[str, 'JSONType']] def data_generator(data_list: List[str]) -> Generator[str, None, None]: """ @@ -84,8 +85,8 @@ def convert_int_to_string(x: int) -> str: def unicode_to_str( - data: Union[str, List, Dict], ignore_dicts: bool = False -) -> Union[str, List, Dict]: + data: JSONType, ignore_dicts: bool = False +) -> JSONType: """ Convert data to string representation if it is a unicode string. @@ -106,7 +107,7 @@ def unicode_to_str( # if data is a dictionary if isinstance(data, dict) and not ignore_dicts: return { - unicode_to_str(key, ignore_dicts=True): unicode_to_str( + cast(str, unicode_to_str(key, ignore_dicts=True)): unicode_to_str( value, ignore_dicts=True ) for key, value in data.items() @@ -116,7 +117,7 @@ def unicode_to_str( def json_to_dataframe( - json_lines: List[Dict], + json_lines: List[JSONType], selected_columns: Optional[List[str]] = None, read_in_string: bool = False, ) -> Tuple[pd.DataFrame, pd.Series]: @@ -187,7 +188,7 @@ def read_json_df( each call as well as original dtypes of the dataframe columns. :rtype: typle(Iterator(pd.DataFrame), pd.Series(dtypes) """ - lines: List[Dict] = list() + lines: List[JSONType] = list() k = 0 while True: try: @@ -204,7 +205,7 @@ def read_json_df( ), ignore_dicts=True, ) - lines.append(cast(Dict, obj)) + lines.append(obj) except ValueError: pass # To ignore malformatted lines. From 7fe1704578b8eafc3b4bc4ee4d4b8c350e046c28 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 10:43:53 -0400 Subject: [PATCH 13/18] fixed formatting --- dataprofiler/data_readers/data_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index bd6185985..86ae7561a 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -29,7 +29,8 @@ logger = dp_logging.get_child_logger(__name__) -JSONType = Union[str, int, float, bool, None, List['JSONType'], Dict[str, 'JSONType']] +JSONType = Union[str, int, float, bool, None, List["JSONType"], Dict[str, "JSONType"]] + def data_generator(data_list: List[str]) -> Generator[str, None, None]: """ @@ -84,9 +85,7 @@ def convert_int_to_string(x: int) -> str: return str(x) -def unicode_to_str( - data: JSONType, ignore_dicts: bool = False -) -> JSONType: +def unicode_to_str(data: JSONType, ignore_dicts: bool = False) -> JSONType: """ Convert data to string representation if it is a unicode string. From e0cf544d08bc302cb3db88d7b057473f9bec7ba4 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 10:48:47 -0400 Subject: [PATCH 14/18] added JSONType to read_json() --- dataprofiler/data_readers/data_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 86ae7561a..ad5327c69 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -218,7 +218,7 @@ def read_json( data_generator: Generator, selected_columns: Optional[List[str]] = None, read_in_string: bool = False, -) -> List[Dict]: +) -> List[JSONType]: """ Return the lines of a json. @@ -243,7 +243,7 @@ def read_json( :return: returns the lines of a json file :rtype: list(dict) """ - lines = list() + lines: List[JSONType] = list() k = 0 while True: try: @@ -260,7 +260,7 @@ def read_json( ), ignore_dicts=True, ) - lines.append(cast(Dict, obj)) # should always be Dict + lines.append(obj) except ValueError: pass # To ignore malformatted lines. From 849263b6690f2ebd8d46124878accf73f07b323f Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 10:54:16 -0400 Subject: [PATCH 15/18] updated unicode_to_str docstring --- dataprofiler/data_readers/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index ad5327c69..abbb7ca8d 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -90,7 +90,7 @@ def unicode_to_str(data: JSONType, ignore_dicts: bool = False) -> JSONType: Convert data to string representation if it is a unicode string. :param data: input data - :type data: str + :type data: JSONType :param ignore_dicts: if set, ignore the dictionary type processing :type ignore_dicts: boolean :return: string representation of data From 9485c9d335bedb957738615977700bd572453b4a Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 11:00:40 -0400 Subject: [PATCH 16/18] moved JSONType to _typing.py --- dataprofiler/_typing.py | 3 ++- dataprofiler/data_readers/data_utils.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/_typing.py b/dataprofiler/_typing.py index 8356e5ab3..bce1364f1 100644 --- a/dataprofiler/_typing.py +++ b/dataprofiler/_typing.py @@ -1,7 +1,8 @@ """Contains typing aliases.""" -from typing import Union +from typing import Dict, List, Union import numpy as np import pandas as pd DataArray = Union[pd.DataFrame, pd.Series, np.ndarray] +JSONType = Union[str, int, float, bool, None, List["JSONType"], Dict[str, "JSONType"]] diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index abbb7ca8d..b39a5d6e2 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -25,12 +25,11 @@ from chardet.universaldetector import UniversalDetector from .. import dp_logging +from .._typing import JSONType from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA logger = dp_logging.get_child_logger(__name__) -JSONType = Union[str, int, float, bool, None, List["JSONType"], Dict[str, "JSONType"]] - def data_generator(data_list: List[str]) -> Generator[str, None, None]: """ From 8de0ceff3ff0503ae4422de6c2b804700e34f484 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 18 Oct 2022 12:13:19 -0400 Subject: [PATCH 17/18] changed JSONType to be nonrecursive and removed cast on key --- dataprofiler/_typing.py | 2 +- dataprofiler/data_readers/data_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataprofiler/_typing.py b/dataprofiler/_typing.py index bce1364f1..200bdd9d3 100644 --- a/dataprofiler/_typing.py +++ b/dataprofiler/_typing.py @@ -5,4 +5,4 @@ import pandas as pd DataArray = Union[pd.DataFrame, pd.Series, np.ndarray] -JSONType = Union[str, int, float, bool, None, List["JSONType"], Dict[str, "JSONType"]] +JSONType = Union[str, int, float, bool, None, List, Dict] diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index b39a5d6e2..ac6a387e4 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -105,7 +105,7 @@ def unicode_to_str(data: JSONType, ignore_dicts: bool = False) -> JSONType: # if data is a dictionary if isinstance(data, dict) and not ignore_dicts: return { - cast(str, unicode_to_str(key, ignore_dicts=True)): unicode_to_str( + unicode_to_str(key, ignore_dicts=True): unicode_to_str( value, ignore_dicts=True ) for key, value in data.items() From 351462b42875f41bd56add369917c7ccf06035a2 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Tue, 18 Oct 2022 12:54:44 -0400 Subject: [PATCH 18/18] fix docstring --- dataprofiler/data_readers/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index ac6a387e4..6eca35367 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -123,7 +123,7 @@ def json_to_dataframe( Take list of json objects and return dataframe representing json list. :param json_lines: list of json objects - :type json_lines: list(dict) + :type json_lines: list(JSONType) :param selected_columns: a list of keys to be processed :type selected_columns: list(str) :param read_in_string: if True, all the values in dataframe will be