Backport PR #37302 on branch 1.1.x (BUG: Allow empty chunksize in stata reader when using iterator) (#37364)

meeseeksmachine · bashtage · simonjayhawkins · web-flow · commit c13285804c76 · 2020-10-24T17:11:08.000+01:00
* Backport PR #37302: BUG: Allow empty chunksize in stata reader when using iterator * remove match argument to assert_produces_warning Co-authored-by: Kevin Sheppard <bashtage@users.noreply.github.com> Co-authored-by: Simon Hawkins <simonjayhawkins@gmail.com>
diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst
@@ -22,6 +22,7 @@ Fixed regressions
 - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`)
 - Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
 - Fixed regression in certain offsets (:meth:`pd.offsets.Day() <pandas.tseries.offsets.Day>` and below) no longer being hashable (:issue:`37267`)
+- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning):
 
 
 precision_loss_doc = """
-Column converted from %s to %s, and some data are outside of the lossless
+Column converted from {0} to {1}, and some data are outside of the lossless
 conversion range. This may result in a loss of precision in the saved data.
 """
 
@@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
     object in a DataFrame.
     """
     ws = ""
-    #                  original, if small, if large
+    # original, if small, if large
     conversion_data = (
         (np.bool_, np.int8, np.int8),
         (np.uint8, np.int8, np.int16),
@@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
                     dtype = c_data[1]
                 else:
                     dtype = c_data[2]
-                if c_data[2] == np.float64:  # Warn if necessary
+                if c_data[2] == np.int64:  # Warn if necessary
                     if data[col].max() >= 2 ** 53:
                         ws = precision_loss_doc.format("uint64", "float64")
 
@@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
         self.value_labels = list(zip(np.arange(len(categories)), categories))
         self.value_labels.sort(key=lambda x: x[0])
         self.text_len = 0
-        self.off: List[int] = []
-        self.val: List[int] = []
         self.txt: List[bytes] = []
         self.n = 0
 
         # Compute lengths and setup lists of offsets and labels
+        offsets: List[int] = []
+        values: List[int] = []
         for vl in self.value_labels:
             category = vl[1]
             if not isinstance(category, str):
@@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
                     ValueLabelTypeMismatch,
                 )
             category = category.encode(encoding)
-            self.off.append(self.text_len)
+            offsets.append(self.text_len)
             self.text_len += len(category) + 1  # +1 for the padding
-            self.val.append(vl[0])
+            values.append(vl[0])
             self.txt.append(category)
             self.n += 1
 
@@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
             )
 
         # Ensure int32
-        self.off = np.array(self.off, dtype=np.int32)
-        self.val = np.array(self.val, dtype=np.int32)
+        self.off = np.array(offsets, dtype=np.int32)
+        self.val = np.array(values, dtype=np.int32)
 
         # Total length
         self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
@@ -876,23 +876,23 @@ def __init__(self):
         # with a label, but the underlying variable is -127 to 100
         # we're going to drop the label and cast to int
         self.DTYPE_MAP = dict(
-            list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
+            list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
             + [
-                (251, np.int8),
-                (252, np.int16),
-                (253, np.int32),
-                (254, np.float32),
-                (255, np.float64),
+                (251, np.dtype(np.int8)),
+                (252, np.dtype(np.int16)),
+                (253, np.dtype(np.int32)),
+                (254, np.dtype(np.float32)),
+                (255, np.dtype(np.float64)),
             ]
         )
         self.DTYPE_MAP_XML = dict(
             [
-                (32768, np.uint8),  # Keys to GSO
-                (65526, np.float64),
-                (65527, np.float32),
-                (65528, np.int32),
-                (65529, np.int16),
-                (65530, np.int8),
+                (32768, np.dtype(np.uint8)),  # Keys to GSO
+                (65526, np.dtype(np.float64)),
+                (65527, np.dtype(np.float32)),
+                (65528, np.dtype(np.int32)),
+                (65529, np.dtype(np.int16)),
+                (65530, np.dtype(np.int8)),
             ]
         )
         self.TYPE_MAP = list(range(251)) + list("bhlfd")
@@ -1050,9 +1050,10 @@ def __init__(
         self._order_categoricals = order_categoricals
         self._encoding = ""
         self._chunksize = chunksize
-        if self._chunksize is not None and (
-            not isinstance(chunksize, int) or chunksize <= 0
-        ):
+        self._using_iterator = False
+        if self._chunksize is None:
+            self._chunksize = 1
+        elif not isinstance(chunksize, int) or chunksize <= 0:
             raise ValueError("chunksize must be a positive integer when set.")
 
         # State variables for the file
@@ -1062,7 +1063,7 @@ def __init__(
         self._column_selector_set = False
         self._value_labels_read = False
         self._data_read = False
-        self._dtype = None
+        self._dtype: Optional[np.dtype] = None
         self._lines_read = 0
 
         self._native_byteorder = _set_endianness(sys.byteorder)
@@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None:
     # Get data type information, works for versions 117-119.
     def _get_dtypes(
         self, seek_vartypes: int
-    ) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
+    ) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:
 
         self.path_or_buf.seek(seek_vartypes)
         raw_typlist = [
@@ -1519,11 +1520,8 @@ def _read_strls(self) -> None:
             self.GSO[str(v_o)] = decoded_va
 
     def __next__(self) -> DataFrame:
-        if self._chunksize is None:
-            raise ValueError(
-                "chunksize must be set to a positive integer to use as an iterator."
-            )
-        return self.read(nrows=self._chunksize or 1)
+        self._using_iterator = True
+        return self.read(nrows=self._chunksize)
 
     def get_chunk(self, size: Optional[int] = None) -> DataFrame:
         """
@@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool:
             convert = False
             for col in data:
                 dtype = data[col].dtype
-                if dtype in (np.float16, np.float32):
-                    dtype = np.float64
+                if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
+                    dtype = np.dtype(np.float64)
                     convert = True
-                elif dtype in (np.int8, np.int16, np.int32):
-                    dtype = np.int64
+                elif dtype in (
+                    np.dtype(np.int8),
+                    np.dtype(np.int16),
+                    np.dtype(np.int32),
+                ):
+                    dtype = np.dtype(np.int64)
                     convert = True
                 retyped_data.append((col, data[col].astype(dtype)))
             if convert:
@@ -1807,14 +1809,14 @@ def _do_convert_categoricals(
                 keys = np.array(list(vl.keys()))
                 column = data[col]
                 key_matches = column.isin(keys)
-                if self._chunksize is not None and key_matches.all():
-                    initial_categories = keys
+                if self._using_iterator and key_matches.all():
+                    initial_categories: Optional[np.ndarray] = keys
                     # If all categories are in the keys and we are iterating,
                     # use the same keys for all chunks. If some are missing
                     # value labels, then we will fall back to the categories
                     # varying across chunks.
                 else:
-                    if self._chunksize is not None:
+                    if self._using_iterator:
                         # warn is using an iterator
                         warnings.warn(
                             categorical_conversion_warning, CategoricalConversionWarning
@@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
         "ty",
         "%ty",
     ]:
-        return np.float64  # Stata expects doubles for SIFs
+        return np.dtype(np.float64)  # Stata expects doubles for SIFs
     else:
         raise NotImplementedError(f"Format {fmt} not implemented")
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
         StataReader(dta_file, chunksize=0)
     with pytest.raises(ValueError, match="chunksize must be a positive"):
         StataReader(dta_file, chunksize="apple")
-    with pytest.raises(ValueError, match="chunksize must be set to a positive"):
-        with StataReader(dta_file) as reader:
-            reader.__next__()
 
 
 def test_iterator_value_labels():
@@ -1983,3 +1980,18 @@ def test_iterator_value_labels():
             for i in range(2):
                 tm.assert_index_equal(chunk.dtypes[i].categories, expected)
             tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
+
+
+def test_precision_loss():
+    df = DataFrame(
+        [[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
+        columns=["big", "little"],
+    )
+    with tm.ensure_clean() as path:
+        with tm.assert_produces_warning(PossiblePrecisionLoss):
+            df.to_stata(path, write_index=False)
+        reread = read_stata(path)
+        expected_dt = Series([np.float64, np.float64], index=["big", "little"])
+        tm.assert_series_equal(reread.dtypes, expected_dt)
+        assert reread.loc[0, "little"] == df.loc[0, "little"]
+        assert reread.loc[0, "big"] == float(df.loc[0, "big"])