pandas-dev · mroeschke · Jul 23, 2024 · Jun 11, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -49,6 +49,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
+- Support reading Stata 102-format (Stata 1) dta files (PR 58978)
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -91,9 +91,9 @@
 
 _version_error = (
     "Version of given Stata file is {version}. pandas supports importing "
-    "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
-    "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
-    "and 119 (Stata 15/16, over 32,767 variables)."
+    "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE),  "
+    "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), "
+    "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)."
 )
 
 _statafile_processing_params1 = """\
@@ -1352,8 +1352,10 @@ def _get_variable_labels(self) -> list[str]:
     def _get_nobs(self) -> int:
         if self._format_version >= 118:
             return self._read_uint64()
-        else:
+        elif self._format_version >= 103:
             return self._read_uint32()
+        else:
+            return self._read_uint16()
 
     def _get_data_label(self) -> str:
         if self._format_version >= 118:
@@ -1393,9 +1395,24 @@ def _get_seek_variable_labels(self) -> int:
 
     def _read_old_header(self, first_char: bytes) -> None:
         self._format_version = int(first_char[0])
-        if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]:
+        if self._format_version not in [
+            102,
+            103,
+            104,
+            105,
+            108,
+            110,
+            111,
+            113,
+            114,
+            115,
+        ]:
             raise ValueError(_version_error.format(version=self._format_version))
         self._set_encoding()
+        # Note 102 format will have a zero in this header position, so support
+        # relies on little-endian being set whenever this value isn't one,
+        # even though for later releases strictly speaking the value should
+        # be either one or two to be valid
         self._byteorder = ">" if self._read_int8() == 0x1 else "<"
         self._filetype = self._read_int8()
         self._path_or_buf.read(1)  # unused

diff --git a/pandas/tests/io/data/stata/stata-compat-102.dta b/pandas/tests/io/data/stata/stata-compat-102.dta
diff --git a/pandas/tests/io/data/stata/stata4_102.dta b/pandas/tests/io/data/stata/stata4_102.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath):
         # stata doesn't save .category metadata
         tm.assert_frame_equal(parsed, expected)
 
-    @pytest.mark.parametrize("version", [103, 104, 105, 108])
+    @pytest.mark.parametrize("version", [102, 103, 104, 105, 108])
     def test_readold_dta4(self, version, datapath):
         # This test is the same as test_read_dta4 above except that the columns
         # had to be renamed to match the restrictions in older file format
@@ -2058,6 +2058,20 @@ def test_backward_compat_nodateconversion(version, datapath):
     tm.assert_frame_equal(old_dta, expected, check_dtype=False)
 
 
+@pytest.mark.parametrize("version", [102])
+def test_backward_compat_nostring(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    data_base = datapath("io", "data", "stata")
+    ref = os.path.join(data_base, "stata-compat-118.dta")
+    old = os.path.join(data_base, f"stata-compat-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    # The Stata data format prior to 103 did not support string data
+    expected = expected.drop(columns=["s10"])
+    old_dta = read_stata(old, convert_dates=False)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
 def test_bigendian(version, datapath):
     ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
@@ -2067,6 +2081,7 @@ def test_bigendian(version, datapath):
     tm.assert_frame_equal(big_dta, expected)
 
 
+# Note: 102 format does not support big-endian byte order
 @pytest.mark.parametrize("version", [103, 104])
 def test_bigendian_nodateconversion(version, datapath):
     # The Stata data format prior to 105 did not support a date format