Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
- Support reading Stata 102-format (Stata 1) dta files (PR 58978)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)

.. ---------------------------------------------------------------------------
Expand Down
27 changes: 22 additions & 5 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@

_version_error = (
"Version of given Stata file is {version}. pandas supports importing "
"versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
"and 119 (Stata 15/16, over 32,767 variables)."
"versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), "
"113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), "
"118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)."
)

_statafile_processing_params1 = """\
Expand Down Expand Up @@ -1352,8 +1352,10 @@ def _get_variable_labels(self) -> list[str]:
def _get_nobs(self) -> int:
if self._format_version >= 118:
return self._read_uint64()
else:
elif self._format_version >= 103:
return self._read_uint32()
else:
return self._read_uint16()

def _get_data_label(self) -> str:
if self._format_version >= 118:
Expand Down Expand Up @@ -1393,9 +1395,24 @@ def _get_seek_variable_labels(self) -> int:

def _read_old_header(self, first_char: bytes) -> None:
self._format_version = int(first_char[0])
if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]:
if self._format_version not in [
102,
103,
104,
105,
108,
110,
111,
113,
114,
115,
]:
raise ValueError(_version_error.format(version=self._format_version))
self._set_encoding()
# Note 102 format will have a zero in this header position, so support
# relies on little-endian being set whenever this value isn't one,
# even though for later releases strictly speaking the value should
# be either one or two to be valid
self._byteorder = ">" if self._read_int8() == 0x1 else "<"
self._filetype = self._read_int8()
self._path_or_buf.read(1) # unused
Expand Down
Binary file added pandas/tests/io/data/stata/stata-compat-102.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_102.dta
Binary file not shown.
17 changes: 16 additions & 1 deletion pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath):
# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize("version", [103, 104, 105, 108])
@pytest.mark.parametrize("version", [102, 103, 104, 105, 108])
def test_readold_dta4(self, version, datapath):
# This test is the same as test_read_dta4 above except that the columns
# had to be renamed to match the restrictions in older file format
Expand Down Expand Up @@ -2058,6 +2058,20 @@ def test_backward_compat_nodateconversion(version, datapath):
tm.assert_frame_equal(old_dta, expected, check_dtype=False)


@pytest.mark.parametrize("version", [102])
def test_backward_compat_nostring(version, datapath):
# The Stata data format prior to 105 did not support a date format
# so read the raw values for comparison
data_base = datapath("io", "data", "stata")
ref = os.path.join(data_base, "stata-compat-118.dta")
old = os.path.join(data_base, f"stata-compat-{version}.dta")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you make these 2 datapath as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have now made this change. Note that the equivalent tests for other format versions don't use datapath for these either, but I haven't changed them to keep this pull restricted to just 102 format related changes.

expected = read_stata(ref, convert_dates=False)
# The Stata data format prior to 103 did not support string data
expected = expected.drop(columns=["s10"])
old_dta = read_stata(old, convert_dates=False)
tm.assert_frame_equal(old_dta, expected, check_dtype=False)


@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
def test_bigendian(version, datapath):
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
Expand All @@ -2067,6 +2081,7 @@ def test_bigendian(version, datapath):
tm.assert_frame_equal(big_dta, expected)


# Note: 102 format does not support big-endian byte order
@pytest.mark.parametrize("version", [103, 104])
def test_bigendian_nodateconversion(version, datapath):
# The Stata data format prior to 105 did not support a date format
Expand Down