From 796ce59da2231be5e4b424070cc3b84f1c4d2fb7 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Thu, 25 Jul 2024 11:50:39 +0100 Subject: [PATCH 1/3] BUG: Integer values at the top end of the supported range incorrectly interpreted as missing for format versions 111 and prior --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 33 +++++-- pandas/tests/io/data/stata/stata1_108.dta | Bin 0 -> 703 bytes pandas/tests/io/data/stata/stata1_110.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_111.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_113.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_115.dta | Bin 0 -> 1130 bytes pandas/tests/io/data/stata/stata1_118.dta | Bin 0 -> 3774 bytes pandas/tests/io/data/stata/stata1_119.dta | Bin 0 -> 3788 bytes pandas/tests/io/data/stata/stata8_108.dta | Bin 0 -> 703 bytes pandas/tests/io/data/stata/stata8_110.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata8_111.dta | Bin 0 -> 945 bytes .../data/stata/stata_int_validranges_102.dta | Bin 0 -> 238 bytes .../data/stata/stata_int_validranges_103.dta | Bin 0 -> 240 bytes .../data/stata/stata_int_validranges_104.dta | Bin 0 -> 238 bytes .../data/stata/stata_int_validranges_105.dta | Bin 0 -> 274 bytes .../data/stata/stata_int_validranges_108.dta | Bin 0 -> 470 bytes .../data/stata/stata_int_validranges_110.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_111.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_113.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_114.dta | Bin 0 -> 727 bytes .../data/stata/stata_int_validranges_115.dta | Bin 0 -> 727 bytes .../data/stata/stata_int_validranges_117.dta | Bin 0 -> 1174 bytes .../data/stata/stata_int_validranges_118.dta | Bin 0 -> 2499 bytes .../data/stata/stata_int_validranges_119.dta | Bin 0 -> 2509 bytes pandas/tests/io/test_stata.py | 83 +++++++++++++++++- 26 files changed, 109 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata1_108.dta create mode 100644 pandas/tests/io/data/stata/stata1_110.dta create mode 100644 pandas/tests/io/data/stata/stata1_111.dta create mode 100644 pandas/tests/io/data/stata/stata1_113.dta create mode 100644 pandas/tests/io/data/stata/stata1_115.dta create mode 100644 pandas/tests/io/data/stata/stata1_118.dta create mode 100644 pandas/tests/io/data/stata/stata1_119.dta create mode 100644 pandas/tests/io/data/stata/stata8_108.dta create mode 100644 pandas/tests/io/data/stata/stata8_110.dta create mode 100644 pandas/tests/io/data/stata/stata8_111.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_102.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_103.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_104.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_105.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_108.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_110.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_111.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_113.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_114.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_115.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_117.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_118.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_119.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5d89613bd3d4f..e71220102cbb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -583,6 +583,7 @@ I/O - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index dd92b1bbfdba0..6d8c6d239fce6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -983,6 +983,19 @@ def __init__(self) -> None: np.float64(struct.unpack(" DataFra replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - if fmt not in self.VALID_RANGE: - continue + if self._format_version <= 111: + if fmt not in self.OLD_VALID_RANGE: + continue - fmt = cast(str, fmt) # only strs in VALID_RANGE - nmin, nmax = self.VALID_RANGE[fmt] + fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE + nmin, nmax = self.OLD_VALID_RANGE[fmt] + else: + if fmt not in self.VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series @@ -1827,7 +1847,10 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=object) for j, um in enumerate(umissing): - missing_value = StataMissingValue(um) + if self._format_version <= 111: + missing_value = StataMissingValue(self.MISSING_VALUES[fmt]) + else: + missing_value = StataMissingValue(um) loc = missing_loc[umissing_loc == j] replacement.iloc[loc] = missing_value diff --git a/pandas/tests/io/data/stata/stata1_108.dta b/pandas/tests/io/data/stata/stata1_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..6c948b44905899da630faf406fe6b41ac683e434 GIT binary patch literal 703 zcmc~{Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1CX~Ny^Mgi_gt0E(Y3{0%Ih>7@06e z4vc~9ELBTAgLJ5Xs-Xd#qiO+XGN7;xjSv#Z292_ZYzWjtT>hZG{(t@d|NrX&c9|dv literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_110.dta b/pandas/tests/io/data/stata/stata1_110.dta new file mode 100644 index 0000000000000000000000000000000000000000..c9e2ca72dbd4e4ac7fea459d0125fec04824c841 GIT binary patch literal 945 zcmc~}Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1CX~Ny^Mg%gIkHiOt%lmyX8*REMcr>KUX%g;Wg<;2c#8IFkW|ZD@p$Ks9QV XHyQ#YhCn?;{e$}Y|Mma>|E~uCWKkzd literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_111.dta b/pandas/tests/io/data/stata/stata1_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..21370d302745819ee36866fcc9c7d2a13e210d15 GIT binary patch literal 945 zcmd02Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`5sDDsj|G)nK|Nr#>`hO}r literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_113.dta b/pandas/tests/io/data/stata/stata1_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..6fcf55f0406e99492a718976080d00f98ec99dd6 GIT binary patch literal 945 zcmXS9Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`5sDDtO`n3M(|Nr#>05&P_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_115.dta b/pandas/tests/io/data/stata/stata1_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e5258da49c3c90f7ade257ec2318fb63d0ab0a8 GIT binary patch literal 1130 zcmXSBVq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`Pbf{X8rk;Ux4TeS} cT7ln-qq3tRKp+I_A@UFEQ=is9{r|rn0D!D1@c;k- literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_118.dta b/pandas/tests/io/data/stata/stata1_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..26d7beccb745c58247d09744af47435900ea0d5d GIT binary patch literal 3774 zcmeHKJx;?g6gEN?8xkX1m#(-e9a{8c34vIOx^RHVZQM4J(^R&L)F~S`VC4`TfQuj@ zmR^Cs#l*4Ol(Iz?kx(+^%k%T|=X`ngi$N-?s2?awV8Up?1OdS~qmmKN>i}LEFjy!S z3)A8H<_d6bLXXrC;2wcK*+cdM+$X^4fN_FSpM3%qMl9uvX5`3g*U#tUdaLQ3*1h&w zy9+q?0$xoeqO)`Wdp$Tjb|jY$U-qp-JH$0>KOb1UIJ7u&c<1om;iJPRhky+$+)@$J z0cXBi%$Owa`b~f7PyL|*Z@Em@DNSFEjCf3yAB9p{%0Isc@!U+gycw9ZD3Knaa9ZjI zK#0fFaj6Pq`DybJWGoa9rCKi{laYYe%`l2ZM3p3s?n!f8BwZU`vsh%KQ!EY@5A<5a zC;_izN|!pHGe6tUe5-L0(W;nAft@N~*8Fy2<`pbUSNUTP*eQE8SfxOvK&3#X!2efZ zWH;S2Q-7@nd F>o+7w&29hy literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_119.dta b/pandas/tests/io/data/stata/stata1_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..284daa78bf6db8ba8e3d357cd425d5bd58b73fbe GIT binary patch literal 3788 zcmeHKy-ve05Do&w1QIh#m#(-e9Vj|1v2-ayLOej^Hf|fqX)4=A>Xet@0g!kR9)O4F z(t#J?ZwYqnHdPQ(6{$o*@sQ7F-}x?I&b}CgB$9Z(Bp61N_>^Pdlu;rmcHK6hr2xH| zq!HH&*LNL2nFw9H?>G*iE`}Stqq@2Qy1~Fmk20(U(*qfVRCx?1_}FdLuBO9Uv*DiA z+}1@)`O0PiErSWkL@i+2qs0^Jq+@ZuYbFl&3|{JZRC_%zcza|}T70nhWbxVJi$y@% zE8G(vkRGF+oK2{}@5x*8m3$^23837RRvi=dYIwjRBE2vW!npkNryosq$mK<#6Qe}; zjuVVa{Q!=`qVcd)2GaaA`3NH7k_AGpHc>t!0j*SE5b=;mf$Qzl#xSqAiClLZ#VqUR z#q6BohHkT%OF&DS)V60+>ZNC!XZYTPq)Mk^V5fdTy(@zs@6*PqysFpudXUL;8jD4hWzIn#y~ N*NT}Q&579s_8Sim%=Q2P literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_108.dta b/pandas/tests/io/data/stata/stata8_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..962f7f4331fb34331f381d9c62d336cc30d11a90 GIT binary patch literal 703 zcmc~{Vq{=tU}T^HFf>|%B3`vwpYHGY|{NFg5}) y(sJ?>!Avm2%mgH+#snn5K-E&uAe~G$RNE*ultZBYfBpae|LcKX0AV0_P!9le5FZQx literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_110.dta b/pandas/tests/io/data/stata/stata8_110.dta new file mode 100644 index 0000000000000000000000000000000000000000..a7fe9a3b7e639c4fba3d4746e7f0844e8c185070 GIT binary patch literal 945 zcmc~}Vq{=tU}T^HFf>|%B3`vh#v|NsBj19gHh5Im>{ E0J3Wz4gdfE literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_111.dta b/pandas/tests/io/data/stata/stata8_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..cb96ac0e0f5d3700cf104804fc31858539cd9376 GIT binary patch literal 945 zcmd02Vq{=tU}T^HFf>|%B48Q;U{r5jJuf!rApLIY5hGzIw02LS; z;Zu;7lb?uJ30Q%d2|hE_m|$*HwbV06hjK_|!^{|^M?-*&5UBrO|NsC0dZ11a27(9m E07!`;{{R30 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_102.dta b/pandas/tests/io/data/stata/stata_int_validranges_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..69de2e2f7f91d930ef6796a70410dfad29b2f991 GIT binary patch literal 238 zcmYdeU}RusU}EshD@jdHEmFwI%*`w*R?sjsFj7d%FUn0U(PRLD%*>pm%92zFJFf&x sGvwswr9)UyQq@Aw04M^Z42{5C6eV=!H2i0109x2k$MCQI-~a#h0M@V@4FCWD literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_103.dta b/pandas/tests/io/data/stata/stata_int_validranges_103.dta new file mode 100644 index 0000000000000000000000000000000000000000..71f03873808e228f3ed7ba48d5194d2a19713d7d GIT binary patch literal 240 zcmYdiVq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=e$NXswEO)Sv_$}%uyX67VSmZU=1 wc_m<)AtygC9m0Z=sup?%KoJ;aXawe>D4{E-;XgwI(87i~hJW?{{{OEB0NX4Z5dZ)H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_104.dta b/pandas/tests/io/data/stata/stata_int_validranges_104.dta new file mode 100644 index 0000000000000000000000000000000000000000..f6dff2a6b42d9aff6e6974078cca14d1575defed GIT binary patch literal 238 zcmc~`Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=exNXswEO)Sv_$}%t{W#%MRmZXB& vnRz8(njt4YFCD^ylByPZ20#%QWoQKEq9~y&r-7jXXkSC!zxsdw|JMTmpXnM6 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_105.dta b/pandas/tests/io/data/stata/stata_int_validranges_105.dta new file mode 100644 index 0000000000000000000000000000000000000000..d0a7ad0f01d16a37c331897ccbe4a79bd6bc3d17 GIT binary patch literal 274 zcmc~~Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=e#NXswEO)Sv_%3=danK?<7C8=Pk t%)AnalAQd!bQHy^7J3FS5frwe5kdk*2faBB3=Kdl8|wbm|NH;H9sra<8V>*f literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_108.dta b/pandas/tests/io/data/stata/stata_int_validranges_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..47b715bce21efa225230a841b0ce84994462dcfb GIT binary patch literal 470 zcmc~{Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf_1GNXswEO)Sv_%2EX+W#%MRmZXBs u%FHW)n3JVIuRKFeeqK6$6;N4K3q1pvN))!C5ki6}lSUPh5dsYi4M1HDb^q%B{r_JN0NXPf A5C8xG literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_111.dta b/pandas/tests/io/data/stata/stata_int_validranges_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..07052d824f13229f45844e63fe408a780775ef67 GIT binary patch literal 616 zcmd02Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=q&NXswEO)Sv_%2EaV{_{7fvLqF+ zS($kyc;y*#^7GR1tANU?TIdL6pNs6_F7Fjf@S93=9n^PwJoi|6dOP D-$EOT literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_115.dta b/pandas/tests/io/data/stata/stata_int_validranges_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..80e1dc8670b38cdd5cf0ae1e58a2139e475dd75b GIT binary patch literal 727 zcmXSBVq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=q(NXswEO)Sv_%2EaV{_{7fvLqF+ zS($kyc;y*#^7GR1tANU?TId-Nr;=<9hDIb>L6pNs6_F7Fjf@S93=9n^PwJoi|6dOP DQD&}dH&zue$OwqK^ty(oEVNUB%Y8$P$Y?K!gl)-*dl=5*if!q!TrN6 zuquQOwi>`X2wmK3G;FtvAbF1@9^Pe!2Bab>Qekv-Vwa4ba+Pv(3e4Al5h+;}=?LxA zDgd_7lX5#0+DE{nrtjPY{0#UNaO#xUcF^(|?R&fq_!uw^_!jUzATY-YFI)-UlN6ir zh&28dKgD+etd@3V8MnJY=ZYs<^A(v{GZNA=&juSSN_Ni%I#)&tZI<=`Tc*JKtnv8j z?0oZ*HNNd9v@f=!0Jackt&`6bmq#J?yQ-<#Md&=N(^mhw6r-1(?n1ynU32AR2NR$_ xt2a$%lUY$rfR#zmMoBj(;EL-mTMMhFXby%#yFLH_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_118.dta b/pandas/tests/io/data/stata/stata_int_validranges_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..4bbd823bff63ef38509cf1511ffe2787ef60c2a5 GIT binary patch literal 2499 zcmeHJy-ve05N@d$cm|eA2niNUm^z$pM1n#W9w2fX(^_(p%07@fwamN$Prw^6G4Up- z4*@K}j@_mzTbBqa9`fbRKX?A_Y#$in#N&iG!bI>yXap^k;6`8+^?-SRY}4doAC94Gvr#RopL$_<_7`ER9IKi zDcYqf0ocl^l-sV*UIQMr{m*s4&tSd?_}=mT<9&~}2Og&Zp98)Gd<*yy5SX(Hk6cSW zR3fI?R2cjzzKgE{*p}K2%edVII%>S3n!Yllnk)yiD$mBXdQe8kM*~c*Nl8QN^Z~Y3 zfzLyO$Gx-jom}31Vsx>aNnk6dZOW+!5tm0O_Pec>^~Shkz~$a{SgJdu43=5vVnbQO zK*PYlWMHEL2!|R*6R!D)gtpI(92o@mN0Zoc3!mBxTe2*Y<$PHbb6}+s3~A-&1l+hq KoOVmB>b76MbG>8$ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_119.dta b/pandas/tests/io/data/stata/stata_int_validranges_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..6bd9bbde1d22d5d560540682de2df35bc79df337 GIT binary patch literal 2509 zcmeHJu};G<5KTc0d;?3DO0Zzc1k&k7B-E-417qYiM;pmWifxd}(()1f0TX}0#Kd3p zBY-8?vC~xTmL*F(FEnM+P8iIAYZyz3e9oHFpTzm7!mXT>+9v_dKe1Gil#p9dDcaNVQzdZu8R^ge; zn2!a8dNC)3fBA3zlLOmP+i58`yFlw4XQ-iXoa%-wd$S@-r;RcwqT`|gDwA5oN^jDq zWCmpIVR4i~x9f6`EjH!>n}fSysqT^@SZXgP z8)MrVZavU?ppFN&Dgl3}VKU>APc%vM?8Fm=!1gqeDZKEtvofX2Dq1hsdA>7_-w}{p5h*bsm51<~r$p8QV literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d6f222fc0660..fb7182fdefb32 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,11 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) - def test_read_dta1(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + def test_read_dta1(self, version, datapath): + file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) # Pandas uses np.nan as missing value. @@ -136,6 +138,18 @@ def test_read_dta1(self, file, datapath): # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) + # Column names too long for older Stata formats + if version <= 108: + expected = expected.rename( + columns={ + "float_miss": "f_miss", + "double_miss": "d_miss", + "byte_miss": "b_miss", + "int_miss": "i_miss", + "long_miss": "l_miss", + } + ) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self, datapath): @@ -920,6 +934,23 @@ def test_missing_value_conversion(self, file, datapath): ) tm.assert_frame_equal(parsed, expected) + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) + def test_missing_value_conversion_compat(self, file, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in range(5)] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + ) + tm.assert_frame_equal(parsed, expected) + def test_big_dates(self, datapath, temp_file): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] @@ -2035,6 +2066,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119]) + def test_read_data_int_validranges(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-127, 100], dtype=np.int8), + "int": np.array([-32767, 32740], dtype=np.int16), + "long": np.array([-2147483647, 2147483620], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_read_data_int_validranges_compat(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int8), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_read_data_int_validranges_compat_nobyte(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int16), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): From 485e7ca2658893fdcc1311f8e9a874f43a0c7f37 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Thu, 25 Jul 2024 13:04:39 +0100 Subject: [PATCH 2/3] StataMissingValue expects value passed in to be of float type, so cast to this --- pandas/io/stata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6d8c6d239fce6..2ab67c3559912 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1848,7 +1848,9 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra replacement = Series(series, dtype=object) for j, um in enumerate(umissing): if self._format_version <= 111: - missing_value = StataMissingValue(self.MISSING_VALUES[fmt]) + missing_value = StataMissingValue( + float(self.MISSING_VALUES[fmt]) + ) else: missing_value = StataMissingValue(um) From ed30c2d886ed909fd1e479d152eac9bf9af30379 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Thu, 25 Jul 2024 13:57:59 +0100 Subject: [PATCH 3/3] Add type hint to StataParser.MISSING_VALUES to avoid mypy error when constructing StataMissingValue from value --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2ab67c3559912..03c15d0ab07bb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1007,7 +1007,7 @@ def __init__(self) -> None: # These missing values are the generic '.' in Stata, and are used # to replace nans - self.MISSING_VALUES = { + self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = { "b": 101, "h": 32741, "l": 2147483621,