3
3
import inspect
4
4
from io import BufferedIOBase , BytesIO , RawIOBase
5
5
import os
6
+ from pathlib import Path
6
7
from textwrap import fill
7
- from typing import Any , Dict , Mapping , Union , cast
8
+ from typing import Any , Dict , Mapping , Union , cast , BinaryIO
8
9
import warnings
10
+ from zipfile import ZipFile
9
11
10
12
from pandas ._config import config
11
13
@@ -888,32 +890,73 @@ def close(self):
888
890
return content
889
891
890
892
891
- def _is_ods_stream (stream : Union [BufferedIOBase , RawIOBase ] ) -> bool :
893
+ def _peek (stream : Union [BufferedIOBase , RawIOBase , BinaryIO ], size : int = 20 ) -> bytes :
892
894
"""
893
- Check if the stream is an OpenDocument Spreadsheet (.ods) file
894
-
895
- It uses magic values inside the stream
895
+ Return the specified number of bytes from the start of the stream
896
+ and seek back to the start of the stream afterwards.
896
897
897
898
Parameters
898
899
----------
899
900
stream : Union[BufferedIOBase, RawIOBase]
900
- IO stream with data which might be an ODS file
901
901
902
902
Returns
903
903
-------
904
- is_ods : bool
905
- Boolean indication that this is indeed an ODS file or not
904
+ content : bytes
905
+ The bytes founds.
906
906
"""
907
907
stream .seek (0 )
908
- is_ods = False
909
- if stream .read (4 ) == b"PK\003 \004 " :
910
- stream .seek (30 )
911
- is_ods = (
912
- stream .read (54 ) == b"mimetype"
913
- b"application/vnd.oasis.opendocument.spreadsheet"
914
- )
908
+ content = stream .read (size )
915
909
stream .seek (0 )
916
- return is_ods
910
+ return content
911
+
912
+
913
+ _XLS_SIGNATURE = b"\xD0 \xCF \x11 \xE0 \xA1 \xB1 \x1A \xE1 "
914
+ _ZIP_SIGNATURE = b"PK\x03 \x04 "
915
+ _PEEK_SIZE = max (len (_XLS_SIGNATURE ), len (_ZIP_SIGNATURE ))
916
+
917
+
918
+ def _engine_from_content (stream : Union [BufferedIOBase , RawIOBase , BinaryIO ]) -> str :
919
+ """
920
+ Use the content of a stream to try and figure out which engine to use.
921
+
922
+ It uses magic values inside the stream.
923
+
924
+ Parameters
925
+ ----------
926
+ stream : Union[BufferedIOBase, RawIOBase]
927
+ IO stream with data which might contain spreadsheet data.
928
+
929
+ Returns
930
+ -------
931
+ engine : Optional[engine]
932
+ The string engine if it can be confidently inferred.
933
+ """
934
+ engine = None
935
+ peek = _peek (stream , _PEEK_SIZE )
936
+
937
+ if peek .startswith (_XLS_SIGNATURE ):
938
+ engine = "xlrd"
939
+
940
+ elif peek .startswith (_ZIP_SIGNATURE ):
941
+ zf = ZipFile (stream )
942
+
943
+ # Workaround for some third party files that use forward slashes and
944
+ # lower case names. We map the expected name in lowercase to the
945
+ # actual filename in the zip container.
946
+ component_names = {
947
+ name .replace ("\\ " , "/" ).lower (): name for name in zf .namelist ()
948
+ }
949
+
950
+ stream .seek (0 )
951
+
952
+ if "xl/workbook.xml" in component_names :
953
+ engine = "openpyxl"
954
+ if "xl/workbook.bin" in component_names :
955
+ engine = "pyxlsb"
956
+ if "content.xml" in component_names :
957
+ engine = "odf"
958
+
959
+ return engine
917
960
918
961
919
962
class ExcelFile :
@@ -970,21 +1013,39 @@ class ExcelFile:
970
1013
"pyxlsb" : PyxlsbReader ,
971
1014
}
972
1015
1016
+ _ext_to_engine : Mapping [str , str ] = {
1017
+ ".ods" : "odf" ,
1018
+ ".xls" : "xlrd" ,
1019
+ ".xlsx" : "openpyxl" ,
1020
+ }
1021
+
973
1022
def __init__ (
974
1023
self , path_or_buffer , engine = None , storage_options : StorageOptions = None
975
1024
):
976
1025
if engine is None :
977
- # Determine ext and use odf for ods stream/file
1026
+
1027
+ ext = peek = None
1028
+
1029
+ if isinstance (path_or_buffer , bytes ):
1030
+ path_or_buffer = BytesIO (path_or_buffer )
1031
+
978
1032
if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase )):
979
- ext = None
980
- if _is_ods_stream (path_or_buffer ):
981
- engine = "odf"
982
- else :
1033
+ engine = _engine_from_content ( path_or_buffer )
1034
+ peek = _peek (path_or_buffer )
1035
+
1036
+ elif isinstance ( path_or_buffer , ( str , os . PathLike )) :
983
1037
ext = os .path .splitext (str (path_or_buffer ))[- 1 ]
984
- if ext == ".ods" :
985
- engine = "odf"
1038
+ handles = get_handle (
1039
+ stringify_path (path_or_buffer ),
1040
+ "rb" ,
1041
+ storage_options = storage_options ,
1042
+ is_text = False ,
1043
+ )
1044
+ with handles :
1045
+ engine = _engine_from_content (handles .handle )
1046
+ peek = _peek (handles .handle )
986
1047
987
- if (
1048
+ elif (
988
1049
import_optional_dependency (
989
1050
"xlrd" , raise_on_missing = False , on_version = "ignore"
990
1051
)
@@ -995,38 +1056,16 @@ def __init__(
995
1056
if isinstance (path_or_buffer , Book ):
996
1057
engine = "xlrd"
997
1058
998
- # GH 35029 - Prefer openpyxl except for xls files
1059
+ # Couldn't tell for definite, so guess based on extension:
999
1060
if engine is None :
1000
- if ext is None or isinstance (path_or_buffer , bytes ) or ext == ".xls" :
1001
- engine = "xlrd"
1002
- elif (
1003
- import_optional_dependency (
1004
- "openpyxl" , raise_on_missing = False , on_version = "ignore"
1005
- )
1006
- is not None
1007
- ):
1008
- engine = "openpyxl"
1009
- else :
1010
- caller = inspect .stack ()[1 ]
1011
- if (
1012
- caller .filename .endswith ("pandas/io/excel/_base.py" )
1013
- and caller .function == "read_excel"
1014
- ):
1015
- stacklevel = 4
1016
- else :
1017
- stacklevel = 2
1018
- warnings .warn (
1019
- "The xlrd engine is no longer maintained and is not "
1020
- "supported when using pandas with python >= 3.9. However, "
1021
- "the engine xlrd will continue to be allowed for the "
1022
- "indefinite future. Beginning with pandas 1.2.0, the "
1023
- "openpyxl engine will be used if it is installed and the "
1024
- "engine argument is not specified. Either install openpyxl "
1025
- "or specify engine='xlrd' to silence this warning." ,
1026
- FutureWarning ,
1027
- stacklevel = stacklevel ,
1028
- )
1029
- engine = "xlrd"
1061
+ engine = self ._ext_to_engine .get (ext )
1062
+
1063
+ if engine is None :
1064
+ raise ValueError (
1065
+ f"Could not find engine for { path_or_buffer !r} , content was "
1066
+ f"{ peek !r} "
1067
+ )
1068
+
1030
1069
if engine not in self ._engines :
1031
1070
raise ValueError (f"Unknown engine: { engine } " )
1032
1071
0 commit comments