Skip to content

Commit 77e1238

Browse files
committed
Re-work engine guessing in ExcelFile / read_excel.
This uses more reliable content introspection that results in better engine selection. It also removes situations where xlrd gets handed a file it no longer supports.
1 parent 122d502 commit 77e1238

File tree

3 files changed

+116
-57
lines changed

3 files changed

+116
-57
lines changed

pandas/io/excel/_base.py

+94-55
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
import inspect
44
from io import BufferedIOBase, BytesIO, RawIOBase
55
import os
6+
from pathlib import Path
67
from textwrap import fill
7-
from typing import Any, Dict, Mapping, Union, cast
8+
from typing import Any, Dict, Mapping, Union, cast, BinaryIO
89
import warnings
10+
from zipfile import ZipFile
911

1012
from pandas._config import config
1113

@@ -888,32 +890,73 @@ def close(self):
888890
return content
889891

890892

891-
def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
893+
def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes:
892894
"""
893-
Check if the stream is an OpenDocument Spreadsheet (.ods) file
894-
895-
It uses magic values inside the stream
895+
Return the specified number of bytes from the start of the stream
896+
and seek back to the start of the stream afterwards.
896897
897898
Parameters
898899
----------
899900
stream : Union[BufferedIOBase, RawIOBase]
900-
IO stream with data which might be an ODS file
901901
902902
Returns
903903
-------
904-
is_ods : bool
905-
Boolean indication that this is indeed an ODS file or not
904+
content : bytes
905+
The bytes founds.
906906
"""
907907
stream.seek(0)
908-
is_ods = False
909-
if stream.read(4) == b"PK\003\004":
910-
stream.seek(30)
911-
is_ods = (
912-
stream.read(54) == b"mimetype"
913-
b"application/vnd.oasis.opendocument.spreadsheet"
914-
)
908+
content = stream.read(size)
915909
stream.seek(0)
916-
return is_ods
910+
return content
911+
912+
913+
_XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
914+
_ZIP_SIGNATURE = b"PK\x03\x04"
915+
_PEEK_SIZE = max(len(_XLS_SIGNATURE), len(_ZIP_SIGNATURE))
916+
917+
918+
def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> str:
919+
"""
920+
Use the content of a stream to try and figure out which engine to use.
921+
922+
It uses magic values inside the stream.
923+
924+
Parameters
925+
----------
926+
stream : Union[BufferedIOBase, RawIOBase]
927+
IO stream with data which might contain spreadsheet data.
928+
929+
Returns
930+
-------
931+
engine : Optional[engine]
932+
The string engine if it can be confidently inferred.
933+
"""
934+
engine = None
935+
peek = _peek(stream, _PEEK_SIZE)
936+
937+
if peek.startswith(_XLS_SIGNATURE):
938+
engine = "xlrd"
939+
940+
elif peek.startswith(_ZIP_SIGNATURE):
941+
zf = ZipFile(stream)
942+
943+
# Workaround for some third party files that use forward slashes and
944+
# lower case names. We map the expected name in lowercase to the
945+
# actual filename in the zip container.
946+
component_names = {
947+
name.replace("\\", "/").lower(): name for name in zf.namelist()
948+
}
949+
950+
stream.seek(0)
951+
952+
if "xl/workbook.xml" in component_names:
953+
engine = "openpyxl"
954+
if "xl/workbook.bin" in component_names:
955+
engine = "pyxlsb"
956+
if "content.xml" in component_names:
957+
engine = "odf"
958+
959+
return engine
917960

918961

919962
class ExcelFile:
@@ -970,21 +1013,39 @@ class ExcelFile:
9701013
"pyxlsb": PyxlsbReader,
9711014
}
9721015

1016+
_ext_to_engine: Mapping[str, str] = {
1017+
".ods": "odf",
1018+
".xls": "xlrd",
1019+
".xlsx": "openpyxl",
1020+
}
1021+
9731022
def __init__(
9741023
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
9751024
):
9761025
if engine is None:
977-
# Determine ext and use odf for ods stream/file
1026+
1027+
ext = peek = None
1028+
1029+
if isinstance(path_or_buffer, bytes):
1030+
path_or_buffer = BytesIO(path_or_buffer)
1031+
9781032
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
979-
ext = None
980-
if _is_ods_stream(path_or_buffer):
981-
engine = "odf"
982-
else:
1033+
engine = _engine_from_content(path_or_buffer)
1034+
peek = _peek(path_or_buffer)
1035+
1036+
elif isinstance(path_or_buffer, (str, os.PathLike)):
9831037
ext = os.path.splitext(str(path_or_buffer))[-1]
984-
if ext == ".ods":
985-
engine = "odf"
1038+
handles = get_handle(
1039+
stringify_path(path_or_buffer),
1040+
"rb",
1041+
storage_options=storage_options,
1042+
is_text=False,
1043+
)
1044+
with handles:
1045+
engine = _engine_from_content(handles.handle)
1046+
peek = _peek(handles.handle)
9861047

987-
if (
1048+
elif (
9881049
import_optional_dependency(
9891050
"xlrd", raise_on_missing=False, on_version="ignore"
9901051
)
@@ -995,38 +1056,16 @@ def __init__(
9951056
if isinstance(path_or_buffer, Book):
9961057
engine = "xlrd"
9971058

998-
# GH 35029 - Prefer openpyxl except for xls files
1059+
# Couldn't tell for definite, so guess based on extension:
9991060
if engine is None:
1000-
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
1001-
engine = "xlrd"
1002-
elif (
1003-
import_optional_dependency(
1004-
"openpyxl", raise_on_missing=False, on_version="ignore"
1005-
)
1006-
is not None
1007-
):
1008-
engine = "openpyxl"
1009-
else:
1010-
caller = inspect.stack()[1]
1011-
if (
1012-
caller.filename.endswith("pandas/io/excel/_base.py")
1013-
and caller.function == "read_excel"
1014-
):
1015-
stacklevel = 4
1016-
else:
1017-
stacklevel = 2
1018-
warnings.warn(
1019-
"The xlrd engine is no longer maintained and is not "
1020-
"supported when using pandas with python >= 3.9. However, "
1021-
"the engine xlrd will continue to be allowed for the "
1022-
"indefinite future. Beginning with pandas 1.2.0, the "
1023-
"openpyxl engine will be used if it is installed and the "
1024-
"engine argument is not specified. Either install openpyxl "
1025-
"or specify engine='xlrd' to silence this warning.",
1026-
FutureWarning,
1027-
stacklevel=stacklevel,
1028-
)
1029-
engine = "xlrd"
1061+
engine = self._ext_to_engine.get(ext)
1062+
1063+
if engine is None:
1064+
raise ValueError(
1065+
f"Could not find engine for {path_or_buffer!r}, content was "
1066+
f"{peek!r}"
1067+
)
1068+
10301069
if engine not in self._engines:
10311070
raise ValueError(f"Unknown engine: {engine}")
10321071

pandas/tests/io/excel/test_readers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,10 @@ def test_reader_spaces(self, read_ext):
508508
def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
509509
# see gh-35802
510510
engine = pd.read_excel.keywords["engine"]
511-
if engine != "odf":
511+
if engine not in ("odf", None):
512512
pytest.skip(f"Skipped for engine: {engine}")
513513

514-
actual = pd.read_excel(basename + read_ext)
514+
actual = pd.read_excel(basename + ".ods")
515515
tm.assert_frame_equal(actual, expected)
516516

517517
def test_reading_all_sheets(self, read_ext):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Tests that don't need or don't work with the autouse fixtures in test_readers.py
2+
import pytest
3+
4+
import pandas as pd
5+
6+
7+
def test_unreadable_bytes():
8+
with pytest.raises(
9+
ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
10+
):
11+
pd.read_excel(b"rubbish")
12+
13+
14+
def test_unreadable_file(tmp_path):
15+
bad = tmp_path / "bad"
16+
bad.write_bytes(b"rubbish")
17+
with pytest.raises(
18+
ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
19+
):
20+
pd.read_excel(bad)

0 commit comments

Comments
 (0)