diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8524a302f4c9..9aa85bea10da8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -213,3 +213,10 @@ repos: |\#\ type:\s?ignore(?!\[) language: pygrep types: [python] + - id: use-pd_array-in-core + name: Import pandas.array as pd_array in core + language: python + entry: python scripts/use_pd_array_in_core.py + files: ^pandas/core/ + exclude: ^pandas/core/api\.py$ + types: [python] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1eda06dbbb1c4..14c77ec2fdf8f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3023,7 +3023,7 @@ def _str_extract_noexpand(arr, pat, flags=0): """ from pandas import ( DataFrame, - array, + array as pd_array, ) regex = re.compile(pat, flags=flags) @@ -3034,7 +3034,7 @@ def _str_extract_noexpand(arr, pat, flags=0): result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) name = _get_single_group_name(regex) # not dispatching, so we have to reconstruct here. - result = array(result, dtype=result_dtype) + result = pd_array(result, dtype=result_dtype) else: if isinstance(arr, ABCIndex): raise ValueError("only one regex group is supported with Index") diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py new file mode 100644 index 0000000000000..9c66199a82846 --- /dev/null +++ b/scripts/tests/test_use_pd_array_in_core.py @@ -0,0 +1,26 @@ +import pytest + +from scripts.use_pd_array_in_core import use_pd_array + +BAD_FILE_0 = "import pandas as pd\npd.array" +BAD_FILE_1 = "\nfrom pandas import array" +GOOD_FILE_0 = "from pandas import array as pd_array" +GOOD_FILE_1 = "from pandas.core.construction import array as pd_array" +PATH = "t.py" + + +@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1]) +def test_inconsistent_usage(content, capsys): + result_msg = ( + "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n" + ) + with pytest.raises(SystemExit): + use_pd_array(content, PATH) + expected_msg, _ = capsys.readouterr() + assert result_msg == expected_msg + + +@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1]) +def test_consistent_usage(content): + # should not raise + use_pd_array(content, PATH) diff --git a/scripts/use_pd_array_in_core.py b/scripts/use_pd_array_in_core.py new file mode 100644 index 0000000000000..531084683bdb1 --- /dev/null +++ b/scripts/use_pd_array_in_core.py @@ -0,0 +1,77 @@ +""" +Check that pandas/core imports pandas.array as pd_array. + +This makes it easier to grep for usage of pandas array. + +This is meant to be run as a pre-commit hook - to run it manually, you can do: + + pre-commit run use-pd_array-in-core --all-files + +""" + +import argparse +import ast +import sys +from typing import ( + Optional, + Sequence, +) + +ERROR_MESSAGE = ( + "{path}:{lineno}:{col_offset}: " + "Don't use pd.array in core, import array as pd_array instead\n" +) + + +class Visitor(ast.NodeVisitor): + def __init__(self, path: str) -> None: + self.path = path + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + # If array has been imported from somewhere in pandas, + # check it's aliased as pd_array. + if ( + node.module is not None + and node.module.startswith("pandas") + and any(i.name == "array" and i.asname != "pd_array" for i in node.names) + ): + msg = ERROR_MESSAGE.format( + path=self.path, lineno=node.lineno, col_offset=node.col_offset + ) + sys.stdout.write(msg) + sys.exit(1) + super().generic_visit(node) + + def visit_Attribute(self, node: ast.Attribute) -> None: + if ( + isinstance(node.value, ast.Name) + and node.value.id == "pd" + and node.attr == "array" + ): + msg = ERROR_MESSAGE.format( + path=self.path, lineno=node.lineno, col_offset=node.col_offset + ) + sys.stdout.write(msg) + sys.exit(1) + super().generic_visit(node) + + +def use_pd_array(content: str, path: str) -> None: + tree = ast.parse(content) + visitor = Visitor(path) + visitor.visit(tree) + + +def main(argv: Optional[Sequence[str]] = None) -> None: + parser = argparse.ArgumentParser() + parser.add_argument("paths", nargs="*") + args = parser.parse_args(argv) + + for path in args.paths: + with open(path, encoding="utf-8") as fd: + content = fd.read() + use_pd_array(content, path) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg index 7165fc2275dc0..2aaafa0391531 100644 --- a/setup.cfg +++ b/setup.cfg @@ -140,6 +140,7 @@ omit = pandas/_typing.py pandas/_version.py plugins = Cython.Coverage +source = pandas [coverage:report] ignore_errors = False