Stop using Python's eval() for -m and -k

bluetech · bluetech · commit 49b2fab40a56 · 2020-04-25T16:13:57.000+03:00
Previously, the expressions given to the `-m` and `-k` options were evaluated with `eval`. This causes a few issues: - Python keywords cannot be used. - Constants like numbers, None, True, False are not handled correctly. - Various syntax like numeric operators and `X if Y else Z` is supported unintentionally. - `eval()` is somewhat dangerous for arbitrary input. - Can fail in many ways so requires `except Exception`. The format we want to support is quite simple, so change to a custom parser. This fixes the issues above, and gives us full control of the format, so can be documented comprehensively and even be extended in the future if we wish. Fixes #1141. Fixes #3573. Fixes #5881. Fixes #6822. Fixes #7112.
diff --git a/changelog/TBD.breaking.rst b/changelog/TBD.breaking.rst
@@ -0,0 +1,4 @@
+Expressions given to the ``-m`` and ``-k`` options are not longer evaluated using Python's ``eval()``.
+
+The format supports ``or``, ``and``, ``not``, parenthesis and general identifiers to match against.
+Python constants, keywords or operators are no longer evaluated differently.
diff --git a/doc/en/example/markers.rst b/doc/en/example/markers.rst
@@ -141,14 +141,14 @@ Or select multiple nodes:
 Using ``-k expr`` to select tests based on their name
 -------------------------------------------------------
 
-.. versionadded: 2.0/2.3.4
+.. versionadded:: 2.0/2.3.4
 
 You can use the ``-k`` command line option to specify an expression
 which implements a substring match on the test names instead of the
 exact match on markers that ``-m`` provides.  This makes it easy to
 select tests based on their names:
 
-.. versionadded: 5.4
+.. versionchanged:: 5.4
 
 The expression matching is now case-insensitive.
 
@@ -198,20 +198,8 @@ Or to select "http" and "quick" tests:
 
     ===================== 2 passed, 2 deselected in 0.12s ======================
 
-.. note::
-
-    If you are using expressions such as ``"X and Y"`` then both ``X`` and ``Y``
-    need to be simple non-keyword names. For example, ``"pass"`` or ``"from"``
-    will result in SyntaxErrors because ``"-k"`` evaluates the expression using
-    Python's `eval`_ function.
-
-.. _`eval`: https://docs.python.org/3.6/library/functions.html#eval
-
+You can use ``and``, ``or``, ``not`` and parentheses.
 
-    However, if the ``"-k"`` argument is a simple string, no such restrictions
-    apply. Also ``"-k 'not STRING'"`` has no restrictions.  You can also
-    specify numbers like ``"-k 1.3"`` to match tests which are parametrized
-    with the float ``"1.3"``.
 
 Registering markers
 -------------------------------------
diff --git a/src/_pytest/mark/legacy.py b/src/_pytest/mark/legacy.py
@@ -2,7 +2,12 @@
 this is a place where we put datastructures used by legacy apis
 we hope to remove
 """
-import keyword
+import enum
+import re
+from typing import Callable
+from typing import Iterator
+from typing import Optional
+from typing import Sequence
 from typing import Set
 
 import attr
@@ -11,35 +16,174 @@
 from _pytest.config import UsageError
 
 if TYPE_CHECKING:
+    from typing import NoReturn
+
     from _pytest.nodes import Item  # noqa: F401 (used in type string)
 
 
+# The grammar for match expressions is:
+#
+# expr:     and_expr ('or' and_expr)*
+# and_expr: not_expr ('and' not_expr)*
+# not_expr: 'not' not_expr | '(' expr ')' | ident
+# ident:    (\w|\+|-|\.|\[|\])+
+
+
+class TokenType(enum.Enum):
+    LPAREN = "left parenthesis"
+    RPAREN = "right parenthesis"
+    OR = "or"
+    AND = "and"
+    NOT = "not"
+    IDENT = "identifier"
+    EOF = "end of input"
+
+
+@attr.s(frozen=True, slots=True)
+class Token:
+    type = attr.ib(type=TokenType)
+    value = attr.ib(type=str)
+    column = attr.ib(type=int)
+
+
+class ParseError(Exception):
+    def __init__(self, column: int, message: str) -> None:
+        self.column = column
+        self.message = message
+
+    def __str__(self) -> str:
+        return "at column {}: {}".format(self.column, self.message)
+
+
+class Scanner:
+    __slots__ = ("tokens", "current")
+
+    def __init__(self, input: str) -> None:
+        self.tokens = self.lex(input)
+        self.current = next(self.tokens)
+
+    def lex(self, input: str) -> Iterator[Token]:
+        pos = 0
+        while pos < len(input):
+            if input[pos] in (" ", "\t"):
+                pos += 1
+            elif input[pos] == "(":
+                yield Token(TokenType.LPAREN, "(", pos + 1)
+                pos += 1
+            elif input[pos] == ")":
+                yield Token(TokenType.RPAREN, ")", pos + 1)
+                pos += 1
+            else:
+                match = re.match(r"(:?\w|\+|-|\.|\[|\])+", input[pos:])
+                if match:
+                    value = match.group(0)
+                    if value == "and":
+                        yield Token(TokenType.AND, value, pos + 1)
+                    elif value == "or":
+                        yield Token(TokenType.OR, value, pos + 1)
+                    elif value == "not":
+                        yield Token(TokenType.NOT, value, pos + 1)
+                    else:
+                        yield Token(TokenType.IDENT, value, pos + 1)
+                    pos += len(value)
+                else:
+                    raise ParseError(
+                        pos + 1, 'unexpected character "{}"'.format(input[pos]),
+                    )
+        yield Token(TokenType.EOF, "", pos + 1)
+
+    def accept(self, type: TokenType, *, reject: bool = False) -> Optional[Token]:
+        if self.current.type is type:
+            token = self.current
+            if token.type is not TokenType.EOF:
+                self.current = next(self.tokens)
+            return token
+        if reject:
+            self.reject((type,))
+        return None
+
+    def reject(self, expected: Sequence[TokenType]) -> "NoReturn":
+        raise ParseError(
+            self.current.column,
+            "expected {}; got {}".format(
+                " OR ".join(type.value for type in expected), self.current.type.value,
+            ),
+        )
+
+
+def expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    ret = and_expr(s, matcher)
+    while s.accept(TokenType.OR):
+        rhs = and_expr(s, matcher)
+        ret = ret or rhs
+    return ret
+
+
+def and_expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    ret = not_expr(s, matcher)
+    while s.accept(TokenType.AND):
+        rhs = not_expr(s, matcher)
+        ret = ret and rhs
+    return ret
+
+
+def not_expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    if s.accept(TokenType.NOT):
+        return not not_expr(s, matcher)
+    if s.accept(TokenType.LPAREN):
+        ret = expr(s, matcher)
+        s.accept(TokenType.RPAREN, reject=True)
+        return ret
+    ident = s.accept(TokenType.IDENT)
+    if ident:
+        return matcher(ident.value)
+    s.reject((TokenType.NOT, TokenType.LPAREN, TokenType.IDENT))
+
+
+def evaluate(input: str, matcher: Callable[[str], bool]) -> bool:
+    """Evaluate a match expression as used by -k and -m.
+
+    :param input: The input expression - one line.
+    :param matcher: Given an identifier, should return whether it matches or not.
+
+    Returns whether the entire expression matches or not.
+    """
+    s = Scanner(input)
+    ret = expr(s, matcher)
+    s.accept(TokenType.EOF, reject=True)
+    return ret
+
+
+# The actual matchers:
+
+
 @attr.s
-class MarkMapping:
-    """Provides a local mapping for markers where item access
-    resolves to True if the marker is present. """
+class MarkMatcher:
+    """A matcher for markers which are present."""
 
     own_mark_names = attr.ib()
 
     @classmethod
-    def from_item(cls, item):
+    def from_item(cls, item) -> "MarkMatcher":
         mark_names = {mark.name for mark in item.iter_markers()}
         return cls(mark_names)
 
-    def __getitem__(self, name):
+    def __call__(self, name: str) -> bool:
         return name in self.own_mark_names
 
 
 @attr.s
-class KeywordMapping:
-    """Provides a local mapping for keywords.
-    Given a list of names, map any substring of one of these names to True.
+class KeywordMatcher:
+    """A matcher for keywords.
+
+    Given a list of names, matches any substring of one of these names. The
+    string inclusion check is case-insensitive.
     """
 
     _names = attr.ib(type=Set[str])
 
     @classmethod
-    def from_item(cls, item: "Item") -> "KeywordMapping":
+    def from_item(cls, item: "Item") -> "KeywordMatcher":
         mapped_names = set()
 
         # Add the names of the current item and any parent items
@@ -62,12 +206,7 @@ def from_item(cls, item: "Item") -> "KeywordMapping":
 
         return cls(mapped_names)
 
-    def __getitem__(self, subname: str) -> bool:
-        """Return whether subname is included within stored names.
-
-        The string inclusion check is case-insensitive.
-
-        """
+    def __call__(self, subname: str) -> bool:
         subname = subname.lower()
         names = (name.lower() for name in self._names)
 
@@ -77,18 +216,17 @@ def __getitem__(self, subname: str) -> bool:
         return False
 
 
-python_keywords_allowed_list = ["or", "and", "not"]
-
-
-def matchmark(colitem, markexpr):
+def matchmark(colitem, markexpr: str) -> bool:
     """Tries to match on any marker names, attached to the given colitem."""
     try:
-        return eval(markexpr, {}, MarkMapping.from_item(colitem))
-    except Exception:
-        raise UsageError("Wrong expression passed to '-m': {}".format(markexpr))
+        return evaluate(markexpr, MarkMatcher.from_item(colitem))
+    except ParseError as e:
+        raise UsageError(
+            "Wrong expression passed to '-m': {}: {}".format(markexpr, e)
+        ) from None
 
 
-def matchkeyword(colitem, keywordexpr):
+def matchkeyword(colitem, keywordexpr: str) -> bool:
     """Tries to match given keyword expression to given collector item.
 
     Will match on the name of colitem, including the names of its parents.
@@ -97,20 +235,9 @@ def matchkeyword(colitem, keywordexpr):
     Additionally, matches on names in the 'extra_keyword_matches' set of
     any item, as well as names directly assigned to test functions.
     """
-    mapping = KeywordMapping.from_item(colitem)
-    if " " not in keywordexpr:
-        # special case to allow for simple "-k pass" and "-k 1.3"
-        return mapping[keywordexpr]
-    elif keywordexpr.startswith("not ") and " " not in keywordexpr[4:]:
-        return not mapping[keywordexpr[4:]]
-    for kwd in keywordexpr.split():
-        if keyword.iskeyword(kwd) and kwd not in python_keywords_allowed_list:
-            raise UsageError(
-                "Python keyword '{}' not accepted in expressions passed to '-k'".format(
-                    kwd
-                )
-            )
     try:
-        return eval(keywordexpr, {}, mapping)
-    except Exception:
-        raise UsageError("Wrong expression passed to '-k': {}".format(keywordexpr))
+        return evaluate(keywordexpr, KeywordMatcher.from_item(colitem))
+    except ParseError as e:
+        raise UsageError(
+            "Wrong expression passed to '-k': {}: {}".format(keywordexpr, e)
+        ) from None
diff --git a/testing/test_mark.py b/testing/test_mark.py