Stop using Python's eval() for -m and -k

bluetech · bluetech · commit a718ad636304 · 2020-05-01T12:59:06.000+03:00
Previously, the expressions given to the `-m` and `-k` options were
evaluated with `eval`. This causes a few issues:

- Python keywords cannot be used.

- Constants like numbers, None, True, False are not handled correctly.

- Various syntax like numeric operators and `X if Y else Z` is supported
  unintentionally.

- `eval()` is somewhat dangerous for arbitrary input.

- Can fail in many ways so requires `except Exception`.

The format we want to support is quite simple, so change to a custom
parser. This fixes the issues above, and gives us full control of the
format, so can be documented comprehensively and even be extended in the
future if we wish.
diff --git a/changelog/7122.breaking.rst b/changelog/7122.breaking.rst
@@ -0,0 +1,3 @@
+Expressions given to the ``-m`` and ``-k`` options are no longer evaluated using Python's ``eval()``.
+The format supports ``or``, ``and``, ``not``, parenthesis and general identifiers to match against.
+Python constants, keywords or other operators are no longer evaluated differently.
diff --git a/doc/en/example/markers.rst b/doc/en/example/markers.rst
@@ -141,14 +141,14 @@ Or select multiple nodes:
 Using ``-k expr`` to select tests based on their name
 -------------------------------------------------------
 
-.. versionadded: 2.0/2.3.4
+.. versionadded:: 2.0/2.3.4
 
 You can use the ``-k`` command line option to specify an expression
 which implements a substring match on the test names instead of the
 exact match on markers that ``-m`` provides.  This makes it easy to
 select tests based on their names:
 
-.. versionadded: 5.4
+.. versionchanged:: 5.4
 
 The expression matching is now case-insensitive.
 
@@ -198,20 +198,8 @@ Or to select "http" and "quick" tests:
 
     ===================== 2 passed, 2 deselected in 0.12s ======================
 
-.. note::
-
-    If you are using expressions such as ``"X and Y"`` then both ``X`` and ``Y``
-    need to be simple non-keyword names. For example, ``"pass"`` or ``"from"``
-    will result in SyntaxErrors because ``"-k"`` evaluates the expression using
-    Python's `eval`_ function.
-
-.. _`eval`: https://docs.python.org/3.6/library/functions.html#eval
-
+You can use ``and``, ``or``, ``not`` and parentheses.
 
-    However, if the ``"-k"`` argument is a simple string, no such restrictions
-    apply. Also ``"-k 'not STRING'"`` has no restrictions.  You can also
-    specify numbers like ``"-k 1.3"`` to match tests which are parametrized
-    with the float ``"1.3"``.
 
 Registering markers
 -------------------------------------
diff --git a/src/_pytest/mark/expression.py b/src/_pytest/mark/expression.py
@@ -0,0 +1,173 @@
+r"""
+Evaluate match expressions, as used by `-k` and `-m`.
+
+The grammar is:
+
+expression: expr? EOF
+expr:       and_expr ('or' and_expr)*
+and_expr:   not_expr ('and' not_expr)*
+not_expr:   'not' not_expr | '(' expr ')' | ident
+ident:      (\w|:|\+|-|\.|\[|\])+
+
+The semantics are:
+
+- Empty expression evaluates to False.
+- ident evaluates to True of False according to a provided matcher function.
+- or/and/not evaluate according to the usual boolean semantics.
+"""
+import enum
+import re
+from typing import Callable
+from typing import Iterator
+from typing import Optional
+from typing import Sequence
+
+import attr
+
+from _pytest.compat import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import NoReturn
+
+
+__all__ = [
+    "evaluate",
+    "ParseError",
+]
+
+
+class TokenType(enum.Enum):
+    LPAREN = "left parenthesis"
+    RPAREN = "right parenthesis"
+    OR = "or"
+    AND = "and"
+    NOT = "not"
+    IDENT = "identifier"
+    EOF = "end of input"
+
+
+@attr.s(frozen=True, slots=True)
+class Token:
+    type = attr.ib(type=TokenType)
+    value = attr.ib(type=str)
+    pos = attr.ib(type=int)
+
+
+class ParseError(Exception):
+    """The expression contains invalid syntax.
+
+    :param column: The column in the line where the error occurred (1-based).
+    :param message: A description of the error.
+    """
+
+    def __init__(self, column: int, message: str) -> None:
+        self.column = column
+        self.message = message
+
+    def __str__(self) -> str:
+        return "at column {}: {}".format(self.column, self.message)
+
+
+class Scanner:
+    __slots__ = ("tokens", "current")
+
+    def __init__(self, input: str) -> None:
+        self.tokens = self.lex(input)
+        self.current = next(self.tokens)
+
+    def lex(self, input: str) -> Iterator[Token]:
+        pos = 0
+        while pos < len(input):
+            if input[pos] in (" ", "\t"):
+                pos += 1
+            elif input[pos] == "(":
+                yield Token(TokenType.LPAREN, "(", pos)
+                pos += 1
+            elif input[pos] == ")":
+                yield Token(TokenType.RPAREN, ")", pos)
+                pos += 1
+            else:
+                match = re.match(r"(:?\w|:|\+|-|\.|\[|\])+", input[pos:])
+                if match:
+                    value = match.group(0)
+                    if value == "or":
+                        yield Token(TokenType.OR, value, pos)
+                    elif value == "and":
+                        yield Token(TokenType.AND, value, pos)
+                    elif value == "not":
+                        yield Token(TokenType.NOT, value, pos)
+                    else:
+                        yield Token(TokenType.IDENT, value, pos)
+                    pos += len(value)
+                else:
+                    raise ParseError(
+                        pos + 1, 'unexpected character "{}"'.format(input[pos]),
+                    )
+        yield Token(TokenType.EOF, "", pos)
+
+    def accept(self, type: TokenType, *, reject: bool = False) -> Optional[Token]:
+        if self.current.type is type:
+            token = self.current
+            if token.type is not TokenType.EOF:
+                self.current = next(self.tokens)
+            return token
+        if reject:
+            self.reject((type,))
+        return None
+
+    def reject(self, expected: Sequence[TokenType]) -> "NoReturn":
+        raise ParseError(
+            self.current.pos + 1,
+            "expected {}; got {}".format(
+                " OR ".join(type.value for type in expected), self.current.type.value,
+            ),
+        )
+
+
+def expression(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    if s.accept(TokenType.EOF):
+        return False
+    ret = expr(s, matcher)
+    s.accept(TokenType.EOF, reject=True)
+    return ret
+
+
+def expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    ret = and_expr(s, matcher)
+    while s.accept(TokenType.OR):
+        rhs = and_expr(s, matcher)
+        ret = ret or rhs
+    return ret
+
+
+def and_expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    ret = not_expr(s, matcher)
+    while s.accept(TokenType.AND):
+        rhs = not_expr(s, matcher)
+        ret = ret and rhs
+    return ret
+
+
+def not_expr(s: Scanner, matcher: Callable[[str], bool]) -> bool:
+    if s.accept(TokenType.NOT):
+        return not not_expr(s, matcher)
+    if s.accept(TokenType.LPAREN):
+        ret = expr(s, matcher)
+        s.accept(TokenType.RPAREN, reject=True)
+        return ret
+    ident = s.accept(TokenType.IDENT)
+    if ident:
+        return matcher(ident.value)
+    s.reject((TokenType.NOT, TokenType.LPAREN, TokenType.IDENT))
+
+
+def evaluate(input: str, matcher: Callable[[str], bool]) -> bool:
+    """Evaluate a match expression as used by -k and -m.
+
+    :param input: The input expression - one line.
+    :param matcher: Given an identifier, should return whether it matches or not.
+                    Should be prepared to handle arbitrary strings as input.
+
+    Returns whether the entire expression matches or not.
+    """
+    return expression(Scanner(input), matcher)
diff --git a/src/_pytest/mark/legacy.py b/src/_pytest/mark/legacy.py
@@ -2,44 +2,46 @@
 this is a place where we put datastructures used by legacy apis
 we hope to remove
 """
-import keyword
 from typing import Set
 
 import attr
 
 from _pytest.compat import TYPE_CHECKING
 from _pytest.config import UsageError
+from _pytest.mark.expression import evaluate
+from _pytest.mark.expression import ParseError
 
 if TYPE_CHECKING:
     from _pytest.nodes import Item  # noqa: F401 (used in type string)
 
 
 @attr.s
-class MarkMapping:
-    """Provides a local mapping for markers where item access
-    resolves to True if the marker is present. """
+class MarkMatcher:
+    """A matcher for markers which are present."""
 
     own_mark_names = attr.ib()
 
     @classmethod
-    def from_item(cls, item):
+    def from_item(cls, item) -> "MarkMatcher":
         mark_names = {mark.name for mark in item.iter_markers()}
         return cls(mark_names)
 
-    def __getitem__(self, name):
+    def __call__(self, name: str) -> bool:
         return name in self.own_mark_names
 
 
 @attr.s
-class KeywordMapping:
-    """Provides a local mapping for keywords.
-    Given a list of names, map any substring of one of these names to True.
+class KeywordMatcher:
+    """A matcher for keywords.
+
+    Given a list of names, matches any substring of one of these names. The
+    string inclusion check is case-insensitive.
     """
 
     _names = attr.ib(type=Set[str])
 
     @classmethod
-    def from_item(cls, item: "Item") -> "KeywordMapping":
+    def from_item(cls, item: "Item") -> "KeywordMatcher":
         mapped_names = set()
 
         # Add the names of the current item and any parent items
@@ -62,12 +64,7 @@ def from_item(cls, item: "Item") -> "KeywordMapping":
 
         return cls(mapped_names)
 
-    def __getitem__(self, subname: str) -> bool:
-        """Return whether subname is included within stored names.
-
-        The string inclusion check is case-insensitive.
-
-        """
+    def __call__(self, subname: str) -> bool:
         subname = subname.lower()
         names = (name.lower() for name in self._names)
 
@@ -77,18 +74,17 @@ def __getitem__(self, subname: str) -> bool:
         return False
 
 
-python_keywords_allowed_list = ["or", "and", "not"]
-
-
-def matchmark(colitem, markexpr):
+def matchmark(colitem, markexpr: str) -> bool:
     """Tries to match on any marker names, attached to the given colitem."""
     try:
-        return eval(markexpr, {}, MarkMapping.from_item(colitem))
-    except Exception:
-        raise UsageError("Wrong expression passed to '-m': {}".format(markexpr))
+        return evaluate(markexpr, MarkMatcher.from_item(colitem))
+    except ParseError as e:
+        raise UsageError(
+            "Wrong expression passed to '-m': {}: {}".format(markexpr, e)
+        ) from None
 
 
-def matchkeyword(colitem, keywordexpr):
+def matchkeyword(colitem, keywordexpr: str) -> bool:
     """Tries to match given keyword expression to given collector item.
 
     Will match on the name of colitem, including the names of its parents.
@@ -97,20 +93,9 @@ def matchkeyword(colitem, keywordexpr):
     Additionally, matches on names in the 'extra_keyword_matches' set of
     any item, as well as names directly assigned to test functions.
     """
-    mapping = KeywordMapping.from_item(colitem)
-    if " " not in keywordexpr:
-        # special case to allow for simple "-k pass" and "-k 1.3"
-        return mapping[keywordexpr]
-    elif keywordexpr.startswith("not ") and " " not in keywordexpr[4:]:
-        return not mapping[keywordexpr[4:]]
-    for kwd in keywordexpr.split():
-        if keyword.iskeyword(kwd) and kwd not in python_keywords_allowed_list:
-            raise UsageError(
-                "Python keyword '{}' not accepted in expressions passed to '-k'".format(
-                    kwd
-                )
-            )
     try:
-        return eval(keywordexpr, {}, mapping)
-    except Exception:
-        raise UsageError("Wrong expression passed to '-k': {}".format(keywordexpr))
+        return evaluate(keywordexpr, KeywordMatcher.from_item(colitem))
+    except ParseError as e:
+        raise UsageError(
+            "Wrong expression passed to '-k': {}: {}".format(keywordexpr, e)
+        ) from None
diff --git a/testing/test_mark.py b/testing/test_mark.py
diff --git a/testing/test_mark_expression.py b/testing/test_mark_expression.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Expressions given to the ``-m`` and ``-k`` options are no longer evaluated using Python's ``eval()``.
	`2`	+The format supports ``or``, ``and``, ``not``, parenthesis and general identifiers to match against.
	`3`	`+Python constants, keywords or other operators are no longer evaluated differently.`