Handle invalid escapes in string literals (#34)

jg-rp · web-flow · commit 65c8920f2e5e · 2023-09-28T14:24:44.000+01:00
* Fix invalid escapes in string literals

* Handle escape sequences in non-name string literals
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Python JSONPath Change Log
 
+## Version 0.10.0 (unreleased)
+
+**Breaking Changes**
+
+- The JSONPath lexer now yields distinct tokens for single and double quoted string literals. This is so the parser can do a better job of detecting invalid escape sequences.
+- Changed the canonical representation of a JSONPath string literal to use double quotes instead of single quotes.
+
+**Fixes**
+
+- We no longer silently ignore invalid escape sequences in JSONPath string literals. For example, `$['\"']` used to be OK, it now raises a `JSONPathSyntaxError`.
+
 ## Version 0.9.0
 
 **Breaking Changes**
diff --git a/docs/syntax.md b/docs/syntax.md
@@ -193,7 +193,6 @@ And this is a list of areas where we deviate from the [IETF JSONPath draft](http
 - The root token (default `$`) is optional.
 - Paths starting with a dot (`.`) are OK. `.thing` is the same as `$.thing`, as is `thing`, `$[thing]` and `$["thing"]`.
 - The built-in `match()` and `search()` filter functions use Python's standard library `re` module, which, at least, doesn't support Unicode properties. We might add an implementation of `match()` and `search()` using the third party [regex](https://pypi.org/project/regex/) package in the future.
-- We silently ignore unnecessary escaping when parsing some quoted selectors. The standard treats this as an "invalid selector".
 
 And this is a list of features that are uncommon or unique to Python JSONPath.
 
diff --git a/jsonpath/filter.py b/jsonpath/filter.py
@@ -188,6 +188,9 @@ class StringLiteral(Literal[str]):
 
     __slots__ = ()
 
+    def __str__(self) -> str:
+        return json.dumps(self.value)
+
 
 class IntegerLiteral(Literal[int]):
     """An integer literal."""
diff --git a/jsonpath/lex.py b/jsonpath/lex.py
@@ -60,7 +60,6 @@
 from .token import TOKEN_SLICE_START
 from .token import TOKEN_SLICE_STEP
 from .token import TOKEN_SLICE_STOP
-from .token import TOKEN_STRING
 from .token import TOKEN_TRUE
 from .token import TOKEN_UNDEFINED
 from .token import TOKEN_UNION
@@ -256,13 +255,13 @@ def tokenize(self, path: str) -> Iterator[Token]:  # noqa PLR0912
                 )
             elif kind == TOKEN_DOUBLE_QUOTE_STRING:
                 yield _token(
-                    kind=TOKEN_STRING,
+                    kind=TOKEN_DOUBLE_QUOTE_STRING,
                     value=match.group("G_DQUOTE"),
                     index=match.start("G_DQUOTE"),
                 )
             elif kind == TOKEN_SINGLE_QUOTE_STRING:
                 yield _token(
-                    kind=TOKEN_STRING,
+                    kind=TOKEN_SINGLE_QUOTE_STRING,
                     value=match.group("G_SQUOTE"),
                     index=match.start("G_SQUOTE"),
                 )
diff --git a/jsonpath/parse.py b/jsonpath/parse.py
@@ -1,7 +1,7 @@
 """The default JSONPath parser."""
 from __future__ import annotations
 
-import codecs
+import json
 import re
 from typing import TYPE_CHECKING
 from typing import Callable
@@ -45,6 +45,7 @@
 from .token import TOKEN_COMMA
 from .token import TOKEN_CONTAINS
 from .token import TOKEN_DDOT
+from .token import TOKEN_DOUBLE_QUOTE_STRING
 from .token import TOKEN_EOF
 from .token import TOKEN_EQ
 from .token import TOKEN_FALSE
@@ -81,14 +82,15 @@
 from .token import TOKEN_ROOT
 from .token import TOKEN_RPAREN
 from .token import TOKEN_SELF
+from .token import TOKEN_SINGLE_QUOTE_STRING
 from .token import TOKEN_SLICE_START
 from .token import TOKEN_SLICE_STEP
 from .token import TOKEN_SLICE_STOP
-from .token import TOKEN_STRING
 from .token import TOKEN_TRUE
 from .token import TOKEN_UNDEFINED
 from .token import TOKEN_UNION
 from .token import TOKEN_WILD
+from .token import Token
 
 if TYPE_CHECKING:
     from .env import JSONPathEnvironment
@@ -212,7 +214,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
             TOKEN_ROOT: self.parse_root_path,
             TOKEN_SELF: self.parse_self_path,
             TOKEN_FILTER_CONTEXT: self.parse_filter_context_path,
-            TOKEN_STRING: self.parse_string_literal,
+            TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
+            TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
             TOKEN_TRUE: self.parse_boolean,
             TOKEN_UNDEFINED: self.parse_undefined,
             TOKEN_FUNCTION: self.parse_function_extension,
@@ -225,7 +228,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
             TOKEN_NIL: self.parse_nil,
             TOKEN_NONE: self.parse_nil,
             TOKEN_NULL: self.parse_nil,
-            TOKEN_STRING: self.parse_string_literal,
+            TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
+            TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
             TOKEN_TRUE: self.parse_boolean,
         }
 
@@ -239,7 +243,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
             TOKEN_NIL: self.parse_nil,
             TOKEN_NONE: self.parse_nil,
             TOKEN_NULL: self.parse_nil,
-            TOKEN_STRING: self.parse_string_literal,
+            TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
+            TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
             TOKEN_TRUE: self.parse_boolean,
             TOKEN_ROOT: self.parse_root_path,
             TOKEN_SELF: self.parse_self_path,
@@ -384,29 +389,21 @@ def parse_selector_list(self, stream: TokenStream) -> ListSelector:  # noqa: PLR
                         token=stream.current,
                     )
                 )
-            elif stream.current.kind == TOKEN_STRING:
+            elif stream.current.kind in (
+                TOKEN_DOUBLE_QUOTE_STRING,
+                TOKEN_SINGLE_QUOTE_STRING,
+            ):
                 if self.RE_INVALID_NAME_SELECTOR.search(stream.current.value):
                     raise JSONPathSyntaxError(
                         f"invalid name selector {stream.current.value!r}",
                         token=stream.current,
                     )
 
-                if self.env.unicode_escape:
-                    name = (
-                        codecs.decode(
-                            stream.current.value.replace("\\/", "/"), "unicode-escape"
-                        )
-                        .encode("utf-16", "surrogatepass")
-                        .decode("utf-16")
-                    )
-                else:
-                    name = stream.current.value
-
                 list_items.append(
                     PropertySelector(
                         env=self.env,
                         token=stream.current,
-                        name=name,
+                        name=self._decode_string_literal(stream.current),
                     ),
                 )
             elif stream.current.kind == TOKEN_SLICE_START:
@@ -454,7 +451,7 @@ def parse_undefined(self, _: TokenStream) -> FilterExpression:
         return UNDEFINED_LITERAL
 
     def parse_string_literal(self, stream: TokenStream) -> FilterExpression:
-        return StringLiteral(value=stream.current.value)
+        return StringLiteral(value=self._decode_string_literal(stream.current))
 
     def parse_integer_literal(self, stream: TokenStream) -> FilterExpression:
         return IntegerLiteral(value=int(stream.current.value))
@@ -611,3 +608,18 @@ def parse_filter_selector(
             left = self.parse_infix_expression(stream, left)
 
         return left
+
+    def _decode_string_literal(self, token: Token) -> str:
+        if self.env.unicode_escape:
+            if token.kind == TOKEN_SINGLE_QUOTE_STRING:
+                value = token.value.replace('"', '\\"').replace("\\'", "'")
+            else:
+                value = token.value
+            try:
+                rv = json.loads(f'"{value}"')
+                assert isinstance(rv, str)
+                return rv
+            except json.JSONDecodeError as err:
+                raise JSONPathSyntaxError(str(err).split(":")[1], token=token) from None
+
+        return token.value
diff --git a/tests/test_compliance.py b/tests/test_compliance.py
@@ -37,10 +37,6 @@ class Case:
     "functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported",  # noqa: E501
     "functions, search, filter, search function, unicode char class, uppercase": "\\p not supported",  # noqa: E501
     "functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported",  # noqa: E501
-    "name selector, double quotes, invalid escaped single quote": "ignore",
-    "name selector, double quotes, incomplete escape": "ignore",
-    "name selector, single quotes, invalid escaped double quote": "ignore",
-    "name selector, single quotes, incomplete escape": "ignore",
     "filter, non-singular query in comparison, slice": "TODO",
     "filter, non-singular query in comparison, all children": "TODO",
     "filter, non-singular query in comparison, descendants": "TODO",
diff --git a/tests/test_lex.py b/tests/test_lex.py
@@ -10,6 +10,7 @@
 from jsonpath.token import TOKEN_BARE_PROPERTY
 from jsonpath.token import TOKEN_COMMA
 from jsonpath.token import TOKEN_DDOT
+from jsonpath.token import TOKEN_DOUBLE_QUOTE_STRING
 from jsonpath.token import TOKEN_EQ
 from jsonpath.token import TOKEN_FALSE
 from jsonpath.token import TOKEN_FILTER_END
@@ -35,10 +36,10 @@
 from jsonpath.token import TOKEN_ROOT
 from jsonpath.token import TOKEN_RPAREN
 from jsonpath.token import TOKEN_SELF
+from jsonpath.token import TOKEN_SINGLE_QUOTE_STRING
 from jsonpath.token import TOKEN_SLICE_START
 from jsonpath.token import TOKEN_SLICE_STEP
 from jsonpath.token import TOKEN_SLICE_STOP
-from jsonpath.token import TOKEN_STRING
 from jsonpath.token import TOKEN_TRUE
 from jsonpath.token import TOKEN_UNION
 from jsonpath.token import TOKEN_WILD
@@ -84,7 +85,9 @@ class Case:
         want=[
             Token(kind=TOKEN_ROOT, value="$", index=0, path='$["some"]'),
             Token(kind=TOKEN_LIST_START, value="[", index=1, path='$["some"]'),
-            Token(kind=TOKEN_STRING, value="some", index=3, path='$["some"]'),
+            Token(
+                kind=TOKEN_DOUBLE_QUOTE_STRING, value="some", index=3, path='$["some"]'
+            ),
             Token(kind=TOKEN_RBRACKET, value="]", index=8, path='$["some"]'),
         ],
     ),
@@ -94,7 +97,9 @@ class Case:
         want=[
             Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some']"),
             Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some']"),
-            Token(kind=TOKEN_STRING, value="some", index=3, path="$['some']"),
+            Token(
+                kind=TOKEN_SINGLE_QUOTE_STRING, value="some", index=3, path="$['some']"
+            ),
             Token(kind=TOKEN_RBRACKET, value="]", index=8, path="$['some']"),
         ],
     ),
@@ -754,7 +759,10 @@ class Case:
                 kind=TOKEN_COMMA, value=",", index=16, path="[?(@.thing in [1, '1'])]"
             ),
             Token(
-                kind=TOKEN_STRING, value="1", index=19, path="[?(@.thing in [1, '1'])]"
+                kind=TOKEN_SINGLE_QUOTE_STRING,
+                value="1",
+                index=19,
+                path="[?(@.thing in [1, '1'])]",
             ),
             Token(
                 kind=TOKEN_RBRACKET,
@@ -1010,10 +1018,18 @@ class Case:
         want=[
             Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some', 'thing']"),
             Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some', 'thing']"),
-            Token(kind=TOKEN_STRING, value="some", index=3, path="$['some', 'thing']"),
+            Token(
+                kind=TOKEN_SINGLE_QUOTE_STRING,
+                value="some",
+                index=3,
+                path="$['some', 'thing']",
+            ),
             Token(kind=TOKEN_COMMA, value=",", index=8, path="$['some', 'thing']"),
             Token(
-                kind=TOKEN_STRING, value="thing", index=11, path="$['some', 'thing']"
+                kind=TOKEN_SINGLE_QUOTE_STRING,
+                value="thing",
+                index=11,
+                path="$['some', 'thing']",
             ),
             Token(kind=TOKEN_RBRACKET, value="]", index=17, path="$['some', 'thing']"),
         ],
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -108,7 +108,7 @@ class Case:
     Case(
         description="filter with list membership test",
         path="$.some[?(@.thing in ['foo', 'bar', 42])]",
-        want="$['some'][?(@['thing'] in ['foo', 'bar', 42])]",
+        want="$['some'][?(@['thing'] in [\"foo\", \"bar\", 42])]",
     ),
     Case(
         description="filter with boolean literals",
@@ -143,7 +143,7 @@ class Case:
     Case(
         description="filter with string literal",
         path="$.some[?(@.thing == 'foo')]",
-        want="$['some'][?(@['thing'] == 'foo')]",
+        want="$['some'][?(@['thing'] == \"foo\")]",
     ),
     Case(
         description="filter with integer literal",
@@ -170,6 +170,16 @@ class Case:
         path="$.some.~",
         want="$['some'][~]",
     ),
+    Case(
+        description="comparison to single quoted string literal with escape",
+        path="$[?@.foo == 'ba\\'r']",
+        want="$[?(@['foo'] == \"ba'r\")]",
+    ),
+    Case(
+        description="comparison to double quoted string literal with escape",
+        path='$[?@.foo == "ba\\"r"]',
+        want='$[?(@[\'foo\'] == "ba\\"r")]',
+    ),
 ]