Skip to content

Commit 65c8920

Browse files
authored
Handle invalid escapes in string literals (#34)
* Fix invalid escapes in string literals * Handle escape sequences in non-name string literals
1 parent fd696ae commit 65c8920

File tree

8 files changed

+81
-35
lines changed

8 files changed

+81
-35
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Python JSONPath Change Log
22

3+
## Version 0.10.0 (unreleased)
4+
5+
**Breaking Changes**
6+
7+
- The JSONPath lexer now yields distinct tokens for single and double quoted string literals. This is so the parser can do a better job of detecting invalid escape sequences.
8+
- Changed the canonical representation of a JSONPath string literal to use double quotes instead of single quotes.
9+
10+
**Fixes**
11+
12+
- We no longer silently ignore invalid escape sequences in JSONPath string literals. For example, `$['\"']` used to be OK, it now raises a `JSONPathSyntaxError`.
13+
314
## Version 0.9.0
415

516
**Breaking Changes**

docs/syntax.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ And this is a list of areas where we deviate from the [IETF JSONPath draft](http
193193
- The root token (default `$`) is optional.
194194
- Paths starting with a dot (`.`) are OK. `.thing` is the same as `$.thing`, as is `thing`, `$[thing]` and `$["thing"]`.
195195
- The built-in `match()` and `search()` filter functions use Python's standard library `re` module, which, at least, doesn't support Unicode properties. We might add an implementation of `match()` and `search()` using the third party [regex](https://pypi.org/project/regex/) package in the future.
196-
- We silently ignore unnecessary escaping when parsing some quoted selectors. The standard treats this as an "invalid selector".
197196

198197
And this is a list of features that are uncommon or unique to Python JSONPath.
199198

jsonpath/filter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ class StringLiteral(Literal[str]):
188188

189189
__slots__ = ()
190190

191+
def __str__(self) -> str:
192+
return json.dumps(self.value)
193+
191194

192195
class IntegerLiteral(Literal[int]):
193196
"""An integer literal."""

jsonpath/lex.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@
6060
from .token import TOKEN_SLICE_START
6161
from .token import TOKEN_SLICE_STEP
6262
from .token import TOKEN_SLICE_STOP
63-
from .token import TOKEN_STRING
6463
from .token import TOKEN_TRUE
6564
from .token import TOKEN_UNDEFINED
6665
from .token import TOKEN_UNION
@@ -256,13 +255,13 @@ def tokenize(self, path: str) -> Iterator[Token]: # noqa PLR0912
256255
)
257256
elif kind == TOKEN_DOUBLE_QUOTE_STRING:
258257
yield _token(
259-
kind=TOKEN_STRING,
258+
kind=TOKEN_DOUBLE_QUOTE_STRING,
260259
value=match.group("G_DQUOTE"),
261260
index=match.start("G_DQUOTE"),
262261
)
263262
elif kind == TOKEN_SINGLE_QUOTE_STRING:
264263
yield _token(
265-
kind=TOKEN_STRING,
264+
kind=TOKEN_SINGLE_QUOTE_STRING,
266265
value=match.group("G_SQUOTE"),
267266
index=match.start("G_SQUOTE"),
268267
)

jsonpath/parse.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""The default JSONPath parser."""
22
from __future__ import annotations
33

4-
import codecs
4+
import json
55
import re
66
from typing import TYPE_CHECKING
77
from typing import Callable
@@ -45,6 +45,7 @@
4545
from .token import TOKEN_COMMA
4646
from .token import TOKEN_CONTAINS
4747
from .token import TOKEN_DDOT
48+
from .token import TOKEN_DOUBLE_QUOTE_STRING
4849
from .token import TOKEN_EOF
4950
from .token import TOKEN_EQ
5051
from .token import TOKEN_FALSE
@@ -81,14 +82,15 @@
8182
from .token import TOKEN_ROOT
8283
from .token import TOKEN_RPAREN
8384
from .token import TOKEN_SELF
85+
from .token import TOKEN_SINGLE_QUOTE_STRING
8486
from .token import TOKEN_SLICE_START
8587
from .token import TOKEN_SLICE_STEP
8688
from .token import TOKEN_SLICE_STOP
87-
from .token import TOKEN_STRING
8889
from .token import TOKEN_TRUE
8990
from .token import TOKEN_UNDEFINED
9091
from .token import TOKEN_UNION
9192
from .token import TOKEN_WILD
93+
from .token import Token
9294

9395
if TYPE_CHECKING:
9496
from .env import JSONPathEnvironment
@@ -212,7 +214,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
212214
TOKEN_ROOT: self.parse_root_path,
213215
TOKEN_SELF: self.parse_self_path,
214216
TOKEN_FILTER_CONTEXT: self.parse_filter_context_path,
215-
TOKEN_STRING: self.parse_string_literal,
217+
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
218+
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
216219
TOKEN_TRUE: self.parse_boolean,
217220
TOKEN_UNDEFINED: self.parse_undefined,
218221
TOKEN_FUNCTION: self.parse_function_extension,
@@ -225,7 +228,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
225228
TOKEN_NIL: self.parse_nil,
226229
TOKEN_NONE: self.parse_nil,
227230
TOKEN_NULL: self.parse_nil,
228-
TOKEN_STRING: self.parse_string_literal,
231+
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
232+
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
229233
TOKEN_TRUE: self.parse_boolean,
230234
}
231235

@@ -239,7 +243,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
239243
TOKEN_NIL: self.parse_nil,
240244
TOKEN_NONE: self.parse_nil,
241245
TOKEN_NULL: self.parse_nil,
242-
TOKEN_STRING: self.parse_string_literal,
246+
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
247+
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
243248
TOKEN_TRUE: self.parse_boolean,
244249
TOKEN_ROOT: self.parse_root_path,
245250
TOKEN_SELF: self.parse_self_path,
@@ -384,29 +389,21 @@ def parse_selector_list(self, stream: TokenStream) -> ListSelector: # noqa: PLR
384389
token=stream.current,
385390
)
386391
)
387-
elif stream.current.kind == TOKEN_STRING:
392+
elif stream.current.kind in (
393+
TOKEN_DOUBLE_QUOTE_STRING,
394+
TOKEN_SINGLE_QUOTE_STRING,
395+
):
388396
if self.RE_INVALID_NAME_SELECTOR.search(stream.current.value):
389397
raise JSONPathSyntaxError(
390398
f"invalid name selector {stream.current.value!r}",
391399
token=stream.current,
392400
)
393401

394-
if self.env.unicode_escape:
395-
name = (
396-
codecs.decode(
397-
stream.current.value.replace("\\/", "/"), "unicode-escape"
398-
)
399-
.encode("utf-16", "surrogatepass")
400-
.decode("utf-16")
401-
)
402-
else:
403-
name = stream.current.value
404-
405402
list_items.append(
406403
PropertySelector(
407404
env=self.env,
408405
token=stream.current,
409-
name=name,
406+
name=self._decode_string_literal(stream.current),
410407
),
411408
)
412409
elif stream.current.kind == TOKEN_SLICE_START:
@@ -454,7 +451,7 @@ def parse_undefined(self, _: TokenStream) -> FilterExpression:
454451
return UNDEFINED_LITERAL
455452

456453
def parse_string_literal(self, stream: TokenStream) -> FilterExpression:
457-
return StringLiteral(value=stream.current.value)
454+
return StringLiteral(value=self._decode_string_literal(stream.current))
458455

459456
def parse_integer_literal(self, stream: TokenStream) -> FilterExpression:
460457
return IntegerLiteral(value=int(stream.current.value))
@@ -611,3 +608,18 @@ def parse_filter_selector(
611608
left = self.parse_infix_expression(stream, left)
612609

613610
return left
611+
612+
def _decode_string_literal(self, token: Token) -> str:
613+
if self.env.unicode_escape:
614+
if token.kind == TOKEN_SINGLE_QUOTE_STRING:
615+
value = token.value.replace('"', '\\"').replace("\\'", "'")
616+
else:
617+
value = token.value
618+
try:
619+
rv = json.loads(f'"{value}"')
620+
assert isinstance(rv, str)
621+
return rv
622+
except json.JSONDecodeError as err:
623+
raise JSONPathSyntaxError(str(err).split(":")[1], token=token) from None
624+
625+
return token.value

tests/test_compliance.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,6 @@ class Case:
3737
"functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
3838
"functions, search, filter, search function, unicode char class, uppercase": "\\p not supported", # noqa: E501
3939
"functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
40-
"name selector, double quotes, invalid escaped single quote": "ignore",
41-
"name selector, double quotes, incomplete escape": "ignore",
42-
"name selector, single quotes, invalid escaped double quote": "ignore",
43-
"name selector, single quotes, incomplete escape": "ignore",
4440
"filter, non-singular query in comparison, slice": "TODO",
4541
"filter, non-singular query in comparison, all children": "TODO",
4642
"filter, non-singular query in comparison, descendants": "TODO",

tests/test_lex.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from jsonpath.token import TOKEN_BARE_PROPERTY
1111
from jsonpath.token import TOKEN_COMMA
1212
from jsonpath.token import TOKEN_DDOT
13+
from jsonpath.token import TOKEN_DOUBLE_QUOTE_STRING
1314
from jsonpath.token import TOKEN_EQ
1415
from jsonpath.token import TOKEN_FALSE
1516
from jsonpath.token import TOKEN_FILTER_END
@@ -35,10 +36,10 @@
3536
from jsonpath.token import TOKEN_ROOT
3637
from jsonpath.token import TOKEN_RPAREN
3738
from jsonpath.token import TOKEN_SELF
39+
from jsonpath.token import TOKEN_SINGLE_QUOTE_STRING
3840
from jsonpath.token import TOKEN_SLICE_START
3941
from jsonpath.token import TOKEN_SLICE_STEP
4042
from jsonpath.token import TOKEN_SLICE_STOP
41-
from jsonpath.token import TOKEN_STRING
4243
from jsonpath.token import TOKEN_TRUE
4344
from jsonpath.token import TOKEN_UNION
4445
from jsonpath.token import TOKEN_WILD
@@ -84,7 +85,9 @@ class Case:
8485
want=[
8586
Token(kind=TOKEN_ROOT, value="$", index=0, path='$["some"]'),
8687
Token(kind=TOKEN_LIST_START, value="[", index=1, path='$["some"]'),
87-
Token(kind=TOKEN_STRING, value="some", index=3, path='$["some"]'),
88+
Token(
89+
kind=TOKEN_DOUBLE_QUOTE_STRING, value="some", index=3, path='$["some"]'
90+
),
8891
Token(kind=TOKEN_RBRACKET, value="]", index=8, path='$["some"]'),
8992
],
9093
),
@@ -94,7 +97,9 @@ class Case:
9497
want=[
9598
Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some']"),
9699
Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some']"),
97-
Token(kind=TOKEN_STRING, value="some", index=3, path="$['some']"),
100+
Token(
101+
kind=TOKEN_SINGLE_QUOTE_STRING, value="some", index=3, path="$['some']"
102+
),
98103
Token(kind=TOKEN_RBRACKET, value="]", index=8, path="$['some']"),
99104
],
100105
),
@@ -754,7 +759,10 @@ class Case:
754759
kind=TOKEN_COMMA, value=",", index=16, path="[?(@.thing in [1, '1'])]"
755760
),
756761
Token(
757-
kind=TOKEN_STRING, value="1", index=19, path="[?(@.thing in [1, '1'])]"
762+
kind=TOKEN_SINGLE_QUOTE_STRING,
763+
value="1",
764+
index=19,
765+
path="[?(@.thing in [1, '1'])]",
758766
),
759767
Token(
760768
kind=TOKEN_RBRACKET,
@@ -1010,10 +1018,18 @@ class Case:
10101018
want=[
10111019
Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some', 'thing']"),
10121020
Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some', 'thing']"),
1013-
Token(kind=TOKEN_STRING, value="some", index=3, path="$['some', 'thing']"),
1021+
Token(
1022+
kind=TOKEN_SINGLE_QUOTE_STRING,
1023+
value="some",
1024+
index=3,
1025+
path="$['some', 'thing']",
1026+
),
10141027
Token(kind=TOKEN_COMMA, value=",", index=8, path="$['some', 'thing']"),
10151028
Token(
1016-
kind=TOKEN_STRING, value="thing", index=11, path="$['some', 'thing']"
1029+
kind=TOKEN_SINGLE_QUOTE_STRING,
1030+
value="thing",
1031+
index=11,
1032+
path="$['some', 'thing']",
10171033
),
10181034
Token(kind=TOKEN_RBRACKET, value="]", index=17, path="$['some', 'thing']"),
10191035
],

tests/test_parse.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class Case:
108108
Case(
109109
description="filter with list membership test",
110110
path="$.some[?(@.thing in ['foo', 'bar', 42])]",
111-
want="$['some'][?(@['thing'] in ['foo', 'bar', 42])]",
111+
want="$['some'][?(@['thing'] in [\"foo\", \"bar\", 42])]",
112112
),
113113
Case(
114114
description="filter with boolean literals",
@@ -143,7 +143,7 @@ class Case:
143143
Case(
144144
description="filter with string literal",
145145
path="$.some[?(@.thing == 'foo')]",
146-
want="$['some'][?(@['thing'] == 'foo')]",
146+
want="$['some'][?(@['thing'] == \"foo\")]",
147147
),
148148
Case(
149149
description="filter with integer literal",
@@ -170,6 +170,16 @@ class Case:
170170
path="$.some.~",
171171
want="$['some'][~]",
172172
),
173+
Case(
174+
description="comparison to single quoted string literal with escape",
175+
path="$[[email protected] == 'ba\\'r']",
176+
want="$[?(@['foo'] == \"ba'r\")]",
177+
),
178+
Case(
179+
description="comparison to double quoted string literal with escape",
180+
path='$[[email protected] == "ba\\"r"]',
181+
want='$[?(@[\'foo\'] == "ba\\"r")]',
182+
),
173183
]
174184

175185

0 commit comments

Comments
 (0)