Skip to content

Commit c93329b

Browse files
Issue #11489: JSON decoder now accepts lone surrogates.
1 parent f45bbb6 commit c93329b

4 files changed

Lines changed: 73 additions & 41 deletions

File tree

‎Lib/json/decoder.py‎

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ def errmsg(msg, doc, pos, end=None):
6666
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
6767
}
6868

69+
def _decode_uXXXX(s, pos):
70+
esc = s[pos + 1:pos + 5]
71+
if len(esc) == 4 and esc[1] not in 'xX':
72+
try:
73+
return int(esc, 16)
74+
except ValueError:
75+
pass
76+
msg = "Invalid \\uXXXX escape"
77+
raise ValueError(errmsg(msg, s, pos))
78+
6979
def py_scanstring(s, end, strict=True,
7080
_b=BACKSLASH, _m=STRINGCHUNK.match):
7181
"""Scan the string s for a JSON string. End is the index of the
@@ -115,25 +125,14 @@ def py_scanstring(s, end, strict=True,
115125
raise ValueError(errmsg(msg, s, end))
116126
end += 1
117127
else:
118-
esc = s[end + 1:end + 5]
119-
next_end = end + 5
120-
if len(esc) != 4:
121-
msg = "Invalid \\uXXXX escape"
122-
raise ValueError(errmsg(msg, s, end))
123-
uni = int(esc, 16)
124-
if 0xd800 <= uni <= 0xdbff:
125-
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
126-
if not s[end + 5:end + 7] == '\\u':
127-
raise ValueError(errmsg(msg, s, end))
128-
esc2 = s[end + 7:end + 11]
129-
if len(esc2) != 4:
130-
raise ValueError(errmsg(msg, s, end))
131-
uni2 = int(esc2, 16)
132-
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
133-
next_end += 6
128+
uni = _decode_uXXXX(s, end)
129+
end += 5
130+
if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
131+
uni2 = _decode_uXXXX(s, end + 1)
132+
if 0xdc00 <= uni2 <= 0xdfff:
133+
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
134+
end += 6
134135
char = chr(uni)
135-
136-
end = next_end
137136
_append(char)
138137
return ''.join(chunks), end
139138

‎Lib/test/test_json/test_scanstring.py‎

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@
55
class TestScanstring:
66
def test_scanstring(self):
77
scanstring = self.json.decoder.scanstring
8-
self.assertEqual(
9-
scanstring('"z\\ud834\\udd20x"', 1, True),
10-
('z\U0001d120x', 16))
11-
128
self.assertEqual(
139
scanstring('"z\U0001d120x"', 1, True),
1410
('z\U0001d120x', 5))
@@ -89,6 +85,53 @@ def test_scanstring(self):
8985
scanstring('["Bad value", truth]', 2, True),
9086
('Bad value', 12))
9187

88+
def test_surrogates(self):
89+
scanstring = self.json.decoder.scanstring
90+
def assertScan(given, expect):
91+
self.assertEqual(scanstring(given, 1, True),
92+
(expect, len(given)))
93+
94+
assertScan('"z\\ud834\\u0079x"', 'z\ud834yx')
95+
assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x')
96+
assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x')
97+
assertScan('"z\\ud834x"', 'z\ud834x')
98+
assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345')
99+
assertScan('"z\\udd20x"', 'z\udd20x')
100+
assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x')
101+
assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x')
102+
assertScan('"z\ud834x"', 'z\ud834x')
103+
104+
def test_bad_escapes(self):
105+
scanstring = self.json.decoder.scanstring
106+
bad_escapes = [
107+
'"\\"',
108+
'"\\x"',
109+
'"\\u"',
110+
'"\\u0"',
111+
'"\\u01"',
112+
'"\\u012"',
113+
'"\\uz012"',
114+
'"\\u0z12"',
115+
'"\\u01z2"',
116+
'"\\u012z"',
117+
'"\\u0x12"',
118+
'"\\u0X12"',
119+
'"\\ud834\\"',
120+
'"\\ud834\\u"',
121+
'"\\ud834\\ud"',
122+
'"\\ud834\\udd"',
123+
'"\\ud834\\udd2"',
124+
'"\\ud834\\uzdd2"',
125+
'"\\ud834\\udzd2"',
126+
'"\\ud834\\uddz2"',
127+
'"\\ud834\\udd2z"',
128+
'"\\ud834\\u0x20"',
129+
'"\\ud834\\u0X20"',
130+
]
131+
for s in bad_escapes:
132+
with self.assertRaises(ValueError, msg=s):
133+
scanstring(s, 1, True)
134+
92135
def test_overflow(self):
93136
with self.assertRaises(OverflowError):
94137
self.json.decoder.scanstring(b"xxx", sys.maxsize+1)

‎Misc/NEWS‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Core and Builtins
1616
Library
1717
-------
1818

19+
- Issue #11489: JSON decoder now accepts lone surrogates.
20+
1921
- Issue #19545: Avoid chained exceptions while passing stray % to
2022
time.strptime(). Initial patch by Claudiu Popa.
2123

‎Modules/_json.c‎

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -433,17 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
433433
}
434434
}
435435
/* Surrogate pair */
436-
if ((c & 0xfc00) == 0xd800) {
436+
if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len &&
437+
PyUnicode_READ(kind, buf, next++) == '\\' &&
438+
PyUnicode_READ(kind, buf, next++) == 'u') {
437439
Py_UCS4 c2 = 0;
438-
if (end + 6 >= len) {
439-
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
440-
goto bail;
441-
}
442-
if (PyUnicode_READ(kind, buf, next++) != '\\' ||
443-
PyUnicode_READ(kind, buf, next++) != 'u') {
444-
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
445-
goto bail;
446-
}
447440
end += 6;
448441
/* Decode 4 hex digits */
449442
for (; next < end; next++) {
@@ -464,15 +457,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
464457
goto bail;
465458
}
466459
}
467-
if ((c2 & 0xfc00) != 0xdc00) {
468-
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
469-
goto bail;
470-
}
471-
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
472-
}
473-
else if ((c & 0xfc00) == 0xdc00) {
474-
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
475-
goto bail;
460+
if (Py_UNICODE_IS_LOW_SURROGATE(c2))
461+
c = Py_UNICODE_JOIN_SURROGATES(c, c2);
462+
else
463+
end -= 6;
476464
}
477465
}
478466
APPEND_OLD_CHUNK

0 commit comments

Comments
 (0)