mirror of
https://github.com/ziglang/zig.git
synced 2024-11-14 16:13:24 +00:00
Merge pull request #20973 from ziglang/fix-json-utf8
std.json: fix partial strings getting dropped when multi-byte codepoints span input buffers
This commit is contained in:
commit
7a7421c749
@ -897,7 +897,7 @@ pub const Scanner = struct {
|
||||
},
|
||||
.number_post_dot => {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
|
||||
switch (try self.expectByte()) {
|
||||
switch (self.input[self.cursor]) {
|
||||
'0'...'9' => {
|
||||
self.cursor += 1;
|
||||
self.state = .number_frac;
|
||||
@ -1032,7 +1032,8 @@ pub const Scanner = struct {
|
||||
return error.BufferUnderrun;
|
||||
},
|
||||
.string_backslash => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
'"', '\\', '/' => {
|
||||
// Since these characters now represent themselves literally,
|
||||
// we can simply begin the next plaintext slice here.
|
||||
@ -1080,7 +1081,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_backslash_u => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.utf16_code_units[0] = @as(u16, c - '0') << 12;
|
||||
@ -1098,7 +1100,8 @@ pub const Scanner = struct {
|
||||
continue :state_loop;
|
||||
},
|
||||
.string_backslash_u_1 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
|
||||
@ -1116,7 +1119,8 @@ pub const Scanner = struct {
|
||||
continue :state_loop;
|
||||
},
|
||||
.string_backslash_u_2 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
|
||||
@ -1134,7 +1138,8 @@ pub const Scanner = struct {
|
||||
continue :state_loop;
|
||||
},
|
||||
.string_backslash_u_3 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.utf16_code_units[0] |= c - '0';
|
||||
@ -1160,7 +1165,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
'\\' => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_surrogate_half_backslash;
|
||||
@ -1170,7 +1176,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half_backslash => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
'u' => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_surrogate_half_backslash_u;
|
||||
@ -1180,7 +1187,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half_backslash_u => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
'D', 'd' => {
|
||||
self.cursor += 1;
|
||||
self.utf16_code_units[1] = 0xD << 12;
|
||||
@ -1191,7 +1199,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half_backslash_u_1 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'C'...'F' => {
|
||||
self.cursor += 1;
|
||||
@ -1209,7 +1218,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half_backslash_u_2 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.cursor += 1;
|
||||
@ -1233,7 +1243,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_surrogate_half_backslash_u_3 => {
|
||||
const c = try self.expectByte();
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
const c = self.input[self.cursor];
|
||||
switch (c) {
|
||||
'0'...'9' => {
|
||||
self.utf16_code_units[1] |= c - '0';
|
||||
@ -1254,7 +1265,8 @@ pub const Scanner = struct {
|
||||
},
|
||||
|
||||
.string_utf8_last_byte => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x80...0xBF => {
|
||||
self.cursor += 1;
|
||||
self.state = .string;
|
||||
@ -1264,7 +1276,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_second_to_last_byte => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x80...0xBF => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_last_byte;
|
||||
@ -1274,7 +1287,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_second_to_last_byte_guard_against_overlong => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0xA0...0xBF => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_last_byte;
|
||||
@ -1284,7 +1298,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_second_to_last_byte_guard_against_surrogate_half => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x80...0x9F => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_last_byte;
|
||||
@ -1294,7 +1309,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_third_to_last_byte => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x80...0xBF => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_second_to_last_byte;
|
||||
@ -1304,7 +1320,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_third_to_last_byte_guard_against_overlong => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x90...0xBF => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_second_to_last_byte;
|
||||
@ -1314,7 +1331,8 @@ pub const Scanner = struct {
|
||||
}
|
||||
},
|
||||
.string_utf8_third_to_last_byte_guard_against_too_large => {
|
||||
switch (try self.expectByte()) {
|
||||
if (self.cursor >= self.input.len) return self.endOfBufferInString();
|
||||
switch (self.input[self.cursor]) {
|
||||
0x80...0x8F => {
|
||||
self.cursor += 1;
|
||||
self.state = .string_utf8_second_to_last_byte;
|
||||
@ -1666,6 +1684,17 @@ pub const Scanner = struct {
|
||||
self.value_start = self.cursor;
|
||||
return slice;
|
||||
}
|
||||
fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 {
|
||||
// Check if the escape sequence started before the current input buffer.
|
||||
// (The algebra here is awkward to avoid unsigned underflow,
|
||||
// but it's just making sure the slice on the next line isn't UB.)
|
||||
if (self.cursor <= self.value_start + trailing_negative_offset) return "";
|
||||
const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset];
|
||||
// When trailing_negative_offset is non-zero, setting self.value_start doesn't matter,
|
||||
// because we always set it again while emitting the .partial_string_escaped_*.
|
||||
self.value_start = self.cursor;
|
||||
return slice;
|
||||
}
|
||||
|
||||
fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
|
||||
const slice = self.takeValueSlice();
|
||||
@ -1678,6 +1707,39 @@ pub const Scanner = struct {
|
||||
return Token{ .partial_number = slice };
|
||||
}
|
||||
|
||||
fn endOfBufferInString(self: *@This()) !Token {
|
||||
if (self.is_end_of_input) return error.UnexpectedEndOfInput;
|
||||
const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) {
|
||||
// Don't include the escape sequence in the partial string.
|
||||
.string_backslash => 1,
|
||||
.string_backslash_u => 2,
|
||||
.string_backslash_u_1 => 3,
|
||||
.string_backslash_u_2 => 4,
|
||||
.string_backslash_u_3 => 5,
|
||||
.string_surrogate_half => 6,
|
||||
.string_surrogate_half_backslash => 7,
|
||||
.string_surrogate_half_backslash_u => 8,
|
||||
.string_surrogate_half_backslash_u_1 => 9,
|
||||
.string_surrogate_half_backslash_u_2 => 10,
|
||||
.string_surrogate_half_backslash_u_3 => 11,
|
||||
|
||||
// Include everything up to the cursor otherwise.
|
||||
.string,
|
||||
.string_utf8_last_byte,
|
||||
.string_utf8_second_to_last_byte,
|
||||
.string_utf8_second_to_last_byte_guard_against_overlong,
|
||||
.string_utf8_second_to_last_byte_guard_against_surrogate_half,
|
||||
.string_utf8_third_to_last_byte,
|
||||
.string_utf8_third_to_last_byte_guard_against_overlong,
|
||||
.string_utf8_third_to_last_byte_guard_against_too_large,
|
||||
=> 0,
|
||||
|
||||
else => unreachable,
|
||||
});
|
||||
if (slice.len == 0) return error.BufferUnderrun;
|
||||
return Token{ .partial_string = slice };
|
||||
}
|
||||
|
||||
fn partialStringCodepoint(code_point: u21) Token {
|
||||
var buf: [4]u8 = undefined;
|
||||
switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {
|
||||
|
@ -310,10 +310,44 @@ fn expectEqualTokens(expected_token: Token, actual_token: Token) !void {
|
||||
.number => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.number);
|
||||
},
|
||||
.allocated_number => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.allocated_number);
|
||||
},
|
||||
.partial_number => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.partial_number);
|
||||
},
|
||||
|
||||
.string => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.string);
|
||||
},
|
||||
else => {},
|
||||
.allocated_string => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.allocated_string);
|
||||
},
|
||||
.partial_string => |expected_value| {
|
||||
try std.testing.expectEqualStrings(expected_value, actual_token.partial_string);
|
||||
},
|
||||
.partial_string_escaped_1 => |expected_value| {
|
||||
try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_1);
|
||||
},
|
||||
.partial_string_escaped_2 => |expected_value| {
|
||||
try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_2);
|
||||
},
|
||||
.partial_string_escaped_3 => |expected_value| {
|
||||
try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_3);
|
||||
},
|
||||
.partial_string_escaped_4 => |expected_value| {
|
||||
try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_4);
|
||||
},
|
||||
|
||||
.object_begin,
|
||||
.object_end,
|
||||
.array_begin,
|
||||
.array_end,
|
||||
.true,
|
||||
.false,
|
||||
.null,
|
||||
.end_of_document,
|
||||
=> {},
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user