Merge pull request #20973 from ziglang/fix-json-utf8

std.json: fix partial strings getting dropped when multi-byte codepoints span input buffers
2024-11-14 16:13:24 +00:00 · 2024-08-07 18:37:49 -07:00 · 2024-08-07 18:37:49 -07:00 · 7a7421c749
commit 7a7421c749
parent 4381bac792 a805454dea
2 changed files with 116 additions and 20 deletions
--- a/lib/std/json/scanner.zig
+++ b/lib/std/json/scanner.zig
@ -897,7 +897,7 @@ pub const Scanner = struct {
                },
                .number_post_dot => {
                    if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
-                    switch (try self.expectByte()) {
+                    switch (self.input[self.cursor]) {
                        '0'...'9' => {
                            self.cursor += 1;
                            self.state = .number_frac;
@ -1032,7 +1032,8 @@ pub const Scanner = struct {
                    return error.BufferUnderrun;
                },
                .string_backslash => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        '"', '\\', '/' => {
                            // Since these characters now represent themselves literally,
                            // we can simply begin the next plaintext slice here.
@ -1080,7 +1081,8 @@ pub const Scanner = struct {
                    }
                },
                .string_backslash_u => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.utf16_code_units[0] = @as(u16, c - '0') << 12;
@ -1098,7 +1100,8 @@ pub const Scanner = struct {
                    continue :state_loop;
                },
                .string_backslash_u_1 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
@ -1116,7 +1119,8 @@ pub const Scanner = struct {
                    continue :state_loop;
                },
                .string_backslash_u_2 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
@ -1134,7 +1138,8 @@ pub const Scanner = struct {
                    continue :state_loop;
                },
                .string_backslash_u_3 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.utf16_code_units[0] |= c - '0';
@ -1160,7 +1165,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        '\\' => {
                            self.cursor += 1;
                            self.state = .string_surrogate_half_backslash;
@ -1170,7 +1176,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half_backslash => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        'u' => {
                            self.cursor += 1;
                            self.state = .string_surrogate_half_backslash_u;
@ -1180,7 +1187,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half_backslash_u => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        'D', 'd' => {
                            self.cursor += 1;
                            self.utf16_code_units[1] = 0xD << 12;
@ -1191,7 +1199,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half_backslash_u_1 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        'C'...'F' => {
                            self.cursor += 1;
@ -1209,7 +1218,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half_backslash_u_2 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.cursor += 1;
@ -1233,7 +1243,8 @@ pub const Scanner = struct {
                    }
                },
                .string_surrogate_half_backslash_u_3 => {
-                    const c = try self.expectByte();
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    const c = self.input[self.cursor];
                    switch (c) {
                        '0'...'9' => {
                            self.utf16_code_units[1] |= c - '0';
@ -1254,7 +1265,8 @@ pub const Scanner = struct {
                },

                .string_utf8_last_byte => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x80...0xBF => {
                            self.cursor += 1;
                            self.state = .string;
@ -1264,7 +1276,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_second_to_last_byte => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x80...0xBF => {
                            self.cursor += 1;
                            self.state = .string_utf8_last_byte;
@ -1274,7 +1287,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_second_to_last_byte_guard_against_overlong => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0xA0...0xBF => {
                            self.cursor += 1;
                            self.state = .string_utf8_last_byte;
@ -1284,7 +1298,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_second_to_last_byte_guard_against_surrogate_half => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x80...0x9F => {
                            self.cursor += 1;
                            self.state = .string_utf8_last_byte;
@ -1294,7 +1309,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_third_to_last_byte => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x80...0xBF => {
                            self.cursor += 1;
                            self.state = .string_utf8_second_to_last_byte;
@ -1304,7 +1320,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_third_to_last_byte_guard_against_overlong => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x90...0xBF => {
                            self.cursor += 1;
                            self.state = .string_utf8_second_to_last_byte;
@ -1314,7 +1331,8 @@ pub const Scanner = struct {
                    }
                },
                .string_utf8_third_to_last_byte_guard_against_too_large => {
-                    switch (try self.expectByte()) {
+                    if (self.cursor >= self.input.len) return self.endOfBufferInString();
+                    switch (self.input[self.cursor]) {
                        0x80...0x8F => {
                            self.cursor += 1;
                            self.state = .string_utf8_second_to_last_byte;
@ -1666,6 +1684,17 @@ pub const Scanner = struct {
        self.value_start = self.cursor;
        return slice;
    }
+    fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 {
+        // Check if the escape sequence started before the current input buffer.
+        // (The algebra here is awkward to avoid unsigned underflow,
+        //  but it's just making sure the slice on the next line isn't UB.)
+        if (self.cursor <= self.value_start + trailing_negative_offset) return "";
+        const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset];
+        // When trailing_negative_offset is non-zero, setting self.value_start doesn't matter,
+        // because we always set it again while emitting the .partial_string_escaped_*.
+        self.value_start = self.cursor;
+        return slice;
+    }

    fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
        const slice = self.takeValueSlice();
@ -1678,6 +1707,39 @@ pub const Scanner = struct {
        return Token{ .partial_number = slice };
    }

+    fn endOfBufferInString(self: *@This()) !Token {
+        if (self.is_end_of_input) return error.UnexpectedEndOfInput;
+        const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) {
+            // Don't include the escape sequence in the partial string.
+            .string_backslash => 1,
+            .string_backslash_u => 2,
+            .string_backslash_u_1 => 3,
+            .string_backslash_u_2 => 4,
+            .string_backslash_u_3 => 5,
+            .string_surrogate_half => 6,
+            .string_surrogate_half_backslash => 7,
+            .string_surrogate_half_backslash_u => 8,
+            .string_surrogate_half_backslash_u_1 => 9,
+            .string_surrogate_half_backslash_u_2 => 10,
+            .string_surrogate_half_backslash_u_3 => 11,
+
+            // Include everything up to the cursor otherwise.
+            .string,
+            .string_utf8_last_byte,
+            .string_utf8_second_to_last_byte,
+            .string_utf8_second_to_last_byte_guard_against_overlong,
+            .string_utf8_second_to_last_byte_guard_against_surrogate_half,
+            .string_utf8_third_to_last_byte,
+            .string_utf8_third_to_last_byte_guard_against_overlong,
+            .string_utf8_third_to_last_byte_guard_against_too_large,
+            => 0,
+
+            else => unreachable,
+        });
+        if (slice.len == 0) return error.BufferUnderrun;
+        return Token{ .partial_string = slice };
+    }
+
    fn partialStringCodepoint(code_point: u21) Token {
        var buf: [4]u8 = undefined;
        switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {
--- a/lib/std/json/scanner_test.zig
+++ b/lib/std/json/scanner_test.zig
@ -310,10 +310,44 @@ fn expectEqualTokens(expected_token: Token, actual_token: Token) !void {
        .number => |expected_value| {
            try std.testing.expectEqualStrings(expected_value, actual_token.number);
        },
+        .allocated_number => |expected_value| {
+            try std.testing.expectEqualStrings(expected_value, actual_token.allocated_number);
+        },
+        .partial_number => |expected_value| {
+            try std.testing.expectEqualStrings(expected_value, actual_token.partial_number);
+        },
+
        .string => |expected_value| {
            try std.testing.expectEqualStrings(expected_value, actual_token.string);
        },
-        else => {},
+        .allocated_string => |expected_value| {
+            try std.testing.expectEqualStrings(expected_value, actual_token.allocated_string);
+        },
+        .partial_string => |expected_value| {
+            try std.testing.expectEqualStrings(expected_value, actual_token.partial_string);
+        },
+        .partial_string_escaped_1 => |expected_value| {
+            try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_1);
+        },
+        .partial_string_escaped_2 => |expected_value| {
+            try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_2);
+        },
+        .partial_string_escaped_3 => |expected_value| {
+            try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_3);
+        },
+        .partial_string_escaped_4 => |expected_value| {
+            try std.testing.expectEqualStrings(&expected_value, &actual_token.partial_string_escaped_4);
+        },
+
+        .object_begin,
+        .object_end,
+        .array_begin,
+        .array_end,
+        .true,
+        .false,
+        .null,
+        .end_of_document,
+        => {},
    }
 }