More precise error message for unencodable \u escapes

The surrogate code points U+D800 to U+DFFF are valid code points but are not Unicode scalar values. This commit makes the error message more accurately reflect what is actually allowed in `\u` escape sequences.

From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf:

> D71 High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF.
> D73 Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF.
>
> 3.9 Unicode Encoding Forms
> D76 Unicode scalar value: Any Unicode code point except high-surrogate and low-surrogate code points.

Related: #20270
This commit is contained in:
Ryan Liptak 2024-06-11 22:13:22 -07:00 committed by Andrew Kelley
parent 44f4abf380
commit 0cef727e59
3 changed files with 4 additions and 4 deletions

View File

@ -728,12 +728,12 @@
</tr>
<tr>
<th scope="row"><code>\u{NNNNNN}</code></th>
<td>hexadecimal Unicode code point UTF-8 encoded (1 or more digits)</td>
<td>hexadecimal Unicode scalar value UTF-8 encoded (1 or more digits)</td>
</tr>
</tbody>
</table>
</div>
<p>Note that the maximum valid Unicode point is {#syntax#}0x10ffff{#endsyntax#}.</p>
<p>Note that the maximum valid Unicode scalar value is {#syntax#}0x10ffff{#endsyntax#}.</p>
{#header_close#}
{#header_open|Multiline String Literals#}
<p>

View File

@ -11306,7 +11306,7 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
return astgen.failOff(
token,
offset + @as(u32, @intCast(bad_index)),
"unicode escape does not correspond to a valid codepoint",
"unicode escape does not correspond to a valid unicode scalar value",
.{},
);
},

View File

@ -522,7 +522,7 @@ const Parse = struct {
try p.appendErrorOff(
token,
offset + @as(u32, @intCast(bad_index)),
"unicode escape does not correspond to a valid codepoint",
"unicode escape does not correspond to a valid unicode scalar value",
.{},
);
},