zig/lib/std/unicode.zig
Jacob Young fe93332ba2 x86_64: implement enough to pass unicode tests
* implement vector comparison
 * implement reduce for bool vectors
 * fix `@memcpy` bug
 * enable passing std tests
2023-10-23 22:42:18 -04:00

1112 lines
40 KiB
Zig
Raw Blame History

const std = @import("./std.zig");
const assert = std.debug.assert;
const testing = std.testing;
const mem = std.mem;
/// Use this to replace an unknown, unrecognized, or unrepresentable character.
///
/// See also: https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
pub const replacement_character: u21 = 0xFFFD;
/// Returns how many bytes the UTF-8 representation would require
/// for the given codepoint.
pub fn utf8CodepointSequenceLength(c: u21) !u3 {
if (c < 0x80) return @as(u3, 1);
if (c < 0x800) return @as(u3, 2);
if (c < 0x10000) return @as(u3, 3);
if (c < 0x110000) return @as(u3, 4);
return error.CodepointTooLarge;
}
/// Given the first byte of a UTF-8 codepoint,
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
// The switch is optimized much better than a "smart" approach using @clz
return switch (first_byte) {
0b0000_0000...0b0111_1111 => 1,
0b1100_0000...0b1101_1111 => 2,
0b1110_0000...0b1110_1111 => 3,
0b1111_0000...0b1111_0111 => 4,
else => error.Utf8InvalidStartByte,
};
}
/// Encodes the given codepoint into a UTF-8 byte sequence.
/// c: the codepoint.
/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
/// Errors: if c cannot be encoded in UTF-8.
/// Returns: the number of bytes written to out.
pub fn utf8Encode(c: u21, out: []u8) !u3 {
const length = try utf8CodepointSequenceLength(c);
assert(out.len >= length);
switch (length) {
// The pattern for each is the same
// - Increasing the initial shift by 6 each time
// - Each time after the first shorten the shifted
// value to a max of 0b111111 (63)
1 => out[0] = @as(u8, @intCast(c)), // Can just do 0 + codepoint for initial range
2 => {
out[0] = @as(u8, @intCast(0b11000000 | (c >> 6)));
out[1] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
},
3 => {
if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;
out[0] = @as(u8, @intCast(0b11100000 | (c >> 12)));
out[1] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
out[2] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
},
4 => {
out[0] = @as(u8, @intCast(0b11110000 | (c >> 18)));
out[1] = @as(u8, @intCast(0b10000000 | ((c >> 12) & 0b111111)));
out[2] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
out[3] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
},
else => unreachable,
}
return length;
}
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
/// If you already know the length at comptime, you can call one of
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
return switch (bytes.len) {
1 => @as(u21, bytes[0]),
2 => utf8Decode2(bytes),
3 => utf8Decode3(bytes),
4 => utf8Decode4(bytes),
else => unreachable,
};
}
const Utf8Decode2Error = error{
Utf8ExpectedContinuation,
Utf8OverlongEncoding,
};
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
assert(bytes.len == 2);
assert(bytes[0] & 0b11100000 == 0b11000000);
var value: u21 = bytes[0] & 0b00011111;
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[1] & 0b00111111;
if (value < 0x80) return error.Utf8OverlongEncoding;
return value;
}
const Utf8Decode3Error = error{
Utf8ExpectedContinuation,
Utf8OverlongEncoding,
Utf8EncodesSurrogateHalf,
};
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
assert(bytes.len == 3);
assert(bytes[0] & 0b11110000 == 0b11100000);
var value: u21 = bytes[0] & 0b00001111;
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[1] & 0b00111111;
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[2] & 0b00111111;
if (value < 0x800) return error.Utf8OverlongEncoding;
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
return value;
}
const Utf8Decode4Error = error{
Utf8ExpectedContinuation,
Utf8OverlongEncoding,
Utf8CodepointTooLarge,
};
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
assert(bytes.len == 4);
assert(bytes[0] & 0b11111000 == 0b11110000);
var value: u21 = bytes[0] & 0b00000111;
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[1] & 0b00111111;
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[2] & 0b00111111;
if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
value <<= 6;
value |= bytes[3] & 0b00111111;
if (value < 0x10000) return error.Utf8OverlongEncoding;
if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
return value;
}
/// Returns true if the given unicode codepoint can be encoded in UTF-8.
pub fn utf8ValidCodepoint(value: u21) bool {
return switch (value) {
0xD800...0xDFFF => false, // Surrogates range
0x110000...0x1FFFFF => false, // Above the maximum codepoint value
else => true,
};
}
/// Returns the length of a supplied UTF-8 string literal in terms of unicode
/// codepoints.
pub fn utf8CountCodepoints(s: []const u8) !usize {
var len: usize = 0;
const N = @sizeOf(usize);
const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
var i: usize = 0;
while (i < s.len) {
// Fast path for ASCII sequences
while (i + N <= s.len) : (i += N) {
const v = mem.readIntNative(usize, s[i..][0..N]);
if (v & MASK != 0) break;
len += N;
}
if (i < s.len) {
const n = try utf8ByteSequenceLength(s[i]);
if (i + n > s.len) return error.TruncatedInput;
switch (n) {
1 => {}, // ASCII, no validation needed
else => _ = try utf8Decode(s[i..][0..n]),
}
i += n;
len += 1;
}
}
return len;
}
/// Returns true if the input consists entirely of UTF-8 codepoints
pub fn utf8ValidateSlice(input: []const u8) bool {
var remaining = input;
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
const Chunk = @Vector(chunk_len, u8);
// Fast path. Check for and skip ASCII characters at the start of the input.
while (remaining.len >= chunk_len) {
const chunk: Chunk = remaining[0..chunk_len].*;
const mask: Chunk = @splat(0x80);
if (@reduce(.Or, chunk & mask == mask)) {
// found a non ASCII byte
break;
}
remaining = remaining[chunk_len..];
}
// default lowest and highest continuation byte
const lo_cb = 0b10000000;
const hi_cb = 0b10111111;
const min_non_ascii_codepoint = 0x80;
// The first nibble is used to identify the continuation byte range to
// accept. The second nibble is the size.
const xx = 0xF1; // invalid: size 1
const as = 0xF0; // ASCII: size 1
const s1 = 0x02; // accept 0, size 2
const s2 = 0x13; // accept 1, size 3
const s3 = 0x03; // accept 0, size 3
const s4 = 0x23; // accept 2, size 3
const s5 = 0x34; // accept 3, size 4
const s6 = 0x04; // accept 0, size 4
const s7 = 0x44; // accept 4, size 4
// Information about the first byte in a UTF-8 sequence.
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
};
var n = remaining.len;
var i: usize = 0;
while (i < n) {
const first_byte = remaining[i];
if (first_byte < min_non_ascii_codepoint) {
i += 1;
continue;
}
const info = first[first_byte];
if (info == xx) {
return false; // Illegal starter byte.
}
const size = info & 7;
if (i + size > n) {
return false; // Short or invalid.
}
// Figure out the acceptable low and high continuation bytes, starting
// with our defaults.
var accept_lo: u8 = lo_cb;
var accept_hi: u8 = hi_cb;
switch (info >> 4) {
0 => {},
1 => accept_lo = 0xA0,
2 => accept_hi = 0x9F,
3 => accept_lo = 0x90,
4 => accept_hi = 0x8F,
else => unreachable,
}
const c1 = remaining[i + 1];
if (c1 < accept_lo or accept_hi < c1) {
return false;
}
switch (size) {
2 => i += 2,
3 => {
const c2 = remaining[i + 2];
if (c2 < lo_cb or hi_cb < c2) {
return false;
}
i += 3;
},
4 => {
const c2 = remaining[i + 2];
if (c2 < lo_cb or hi_cb < c2) {
return false;
}
const c3 = remaining[i + 3];
if (c3 < lo_cb or hi_cb < c3) {
return false;
}
i += 4;
},
else => unreachable,
}
}
return true;
}
/// Utf8View iterates the code points of a utf-8 encoded string.
///
/// ```
/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator();
/// while (utf8.nextCodepointSlice()) |codepoint| {
/// std.debug.print("got codepoint {}\n", .{codepoint});
/// }
/// ```
pub const Utf8View = struct {
bytes: []const u8,
pub fn init(s: []const u8) !Utf8View {
if (!utf8ValidateSlice(s)) {
return error.InvalidUtf8;
}
return initUnchecked(s);
}
pub fn initUnchecked(s: []const u8) Utf8View {
return Utf8View{ .bytes = s };
}
/// TODO: https://github.com/ziglang/zig/issues/425
pub fn initComptime(comptime s: []const u8) Utf8View {
if (comptime init(s)) |r| {
return r;
} else |err| switch (err) {
error.InvalidUtf8 => {
@compileError("invalid utf8");
},
}
}
pub fn iterator(s: Utf8View) Utf8Iterator {
return Utf8Iterator{
.bytes = s.bytes,
.i = 0,
};
}
};
pub const Utf8Iterator = struct {
bytes: []const u8,
i: usize,
pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 {
if (it.i >= it.bytes.len) {
return null;
}
const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
it.i += cp_len;
return it.bytes[it.i - cp_len .. it.i];
}
pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
const slice = it.nextCodepointSlice() orelse return null;
return utf8Decode(slice) catch unreachable;
}
/// Look ahead at the next n codepoints without advancing the iterator.
/// If fewer than n codepoints are available, then return the remainder of the string.
pub fn peek(it: *Utf8Iterator, n: usize) []const u8 {
const original_i = it.i;
defer it.i = original_i;
var end_ix = original_i;
var found: usize = 0;
while (found < n) : (found += 1) {
const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
end_ix += next_codepoint.len;
}
return it.bytes[original_i..end_ix];
}
};
pub fn utf16IsHighSurrogate(c: u16) bool {
return c & ~@as(u16, 0x03ff) == 0xd800;
}
pub fn utf16IsLowSurrogate(c: u16) bool {
return c & ~@as(u16, 0x03ff) == 0xdc00;
}
/// Returns how many code units the UTF-16 representation would require
/// for the given codepoint.
pub fn utf16CodepointSequenceLength(c: u21) !u2 {
if (c <= 0xFFFF) return 1;
if (c <= 0x10FFFF) return 2;
return error.CodepointTooLarge;
}
test utf16CodepointSequenceLength {
try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength('a'));
try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength(0xFFFF));
try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10000));
try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10FFFF));
try testing.expectError(error.CodepointTooLarge, utf16CodepointSequenceLength(0x110000));
}
/// Given the first code unit of a UTF-16 codepoint, returns a number 1-2
/// indicating the total length of the codepoint in UTF-16 code units.
/// If this code unit does not match the form of a UTF-16 start code unit, returns Utf16InvalidStartCodeUnit.
pub fn utf16CodeUnitSequenceLength(first_code_unit: u16) !u2 {
if (utf16IsHighSurrogate(first_code_unit)) return 2;
if (utf16IsLowSurrogate(first_code_unit)) return error.Utf16InvalidStartCodeUnit;
return 1;
}
test utf16CodeUnitSequenceLength {
try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength('a'));
try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength(0xFFFF));
try testing.expectEqual(@as(u2, 2), try utf16CodeUnitSequenceLength(0xDBFF));
try testing.expectError(error.Utf16InvalidStartCodeUnit, utf16CodeUnitSequenceLength(0xDFFF));
}
/// Decodes the codepoint encoded in the given pair of UTF-16 code units.
/// Asserts that `surrogate_pair.len >= 2` and that the first code unit is a high surrogate.
/// If the second code unit is not a low surrogate, error.ExpectedSecondSurrogateHalf is returned.
pub fn utf16DecodeSurrogatePair(surrogate_pair: []const u16) !u21 {
assert(surrogate_pair.len >= 2);
assert(utf16IsHighSurrogate(surrogate_pair[0]));
const high_half: u21 = surrogate_pair[0];
const low_half = surrogate_pair[1];
if (!utf16IsLowSurrogate(low_half)) return error.ExpectedSecondSurrogateHalf;
return 0x10000 + ((high_half & 0x03ff) << 10) | (low_half & 0x03ff);
}
pub const Utf16LeIterator = struct {
bytes: []const u8,
i: usize,
pub fn init(s: []const u16) Utf16LeIterator {
return Utf16LeIterator{
.bytes = mem.sliceAsBytes(s),
.i = 0,
};
}
pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 {
assert(it.i <= it.bytes.len);
if (it.i == it.bytes.len) return null;
var code_units: [2]u16 = undefined;
code_units[0] = mem.readIntLittle(u16, it.bytes[it.i..][0..2]);
it.i += 2;
if (utf16IsHighSurrogate(code_units[0])) {
// surrogate pair
if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
code_units[1] = mem.readIntLittle(u16, it.bytes[it.i..][0..2]);
const codepoint = try utf16DecodeSurrogatePair(&code_units);
it.i += 2;
return codepoint;
} else if (utf16IsLowSurrogate(code_units[0])) {
return error.UnexpectedSecondSurrogateHalf;
} else {
return code_units[0];
}
}
};
/// Returns the length of a supplied UTF-16 string literal in terms of unicode
/// codepoints.
pub fn utf16CountCodepoints(utf16le: []const u16) !usize {
var len: usize = 0;
var it = Utf16LeIterator.init(utf16le);
while (try it.nextCodepoint()) |_| len += 1;
return len;
}
fn testUtf16CountCodepoints() !void {
try testing.expectEqual(
@as(usize, 1),
try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("a")),
);
try testing.expectEqual(
@as(usize, 10),
try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("abcdefghij")),
);
try testing.expectEqual(
@as(usize, 10),
try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("äåéëþüúíóö")),
);
try testing.expectEqual(
@as(usize, 5),
try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("こんにちは")),
);
}
test "utf16 count codepoints" {
try testUtf16CountCodepoints();
try comptime testUtf16CountCodepoints();
}
test "utf8 encode" {
try comptime testUtf8Encode();
try testUtf8Encode();
}
fn testUtf8Encode() !void {
// A few taken from wikipedia a few taken elsewhere
var array: [4]u8 = undefined;
try testing.expect((try utf8Encode(try utf8Decode(""), array[0..])) == 3);
try testing.expect(array[0] == 0b11100010);
try testing.expect(array[1] == 0b10000010);
try testing.expect(array[2] == 0b10101100);
try testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
try testing.expect(array[0] == 0b00100100);
try testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
try testing.expect(array[0] == 0b11000010);
try testing.expect(array[1] == 0b10100010);
try testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
try testing.expect(array[0] == 0b11110000);
try testing.expect(array[1] == 0b10010000);
try testing.expect(array[2] == 0b10001101);
try testing.expect(array[3] == 0b10001000);
}
test "utf8 encode error" {
try comptime testUtf8EncodeError();
try testUtf8EncodeError();
}
fn testUtf8EncodeError() !void {
var array: [4]u8 = undefined;
try testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
try testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
try testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
try testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
}
fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) !void {
try testing.expectError(expectedErr, utf8Encode(codePoint, array));
}
test "utf8 iterator on ascii" {
try comptime testUtf8IteratorOnAscii();
try testUtf8IteratorOnAscii();
}
fn testUtf8IteratorOnAscii() !void {
const s = Utf8View.initComptime("abc");
var it1 = s.iterator();
try testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?));
try testing.expect(it1.nextCodepointSlice() == null);
var it2 = s.iterator();
try testing.expect(it2.nextCodepoint().? == 'a');
try testing.expect(it2.nextCodepoint().? == 'b');
try testing.expect(it2.nextCodepoint().? == 'c');
try testing.expect(it2.nextCodepoint() == null);
}
test "utf8 view bad" {
try comptime testUtf8ViewBad();
try testUtf8ViewBad();
}
fn testUtf8ViewBad() !void {
// Compile-time error.
// const s3 = Utf8View.initComptime("\xfe\xf2");
try testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo"));
}
test "utf8 view ok" {
try comptime testUtf8ViewOk();
try testUtf8ViewOk();
}
fn testUtf8ViewOk() !void {
const s = Utf8View.initComptime("東京市");
var it1 = s.iterator();
try testing.expect(std.mem.eql(u8, "", it1.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "", it1.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "", it1.nextCodepointSlice().?));
try testing.expect(it1.nextCodepointSlice() == null);
var it2 = s.iterator();
try testing.expect(it2.nextCodepoint().? == 0x6771);
try testing.expect(it2.nextCodepoint().? == 0x4eac);
try testing.expect(it2.nextCodepoint().? == 0x5e02);
try testing.expect(it2.nextCodepoint() == null);
}
test "validate slice" {
try comptime testValidateSlice();
try testValidateSlice();
// We skip a variable (based on recommended vector size) chunks of
// ASCII characters. Let's make sure we're chunking correctly.
const str = [_]u8{'a'} ** 550 ++ "\xc0";
for (0..str.len - 3) |i| {
try testing.expect(!utf8ValidateSlice(str[i..]));
}
}
fn testValidateSlice() !void {
try testing.expect(utf8ValidateSlice("abc"));
try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
try testing.expect(utf8ValidateSlice(""));
try testing.expect(utf8ValidateSlice("a"));
try testing.expect(utf8ValidateSlice("abc"));
try testing.expect(utf8ValidateSlice("Ж"));
try testing.expect(utf8ValidateSlice("ЖЖ"));
try testing.expect(utf8ValidateSlice("брэд-ЛГТМ"));
try testing.expect(utf8ValidateSlice("☺☻☹"));
try testing.expect(utf8ValidateSlice("a\u{fffdb}"));
try testing.expect(utf8ValidateSlice("\xf4\x8f\xbf\xbf"));
try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
try testing.expect(!utf8ValidateSlice("abc\xc0"));
try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
try testing.expect(!utf8ValidateSlice("aa\xe2"));
try testing.expect(!utf8ValidateSlice("\x42\xfa"));
try testing.expect(!utf8ValidateSlice("\x42\xfa\x43"));
try testing.expect(!utf8ValidateSlice("abc\xc0"));
try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
try testing.expect(!utf8ValidateSlice("\xf4\x90\x80\x80"));
try testing.expect(!utf8ValidateSlice("\xf7\xbf\xbf\xbf"));
try testing.expect(!utf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
try testing.expect(!utf8ValidateSlice("\xc0\x80"));
try testing.expect(!utf8ValidateSlice("\xed\xa0\x80"));
try testing.expect(!utf8ValidateSlice("\xed\xbf\xbf"));
}
test "valid utf8" {
try comptime testValidUtf8();
try testValidUtf8();
}
fn testValidUtf8() !void {
try testValid("\x00", 0x0);
try testValid("\x20", 0x20);
try testValid("\x7f", 0x7f);
try testValid("\xc2\x80", 0x80);
try testValid("\xdf\xbf", 0x7ff);
try testValid("\xe0\xa0\x80", 0x800);
try testValid("\xe1\x80\x80", 0x1000);
try testValid("\xef\xbf\xbf", 0xffff);
try testValid("\xf0\x90\x80\x80", 0x10000);
try testValid("\xf1\x80\x80\x80", 0x40000);
try testValid("\xf3\xbf\xbf\xbf", 0xfffff);
try testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
}
test "invalid utf8 continuation bytes" {
try comptime testInvalidUtf8ContinuationBytes();
try testInvalidUtf8ContinuationBytes();
}
fn testInvalidUtf8ContinuationBytes() !void {
// unexpected continuation
try testError("\x80", error.Utf8InvalidStartByte);
try testError("\xbf", error.Utf8InvalidStartByte);
// too many leading 1's
try testError("\xf8", error.Utf8InvalidStartByte);
try testError("\xff", error.Utf8InvalidStartByte);
// expected continuation for 2 byte sequences
try testError("\xc2", error.UnexpectedEof);
try testError("\xc2\x00", error.Utf8ExpectedContinuation);
try testError("\xc2\xc0", error.Utf8ExpectedContinuation);
// expected continuation for 3 byte sequences
try testError("\xe0", error.UnexpectedEof);
try testError("\xe0\x00", error.UnexpectedEof);
try testError("\xe0\xc0", error.UnexpectedEof);
try testError("\xe0\xa0", error.UnexpectedEof);
try testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
try testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
// expected continuation for 4 byte sequences
try testError("\xf0", error.UnexpectedEof);
try testError("\xf0\x00", error.UnexpectedEof);
try testError("\xf0\xc0", error.UnexpectedEof);
try testError("\xf0\x90\x00", error.UnexpectedEof);
try testError("\xf0\x90\xc0", error.UnexpectedEof);
try testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
try testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
}
test "overlong utf8 codepoint" {
try comptime testOverlongUtf8Codepoint();
try testOverlongUtf8Codepoint();
}
fn testOverlongUtf8Codepoint() !void {
try testError("\xc0\x80", error.Utf8OverlongEncoding);
try testError("\xc1\xbf", error.Utf8OverlongEncoding);
try testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
try testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
try testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
try testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
}
test "misc invalid utf8" {
try comptime testMiscInvalidUtf8();
try testMiscInvalidUtf8();
}
fn testMiscInvalidUtf8() !void {
// codepoint out of bounds
try testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
try testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
// surrogate halves
try testValid("\xed\x9f\xbf", 0xd7ff);
try testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
try testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
try testValid("\xee\x80\x80", 0xe000);
}
test "utf8 iterator peeking" {
try comptime testUtf8Peeking();
try testUtf8Peeking();
}
fn testUtf8Peeking() !void {
const s = Utf8View.initComptime("noël");
var it = s.iterator();
try testing.expect(std.mem.eql(u8, "n", it.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "o", it.peek(1)));
try testing.expect(std.mem.eql(u8, "", it.peek(2)));
try testing.expect(std.mem.eql(u8, "oël", it.peek(3)));
try testing.expect(std.mem.eql(u8, "oël", it.peek(4)));
try testing.expect(std.mem.eql(u8, "oël", it.peek(10)));
try testing.expect(std.mem.eql(u8, "o", it.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "ë", it.nextCodepointSlice().?));
try testing.expect(std.mem.eql(u8, "l", it.nextCodepointSlice().?));
try testing.expect(it.nextCodepointSlice() == null);
try testing.expect(std.mem.eql(u8, &[_]u8{}, it.peek(1)));
}
fn testError(bytes: []const u8, expected_err: anyerror) !void {
try testing.expectError(expected_err, testDecode(bytes));
}
fn testValid(bytes: []const u8, expected_codepoint: u21) !void {
try testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
}
fn testDecode(bytes: []const u8) !u21 {
const length = try utf8ByteSequenceLength(bytes[0]);
if (bytes.len < length) return error.UnexpectedEof;
try testing.expect(bytes.len == length);
return utf8Decode(bytes);
}
/// Caller must free returned memory.
pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8 {
// optimistically guess that it will all be ascii.
var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);
errdefer result.deinit();
var out_index: usize = 0;
var it = Utf16LeIterator.init(utf16le);
while (try it.nextCodepoint()) |codepoint| {
const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
try result.resize(result.items.len + utf8_len);
assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len);
out_index += utf8_len;
}
return result.toOwnedSlice();
}
/// Caller must free returned memory.
pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]u8 {
// optimistically guess that it will all be ascii (and allocate space for the null terminator)
var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);
errdefer result.deinit();
var out_index: usize = 0;
var it = Utf16LeIterator.init(utf16le);
while (try it.nextCodepoint()) |codepoint| {
const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
try result.resize(result.items.len + utf8_len);
assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len);
out_index += utf8_len;
}
return result.toOwnedSliceSentinel(0);
}
/// Asserts that the output buffer is big enough.
/// Returns end byte index into utf8.
pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {
var end_index: usize = 0;
var it = Utf16LeIterator.init(utf16le);
while (try it.nextCodepoint()) |codepoint| {
end_index += try utf8Encode(codepoint, utf8[end_index..]);
}
return end_index;
}
test "utf16leToUtf8" {
var utf16le: [2]u16 = undefined;
const utf16le_as_bytes = mem.sliceAsBytes(utf16le[0..]);
{
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 'A');
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 'a');
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "Aa"));
}
{
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0x80);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xffff);
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
}
{
// the values just outside the surrogate half range
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xd7ff);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xe000);
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
}
{
// smallest surrogate pair
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xd800);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdc00);
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
}
{
// largest surrogate pair
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xdbff);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdfff);
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
}
{
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xdbff);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdc00);
const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
defer std.testing.allocator.free(utf8);
try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
}
{
mem.writeIntSliceLittle(u16, utf16le_as_bytes[0..], 0xdcdc);
mem.writeIntSliceLittle(u16, utf16le_as_bytes[2..], 0xdcdc);
const result = utf16leToUtf8Alloc(std.testing.allocator, &utf16le);
try std.testing.expectError(error.UnexpectedSecondSurrogateHalf, result);
}
}
pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u16 {
// optimistically guess that it will not require surrogate pairs
var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len + 1);
errdefer result.deinit();
const view = try Utf8View.init(utf8);
var it = view.iterator();
while (it.nextCodepoint()) |codepoint| {
if (codepoint < 0x10000) {
const short = @as(u16, @intCast(codepoint));
try result.append(mem.nativeToLittle(u16, short));
} else {
const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
var out: [2]u16 = undefined;
out[0] = mem.nativeToLittle(u16, high);
out[1] = mem.nativeToLittle(u16, low);
try result.appendSlice(out[0..]);
}
}
return result.toOwnedSliceSentinel(0);
}
/// Returns index of next character. If exact fit, returned index equals output slice length.
/// Assumes there is enough space for the output.
pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
var dest_i: usize = 0;
var src_i: usize = 0;
while (src_i < utf8.len) {
const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8;
const next_src_i = src_i + n;
const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
if (codepoint < 0x10000) {
const short = @as(u16, @intCast(codepoint));
utf16le[dest_i] = mem.nativeToLittle(u16, short);
dest_i += 1;
} else {
const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
utf16le[dest_i] = mem.nativeToLittle(u16, high);
utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
dest_i += 2;
}
src_i = next_src_i;
}
return dest_i;
}
test "utf8ToUtf16Le" {
var utf16le: [2]u16 = [_]u16{0} ** 2;
{
const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
try testing.expectEqual(@as(usize, 2), length);
try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..]));
}
{
const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
try testing.expectEqual(@as(usize, 2), length);
try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..]));
}
{
const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80");
try testing.expectError(error.InvalidUtf8, result);
}
}
test "utf8ToUtf16LeWithNull" {
{
const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "𐐷");
defer testing.allocator.free(utf16);
try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));
try testing.expect(utf16[2] == 0);
}
{
const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "\u{10FFFF}");
defer testing.allocator.free(utf16);
try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));
try testing.expect(utf16[2] == 0);
}
{
const result = utf8ToUtf16LeWithNull(testing.allocator, "\xf4\x90\x80\x80");
try testing.expectError(error.InvalidUtf8, result);
}
}
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 {
return comptime blk: {
const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err);
var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
assert(len == utf16le_len);
break :blk &utf16le;
};
}
const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
/// Returns length in UTF-16 of UTF-8 slice as length of []u16.
/// Length in []u8 is 2*len16.
pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
var src_i: usize = 0;
var dest_len: usize = 0;
while (src_i < utf8.len) {
const n = try utf8ByteSequenceLength(utf8[src_i]);
const next_src_i = src_i + n;
const codepoint = try utf8Decode(utf8[src_i..next_src_i]);
if (codepoint < 0x10000) {
dest_len += 1;
} else {
dest_len += 2;
}
src_i = next_src_i;
}
return dest_len;
}
fn testCalcUtf16LeLen() !void {
try testing.expectEqual(@as(usize, 1), try calcUtf16LeLen("a"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("abcdefghij"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("äåéëþüúíóö"));
try testing.expectEqual(@as(usize, 5), try calcUtf16LeLen("こんにちは"));
}
test "calculate utf16 string length of given utf8 string in u16" {
try testCalcUtf16LeLen();
try comptime testCalcUtf16LeLen();
}
/// Print the given `utf16le` string
fn formatUtf16le(
utf16le: []const u16,
comptime fmt: []const u8,
options: std.fmt.FormatOptions,
writer: anytype,
) !void {
_ = fmt;
_ = options;
var buf: [300]u8 = undefined; // just a random size I chose
var it = Utf16LeIterator.init(utf16le);
var u8len: usize = 0;
while (it.nextCodepoint() catch replacement_character) |codepoint| {
u8len += utf8Encode(codepoint, buf[u8len..]) catch
utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
if (u8len + 3 >= buf.len) {
try writer.writeAll(buf[0..u8len]);
u8len = 0;
}
}
try writer.writeAll(buf[0..u8len]);
}
/// Return a Formatter for a Utf16le string
pub fn fmtUtf16le(utf16le: []const u16) std.fmt.Formatter(formatUtf16le) {
return .{ .data = utf16le };
}
test "fmtUtf16le" {
const expectFmt = std.testing.expectFmt;
try expectFmt("", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral(""))});
try expectFmt("foo", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral("foo"))});
try expectFmt("𐐷", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral("𐐷"))});
try expectFmt("", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\xff\xd7")})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\x00\xd8")})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\xff\xdb")})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\x00\xdc")})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\xff\xdf")})});
try expectFmt("", "{}", .{fmtUtf16le(&[_]u16{std.mem.readIntNative(u16, "\x00\xe0")})});
}
test "utf8ToUtf16LeStringLiteral" {
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x41),
};
const utf16 = utf8ToUtf16LeStringLiteral("A");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0xD801),
mem.nativeToLittle(u16, 0xDC37),
};
const utf16 = utf8ToUtf16LeStringLiteral("𐐷");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[2] == 0);
}
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x02FF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{02FF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x7FF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{7FF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x801),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{801}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0xDBFF),
mem.nativeToLittle(u16, 0xDFFF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{10FFFF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[2] == 0);
}
}
fn testUtf8CountCodepoints() !void {
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
try testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
// testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
}
test "utf8 count codepoints" {
try testUtf8CountCodepoints();
try comptime testUtf8CountCodepoints();
}
fn testUtf8ValidCodepoint() !void {
try testing.expect(utf8ValidCodepoint('e'));
try testing.expect(utf8ValidCodepoint('ë'));
try testing.expect(utf8ValidCodepoint('は'));
try testing.expect(utf8ValidCodepoint(0xe000));
try testing.expect(utf8ValidCodepoint(0x10ffff));
try testing.expect(!utf8ValidCodepoint(0xd800));
try testing.expect(!utf8ValidCodepoint(0xdfff));
try testing.expect(!utf8ValidCodepoint(0x110000));
}
test "utf8 valid codepoint" {
try testUtf8ValidCodepoint();
try comptime testUtf8ValidCodepoint();
}