Skip to content

Commit 8457439

Browse files
authored
Merge pull request #17797 from squeek502/utf16-ascii-fast-path
std.unicode: Add ASCII fast path to UTF-16 <-> UTF-8 conversion functions
2 parents a09ba45 + 13c8ec9 commit 8457439

File tree

1 file changed

+128
-9
lines changed

1 file changed

+128
-9
lines changed

lib/std/unicode.zig

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ const std = @import("./std.zig");
22
const assert = std.debug.assert;
33
const testing = std.testing;
44
const mem = std.mem;
5+
const builtin = @import("builtin");
56

67
/// Use this to replace an unknown, unrecognized, or unrepresentable character.
78
///
@@ -756,8 +757,34 @@ pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8
756757
// optimistically guess that it will all be ascii.
757758
var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);
758759
errdefer result.deinit();
759-
var out_index: usize = 0;
760-
var it = Utf16LeIterator.init(utf16le);
760+
761+
var remaining = utf16le;
762+
if (builtin.zig_backend != .stage2_x86_64) {
763+
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
764+
const Chunk = @Vector(chunk_len, u16);
765+
766+
// Fast path. Check for and encode ASCII characters at the start of the input.
767+
while (remaining.len >= chunk_len) {
768+
const chunk: Chunk = remaining[0..chunk_len].*;
769+
const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
770+
if (@reduce(.Or, chunk | mask != mask)) {
771+
// found a non ASCII code unit
772+
break;
773+
}
774+
const chunk_byte_len = chunk_len * 2;
775+
const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
776+
const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
777+
const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
778+
// We allocated enough space to encode every UTF-16 code unit
779+
// as ASCII, so if the entire string is ASCII then we are
780+
// guaranteed to have enough space allocated
781+
result.appendSliceAssumeCapacity(&ascii_bytes);
782+
remaining = remaining[chunk_len..];
783+
}
784+
}
785+
786+
var out_index: usize = result.items.len;
787+
var it = Utf16LeIterator.init(remaining);
761788
while (try it.nextCodepoint()) |codepoint| {
762789
const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
763790
try result.resize(result.items.len + utf8_len);
@@ -773,8 +800,34 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]
773800
// optimistically guess that it will all be ascii (and allocate space for the null terminator)
774801
var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);
775802
errdefer result.deinit();
776-
var out_index: usize = 0;
777-
var it = Utf16LeIterator.init(utf16le);
803+
804+
var remaining = utf16le;
805+
if (builtin.zig_backend != .stage2_x86_64) {
806+
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
807+
const Chunk = @Vector(chunk_len, u16);
808+
809+
// Fast path. Check for and encode ASCII characters at the start of the input.
810+
while (remaining.len >= chunk_len) {
811+
const chunk: Chunk = remaining[0..chunk_len].*;
812+
const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
813+
if (@reduce(.Or, chunk | mask != mask)) {
814+
// found a non ASCII code unit
815+
break;
816+
}
817+
const chunk_byte_len = chunk_len * 2;
818+
const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
819+
const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
820+
const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
821+
// We allocated enough space to encode every UTF-16 code unit
822+
// as ASCII, so if the entire string is ASCII then we are
823+
// guaranteed to have enough space allocated
824+
result.appendSliceAssumeCapacity(&ascii_bytes);
825+
remaining = remaining[chunk_len..];
826+
}
827+
}
828+
829+
var out_index = result.items.len;
830+
var it = Utf16LeIterator.init(remaining);
778831
while (try it.nextCodepoint()) |codepoint| {
779832
const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
780833
try result.resize(result.items.len + utf8_len);
@@ -788,7 +841,31 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]
788841
/// Returns end byte index into utf8.
789842
pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {
790843
var end_index: usize = 0;
791-
var it = Utf16LeIterator.init(utf16le);
844+
845+
var remaining = utf16le;
846+
if (builtin.zig_backend != .stage2_x86_64) {
847+
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
848+
const Chunk = @Vector(chunk_len, u16);
849+
850+
// Fast path. Check for and encode ASCII characters at the start of the input.
851+
while (remaining.len >= chunk_len) {
852+
const chunk: Chunk = remaining[0..chunk_len].*;
853+
const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
854+
if (@reduce(.Or, chunk | mask != mask)) {
855+
// found a non ASCII code unit
856+
break;
857+
}
858+
const chunk_byte_len = chunk_len * 2;
859+
const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
860+
const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
861+
const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
862+
@memcpy(utf8[end_index .. end_index + chunk_len], &ascii_bytes);
863+
end_index += chunk_len;
864+
remaining = remaining[chunk_len..];
865+
}
866+
}
867+
868+
var it = Utf16LeIterator.init(remaining);
792869
while (try it.nextCodepoint()) |codepoint| {
793870
end_index += try utf8Encode(codepoint, utf8[end_index..]);
794871
}
@@ -863,7 +940,27 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1
863940
var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len + 1);
864941
errdefer result.deinit();
865942

866-
const view = try Utf8View.init(utf8);
943+
var remaining = utf8;
944+
if (builtin.zig_backend != .stage2_x86_64) {
945+
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
946+
const Chunk = @Vector(chunk_len, u8);
947+
948+
// Fast path. Check for and encode ASCII characters at the start of the input.
949+
while (remaining.len >= chunk_len) {
950+
const chunk: Chunk = remaining[0..chunk_len].*;
951+
const mask: Chunk = @splat(0x80);
952+
if (@reduce(.Or, chunk & mask == mask)) {
953+
// found a non ASCII code unit
954+
break;
955+
}
956+
const zeroes: Chunk = @splat(0);
957+
const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
958+
result.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk));
959+
remaining = remaining[chunk_len..];
960+
}
961+
}
962+
963+
const view = try Utf8View.init(remaining);
867964
var it = view.iterator();
868965
while (it.nextCodepoint()) |codepoint| {
869966
if (codepoint < 0x10000) {
@@ -886,11 +983,33 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1
886983
/// Assumes there is enough space for the output.
887984
pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
888985
var dest_i: usize = 0;
986+
987+
var remaining = utf8;
988+
if (builtin.zig_backend != .stage2_x86_64) {
989+
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
990+
const Chunk = @Vector(chunk_len, u8);
991+
992+
// Fast path. Check for and encode ASCII characters at the start of the input.
993+
while (remaining.len >= chunk_len) {
994+
const chunk: Chunk = remaining[0..chunk_len].*;
995+
const mask: Chunk = @splat(0x80);
996+
if (@reduce(.Or, chunk & mask == mask)) {
997+
// found a non ASCII code unit
998+
break;
999+
}
1000+
const zeroes: Chunk = @splat(0);
1001+
const utf16_bytes: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
1002+
@memcpy(utf16le[dest_i..][0..chunk_len], std.mem.bytesAsSlice(u16, &utf16_bytes));
1003+
dest_i += chunk_len;
1004+
remaining = remaining[chunk_len..];
1005+
}
1006+
}
1007+
8891008
var src_i: usize = 0;
890-
while (src_i < utf8.len) {
891-
const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8;
1009+
while (src_i < remaining.len) {
1010+
const n = utf8ByteSequenceLength(remaining[src_i]) catch return error.InvalidUtf8;
8921011
const next_src_i = src_i + n;
893-
const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
1012+
const codepoint = utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8;
8941013
if (codepoint < 0x10000) {
8951014
const short = @as(u16, @intCast(codepoint));
8961015
utf16le[dest_i] = mem.nativeToLittle(u16, short);

0 commit comments

Comments
 (0)