zig/lib/std/tar.zig
2024-01-13 19:37:33 -07:00

731 lines
27 KiB
Zig
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/// Tar archive is single ordinary file which can contain many files (or
/// directories, symlinks, ...). It's build by series of blocks each size of 512
/// bytes. First block of each entry is header which defines type, name, size
/// permissions and other attributes. Header is followed by series of blocks of
/// file content, if any that entry has content. Content is padded to the block
/// size, so next header always starts at block boundary.
///
/// This simple format is extended by GNU and POSIX pax extensions to support
/// file names longer than 256 bytes and additional attributes.
///
/// This is not comprehensive tar parser. Here we are only file types needed to
/// support Zig package manager; normal file, directory, symbolic link. And
/// subset of attributes: name, size, permissions.
///
/// GNU tar reference: https://www.gnu.org/software/tar/manual/html_node/Standard.html
/// pax reference: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13
///
const std = @import("std.zig");
const assert = std.debug.assert;
pub const Options = struct {
/// Number of directory levels to skip when extracting files.
strip_components: u32 = 0,
/// How to handle the "mode" property of files from within the tar file.
mode_mode: ModeMode = .executable_bit_only,
/// Prevents creation of empty directories.
exclude_empty_directories: bool = false,
/// Provide this to receive detailed error messages.
/// When this is provided, some errors which would otherwise be returned immediately
/// will instead be added to this structure. The API user must check the errors
/// in diagnostics to know whether the operation succeeded or failed.
diagnostics: ?*Diagnostics = null,
pub const ModeMode = enum {
/// The mode from the tar file is completely ignored. Files are created
/// with the default mode when creating files.
ignore,
/// The mode from the tar file is inspected for the owner executable bit
/// only. This bit is copied to the group and other executable bits.
/// Other bits of the mode are left as the default when creating files.
executable_bit_only,
};
pub const Diagnostics = struct {
allocator: std.mem.Allocator,
errors: std.ArrayListUnmanaged(Error) = .{},
pub const Error = union(enum) {
unable_to_create_sym_link: struct {
code: anyerror,
file_name: []const u8,
link_name: []const u8,
},
unable_to_create_file: struct {
code: anyerror,
file_name: []const u8,
},
unsupported_file_type: struct {
file_name: []const u8,
file_type: Header.Kind,
},
};
pub fn deinit(d: *Diagnostics) void {
for (d.errors.items) |item| {
switch (item) {
.unable_to_create_sym_link => |info| {
d.allocator.free(info.file_name);
d.allocator.free(info.link_name);
},
.unable_to_create_file => |info| {
d.allocator.free(info.file_name);
},
.unsupported_file_type => |info| {
d.allocator.free(info.file_name);
},
}
}
d.errors.deinit(d.allocator);
d.* = undefined;
}
};
};
pub const Header = struct {
const SIZE = 512;
const MAX_NAME_SIZE = 100 + 1 + 155; // name(100) + separator(1) + prefix(155)
const LINK_NAME_SIZE = 100;
bytes: *const [SIZE]u8,
pub const Kind = enum(u8) {
normal_alias = 0,
normal = '0',
hard_link = '1',
symbolic_link = '2',
character_special = '3',
block_special = '4',
directory = '5',
fifo = '6',
contiguous = '7',
global_extended_header = 'g',
extended_header = 'x',
// Types 'L' and 'K' are used by the GNU format for a meta file
// used to store the path or link name for the next file.
gnu_long_name = 'L',
gnu_long_link = 'K',
_,
};
/// Includes prefix concatenated, if any.
/// TODO: check against "../" and other nefarious things
pub fn fullName(header: Header, buffer: *[MAX_NAME_SIZE]u8) ![]const u8 {
const n = name(header);
const p = prefix(header);
if (!is_ustar(header) or p.len == 0) {
@memcpy(buffer[0..n.len], n);
return buffer[0..n.len];
}
@memcpy(buffer[0..p.len], p);
buffer[p.len] = '/';
@memcpy(buffer[p.len + 1 ..][0..n.len], n);
return buffer[0 .. p.len + 1 + n.len];
}
pub fn linkName(header: Header, buffer: *[LINK_NAME_SIZE]u8) []const u8 {
const link_name = header.str(157, 100);
if (link_name.len == 0) {
return buffer[0..0];
}
const buf = buffer[0..link_name.len];
@memcpy(buf, link_name);
return buf;
}
pub fn name(header: Header) []const u8 {
return header.str(0, 100);
}
pub fn mode(header: Header) !u32 {
return @intCast(try header.numeric(100, 8));
}
pub fn size(header: Header) !u64 {
return header.numeric(124, 12);
}
pub fn chksum(header: Header) !u64 {
return header.octal(148, 8);
}
pub fn is_ustar(header: Header) bool {
const magic = header.bytes[257..][0..6];
return std.mem.eql(u8, magic[0..5], "ustar") and (magic[5] == 0 or magic[5] == ' ');
}
pub fn prefix(header: Header) []const u8 {
return header.str(345, 155);
}
pub fn kind(header: Header) Kind {
const result: Kind = @enumFromInt(header.bytes[156]);
if (result == .normal_alias) return .normal;
return result;
}
fn str(header: Header, start: usize, len: usize) []const u8 {
return nullStr(header.bytes[start .. start + len]);
}
fn numeric(header: Header, start: usize, len: usize) !u64 {
const raw = header.bytes[start..][0..len];
// If the leading byte is 0xff (255), all the bytes of the field
// (including the leading byte) are concatenated in big-endian order,
// with the result being a negative number expressed in twos
// complement form.
if (raw[0] == 0xff) return error.TarNumericValueNegative;
// If the leading byte is 0x80 (128), the non-leading bytes of the
// field are concatenated in big-endian order.
if (raw[0] == 0x80) {
if (raw[1] + raw[2] + raw[3] != 0) return error.TarNumericValueTooBig;
return std.mem.readInt(u64, raw[4..12], .big);
}
return try header.octal(start, len);
}
fn octal(header: Header, start: usize, len: usize) !u64 {
const raw = header.bytes[start..][0..len];
// Zero-filled octal number in ASCII. Each numeric field of width w
// contains w minus 1 digits, and a null
const ltrimmed = std.mem.trimLeft(u8, raw, "0 ");
const rtrimmed = std.mem.trimRight(u8, ltrimmed, " \x00");
if (rtrimmed.len == 0) return 0;
return std.fmt.parseInt(u64, rtrimmed, 8) catch return error.TarHeader;
}
// Sum of all bytes in the header block. The chksum field is treated as if
// it were filled with spaces (ASCII 32).
fn computeChksum(header: Header) u64 {
var sum: u64 = 0;
for (header.bytes, 0..) |b, i| {
if (148 <= i and i < 156) continue; // skip chksum field bytes
sum += b;
}
// Treating chksum bytes as spaces. 256 = 8 * 32, 8 spaces.
return if (sum > 0) sum + 256 else 0;
}
// Checks calculated chksum with value of chksum field.
// Returns error or valid chksum value.
// Zero value indicates empty block.
pub fn checkChksum(header: Header) !u64 {
const field = try header.chksum();
const computed = header.computeChksum();
if (field != computed) return error.TarHeaderChksum;
return field;
}
};
// Breaks string on first null character.
fn nullStr(str: []const u8) []const u8 {
for (str, 0..) |c, i| {
if (c == 0) return str[0..i];
}
return str;
}
pub fn tarReader(reader: anytype, diagnostics: ?*Options.Diagnostics) TarReader(@TypeOf(reader)) {
return .{
.reader = reader,
.diagnostics = diagnostics,
};
}
fn TarReader(comptime ReaderType: type) type {
return struct {
reader: ReaderType,
diagnostics: ?*Options.Diagnostics,
// buffers for heeader and file attributes
header_buffer: [Header.SIZE]u8 = undefined,
file_name_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined,
link_name_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined,
// bytes of padding to the end of the block
padding: usize = 0,
// current tar file
file: File = undefined,
pub const File = struct {
name: []const u8, // name of file, symlink or directory
link_name: []const u8, // target name of symlink
size: u64, // size of the file in bytes
mode: u32,
kind: Header.Kind,
reader: ReaderType,
// Writes file content to writer.
pub fn write(self: File, writer: anytype) !void {
var buffer: [4096]u8 = undefined;
var n: u64 = 0;
while (n < self.size) {
const buf = buffer[0..@min(buffer.len, self.size - n)];
try self.reader.readNoEof(buf);
try writer.writeAll(buf);
n += buf.len;
}
}
// Skips file content. Advances reader.
pub fn skip(self: File) !void {
try self.reader.skipBytes(self.size, .{});
}
};
const Self = @This();
fn readHeader(self: *Self) !?Header {
if (self.padding > 0) {
try self.reader.skipBytes(self.padding, .{});
}
const n = try self.reader.readAll(&self.header_buffer);
if (n == 0) return null;
if (n < Header.SIZE) return error.UnexpectedEndOfStream;
const header = Header{ .bytes = self.header_buffer[0..Header.SIZE] };
if (try header.checkChksum() == 0) return null;
return header;
}
inline fn readString(self: *Self, size: usize, buffer: []u8) ![]const u8 {
assert(buffer.len >= size);
const buf = buffer[0..size];
try self.reader.readNoEof(buf);
return nullStr(buf);
}
inline fn initFile(self: *Self) void {
self.file = File{
.name = self.file_name_buffer[0..0],
.link_name = self.link_name_buffer[0..0],
.size = 0,
.kind = .normal,
.mode = 0,
.reader = self.reader,
};
}
// Number of padding bytes in the last file block.
inline fn blockPadding(size: u64) usize {
const block_rounded = std.mem.alignForward(u64, size, Header.SIZE); // size rounded to te block boundary
return @intCast(block_rounded - size);
}
/// Iterates through the tar archive as if it is a series of files.
/// Internally, the tar format often uses entries (header with optional
/// content) to add meta data that describes the next file. These
/// entries should not normally be visible to the outside. As such, this
/// loop iterates through one or more entries until it collects a all
/// file attributes.
pub fn next(self: *Self) !?File {
self.initFile();
while (try self.readHeader()) |header| {
const kind = header.kind();
const size: u64 = try header.size();
self.padding = blockPadding(size);
switch (kind) {
// File types to retrun upstream
.directory, .normal, .symbolic_link => {
self.file.kind = kind;
self.file.mode = try header.mode();
// set file attributes if not already set by prefix/extended headers
if (self.file.size == 0) {
self.file.size = size;
}
if (self.file.link_name.len == 0) {
self.file.link_name = header.linkName(self.link_name_buffer[0..Header.LINK_NAME_SIZE]);
}
if (self.file.name.len == 0) {
self.file.name = try header.fullName(self.file_name_buffer[0..Header.MAX_NAME_SIZE]);
}
self.padding = blockPadding(self.file.size);
return self.file;
},
// Prefix header types
.gnu_long_name => {
self.file.name = try self.readString(@intCast(size), &self.file_name_buffer);
},
.gnu_long_link => {
self.file.link_name = try self.readString(@intCast(size), &self.link_name_buffer);
},
.extended_header => {
// Use just attributes from last extended header.
self.initFile();
var rdr = paxReader(self.reader, @intCast(size));
while (try rdr.next()) |attr| {
switch (attr.kind) {
.path => {
self.file.name = try attr.value(&self.file_name_buffer);
},
.linkpath => {
self.file.link_name = try attr.value(&self.link_name_buffer);
},
.size => {
var buf: [64]u8 = undefined;
self.file.size = try std.fmt.parseInt(u64, try attr.value(&buf), 10);
},
}
}
},
// Ignored header type
.global_extended_header => {
self.reader.skipBytes(size, .{}) catch return error.TarHeadersTooBig;
},
// All other are unsupported header types
else => {
const d = self.diagnostics orelse return error.TarUnsupportedHeader;
try d.errors.append(d.allocator, .{ .unsupported_file_type = .{
.file_name = try d.allocator.dupe(u8, header.name()),
.file_type = kind,
} });
},
}
}
return null;
}
};
}
// Pax attributes reader.
// Size is length of pax extended header in reader.
fn paxReader(reader: anytype, size: usize) PaxReader(@TypeOf(reader)) {
return PaxReader(@TypeOf(reader)){
.reader = reader,
.size = size,
};
}
const PaxAttributeKind = enum {
path,
linkpath,
size,
};
fn PaxReader(comptime ReaderType: type) type {
return struct {
size: usize, // cumulative size of all pax attributes
reader: ReaderType,
// scratch buffer used for reading attribute length and keyword
scratch: [128]u8 = undefined,
const Self = @This();
const Attribute = struct {
kind: PaxAttributeKind,
len: usize, // length of the attribute value
reader: ReaderType, // reader positioned at value start
// Copies pax attribute value into destination buffer.
// Must be called with destination buffer of size at least Attribute.len.
pub fn value(self: Attribute, dst: []u8) ![]const u8 {
assert(self.len <= dst.len);
const buf = dst[0..self.len];
const n = try self.reader.readAll(buf);
if (n < self.len) return error.UnexpectedEndOfStream;
try validateAttributeEnding(self.reader);
if (hasNull(buf)) return error.PaxNullInValue;
return buf;
}
};
// Iterates over pax attributes. Returns known only known attributes.
// Caller has to call value in Attribute, to advance reader across value.
pub fn next(self: *Self) !?Attribute {
// Pax extended header consists of one or more attributes, each constructed as follows:
// "%d %s=%s\n", <length>, <keyword>, <value>
while (self.size > 0) {
const length_buf = try self.readUntil(' ');
const length = try std.fmt.parseInt(usize, length_buf, 10); // record length in bytes
const keyword = try self.readUntil('=');
if (hasNull(keyword)) return error.PaxNullInKeyword;
// calculate value_len
const value_start = length_buf.len + keyword.len + 2; // 2 separators
if (length < value_start + 1 or self.size < length) return error.UnexpectedEndOfStream;
const value_len = length - value_start - 1; // \n separator at end
self.size -= length;
const kind: PaxAttributeKind = if (eql(keyword, "path"))
.path
else if (eql(keyword, "linkpath"))
.linkpath
else if (eql(keyword, "size"))
.size
else {
try self.reader.skipBytes(value_len, .{});
try validateAttributeEnding(self.reader);
continue;
};
return Attribute{
.kind = kind,
.len = value_len,
.reader = self.reader,
};
}
return null;
}
inline fn readUntil(self: *Self, delimiter: u8) ![]const u8 {
var fbs = std.io.fixedBufferStream(&self.scratch);
try self.reader.streamUntilDelimiter(fbs.writer(), delimiter, null);
return fbs.getWritten();
}
inline fn eql(a: []const u8, b: []const u8) bool {
return std.mem.eql(u8, a, b);
}
inline fn hasNull(str: []const u8) bool {
return (std.mem.indexOfScalar(u8, str, 0)) != null;
}
// Checks that each record ends with new line.
inline fn validateAttributeEnding(reader: ReaderType) !void {
if (try reader.readByte() != '\n') return error.PaxInvalidAttributeEnd;
}
};
}
pub fn pipeToFileSystem(dir: std.fs.Dir, reader: anytype, options: Options) !void {
switch (options.mode_mode) {
.ignore => {},
.executable_bit_only => {
// This code does not look at the mode bits yet. To implement this feature,
// the implementation must be adjusted to look at the mode, and check the
// user executable bit, then call fchmod on newly created files when
// the executable bit is supposed to be set.
// It also needs to properly deal with ACLs on Windows.
@panic("TODO: unimplemented: tar ModeMode.executable_bit_only");
},
}
var iter = tarReader(reader, options.diagnostics);
while (try iter.next()) |file| {
switch (file.kind) {
.directory => {
const file_name = try stripComponents(file.name, options.strip_components);
if (file_name.len != 0 and !options.exclude_empty_directories) {
try dir.makePath(file_name);
}
},
.normal => {
if (file.size == 0 and file.name.len == 0) return;
const file_name = try stripComponents(file.name, options.strip_components);
const fs_file = dir.createFile(file_name, .{}) catch |err| switch (err) {
error.FileNotFound => again: {
const code = code: {
if (std.fs.path.dirname(file_name)) |dir_name| {
dir.makePath(dir_name) catch |code| break :code code;
break :again dir.createFile(file_name, .{}) catch |code| {
break :code code;
};
}
break :code err;
};
const d = options.diagnostics orelse return error.UnableToCreateFile;
try d.errors.append(d.allocator, .{ .unable_to_create_file = .{
.code = code,
.file_name = try d.allocator.dupe(u8, file_name),
} });
break :again null;
},
else => |e| return e,
};
defer if (fs_file) |f| f.close();
if (fs_file) |f| {
try file.write(f);
} else {
try file.skip();
}
},
.symbolic_link => {
// The file system path of the symbolic link.
const file_name = try stripComponents(file.name, options.strip_components);
// The data inside the symbolic link.
const link_name = file.link_name;
dir.symLink(link_name, file_name, .{}) catch |err| again: {
const code = code: {
if (err == error.FileNotFound) {
if (std.fs.path.dirname(file_name)) |dir_name| {
dir.makePath(dir_name) catch |code| break :code code;
break :again dir.symLink(link_name, file_name, .{}) catch |code| {
break :code code;
};
}
}
break :code err;
};
const d = options.diagnostics orelse return error.UnableToCreateSymLink;
try d.errors.append(d.allocator, .{ .unable_to_create_sym_link = .{
.code = code,
.file_name = try d.allocator.dupe(u8, file_name),
.link_name = try d.allocator.dupe(u8, link_name),
} });
};
},
else => unreachable,
}
}
}
fn stripComponents(path: []const u8, count: u32) ![]const u8 {
var i: usize = 0;
var c = count;
while (c > 0) : (c -= 1) {
if (std.mem.indexOfScalarPos(u8, path, i, '/')) |pos| {
i = pos + 1;
} else {
return error.TarComponentsOutsideStrippedPrefix;
}
}
return path[i..];
}
test "tar stripComponents" {
const expectEqualStrings = std.testing.expectEqualStrings;
try expectEqualStrings("a/b/c", try stripComponents("a/b/c", 0));
try expectEqualStrings("b/c", try stripComponents("a/b/c", 1));
try expectEqualStrings("c", try stripComponents("a/b/c", 2));
}
test "tar PaxReader" {
const Attr = struct {
kind: PaxAttributeKind,
value: []const u8 = undefined,
err: ?anyerror = null,
};
const cases = [_]struct {
data: []const u8,
attrs: []const Attr,
err: ?anyerror = null,
}{
.{ // valid but unknown keys
.data =
\\30 mtime=1350244992.023960108
\\6 k=1
\\13 key1=val1
\\10 a=name
\\9 a=name
\\
,
.attrs = &[_]Attr{},
},
.{ // mix of known and unknown keys
.data =
\\6 k=1
\\13 path=name
\\17 linkpath=link
\\13 key1=val1
\\12 size=123
\\13 key2=val2
\\
,
.attrs = &[_]Attr{
.{ .kind = .path, .value = "name" },
.{ .kind = .linkpath, .value = "link" },
.{ .kind = .size, .value = "123" },
},
},
.{ // too short size of the second key-value pair
.data =
\\13 path=name
\\10 linkpath=value
\\
,
.attrs = &[_]Attr{
.{ .kind = .path, .value = "name" },
},
.err = error.UnexpectedEndOfStream,
},
.{ // too long size of the second key-value pair
.data =
\\13 path=name
\\6 k=1
\\19 linkpath=value
\\
,
.attrs = &[_]Attr{
.{ .kind = .path, .value = "name" },
},
.err = error.UnexpectedEndOfStream,
},
.{ // too long size of the second key-value pair
.data =
\\13 path=name
\\19 linkpath=value
\\6 k=1
\\
,
.attrs = &[_]Attr{
.{ .kind = .path, .value = "name" },
.{ .kind = .linkpath, .err = error.PaxInvalidAttributeEnd },
},
},
.{ // null in keyword is not valid
.data = "13 path=name\n" ++ "7 k\x00b=1\n",
.attrs = &[_]Attr{
.{ .kind = .path, .value = "name" },
},
.err = error.PaxNullInKeyword,
},
.{ // null in value is not valid
.data = "23 path=name\x00with null\n",
.attrs = &[_]Attr{
.{ .kind = .path, .err = error.PaxNullInValue },
},
},
.{ // 1000 characters path
.data = "1011 path=" ++ "0123456789" ** 100 ++ "\n",
.attrs = &[_]Attr{
.{ .kind = .path, .value = "0123456789" ** 100 },
},
},
};
var buffer: [1024]u8 = undefined;
outer: for (cases) |case| {
var stream = std.io.fixedBufferStream(case.data);
var rdr = paxReader(stream.reader(), case.data.len);
var i: usize = 0;
while (rdr.next() catch |err| {
if (case.err) |e| {
try std.testing.expectEqual(e, err);
continue;
}
return err;
}) |attr| : (i += 1) {
const exp = case.attrs[i];
try std.testing.expectEqual(exp.kind, attr.kind);
const value = attr.value(&buffer) catch |err| {
if (exp.err) |e| {
try std.testing.expectEqual(e, err);
break :outer;
}
return err;
};
try std.testing.expectEqualStrings(exp.value, value);
}
try std.testing.expectEqual(case.attrs.len, i);
try std.testing.expect(case.err == null);
}
}
test {
_ = @import("tar/test.zig");
}