macho: calculate UUID excluding stabs and part of contributing strtab

2024-11-14 16:13:24 +00:00 · 2022-12-15 15:10:35 +01:00 · 2022-12-15 15:10:35 +01:00 · 660270b7a9
commit 660270b7a9
parent 09dee74414
5 changed files with 102 additions and 74 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -594,7 +594,6 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/link/MachO/hasher.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/load_commands.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/thunks.zig"
-    "${CMAKE_SOURCE_DIR}/src/link/MachO/uuid.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/zld.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Plan9.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Plan9/aout.zig"
--- a/src/link/MachO.zig
+++ b/src/link/MachO.zig
@ -39,6 +39,7 @@ const Object = @import("MachO/Object.zig");
 const LibStub = @import("tapi.zig").LibStub;
 const Liveness = @import("../Liveness.zig");
 const LlvmObject = @import("../codegen/llvm.zig").Object;
+const Md5 = std.crypto.hash.Md5;
 const Module = @import("../Module.zig");
 const Relocation = @import("MachO/Relocation.zig");
 const StringTable = @import("strtab.zig").StringTable;
@ -598,6 +599,8 @@ pub fn flushModule(self: *MachO, comp: *Compilation, prog_node: *std.Progress.No

    if (self.cold_start) {
        std.crypto.random.bytes(&self.uuid_cmd.uuid);
+        Md5.hash(&self.uuid_cmd.uuid, &self.uuid_cmd.uuid, .{});
+        conformUuid(&self.uuid_cmd.uuid);
    }
    try lc_writer.writeStruct(self.uuid_cmd);

@ -662,6 +665,11 @@ pub fn flushModule(self: *MachO, comp: *Compilation, prog_node: *std.Progress.No

    self.cold_start = false;
 }
+inline fn conformUuid(out: *[Md5.digest_length]u8) void {
+    // LC_UUID uuids should conform to RFC 4122 UUID version 4 & UUID version 5 formats
+    out[6] = (out[6] & 0x0F) | (3 << 4);
+    out[8] = (out[8] & 0x3F) | 0x80;
+}

 pub fn resolveLibSystem(
    arena: Allocator,
--- a/src/link/MachO/hasher.zig
+++ b/src/link/MachO/hasher.zig
@ -13,6 +13,7 @@ pub fn ParallelHasher(comptime Hasher: type) type {
    return struct {
        pub fn hash(self: @This(), gpa: Allocator, pool: *ThreadPool, file: fs.File, out: [][hash_size]u8, opts: struct {
            chunk_size: u16 = 0x4000,
+            file_pos: u64 = 0,
            max_file_size: ?u64 = null,
        }) !void {
            _ = self;
@ -38,7 +39,14 @@ pub fn ParallelHasher(comptime Hasher: type) type {
                    const fstart = i * opts.chunk_size;
                    const fsize = if (fstart + opts.chunk_size > file_size) file_size - fstart else opts.chunk_size;
                    wg.start();
-                    try pool.spawn(worker, .{ file, fstart, buffer[fstart..][0..fsize], &out[i], &results[i], &wg });
+                    try pool.spawn(worker, .{
+                        file,
+                        fstart + opts.file_pos,
+                        buffer[fstart..][0..fsize],
+                        &out[i],
+                        &results[i],
+                        &wg,
+                    });
                }
            }
            for (results) |result| _ = try result;
--- a/src/link/MachO/uuid.zig
+++ b/src/link/MachO/uuid.zig
@ -1,69 +0,0 @@
-const std = @import("std");
-const fs = std.fs;
-const mem = std.mem;
-
-const Allocator = mem.Allocator;
-const Compilation = @import("../../Compilation.zig");
-const Md5 = std.crypto.hash.Md5;
-const Hasher = @import("hasher.zig").ParallelHasher;
-
-/// Somewhat random chunk size for MD5 hash calculation.
-pub const chunk_size = 0x4000;
-
-/// Calculates Md5 hash of the file contents.
-/// Hash is calculated in a streaming manner which may be slow.
-pub fn calcUuidStreaming(file: fs.File, file_size: u64, out: *[Md5.digest_length]u8) !void {
-    const total_num_chunks = mem.alignForward(file_size, chunk_size) / chunk_size;
-
-    var hasher = Md5.init(.{});
-    var buffer: [chunk_size]u8 = undefined;
-
-    var i: usize = 0;
-    while (i < total_num_chunks) : (i += 1) {
-        const start = i * chunk_size;
-        const size = if (start + chunk_size > file_size)
-            file_size - start
-        else
-            chunk_size;
-        const amt = try file.preadAll(&buffer, start);
-        if (amt != size) return error.InputOutput;
-
-        hasher.update(buffer[0..size]);
-    }
-
-    hasher.final(out);
-    conform(out);
-}
-
-/// Calculates Md5 hash of each chunk in parallel and then hashes all Md5 hashes to produce
-/// the final digest.
-/// While this is NOT a correct MD5 hash of the contents, this methodology is used by LLVM/LLD
-/// and we will use it too as it seems accepted by Apple OSes.
-pub fn calcUuidParallel(comp: *const Compilation, file: fs.File, file_size: u64, out: *[Md5.digest_length]u8) !void {
-    const total_hashes = mem.alignForward(file_size, chunk_size) / chunk_size;
-
-    const hashes = try comp.gpa.alloc([Md5.digest_length]u8, total_hashes);
-    defer comp.gpa.free(hashes);
-
-    var hasher = Hasher(Md5){};
-    try hasher.hash(comp.gpa, comp.thread_pool, file, hashes, .{
-        .chunk_size = chunk_size,
-        .max_file_size = file_size,
-    });
-
-    const final_buffer = try comp.gpa.alloc(u8, total_hashes * Md5.digest_length);
-    defer comp.gpa.free(final_buffer);
-
-    for (hashes) |hash, i| {
-        mem.copy(u8, final_buffer[i * Md5.digest_length ..][0..Md5.digest_length], &hash);
-    }
-
-    Md5.hash(final_buffer, out, .{});
-    conform(out);
-}
-
-inline fn conform(out: *[Md5.digest_length]u8) void {
-    // LC_UUID uuids should conform to RFC 4122 UUID version 4 & UUID version 5 formats
-    out[6] = (out[6] & 0x0F) | (3 << 4);
-    out[8] = (out[8] & 0x3F) | 0x80;
-}
--- a/src/link/MachO/zld.zig
+++ b/src/link/MachO/zld.zig
@ -16,7 +16,6 @@ const link = @import("../../link.zig");
 const load_commands = @import("load_commands.zig");
 const thunks = @import("thunks.zig");
 const trace = @import("../../tracy.zig").trace;
-const uuid = @import("uuid.zig");

 const Allocator = mem.Allocator;
 const Archive = @import("Archive.zig");
@ -26,7 +25,9 @@ const CodeSignature = @import("CodeSignature.zig");
 const Compilation = @import("../../Compilation.zig");
 const DwarfInfo = @import("DwarfInfo.zig");
 const Dylib = @import("Dylib.zig");
+const Hasher = @import("hasher.zig").ParallelHasher;
 const MachO = @import("../MachO.zig");
+const Md5 = std.crypto.hash.Md5;
 const LibStub = @import("../tapi.zig").LibStub;
 const Object = @import("Object.zig");
 const StringTable = @import("../strtab.zig").StringTable;
@ -2680,17 +2681,98 @@ pub const Zld = struct {
                // In Debug we don't really care about reproducibility, so put in a random value
                // and be done with it.
                std.crypto.random.bytes(&self.uuid_cmd.uuid);
+                Md5.hash(&self.uuid_cmd.uuid, &self.uuid_cmd.uuid, .{});
+                conformUuid(&self.uuid_cmd.uuid);
            },
            else => {
                const seg = self.getLinkeditSegmentPtr();
-                const file_size = seg.fileoff + seg.filesize;
-                try uuid.calcUuidParallel(comp, self.file, file_size, &self.uuid_cmd.uuid);
+                const max_file_size = @intCast(u32, seg.fileoff + seg.filesize);
+
+                var hashes = std.ArrayList([Md5.digest_length]u8).init(self.gpa);
+                defer hashes.deinit();
+
+                if (!self.options.strip) {
+                    // First exclusion region will comprise all symbol stabs.
+                    const nlocals = self.dysymtab_cmd.nlocalsym;
+
+                    const locals_buf = try self.gpa.alloc(u8, nlocals * @sizeOf(macho.nlist_64));
+                    defer self.gpa.free(locals_buf);
+
+                    const amt = try self.file.preadAll(locals_buf, self.symtab_cmd.symoff);
+                    if (amt != locals_buf.len) return error.InputOutput;
+                    const locals = @ptrCast([*]macho.nlist_64, @alignCast(@alignOf(macho.nlist_64), locals_buf))[0..nlocals];
+
+                    const istab: usize = for (locals) |local, i| {
+                        if (local.stab()) break i;
+                    } else locals.len;
+                    const nstabs = locals.len - istab;
+
+                    // Next, a subsection of the strtab.
+                    // We do not care about anything succeeding strtab as it is the code signature data which is
+                    // not part of the UUID calculation anyway.
+                    const stab_stroff = locals[istab].n_strx;
+
+                    const first_cut = FileSubsection{
+                        .start = 0,
+                        .end = @intCast(u32, self.symtab_cmd.symoff + istab * @sizeOf(macho.nlist_64)),
+                    };
+                    const second_cut = FileSubsection{
+                        .start = first_cut.end + @intCast(u32, nstabs * @sizeOf(macho.nlist_64)),
+                        .end = self.symtab_cmd.stroff + stab_stroff,
+                    };
+
+                    for (&[_]FileSubsection{ first_cut, second_cut }) |cut| {
+                        try self.calcUuidHashes(comp, cut, &hashes);
+                    }
+                } else {
+                    try self.calcUuidHashes(comp, .{ .start = 0, .end = max_file_size }, &hashes);
+                }
+
+                const final_buffer = try self.gpa.alloc(u8, hashes.items.len * Md5.digest_length);
+                defer self.gpa.free(final_buffer);
+
+                for (hashes.items) |hash, i| {
+                    mem.copy(u8, final_buffer[i * Md5.digest_length ..][0..Md5.digest_length], &hash);
+                }
+
+                Md5.hash(final_buffer, &self.uuid_cmd.uuid, .{});
+                conformUuid(&self.uuid_cmd.uuid);
            },
        }
+
        const in_file = @sizeOf(macho.mach_header_64) + offset + @sizeOf(macho.load_command);
        try self.file.pwriteAll(&self.uuid_cmd.uuid, in_file);
    }

+    inline fn conformUuid(out: *[Md5.digest_length]u8) void {
+        // LC_UUID uuids should conform to RFC 4122 UUID version 4 & UUID version 5 formats
+        out[6] = (out[6] & 0x0F) | (3 << 4);
+        out[8] = (out[8] & 0x3F) | 0x80;
+    }
+
+    const FileSubsection = struct {
+        start: u32,
+        end: u32,
+    };
+
+    fn calcUuidHashes(
+        self: *Zld,
+        comp: *const Compilation,
+        cut: FileSubsection,
+        hashes: *std.ArrayList([Md5.digest_length]u8),
+    ) !void {
+        const chunk_size = 0x4000;
+        const total_hashes = mem.alignForward(cut.end - cut.start, chunk_size) / chunk_size;
+        try hashes.resize(hashes.items.len + total_hashes);
+
+        var hasher = Hasher(Md5){};
+        try hasher.hash(self.gpa, comp.thread_pool, self.file, hashes.items, .{
+            .chunk_size = chunk_size,
+            .file_pos = cut.start,
+            .max_file_size = cut.end - cut.start,
+        });
+    }
+
    fn writeCodeSignaturePadding(self: *Zld, code_sig: *CodeSignature) !void {
        const seg = self.getLinkeditSegmentPtr();
        // Code signature data has to be 16-bytes aligned for Apple tools to recognize the file