const std = @import("std");
const Allocator = std.mem.Allocator;

const AtomicOp = enum {
    cas,
    swp,
    ldadd,
    ldclr,
    ldeor,
    ldset,
};

pub fn main() !void {
    var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena_instance.deinit();
    const arena = arena_instance.allocator();

    //const args = try std.process.argsAlloc(arena);

    var stdout_buffer: [2000]u8 = undefined;
    var stdout_writer = std.fs.File.stdout().writerStreaming(&stdout_buffer);
    const w = &stdout_writer.interface;

    try w.writeAll(
        \\//! This file is generated by tools/gen_outline_atomics.zig.
        \\const builtin = @import("builtin");
        \\const std = @import("std");
        \\const common = @import("common.zig");
        \\const always_has_lse = builtin.cpu.has(.aarch64, .lse);
        \\
        \\/// This default is overridden at runtime after inspecting CPU properties.
        \\/// It is intentionally not exported in order to make the machine code that
        \\/// uses it a statically predicted direct branch rather than using the PLT,
        \\/// which ARM is concerned would have too much overhead.
        \\var __aarch64_have_lse_atomics: u8 = @intFromBool(always_has_lse);
        \\
        \\
    );

    var footer = std.array_list.Managed(u8).init(arena);
    try footer.appendSlice("\ncomptime {\n");

    for ([_]N{ .one, .two, .four, .eight, .sixteen }) |n| {
        for ([_]Ordering{ .relax, .acq, .rel, .acq_rel }) |order| {
            for ([_]AtomicOp{ .cas, .swp, .ldadd, .ldclr, .ldeor, .ldset }) |op| {
                if (n == .sixteen and op != .cas) continue;

                const name = try std.fmt.allocPrint(arena, "__aarch64_{s}{d}_{s}", .{
                    @tagName(op), n.toBytes(), @tagName(order),
                });
                try writeFunction(arena, w, name, op, n, order);
                try footer.writer().print("    @export(&{s}, .{{ .name = \"{s}\", .linkage = common.linkage, .visibility = common.visibility }});\n", .{
                    name, name,
                });
            }
        }
    }

    try w.writeAll(footer.items);
    try w.writeAll("}\n");
    try w.flush();
}

fn writeFunction(
    arena: Allocator,
    w: anytype,
    name: []const u8,
    op: AtomicOp,
    n: N,
    order: Ordering,
) !void {
    const body = switch (op) {
        .cas => try generateCas(arena, n, order),
        .swp => try generateSwp(arena, n, order),
        .ldadd => try generateLd(arena, n, order, .ldadd),
        .ldclr => try generateLd(arena, n, order, .ldclr),
        .ldeor => try generateLd(arena, n, order, .ldeor),
        .ldset => try generateLd(arena, n, order, .ldset),
    };
    const fn_sig = try std.fmt.allocPrint(
        arena,
        "fn {[name]s}() align(16) callconv(.naked) void {{",
        .{ .name = name },
    );
    try w.writeAll(fn_sig);
    try w.writeAll(
        \\
        \\    @setRuntimeSafety(false);
        \\    asm volatile (
        \\
    );
    var iter = std.mem.splitScalar(u8, body, '\n');
    while (iter.next()) |line| {
        try w.writeAll("        \\\\");
        try w.writeAll(line);
        try w.writeAll("\n");
    }
    try w.writeAll(
        \\        :
        \\        : [__aarch64_have_lse_atomics] "{w16}" (__aarch64_have_lse_atomics),
        \\        : "w15", "w16", "w17", "memory"
        \\    );
        \\    unreachable;
        \\}
        \\
    );
}

const N = enum(u8) {
    one = 1,
    two = 2,
    four = 4,
    eight = 8,
    sixteen = 16,

    const Defines = struct {
        s: []const u8,
        uxt: []const u8,
        b: []const u8,
    };

    fn defines(n: N) Defines {
        const s = switch (n) {
            .one => "b",
            .two => "h",
            else => "",
        };
        const uxt = switch (n) {
            .one => "uxtb",
            .two => "uxth",
            .four, .eight, .sixteen => "mov",
        };
        const b = switch (n) {
            .one => "0x00000000",
            .two => "0x40000000",
            .four => "0x80000000",
            .eight => "0xc0000000",
            else => "0x00000000",
        };
        return Defines{
            .s = s,
            .uxt = uxt,
            .b = b,
        };
    }

    fn register(n: N) []const u8 {
        return if (@intFromEnum(n) < 8) "w" else "x";
    }

    fn toBytes(n: N) u8 {
        return @intFromEnum(n);
    }

    fn toBits(n: N) u8 {
        return n.toBytes() * 8;
    }
};

const Ordering = enum {
    relax,
    acq,
    rel,
    acq_rel,

    const Defines = struct {
        suff: []const u8,
        a: []const u8,
        l: []const u8,
        m: []const u8,
        n: []const u8,
    };
    fn defines(self: @This()) Defines {
        const suff = switch (self) {
            .relax => "_relax",
            .acq => "_acq",
            .rel => "_rel",
            .acq_rel => "_acq_rel",
        };
        const a = switch (self) {
            .relax => "",
            .acq => "a",
            .rel => "",
            .acq_rel => "a",
        };
        const l = switch (self) {
            .relax => "",
            .acq => "",
            .rel => "l",
            .acq_rel => "l",
        };
        const m = switch (self) {
            .relax => "0x000000",
            .acq => "0x400000",
            .rel => "0x008000",
            .acq_rel => "0x408000",
        };
        const n = switch (self) {
            .relax => "0x000000",
            .acq => "0x800000",
            .rel => "0x400000",
            .acq_rel => "0xc00000",
        };
        return .{ .suff = suff, .a = a, .l = l, .m = m, .n = n };
    }
};

const LdName = enum { ldadd, ldclr, ldeor, ldset };

fn generateCas(arena: Allocator, n: N, order: Ordering) ![]const u8 {
    const s_def = n.defines();
    const o_def = order.defines();

    const reg = n.register();

    if (@intFromEnum(n) < 16) {
        const cas = try std.fmt.allocPrint(arena, ".inst 0x08a07c41 + {s} + {s}", .{ s_def.b, o_def.m });
        const ldxr = try std.fmt.allocPrint(arena, "ld{s}xr{s}", .{ o_def.a, s_def.s });
        const stxr = try std.fmt.allocPrint(arena, "st{s}xr{s}", .{ o_def.l, s_def.s });

        return try std.fmt.allocPrint(arena,
            \\        cbz     w16, 8f
            \\        {[cas]s}
            \\        ret
            \\8:
            \\        {[uxt]s}    {[reg]s}16, {[reg]s}0
            \\0:
            \\        {[ldxr]s}   {[reg]s}0, [x2]
            \\        cmp    {[reg]s}0, {[reg]s}16
            \\        bne    1f
            \\        {[stxr]s}   w17, {[reg]s}1, [x2]
            \\        cbnz   w17, 0b
            \\1:
            \\        ret
        , .{
            .cas = cas,
            .uxt = s_def.uxt,
            .ldxr = ldxr,
            .stxr = stxr,
            .reg = reg,
        });
    } else {
        const casp = try std.fmt.allocPrint(arena, ".inst 0x48207c82 + {s}", .{o_def.m});
        const ldxp = try std.fmt.allocPrint(arena, "ld{s}xp", .{o_def.a});
        const stxp = try std.fmt.allocPrint(arena, "st{s}xp", .{o_def.l});

        return try std.fmt.allocPrint(arena,
            \\        cbz     w16, 8f
            \\        {[casp]s}
            \\        ret
            \\8:
            \\        mov    x16, x0
            \\        mov    x17, x1
            \\0:
            \\        {[ldxp]s}   x0, x1, [x4]
            \\        cmp    x0, x16
            \\        ccmp   x1, x17, #0, eq
            \\        bne    1f
            \\        {[stxp]s}   w15, x2, x3, [x4]
            \\        cbnz   w15, 0b
            \\1:
            \\        ret
        , .{
            .casp = casp,
            .ldxp = ldxp,
            .stxp = stxp,
        });
    }
}

fn generateSwp(arena: Allocator, n: N, order: Ordering) ![]const u8 {
    const s_def = n.defines();
    const o_def = order.defines();
    const reg = n.register();

    return try std.fmt.allocPrint(arena,
        \\        cbz     w16, 8f
        \\        .inst 0x38208020 + {[b]s} + {[n]s}
        \\        ret
        \\8:
        \\        mov    {[reg]s}16, {[reg]s}0
        \\0:
        \\        ld{[a]s}xr{[s]s}   {[reg]s}0, [x1]
        \\        st{[l]s}xr{[s]s}   w17, {[reg]s}16, [x1]
        \\        cbnz   w17, 0b
        \\1:
        \\        ret
    , .{
        .b = s_def.b,
        .n = o_def.n,
        .reg = reg,
        .s = s_def.s,
        .a = o_def.a,
        .l = o_def.l,
    });
}

fn generateLd(arena: Allocator, n: N, order: Ordering, ld: LdName) ![]const u8 {
    const s_def = n.defines();
    const o_def = order.defines();
    const op = switch (ld) {
        .ldadd => "add",
        .ldclr => "bic",
        .ldeor => "eor",
        .ldset => "orr",
    };
    const op_n = switch (ld) {
        .ldadd => "0x0000",
        .ldclr => "0x1000",
        .ldeor => "0x2000",
        .ldset => "0x3000",
    };

    const reg = n.register();

    return try std.fmt.allocPrint(arena,
        \\        cbz     w16, 8f
        \\        .inst 0x38200020 + {[op_n]s} + {[b]s} + {[n]s}
        \\        ret
        \\8:
        \\        mov    {[reg]s}16, {[reg]s}0
        \\0:
        \\        ld{[a]s}xr{[s]s}   {[reg]s}0, [x1]
        \\        {[op]s}     {[reg]s}17, {[reg]s}0, {[reg]s}16
        \\        st{[l]s}xr{[s]s}   w15, {[reg]s}17, [x1]
        \\        cbnz   w15, 0b
        \\1:
        \\        ret
    , .{
        .op_n = op_n,
        .b = s_def.b,
        .n = o_def.n,
        .s = s_def.s,
        .a = o_def.a,
        .l = o_def.l,
        .op = op,
        .reg = reg,
    });
}
