10#include "unreachable.h"
11#include "siphash_rng.h"
14#define TARGET_CYCLE 192
17#define REQUIREMENT_SIZE 512
18#define REQUIREMENT_MUL_COUNT 192
19#define REQUIREMENT_LATENCY 195
22#define REGISTER_NEEDS_DISPLACEMENT 5
24#define PORT_MAP_SIZE (TARGET_CYCLE + 4)
27#define LOG2_BRANCH_PROB 4
28#define BRANCH_MASK 0x80000000
31#define TRACE_PRINT(...) do { if (TRACE) printf(__VA_ARGS__); } while (false)
33#define MAX(a,b) ((a) > (b) ? (a) : (b))
36static inline bool is_mul(instr_type type) {
37 return type <= INSTR_MUL_R;
40#ifdef HASHX_PROGRAM_STATS
42static inline bool is_wide_mul(instr_type type) {
43 return type < INSTR_MUL_R;
48typedef enum execution_port {
53 PORT_P01 = PORT_P0 | PORT_P1,
54 PORT_P05 = PORT_P0 | PORT_P5,
55 PORT_P015 = PORT_P0 | PORT_P1 | PORT_P5
58static const char* execution_port_names[] = {
59 "PORT_NONE",
"PORT_P0",
"PORT_P1",
"PORT_P01",
"PORT_P5",
"PORT_P05",
"PORT_P15",
"PORT_P015"
69 uint32_t immediate_mask;
99 execution_port ports[PORT_MAP_SIZE][NUM_PORTS];
103 .type = INSTR_UMULH_R,
110 .group = INSTR_UMULH_R,
111 .imm_can_be_0 =
false,
112 .distinct_dst =
false,
119 .type = INSTR_SMULH_R,
126 .group = INSTR_SMULH_R,
127 .imm_can_be_0 =
false,
128 .distinct_dst =
false,
136 .x86_asm =
"imul r,r",
142 .group = INSTR_MUL_R,
143 .imm_can_be_0 =
false,
144 .distinct_dst =
true,
152 .x86_asm =
"sub r,r",
158 .group = INSTR_ADD_RS,
159 .imm_can_be_0 =
false,
160 .distinct_dst =
true,
168 .x86_asm =
"xor r,r",
174 .group = INSTR_XOR_R,
175 .imm_can_be_0 =
false,
176 .distinct_dst =
true,
183 .type = INSTR_ADD_RS,
184 .x86_asm =
"lea r,r+r*s",
190 .group = INSTR_ADD_RS,
191 .imm_can_be_0 =
true,
192 .distinct_dst =
true,
200 .x86_asm =
"ror r,i",
205 .immediate_mask = 63,
206 .group = INSTR_ROR_C,
207 .imm_can_be_0 =
false,
208 .distinct_dst =
true,
216 .x86_asm =
"add r,i",
221 .immediate_mask = UINT32_MAX,
222 .group = INSTR_ADD_C,
223 .imm_can_be_0 =
false,
224 .distinct_dst =
true,
232 .x86_asm =
"xor r,i",
237 .immediate_mask = UINT32_MAX,
238 .group = INSTR_XOR_C,
239 .imm_can_be_0 =
false,
240 .distinct_dst =
true,
248 .type = INSTR_TARGET,
249 .x86_asm =
"cmovz esi, edi",
255 .group = INSTR_TARGET,
256 .imm_can_be_0 =
false,
257 .distinct_dst =
true,
264 .type = INSTR_BRANCH,
265 .x86_asm =
"jz target",
270 .immediate_mask = BRANCH_MASK,
271 .group = INSTR_BRANCH,
272 .imm_can_be_0 =
false,
273 .distinct_dst =
true,
300 .templates = &mul_lookup,
307 .templates = &target_lookup,
314 .templates = &branch_lookup,
321 .templates = wide_mul_lookup,
328 .templates = instr_lookup,
374 const program_item* item = program_layout[ctx->sub_cycle % 36];
377 int index = item->mask0 ? hashx_siphash_rng_u8(&ctx->gen) & (attempt > 0 ? item->mask1 : item->mask0) : 0;
378 tpl = item->templates[index];
379 }
while (!item->duplicates && tpl->group == last_instr);
386 while (popcnt < LOG2_BRANCH_PROB) {
387 int bit = hashx_siphash_rng_u8(gen) % 32;
388 uint32_t bitmask = 1U << bit;
389 if (!(mask & bitmask)) {
398 instr->opcode = tpl->type;
399 if (tpl->immediate_mask) {
400 if (tpl->immediate_mask == BRANCH_MASK) {
401 instr->imm32 = branch_mask(gen);
404 instr->imm32 = hashx_siphash_rng_u32(gen) & tpl->immediate_mask;
405 }
while (instr->imm32 == 0 && !tpl->imm_can_be_0);
407 if (!tpl->op_par_src) {
408 if (tpl->distinct_dst) {
409 instr->op_par = UINT32_MAX;
412 instr->op_par = hashx_siphash_rng_u32(gen);
423static bool select_register(
int available_regs[8],
int regs_count,
siphash_rng* gen,
int* reg_out) {
429 if (regs_count > 1) {
430 index = hashx_siphash_rng_u32(gen) % regs_count;
435 *reg_out = available_regs[index];
440 int available_regs[8];
452 for (
int i = 0; i < 8; ++i) {
453 bool available = ctx->registers[i].latency <= cycle;
454 available &= ((!tpl->distinct_dst) | (i != instr->src));
455 available &= (ctx->chain_mul | (tpl->group != INSTR_MUL_R) | (ctx->registers[i].last_op != INSTR_MUL_R));
456 available &= ((ctx->registers[i].last_op != tpl->group) | (ctx->registers[i].last_op_par != instr->op_par));
457 available &= ((instr->opcode != INSTR_ADD_RS) | (i != REGISTER_NEEDS_DISPLACEMENT));
458 available_regs[regs_count] = available ? i : 0;
459 regs_count += available;
461 return select_register(available_regs, regs_count, &ctx->gen, &instr->dst);
465 int available_regs[8];
468 for (
int i = 0; i < 8; ++i) {
469 if (ctx->registers[i].latency <= cycle)
470 available_regs[regs_count++] = i;
473 if (regs_count == 2 && instr->opcode == INSTR_ADD_RS) {
474 if (available_regs[0] == REGISTER_NEEDS_DISPLACEMENT || available_regs[1] == REGISTER_NEEDS_DISPLACEMENT) {
475 instr->op_par = instr->src = REGISTER_NEEDS_DISPLACEMENT;
479 if (select_register(available_regs, regs_count, &ctx->gen, &instr->src)) {
481 instr->op_par = instr->src;
487static int schedule_uop(execution_port uop,
generator_ctx* ctx,
int cycle,
bool commit) {
490 for (; cycle < PORT_MAP_SIZE; ++cycle) {
491 if ((uop & PORT_P5) && !ctx->ports[cycle][2]) {
493 ctx->ports[cycle][2] = uop;
495 TRACE_PRINT(
"%s scheduled to port P5 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
498 if ((uop & PORT_P0) && !ctx->ports[cycle][0]) {
500 ctx->ports[cycle][0] = uop;
502 TRACE_PRINT(
"%s scheduled to port P0 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
505 if ((uop & PORT_P1) != 0 && !ctx->ports[cycle][1]) {
507 ctx->ports[cycle][1] = uop;
509 TRACE_PRINT(
"%s scheduled to port P1 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
517 if (tpl->uop2 == PORT_NONE) {
519 return schedule_uop(tpl->uop1, ctx, ctx->cycle, commit);
523 for (
int cycle = ctx->cycle; cycle < PORT_MAP_SIZE; ++cycle) {
525 int cycle1 = schedule_uop(tpl->uop1, ctx, cycle,
false);
526 int cycle2 = schedule_uop(tpl->uop2, ctx, cycle,
false);
528 if (cycle1 >= 0 && cycle1 == cycle2) {
530 schedule_uop(tpl->uop1, ctx, cycle,
true);
531 schedule_uop(tpl->uop2, ctx, cycle,
true);
542 for (
int i = 0; i < 8; ++i) {
543 printf(
" R%i = %i\n", i, ctx->registers[i].latency);
556 hashx_siphash_rng_init(&ctx.gen, key);
557#ifdef HASHX_RNG_CALLBACK
558 ctx.gen.callback = program->rng_callback;
559 ctx.gen.callback_user_data = program->rng_callback_user_data;
561 for (
int i = 0; i < 8; ++i) {
562 ctx.registers[i].last_op = -1;
563 ctx.registers[i].latency = 0;
564 ctx.registers[i].last_op_par = (uint32_t)-1;
566 program->code_size = 0;
569 instr_type last_instr = -1;
570#ifdef HASHX_PROGRAM_STATS
571 program->x86_size = 0;
574 while (program->code_size < HASHX_PROGRAM_MAX_SIZE) {
575 instruction* instr = &program->code[program->code_size];
576 TRACE_PRINT(
"CYCLE: %i/%i\n", ctx.sub_cycle, ctx.cycle);
579 const instr_template* tpl = select_template(&ctx, last_instr, attempt);
580 last_instr = tpl->group;
582 TRACE_PRINT(
"Template: %s\n", tpl->x86_asm);
584 instr_from_template(tpl, &ctx.gen, instr);
587 int scheduleCycle = schedule_instr(tpl, &ctx,
false);
588 if (scheduleCycle < 0) {
589 TRACE_PRINT(
"Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle);
594 ctx.chain_mul = attempt > 0;
598 if (!select_source(tpl, instr, &ctx, scheduleCycle)) {
599 TRACE_PRINT(
"; src STALL (attempt %i)\n", attempt);
600 if (attempt++ < MAX_RETRIES) {
604 printf(
"; select_source FAILED at cycle %i\n", ctx.cycle);
605 print_registers(&ctx);
609 ctx.cycle = ctx.sub_cycle / 3;
613 TRACE_PRINT(
"; src = r%i\n", instr->src);
618 if (!select_destination(tpl, instr, &ctx, scheduleCycle)) {
619 TRACE_PRINT(
"; dst STALL (attempt %i)\n", attempt);
620 if (attempt++ < MAX_RETRIES) {
624 printf(
"; select_destination FAILED at cycle %i\n", ctx.cycle);
625 print_registers(&ctx);
629 ctx.cycle = ctx.sub_cycle / 3;
633 TRACE_PRINT(
"; dst = r%i\n", instr->dst);
638 scheduleCycle = schedule_instr(tpl, &ctx,
true);
640 if (scheduleCycle < 0) {
641 TRACE_PRINT(
"Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle);
646 if (scheduleCycle >= TARGET_CYCLE) {
652 int retireCycle = scheduleCycle + tpl->latency;
653 ri->latency = retireCycle;
654 ri->last_op = tpl->group;
655 ri->last_op_par = instr->op_par;
656 ctx.latency =
MAX(retireCycle, ctx.latency);
657 TRACE_PRINT(
"; RETIRED at cycle %i\n", retireCycle);
660 program->code_size++;
661#ifdef HASHX_PROGRAM_STATS
662 program->x86_size += tpl->x86_size;
665 ctx.mul_count += is_mul(instr->opcode);
668 ctx.sub_cycle += (tpl->uop2 != PORT_NONE);
669 ctx.cycle = ctx.sub_cycle / 3;
672#ifdef HASHX_PROGRAM_STATS
673 memset(program->asic_latencies, 0,
sizeof(program->asic_latencies));
675 program->counter = ctx.gen.counter;
676 program->wide_mul_count = 0;
677 program->mul_count = ctx.mul_count;
681 for (
size_t i = 0; i < program->code_size; ++i) {
685 int last_dst = program->asic_latencies[instr->dst] + 1;
686 int lat_src = instr->dst != instr->src ? program->asic_latencies[instr->src] + 1 : 0;
687 program->asic_latencies[instr->dst] =
MAX(last_dst, lat_src);
688 program->wide_mul_count += is_wide_mul(instr->opcode);
691 program->asic_latency = 0;
692 program->cpu_latency = 0;
693 for (
int i = 0; i < 8; ++i) {
694 program->asic_latency =
MAX(program->asic_latency, program->asic_latencies[i]);
695 program->cpu_latencies[i] = ctx.registers[i].latency;
696 program->cpu_latency =
MAX(program->cpu_latency, program->cpu_latencies[i]);
699 program->ipc = program->code_size / (double)program->cpu_latency;
700 program->branch_count = 0;
701 memset(program->branches, 0,
sizeof(program->branches));
704 printf(
"; ALU port utilization:\n");
705 printf(
"; (* = in use, _ = idle)\n");
706 for (
int i = 0; i < PORT_MAP_SIZE; ++i) {
708 for (
int j = 0; j < NUM_PORTS; ++j) {
709 printf(
"%c", (ctx.ports[i][j] ?
'*' :
'_'));
719 (program->code_size == REQUIREMENT_SIZE) &&
720 (ctx.mul_count == REQUIREMENT_MUL_COUNT) &&
721 (ctx.latency == REQUIREMENT_LATENCY - 1);
724static const char* x86_reg_map[] = {
"r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15" };
728 for (
size_t i = 0; i < program->code_size; ++i) {
730 switch (instr->opcode)
733 printf(
"sub %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
736 printf(
"xor %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
739 printf(
"lea %s, [%s+%s*%u]\n", x86_reg_map[instr->dst], x86_reg_map[instr->dst], x86_reg_map[instr->src], 1 << instr->imm32);
742 printf(
"imul %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
745 printf(
"ror %s, %u\n", x86_reg_map[instr->dst], instr->imm32);
748 printf(
"add %s, %i\n", x86_reg_map[instr->dst], instr->imm32);
751 printf(
"xor %s, %i\n", x86_reg_map[instr->dst], instr->imm32);
754 printf(
"mov rax, %s\n", x86_reg_map[instr->dst]);
755 printf(
"mul %s\n", x86_reg_map[instr->src]);
756 printf(
"mov %s, rdx\n", x86_reg_map[instr->dst]);
759 printf(
"mov rax, %s\n", x86_reg_map[instr->dst]);
760 printf(
"imul %s\n", x86_reg_map[instr->src]);
761 printf(
"mov %s, rdx\n", x86_reg_map[instr->dst]);
764 printf(
"test edi, edi\n");
765 printf(
"target_%i: cmovz esi, edi\n", (
int)i);
769 printf(
"or edx, esi\n");
770 printf(
"test edx, %i\n", instr->imm32);
771 printf(
"jz target_%i\n", (
int)target);