Tor 0.4.9.2-alpha-dev
All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
program.c
1/* Copyright (c) 2020 tevador <tevador@gmail.com> */
2/* See LICENSE for licensing information */
3
4#include <stdbool.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <string.h>
8
9#include "program.h"
10#include "unreachable.h"
11#include "siphash_rng.h"
12
13/* instructions are generated until this CPU cycle */
14#define TARGET_CYCLE 192
15
16/* requirements for the program to be acceptable */
17#define REQUIREMENT_SIZE 512
18#define REQUIREMENT_MUL_COUNT 192
19#define REQUIREMENT_LATENCY 195
20
21/* R5 (x86 = r13) register cannot be used as the destination of INSTR_ADD_RS */
22#define REGISTER_NEEDS_DISPLACEMENT 5
23
24#define PORT_MAP_SIZE (TARGET_CYCLE + 4)
25#define NUM_PORTS 3
26#define MAX_RETRIES 1
27#define LOG2_BRANCH_PROB 4
28#define BRANCH_MASK 0x80000000
29
30#define TRACE false
31#define TRACE_PRINT(...) do { if (TRACE) printf(__VA_ARGS__); } while (false)
32
33#define MAX(a,b) ((a) > (b) ? (a) : (b))
34
35/* If the instruction is a multiplication. */
36static inline bool is_mul(instr_type type) {
37 return type <= INSTR_MUL_R;
38}
39
40#ifdef HASHX_PROGRAM_STATS
41/* If the instruction is a 64x64->128 bit multiplication. */
42static inline bool is_wide_mul(instr_type type) {
43 return type < INSTR_MUL_R;
44}
45#endif
46
47/* Ivy Bridge integer execution ports: P0, P1, P5 */
48typedef enum execution_port {
49 PORT_NONE = 0,
50 PORT_P0 = 1,
51 PORT_P1 = 2,
52 PORT_P5 = 4,
53 PORT_P01 = PORT_P0 | PORT_P1,
54 PORT_P05 = PORT_P0 | PORT_P5,
55 PORT_P015 = PORT_P0 | PORT_P1 | PORT_P5
56} execution_port;
57
58static const char* execution_port_names[] = {
59 "PORT_NONE", "PORT_P0", "PORT_P1", "PORT_P01", "PORT_P5", "PORT_P05", "PORT_P15", "PORT_P015"
60};
61
62typedef struct instr_template {
63 instr_type type; /* instruction type */
64 const char* x86_asm; /* x86 assembly */
65 int x86_size; /* x86 code size */
66 int latency; /* latency in cycles */
67 execution_port uop1; /* ex. ports for the 1st uop */
68 execution_port uop2; /* ex. ports for the 2nd uop */
69 uint32_t immediate_mask; /* mask for imm32 */
70 instr_type group; /* instruction group */
71 bool imm_can_be_0; /* if imm32 can be zero */
72 bool distinct_dst; /* if dst and src must be distinct */
73 bool op_par_src; /* operation parameter is equal to src */
74 bool has_src; /* if the instruction has a source operand */
75 bool has_dst; /* if the instr. has a destination operand */
77
78typedef struct register_info {
79 int latency; /* cycle when the register value will be ready */
80 instr_type last_op; /* last op applied to the register */
81 uint32_t last_op_par; /* parameter of the last op (~0 = constant) */
83
84typedef struct program_item {
85 const instr_template** templates;
86 uint32_t mask0;
87 uint32_t mask1;
88 bool duplicates;
90
91typedef struct generator_ctx {
92 int cycle;
93 int sub_cycle;
94 int mul_count;
95 bool chain_mul;
96 int latency;
97 siphash_rng gen;
98 register_info registers[8];
99 execution_port ports[PORT_MAP_SIZE][NUM_PORTS];
101
102static const instr_template tpl_umulh_r = {
103 .type = INSTR_UMULH_R,
104 .x86_asm = "mul r",
105 .x86_size = 9, /* mov, mul, mov */
106 .latency = 4,
107 .uop1 = PORT_P1,
108 .uop2 = PORT_P5,
109 .immediate_mask = 0,
110 .group = INSTR_UMULH_R,
111 .imm_can_be_0 = false,
112 .distinct_dst = false,
113 .op_par_src = false,
114 .has_src = true,
115 .has_dst = true,
116};
117
118static const instr_template tpl_smulh_r = {
119 .type = INSTR_SMULH_R,
120 .x86_asm = "imul r",
121 .x86_size = 9, /* mov, mul, mov */
122 .latency = 4,
123 .uop1 = PORT_P1,
124 .uop2 = PORT_P5,
125 .immediate_mask = 0,
126 .group = INSTR_SMULH_R,
127 .imm_can_be_0 = false,
128 .distinct_dst = false,
129 .op_par_src = false,
130 .has_src = true,
131 .has_dst = true,
132};
133
134static const instr_template tpl_mul_r = {
135 .type = INSTR_MUL_R,
136 .x86_asm = "imul r,r",
137 .x86_size = 4,
138 .latency = 3,
139 .uop1 = PORT_P1,
140 .uop2 = PORT_NONE,
141 .immediate_mask = 0,
142 .group = INSTR_MUL_R,
143 .imm_can_be_0 = false,
144 .distinct_dst = true,
145 .op_par_src = true,
146 .has_src = true,
147 .has_dst = true,
148};
149
150static const instr_template tpl_sub_r = {
151 .type = INSTR_SUB_R,
152 .x86_asm = "sub r,r",
153 .x86_size = 3,
154 .latency = 1,
155 .uop1 = PORT_P015,
156 .uop2 = PORT_NONE,
157 .immediate_mask = 0,
158 .group = INSTR_ADD_RS,
159 .imm_can_be_0 = false,
160 .distinct_dst = true,
161 .op_par_src = true,
162 .has_src = true,
163 .has_dst = true,
164};
165
166static const instr_template tpl_xor_r = {
167 .type = INSTR_XOR_R,
168 .x86_asm = "xor r,r",
169 .x86_size = 3,
170 .latency = 1,
171 .uop1 = PORT_P015,
172 .uop2 = PORT_NONE,
173 .immediate_mask = 0,
174 .group = INSTR_XOR_R,
175 .imm_can_be_0 = false,
176 .distinct_dst = true,
177 .op_par_src = true,
178 .has_src = true,
179 .has_dst = true,
180};
181
182static const instr_template tpl_add_rs = {
183 .type = INSTR_ADD_RS,
184 .x86_asm = "lea r,r+r*s",
185 .x86_size = 4,
186 .latency = 1,
187 .uop1 = PORT_P01,
188 .uop2 = PORT_NONE,
189 .immediate_mask = 3,
190 .group = INSTR_ADD_RS,
191 .imm_can_be_0 = true,
192 .distinct_dst = true,
193 .op_par_src = true,
194 .has_src = true,
195 .has_dst = true,
196};
197
198static const instr_template tpl_ror_c = {
199 .type = INSTR_ROR_C,
200 .x86_asm = "ror r,i",
201 .x86_size = 4,
202 .latency = 1,
203 .uop1 = PORT_P05,
204 .uop2 = PORT_NONE,
205 .immediate_mask = 63,
206 .group = INSTR_ROR_C,
207 .imm_can_be_0 = false,
208 .distinct_dst = true,
209 .op_par_src = false,
210 .has_src = false,
211 .has_dst = true,
212};
213
214static const instr_template tpl_add_c = {
215 .type = INSTR_ADD_C,
216 .x86_asm = "add r,i",
217 .x86_size = 7,
218 .latency = 1,
219 .uop1 = PORT_P015,
220 .uop2 = PORT_NONE,
221 .immediate_mask = UINT32_MAX,
222 .group = INSTR_ADD_C,
223 .imm_can_be_0 = false,
224 .distinct_dst = true,
225 .op_par_src = false,
226 .has_src = false,
227 .has_dst = true,
228};
229
230static const instr_template tpl_xor_c = {
231 .type = INSTR_XOR_C,
232 .x86_asm = "xor r,i",
233 .x86_size = 7,
234 .latency = 1,
235 .uop1 = PORT_P015,
236 .uop2 = PORT_NONE,
237 .immediate_mask = UINT32_MAX,
238 .group = INSTR_XOR_C,
239 .imm_can_be_0 = false,
240 .distinct_dst = true,
241 .op_par_src = false,
242 .has_src = false,
243 .has_dst = true,
244};
245
246
247static const instr_template tpl_target = {
248 .type = INSTR_TARGET,
249 .x86_asm = "cmovz esi, edi",
250 .x86_size = 5, /* test, cmovz */
251 .latency = 1,
252 .uop1 = PORT_P015,
253 .uop2 = PORT_P015,
254 .immediate_mask = 0,
255 .group = INSTR_TARGET,
256 .imm_can_be_0 = false,
257 .distinct_dst = true,
258 .op_par_src = false,
259 .has_src = false,
260 .has_dst = false,
261};
262
263static const instr_template tpl_branch = {
264 .type = INSTR_BRANCH,
265 .x86_asm = "jz target",
266 .x86_size = 10, /* or, test, jz */
267 .latency = 1,
268 .uop1 = PORT_P015,
269 .uop2 = PORT_P015,
270 .immediate_mask = BRANCH_MASK,
271 .group = INSTR_BRANCH,
272 .imm_can_be_0 = false,
273 .distinct_dst = true,
274 .op_par_src = false,
275 .has_src = false,
276 .has_dst = false,
277};
278
279static const instr_template* instr_lookup[] = {
280 &tpl_ror_c,
281 &tpl_xor_c,
282 &tpl_add_c,
283 &tpl_add_c,
284 &tpl_sub_r,
285 &tpl_xor_r,
286 &tpl_xor_c,
287 &tpl_add_rs,
288};
289
290static const instr_template* wide_mul_lookup[] = {
291 &tpl_smulh_r,
292 &tpl_umulh_r
293};
294
295static const instr_template* mul_lookup = &tpl_mul_r;
296static const instr_template* target_lookup = &tpl_target;
297static const instr_template* branch_lookup = &tpl_branch;
298
299static const program_item item_mul = {
300 .templates = &mul_lookup,
301 .mask0 = 0,
302 .mask1 = 0,
303 .duplicates = true
304};
305
306static const program_item item_target = {
307 .templates = &target_lookup,
308 .mask0 = 0,
309 .mask1 = 0,
310 .duplicates = true
311};
312
313static const program_item item_branch = {
314 .templates = &branch_lookup,
315 .mask0 = 0,
316 .mask1 = 0,
317 .duplicates = true
318};
319
320static const program_item item_wide_mul = {
321 .templates = wide_mul_lookup,
322 .mask0 = 1,
323 .mask1 = 1,
324 .duplicates = true
325};
326
327static const program_item item_any = {
328 .templates = instr_lookup,
329 .mask0 = 7,
330 .mask1 = 3, /* instructions that don't need a src register */
331 .duplicates = false
332};
333
334static const program_item* program_layout[] = {
335 &item_mul,
336 &item_target,
337 &item_any,
338 &item_mul,
339 &item_any,
340 &item_any,
341 &item_mul,
342 &item_any,
343 &item_any,
344 &item_mul,
345 &item_any,
346 &item_any,
347 &item_wide_mul,
348 &item_any,
349 &item_any,
350 &item_mul,
351 &item_any,
352 &item_any,
353 &item_mul,
354 &item_branch,
355 &item_any,
356 &item_mul,
357 &item_any,
358 &item_any,
359 &item_wide_mul,
360 &item_any,
361 &item_any,
362 &item_mul,
363 &item_any,
364 &item_any,
365 &item_mul,
366 &item_any,
367 &item_any,
368 &item_mul,
369 &item_any,
370 &item_any,
371};
372
373static const instr_template* select_template(generator_ctx* ctx, instr_type last_instr, int attempt) {
374 const program_item* item = program_layout[ctx->sub_cycle % 36];
375 const instr_template* tpl;
376 do {
377 int index = item->mask0 ? hashx_siphash_rng_u8(&ctx->gen) & (attempt > 0 ? item->mask1 : item->mask0) : 0;
378 tpl = item->templates[index];
379 } while (!item->duplicates && tpl->group == last_instr);
380 return tpl;
381}
382
383static uint32_t branch_mask(siphash_rng* gen) {
384 uint32_t mask = 0;
385 int popcnt = 0;
386 while (popcnt < LOG2_BRANCH_PROB) {
387 int bit = hashx_siphash_rng_u8(gen) % 32;
388 uint32_t bitmask = 1U << bit;
389 if (!(mask & bitmask)) {
390 mask |= bitmask;
391 popcnt++;
392 }
393 }
394 return mask;
395}
396
397static void instr_from_template(const instr_template* tpl, siphash_rng* gen, instruction* instr) {
398 instr->opcode = tpl->type;
399 if (tpl->immediate_mask) {
400 if (tpl->immediate_mask == BRANCH_MASK) {
401 instr->imm32 = branch_mask(gen);
402 }
403 else do {
404 instr->imm32 = hashx_siphash_rng_u32(gen) & tpl->immediate_mask;
405 } while (instr->imm32 == 0 && !tpl->imm_can_be_0);
406 }
407 if (!tpl->op_par_src) {
408 if (tpl->distinct_dst) {
409 instr->op_par = UINT32_MAX;
410 }
411 else {
412 instr->op_par = hashx_siphash_rng_u32(gen);
413 }
414 }
415 if (!tpl->has_src) {
416 instr->src = -1;
417 }
418 if (!tpl->has_dst) {
419 instr->dst = -1;
420 }
421}
422
423static bool select_register(int available_regs[8], int regs_count, siphash_rng* gen, int* reg_out) {
424 if (regs_count == 0)
425 return false;
426
427 int index;
428
429 if (regs_count > 1) {
430 index = hashx_siphash_rng_u32(gen) % regs_count;
431 }
432 else {
433 index = 0;
434 }
435 *reg_out = available_regs[index];
436 return true;
437}
438
439static bool select_destination(const instr_template* tpl, instruction* instr, generator_ctx* ctx, int cycle) {
440 int available_regs[8];
441 int regs_count = 0;
442 /* Conditions for the destination register:
443 // * value must be ready at the required cycle
444 // * cannot be the same as the source register unless the instruction allows it
445 // - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
446 // * register cannot be multiplied twice in a row unless chain_mul is true
447 // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
448 // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
449 // * either the last instruction applied to the register or its source must be different than this instruction
450 // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
451 // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) */
452 for (int i = 0; i < 8; ++i) {
453 bool available = ctx->registers[i].latency <= cycle;
454 available &= ((!tpl->distinct_dst) | (i != instr->src));
455 available &= (ctx->chain_mul | (tpl->group != INSTR_MUL_R) | (ctx->registers[i].last_op != INSTR_MUL_R));
456 available &= ((ctx->registers[i].last_op != tpl->group) | (ctx->registers[i].last_op_par != instr->op_par));
457 available &= ((instr->opcode != INSTR_ADD_RS) | (i != REGISTER_NEEDS_DISPLACEMENT));
458 available_regs[regs_count] = available ? i : 0;
459 regs_count += available;
460 }
461 return select_register(available_regs, regs_count, &ctx->gen, &instr->dst);
462}
463
464static bool select_source(const instr_template* tpl, instruction* instr, generator_ctx* ctx, int cycle) {
465 int available_regs[8];
466 int regs_count = 0;
467 /* all registers that are ready at the cycle */
468 for (int i = 0; i < 8; ++i) {
469 if (ctx->registers[i].latency <= cycle)
470 available_regs[regs_count++] = i;
471 }
472 /* if there are only 2 available registers for ADD_RS and one of them is r5, select it as the source because it cannot be the destination */
473 if (regs_count == 2 && instr->opcode == INSTR_ADD_RS) {
474 if (available_regs[0] == REGISTER_NEEDS_DISPLACEMENT || available_regs[1] == REGISTER_NEEDS_DISPLACEMENT) {
475 instr->op_par = instr->src = REGISTER_NEEDS_DISPLACEMENT;
476 return true;
477 }
478 }
479 if (select_register(available_regs, regs_count, &ctx->gen, &instr->src)) {
480 if (tpl->op_par_src)
481 instr->op_par = instr->src;
482 return true;
483 }
484 return false;
485}
486
487static int schedule_uop(execution_port uop, generator_ctx* ctx, int cycle, bool commit) {
488 /* The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
489 port P1 (multiplication) by instructions that can go to any port. */
490 for (; cycle < PORT_MAP_SIZE; ++cycle) {
491 if ((uop & PORT_P5) && !ctx->ports[cycle][2]) {
492 if (commit) {
493 ctx->ports[cycle][2] = uop;
494 }
495 TRACE_PRINT("%s scheduled to port P5 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
496 return cycle;
497 }
498 if ((uop & PORT_P0) && !ctx->ports[cycle][0]) {
499 if (commit) {
500 ctx->ports[cycle][0] = uop;
501 }
502 TRACE_PRINT("%s scheduled to port P0 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
503 return cycle;
504 }
505 if ((uop & PORT_P1) != 0 && !ctx->ports[cycle][1]) {
506 if (commit) {
507 ctx->ports[cycle][1] = uop;
508 }
509 TRACE_PRINT("%s scheduled to port P1 at cycle %i (commit = %i)\n", execution_port_names[uop], cycle, commit);
510 return cycle;
511 }
512 }
513 return -1;
514}
515
516static int schedule_instr(const instr_template* tpl, generator_ctx* ctx, bool commit) {
517 if (tpl->uop2 == PORT_NONE) {
518 /* this instruction has only one uOP */
519 return schedule_uop(tpl->uop1, ctx, ctx->cycle, commit);
520 }
521 else {
522 /* instructions with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle */
523 for (int cycle = ctx->cycle; cycle < PORT_MAP_SIZE; ++cycle) {
524
525 int cycle1 = schedule_uop(tpl->uop1, ctx, cycle, false);
526 int cycle2 = schedule_uop(tpl->uop2, ctx, cycle, false);
527
528 if (cycle1 >= 0 && cycle1 == cycle2) {
529 if (commit) {
530 schedule_uop(tpl->uop1, ctx, cycle, true);
531 schedule_uop(tpl->uop2, ctx, cycle, true);
532 }
533 return cycle1;
534 }
535 }
536 }
537
538 return -1;
539}
540
541static void print_registers(const generator_ctx* ctx) {
542 for (int i = 0; i < 8; ++i) {
543 printf(" R%i = %i\n", i, ctx->registers[i].latency);
544 }
545}
546
547bool hashx_program_generate(const siphash_state* key, hashx_program* program) {
548 generator_ctx ctx = {
549 .cycle = 0,
550 .sub_cycle = 0, /* 3 sub-cycles = 1 cycle */
551 .mul_count = 0,
552 .chain_mul = false,
553 .latency = 0,
554 .ports = {{ 0 }}
555 };
556 hashx_siphash_rng_init(&ctx.gen, key);
557#ifdef HASHX_RNG_CALLBACK
558 ctx.gen.callback = program->rng_callback;
559 ctx.gen.callback_user_data = program->rng_callback_user_data;
560#endif
561 for (int i = 0; i < 8; ++i) {
562 ctx.registers[i].last_op = -1;
563 ctx.registers[i].latency = 0;
564 ctx.registers[i].last_op_par = (uint32_t)-1;
565 }
566 program->code_size = 0;
567
568 int attempt = 0;
569 instr_type last_instr = -1;
570#ifdef HASHX_PROGRAM_STATS
571 program->x86_size = 0;
572#endif
573
574 while (program->code_size < HASHX_PROGRAM_MAX_SIZE) {
575 instruction* instr = &program->code[program->code_size];
576 TRACE_PRINT("CYCLE: %i/%i\n", ctx.sub_cycle, ctx.cycle);
577
578 /* select an instruction template */
579 const instr_template* tpl = select_template(&ctx, last_instr, attempt);
580 last_instr = tpl->group;
581
582 TRACE_PRINT("Template: %s\n", tpl->x86_asm);
583
584 instr_from_template(tpl, &ctx.gen, instr);
585
586 /* calculate the earliest cycle when this instruction (all of its uOPs) can be scheduled for execution */
587 int scheduleCycle = schedule_instr(tpl, &ctx, false);
588 if (scheduleCycle < 0) {
589 TRACE_PRINT("Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle);
590 /* __debugbreak(); */
591 break;
592 }
593
594 ctx.chain_mul = attempt > 0;
595
596 /* find a source register (if applicable) that will be ready when this instruction executes */
597 if (tpl->has_src) {
598 if (!select_source(tpl, instr, &ctx, scheduleCycle)) {
599 TRACE_PRINT("; src STALL (attempt %i)\n", attempt);
600 if (attempt++ < MAX_RETRIES) {
601 continue;
602 }
603 if (TRACE) {
604 printf("; select_source FAILED at cycle %i\n", ctx.cycle);
605 print_registers(&ctx);
606 /* __debugbreak(); */
607 }
608 ctx.sub_cycle += 3;
609 ctx.cycle = ctx.sub_cycle / 3;
610 attempt = 0;
611 continue;
612 }
613 TRACE_PRINT("; src = r%i\n", instr->src);
614 }
615
616 /* find a destination register that will be ready when this instruction executes */
617 if (tpl->has_dst) {
618 if (!select_destination(tpl, instr, &ctx, scheduleCycle)) {
619 TRACE_PRINT("; dst STALL (attempt %i)\n", attempt);
620 if (attempt++ < MAX_RETRIES) {
621 continue;
622 }
623 if (TRACE) {
624 printf("; select_destination FAILED at cycle %i\n", ctx.cycle);
625 print_registers(&ctx);
626 /* __debugbreak(); */
627 }
628 ctx.sub_cycle += 3;
629 ctx.cycle = ctx.sub_cycle / 3;
630 attempt = 0;
631 continue;
632 }
633 TRACE_PRINT("; dst = r%i\n", instr->dst);
634 }
635 attempt = 0;
636
637 /* recalculate when the instruction can be scheduled for execution based on operand availability */
638 scheduleCycle = schedule_instr(tpl, &ctx, true);
639
640 if (scheduleCycle < 0) {
641 TRACE_PRINT("Unable to map operation '%s' to execution port (cycle %i)\n", tpl->x86_asm, ctx.cycle);
642 break;
643 }
644
645 /* terminating condition */
646 if (scheduleCycle >= TARGET_CYCLE) {
647 break;
648 }
649
650 if (tpl->has_dst) {
651 register_info* ri = &ctx.registers[instr->dst];
652 int retireCycle = scheduleCycle + tpl->latency;
653 ri->latency = retireCycle;
654 ri->last_op = tpl->group;
655 ri->last_op_par = instr->op_par;
656 ctx.latency = MAX(retireCycle, ctx.latency);
657 TRACE_PRINT("; RETIRED at cycle %i\n", retireCycle);
658 }
659
660 program->code_size++;
661#ifdef HASHX_PROGRAM_STATS
662 program->x86_size += tpl->x86_size;
663#endif
664
665 ctx.mul_count += is_mul(instr->opcode);
666
667 ++ctx.sub_cycle;
668 ctx.sub_cycle += (tpl->uop2 != PORT_NONE);
669 ctx.cycle = ctx.sub_cycle / 3;
670 }
671
672#ifdef HASHX_PROGRAM_STATS
673 memset(program->asic_latencies, 0, sizeof(program->asic_latencies));
674
675 program->counter = ctx.gen.counter;
676 program->wide_mul_count = 0;
677 program->mul_count = ctx.mul_count;
678
679 /* Calculate ASIC latency:
680 Assumes 1 cycle latency for all operations and unlimited parallelization. */
681 for (size_t i = 0; i < program->code_size; ++i) {
682 instruction* instr = &program->code[i];
683 if (instr->dst < 0)
684 continue;
685 int last_dst = program->asic_latencies[instr->dst] + 1;
686 int lat_src = instr->dst != instr->src ? program->asic_latencies[instr->src] + 1 : 0;
687 program->asic_latencies[instr->dst] = MAX(last_dst, lat_src);
688 program->wide_mul_count += is_wide_mul(instr->opcode);
689 }
690
691 program->asic_latency = 0;
692 program->cpu_latency = 0;
693 for (int i = 0; i < 8; ++i) {
694 program->asic_latency = MAX(program->asic_latency, program->asic_latencies[i]);
695 program->cpu_latencies[i] = ctx.registers[i].latency;
696 program->cpu_latency = MAX(program->cpu_latency, program->cpu_latencies[i]);
697 }
698
699 program->ipc = program->code_size / (double)program->cpu_latency;
700 program->branch_count = 0;
701 memset(program->branches, 0, sizeof(program->branches));
702
703 if (TRACE) {
704 printf("; ALU port utilization:\n");
705 printf("; (* = in use, _ = idle)\n");
706 for (int i = 0; i < PORT_MAP_SIZE; ++i) {
707 printf("; %3i ", i);
708 for (int j = 0; j < NUM_PORTS; ++j) {
709 printf("%c", (ctx.ports[i][j] ? '*' : '_'));
710 }
711 printf("\n");
712 }
713 }
714#endif
715
716 /* reject programs that don't meet the uniform complexity requirements */
717 /* this happens in less than 1 seed out of 10000 */
718 return
719 (program->code_size == REQUIREMENT_SIZE) &&
720 (ctx.mul_count == REQUIREMENT_MUL_COUNT) &&
721 (ctx.latency == REQUIREMENT_LATENCY - 1); /* cycles are numbered from 0 */
722}
723
724static const char* x86_reg_map[] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" };
725
726void hashx_program_asm_x86(const hashx_program* program) {
727 size_t target = 0;
728 for (size_t i = 0; i < program->code_size; ++i) {
729 const instruction* instr = &program->code[i];
730 switch (instr->opcode)
731 {
732 case INSTR_SUB_R:
733 printf("sub %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
734 break;
735 case INSTR_XOR_R:
736 printf("xor %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
737 break;
738 case INSTR_ADD_RS:
739 printf("lea %s, [%s+%s*%u]\n", x86_reg_map[instr->dst], x86_reg_map[instr->dst], x86_reg_map[instr->src], 1 << instr->imm32);
740 break;
741 case INSTR_MUL_R:
742 printf("imul %s, %s\n", x86_reg_map[instr->dst], x86_reg_map[instr->src]);
743 break;
744 case INSTR_ROR_C:
745 printf("ror %s, %u\n", x86_reg_map[instr->dst], instr->imm32);
746 break;
747 case INSTR_ADD_C:
748 printf("add %s, %i\n", x86_reg_map[instr->dst], instr->imm32);
749 break;
750 case INSTR_XOR_C:
751 printf("xor %s, %i\n", x86_reg_map[instr->dst], instr->imm32);
752 break;
753 case INSTR_UMULH_R:
754 printf("mov rax, %s\n", x86_reg_map[instr->dst]);
755 printf("mul %s\n", x86_reg_map[instr->src]);
756 printf("mov %s, rdx\n", x86_reg_map[instr->dst]);
757 break;
758 case INSTR_SMULH_R:
759 printf("mov rax, %s\n", x86_reg_map[instr->dst]);
760 printf("imul %s\n", x86_reg_map[instr->src]);
761 printf("mov %s, rdx\n", x86_reg_map[instr->dst]);
762 break;
763 case INSTR_TARGET:
764 printf("test edi, edi\n");
765 printf("target_%i: cmovz esi, edi\n", (int)i);
766 target = i;
767 break;
768 case INSTR_BRANCH:
769 printf("or edx, esi\n");
770 printf("test edx, %i\n", instr->imm32);
771 printf("jz target_%i\n", (int)target);
772 break;
773 default:
774 UNREACHABLE;
775 }
776 }
777}
#define MAX(a, b)
Definition: cmp.h:22