hashx/scheduler.rs
1//! Scheduling model for program generation
2//!
3//! HashX uses a simple scheduling model inspired by the Intel Ivy Bridge
4//! microarchitecture to choose registers that should be available and
5//! avoid stalls.
6
7use crate::program::{Instruction, Opcode};
8use crate::register::{RegisterId, NUM_REGISTERS};
9
10/// Scheduling information for each opcode
11mod model {
12 use crate::program::Opcode;
13
14 /// Number of simulated cycles we run before stopping program generation
15 pub(super) const TARGET_CYCLES: usize = 192;
16
17 /// Total number of cycles to store schedule data for
18 pub(super) const SCHEDULE_SIZE: usize = TARGET_CYCLES + MAX_LATENCY;
19
20 /// Number of execution ports in our simulated microarchitecture
21 pub(super) const NUM_EXECUTION_PORTS: usize = 3;
22
23 /// Identify one or more CPU execution ports
24 ///
25 /// They are inspired by the Intel Ivy Bridge design and the original HashX
26 /// implementation names them P5, P0, and P1.
27 ///
28 /// We can mostly ignore the names and treat this as an array of three
29 /// ports specified in the order that HashX tries to allocate them. The
30 /// original HashX implementation explains this allocation order as an
31 /// optimistic strategy which assumes P1 will typically be reserved for
32 /// multiplication.
33 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
34 pub(super) struct ExecPorts(pub(super) u8);
35
36 /// Port P5 (first choice) only
37 const P5: ExecPorts = ExecPorts(1 << 0);
38 /// Port P0 (second choice) only
39 const P0: ExecPorts = ExecPorts(1 << 1);
40 /// Port P1 (third choice) only
41 const P1: ExecPorts = ExecPorts(1 << 2);
42 /// Either port P0 or P1
43 const P01: ExecPorts = ExecPorts(P0.0 | P1.0);
44 /// Either port P0 or P5
45 const P05: ExecPorts = ExecPorts(P0.0 | P5.0);
46 /// Any of the three ports
47 const P015: ExecPorts = ExecPorts(P0.0 | P1.0 | P5.0);
48
49 /// Maximum value of [`instruction_latency_cycles()`] for any Opcode
50 const MAX_LATENCY: usize = 4;
51
52 /// Latency for each operation, in cycles
53 #[inline(always)]
54 pub(super) fn instruction_latency_cycles(op: Opcode) -> usize {
55 match op {
56 Opcode::AddConst => 1,
57 Opcode::AddShift => 1,
58 Opcode::Branch => 1,
59 Opcode::Rotate => 1,
60 Opcode::Sub => 1,
61 Opcode::Target => 1,
62 Opcode::Xor => 1,
63 Opcode::XorConst => 1,
64 Opcode::Mul => 3,
65 Opcode::SMulH => 4,
66 Opcode::UMulH => 4,
67 }
68 }
69
70 /// Break an instruction down into one or two micro-operation port sets.
71 #[inline(always)]
72 pub(super) fn micro_operations(op: Opcode) -> (ExecPorts, Option<ExecPorts>) {
73 match op {
74 Opcode::AddConst => (P015, None),
75 Opcode::Sub => (P015, None),
76 Opcode::Xor => (P015, None),
77 Opcode::XorConst => (P015, None),
78 Opcode::Mul => (P1, None),
79 Opcode::AddShift => (P01, None),
80 Opcode::Rotate => (P05, None),
81 Opcode::SMulH => (P1, Some(P5)),
82 Opcode::UMulH => (P1, Some(P5)),
83 Opcode::Branch => (P015, Some(P015)),
84 Opcode::Target => (P015, Some(P015)),
85 }
86 }
87
88 /// Each instruction advances the earliest possible issuing cycle by one
89 /// sub-cycle per micro-op.
90 #[inline(always)]
91 pub(super) fn instruction_sub_cycle_count(op: Opcode) -> usize {
92 match micro_operations(op) {
93 (_, None) => 1,
94 (_, Some(_)) => 2,
95 }
96 }
97}
98
99/// One single execution port
100///
101/// Formatted to use easily as an array index.
102/// It's helpful for this to be a compact data type.
103#[derive(Debug, Clone, Copy, Eq, PartialEq)]
104struct ExecPortIndex {
105 /// Equivalent to a bit position in [`model::ExecPorts`].
106 index: u8,
107}
108
109/// Overall state for the simulated execution schedule
110#[derive(Debug, Default, Clone)]
111pub(crate) struct Scheduler {
112 /// Current timestamp in the schedule, in sub-cycles
113 ///
114 /// Sub-cycles advance as code is generated, modeling the time taken to
115 /// fetch and decode instructions.
116 sub_cycle: SubCycle,
117
118 /// Current timestamp in the schedule, in cycles
119 ///
120 /// Derived from sub_cycle.
121 cycle: Cycle,
122
123 /// State for scheduling execution by fitting micro-ops into execution ports
124 exec: ExecSchedule,
125
126 /// State for scheduling register access by keeping track of data latency
127 data: DataSchedule,
128}
129
130impl Scheduler {
131 /// Create a new empty execution schedule at cycle 0
132 #[inline(always)]
133 pub(crate) fn new() -> Self {
134 Default::default()
135 }
136
137 /// Stall for one cycle.
138 ///
139 /// Used when register allocation fails.
140 /// Returns `Ok` if we had enough time, or `Err` if we ran out.
141 #[inline(always)]
142 pub(crate) fn stall(&mut self) -> Result<(), ()> {
143 self.advance(SubCycle::PER_CYCLE as usize)
144 }
145
146 /// Return the current instruction fetch/decode timestamp in sub-cycles.
147 #[inline(always)]
148 pub(crate) fn instruction_stream_sub_cycle(&self) -> SubCycle {
149 self.sub_cycle
150 }
151
152 /// Advance forward in time by some number of sub-cycles.
153 ///
154 /// Stops just before reaching `Cycle::target()`, where we stop
155 /// scheduling new instructions.
156 #[inline(always)]
157 fn advance(&mut self, n: usize) -> Result<(), ()> {
158 let sub_cycle = self.sub_cycle.add_usize(n)?;
159 let cycle = sub_cycle.cycle();
160 if cycle < Cycle::target() {
161 self.sub_cycle = sub_cycle;
162 self.cycle = cycle;
163 Ok(())
164 } else {
165 Err(())
166 }
167 }
168
169 /// Advance time forward by the modeled duration of the instruction fetch
170 /// and decode, in sub-cycles.
171 ///
172 /// Returns Ok if we still have enough time to schedule more, or Err if the
173 /// schedule would be full.
174 #[inline(always)]
175 pub(crate) fn advance_instruction_stream(&mut self, op: Opcode) -> Result<(), ()> {
176 self.advance(model::instruction_sub_cycle_count(op))
177 }
178
179 /// Calculate a timing plan describing the cycle and execution units
180 /// on which a particular opcode could run, at the earliest.
181 #[inline(always)]
182 pub(crate) fn instruction_plan(&self, op: Opcode) -> Result<InstructionPlan, ()> {
183 self.exec.instruction_plan(self.cycle, op)
184 }
185
186 /// Commit to using a plan returned by [`Self::instruction_plan()`],
187 /// for a particular concrete [`Instruction`] instance.
188 ///
189 /// Marks as busy each execution unit cycle in the plan, and updates the
190 /// latency for the [`Instruction`]'s destination register if it has one.
191 #[inline(always)]
192 pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) {
193 self.exec.mark_instruction_busy(plan);
194 if let Some(dst) = inst.destination() {
195 self.data
196 .plan_register_write(dst, plan.cycle_retired(inst.opcode()));
197 }
198 }
199
200 /// Look up if a register will be available at or before the indicated cycle.
201 #[inline(always)]
202 pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
203 self.data.register_available(reg, cycle)
204 }
205
206 /// Return the overall data latency.
207 ///
208 /// This is the Cycle at which we expect every register
209 /// to reach its final simulated state.
210 #[inline(always)]
211 pub(crate) fn overall_latency(&self) -> Cycle {
212 self.data.overall_latency()
213 }
214}
215
216/// Cycle timestamp
217///
218/// Measured from the beginning of the program, assuming no branches taken.
219/// It's useful to be able to store these compactly in register latency arrays.
220#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
221pub(crate) struct Cycle(u8);
222
223impl Cycle {
224 /// HashX stops generating code once the issue cycle reaches this target.
225 #[inline(always)]
226 fn target() -> Self {
227 Cycle(
228 model::TARGET_CYCLES
229 .try_into()
230 .expect("Cycle type wide enough for target count"),
231 )
232 }
233
234 /// Cast this Cycle count to a usize losslessly.
235 #[inline(always)]
236 pub(crate) fn as_usize(&self) -> usize {
237 self.0.into()
238 }
239
240 /// Add an integer number of cycles, returning Err(()) if we reach the end.
241 #[inline(always)]
242 fn add_usize(&self, n: usize) -> Result<Self, ()> {
243 let result = self.as_usize() + n;
244 if result < model::SCHEDULE_SIZE {
245 Ok(Cycle(
246 result
247 .try_into()
248 .expect("Cycle type wide enough for full schedule size"),
249 ))
250 } else {
251 Err(())
252 }
253 }
254}
255
256/// Sub-cycle timestamp
257///
258/// Timer for instruction decode, at a finer resolution than the Cycle
259/// we use for keeping schedule records. Doesn't need to be compact.
260#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
261pub(crate) struct SubCycle(u16);
262
263impl SubCycle {
264 /// Number of sub-cycles per cycle
265 const PER_CYCLE: u16 = 3;
266
267 /// Maximum value of a sub-cycle counter, based on the schedule size
268 const MAX: Self = SubCycle(model::SCHEDULE_SIZE as u16 * Self::PER_CYCLE - 1);
269
270 /// Cast this sub-cycle count into a usize losslessly.
271 #[inline(always)]
272 pub(crate) fn as_usize(&self) -> usize {
273 self.0.into()
274 }
275
276 /// Convert this sub-cycle into a full Cycle timestamp.
277 #[inline(always)]
278 fn cycle(&self) -> Cycle {
279 Cycle(
280 (self.0 / Self::PER_CYCLE)
281 .try_into()
282 .expect("Cycle type wide enough for conversion from SubCycle"),
283 )
284 }
285
286 /// Advance by an integer number of sub-cycles.
287 ///
288 /// Returns the new advanced [`SubCycle`], or `Err(())`
289 /// if we reach the end of the schedule.
290 #[inline(always)]
291 fn add_usize(&self, n: usize) -> Result<Self, ()> {
292 let result = self.as_usize() + n;
293 if result < Self::MAX.0.into() {
294 Ok(SubCycle(result.try_into().expect(
295 "SubCycle type wide enough for full schedule size",
296 )))
297 } else {
298 Err(())
299 }
300 }
301}
302
303/// Busy tracking for one CPU execution port
304#[derive(Debug, Default, Clone)]
305struct PortSchedule {
306 /// Bit array indicating when this port is busy, indexed by Cycle
307 busy: SimpleBitArray<
308 { (usize::BITS as usize + model::SCHEDULE_SIZE - 1) / (usize::BITS as usize) },
309 >,
310}
311
312/// Latency tracking for all relevant CPU registers
313#[derive(Debug, Default, Clone)]
314struct DataSchedule {
315 /// Cycle count at which this register is available, indexed by register ID
316 latencies: [Cycle; NUM_REGISTERS],
317}
318
319impl DataSchedule {
320 /// Plan to finish a register write at the indicated cycle
321 #[inline(always)]
322 fn plan_register_write(&mut self, dst: RegisterId, cycle: Cycle) {
323 self.latencies[dst.as_usize()] = cycle;
324 }
325
326 /// Look up if a register will be available at or before the indicated cycle.
327 #[inline(always)]
328 fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
329 self.latencies[reg.as_usize()] <= cycle
330 }
331
332 /// Return the overall latency, the [`Cycle`] at which we expect
333 /// every register to reach its latest simulated state.
334 #[inline(always)]
335 fn overall_latency(&self) -> Cycle {
336 match self.latencies.iter().max() {
337 Some(cycle) => *cycle,
338 None => Default::default(),
339 }
340 }
341}
342
343/// Execution schedule for all ports
344///
345/// This is a scoreboard that keeps track of which CPU units are busy in which
346/// cycles, so we can estimate a timestamp at which each instruction will write
347/// its result.
348#[derive(Debug, Default, Clone)]
349struct ExecSchedule {
350 /// Execution schedule (busy flags) for each port
351 ports: [PortSchedule; model::NUM_EXECUTION_PORTS],
352}
353
354impl ExecSchedule {
355 /// Calculate the next cycle at which we could schedule one micro-op.
356 ///
357 /// HashX always searches execution ports in the same order, and it will
358 /// look ahead up to the entire length of the schedule before failing.
359 #[inline(always)]
360 fn micro_plan(&self, begin: Cycle, ports: model::ExecPorts) -> Result<MicroOpPlan, ()> {
361 let mut cycle = begin;
362 loop {
363 for index in 0..(model::NUM_EXECUTION_PORTS as u8) {
364 if (ports.0 & (1 << index)) != 0
365 && !self.ports[index as usize].busy.get(cycle.as_usize())
366 {
367 return Ok(MicroOpPlan {
368 cycle,
369 port: ExecPortIndex { index },
370 });
371 }
372 }
373 cycle = cycle.add_usize(1)?;
374 }
375 }
376
377 /// Mark the schedule busy according to a previously calculated plan.
378 #[inline(always)]
379 fn mark_micro_busy(&mut self, plan: MicroOpPlan) {
380 self.ports[plan.port.index as usize]
381 .busy
382 .set(plan.cycle.as_usize(), true);
383 }
384
385 /// Calculate a timing plan describing the cycle and execution units
386 /// we could use for scheduling one entire instruction.
387 ///
388 /// This takes place after the [`Opcode`] has been chosen but before
389 /// a full [`Instruction`] can be assembled.
390 #[inline(always)]
391 fn instruction_plan(&self, begin: Cycle, op: Opcode) -> Result<InstructionPlan, ()> {
392 match model::micro_operations(op) {
393 // Single-op instructions
394 (single_port, None) => {
395 InstructionPlan::from_micro_plans(self.micro_plan(begin, single_port)?, None)
396 }
397
398 // HashX schedules two-op instructions by searching forward
399 // until we find the first cycle where both ports are
400 // simultaneously available.
401 (first_port, Some(second_port)) => {
402 let mut cycle = begin;
403 loop {
404 if let (Ok(first_plan), Ok(second_plan)) = (
405 self.micro_plan(cycle, first_port),
406 self.micro_plan(cycle, second_port),
407 ) {
408 if let Ok(joint_plan) =
409 InstructionPlan::from_micro_plans(first_plan, Some(second_plan))
410 {
411 return Ok(joint_plan);
412 }
413 }
414 cycle = cycle.add_usize(1)?;
415 }
416 }
417 }
418 }
419
420 /// Mark each micro-op in an InstructionPlan as busy in the schedule.
421 #[inline(always)]
422 fn mark_instruction_busy(&mut self, plan: &InstructionPlan) {
423 let (first, second) = plan.as_micro_plans();
424 self.mark_micro_busy(first);
425 if let Some(second) = second {
426 self.mark_micro_busy(second);
427 }
428 }
429}
430
431/// Detailed execution schedule for one micro-operation
432///
433/// Includes the [`Cycle`] it begins on, and the actual
434/// execution port it was assigned.
435#[derive(Debug, Clone, Copy, Eq, PartialEq)]
436struct MicroOpPlan {
437 /// The Cycle this operation begins on
438 cycle: Cycle,
439 /// Index of the execution port it runs on
440 port: ExecPortIndex,
441}
442
443/// Detailed execution schedule for one instruction
444///
445/// This is defined as either one or two micro-operations
446/// scheduled on the same cycle.
447#[derive(Debug, Clone, Copy, Eq, PartialEq)]
448pub(crate) struct InstructionPlan {
449 /// The Cycle this whole instruction begins on
450 cycle: Cycle,
451 /// First execution port, always present
452 first_port: ExecPortIndex,
453 /// Optional second execution port ID, for two-uop instructions
454 second_port: Option<ExecPortIndex>,
455}
456
457impl InstructionPlan {
458 /// Get the Cycle this whole instruction begins on.
459 #[inline(always)]
460 pub(crate) fn cycle_issued(&self) -> Cycle {
461 self.cycle
462 }
463
464 /// Calculate the cycle this instruction is complete by.
465 #[inline(always)]
466 pub(crate) fn cycle_retired(&self, op: Opcode) -> Cycle {
467 self.cycle
468 .add_usize(model::instruction_latency_cycles(op))
469 .expect("instruction retired prior to end of schedule")
470 }
471
472 /// Convert this `InstructionPlan` back to one or two [`MicroOpPlan`]s.
473 #[inline(always)]
474 fn as_micro_plans(&self) -> (MicroOpPlan, Option<MicroOpPlan>) {
475 (
476 MicroOpPlan {
477 cycle: self.cycle,
478 port: self.first_port,
479 },
480 self.second_port.map(|port| MicroOpPlan {
481 cycle: self.cycle,
482 port,
483 }),
484 )
485 }
486
487 /// Bundle two [`MicroOpPlan`]s into an [`InstructionPlan`],
488 /// if they are on matching cycles.
489 ///
490 /// Returns `Err(())` if the combination is not possible.
491 #[inline(always)]
492 fn from_micro_plans(first_op: MicroOpPlan, second_op: Option<MicroOpPlan>) -> Result<Self, ()> {
493 let second_port = match second_op {
494 None => None,
495 Some(second_op) => {
496 if first_op.cycle == second_op.cycle {
497 Some(second_op.port)
498 } else {
499 return Err(());
500 }
501 }
502 };
503 Ok(Self {
504 cycle: first_op.cycle,
505 first_port: first_op.port,
506 second_port,
507 })
508 }
509}
510
511/// Simple packed bit array implementation
512///
513/// This could use the `bitvec` crate if we cared to depend on it, but the
514/// functionality we need is so tiny let's keep this simple.
515#[derive(Debug, Clone, Eq, PartialEq)]
516struct SimpleBitArray<const N: usize> {
517 /// Array of words to use as a bit vector, in LSB-first order
518 inner: [usize; N],
519}
520
521impl<const N: usize> Default for SimpleBitArray<N> {
522 #[inline(always)]
523 fn default() -> Self {
524 Self {
525 inner: [0_usize; N],
526 }
527 }
528}
529
530impl<const N: usize> SimpleBitArray<N> {
531 /// Set or clear one bit in the array.
532 ///
533 /// Panics if the index is out of range.
534 #[inline(always)]
535 fn set(&mut self, index: usize, value: bool) {
536 let word_size = usize::BITS as usize;
537 let word_index = index / word_size;
538 let bit_mask = 1 << (index % word_size);
539 if value {
540 self.inner[word_index] |= bit_mask;
541 } else {
542 self.inner[word_index] &= !bit_mask;
543 }
544 }
545
546 /// Get one bit from the array.
547 ///
548 /// Panics if the index is out of range.
549 #[inline(always)]
550 fn get(&self, index: usize) -> bool {
551 let word_size = usize::BITS as usize;
552 let word_index = index / word_size;
553 let bit_mask = 1 << (index % word_size);
554 0 != (self.inner[word_index] & bit_mask)
555 }
556}