hashx/
scheduler.rs

1//! Scheduling model for program generation
2//!
3//! HashX uses a simple scheduling model inspired by the Intel Ivy Bridge
4//! microarchitecture to choose registers that should be available and
5//! avoid stalls.
6
7use crate::program::{Instruction, Opcode};
8use crate::register::{RegisterId, NUM_REGISTERS};
9
10/// Scheduling information for each opcode
11mod model {
12    use crate::program::Opcode;
13
14    /// Number of simulated cycles we run before stopping program generation
15    pub(super) const TARGET_CYCLES: usize = 192;
16
17    /// Total number of cycles to store schedule data for
18    pub(super) const SCHEDULE_SIZE: usize = TARGET_CYCLES + MAX_LATENCY;
19
20    /// Number of execution ports in our simulated microarchitecture
21    pub(super) const NUM_EXECUTION_PORTS: usize = 3;
22
23    /// Identify one or more CPU execution ports
24    ///
25    /// They are inspired by the Intel Ivy Bridge design and the original HashX
26    /// implementation names them P5, P0, and P1.
27    ///
28    /// We can mostly ignore the names and treat this as an array of three
29    /// ports specified in the order that HashX tries to allocate them. The
30    /// original HashX implementation explains this allocation order as an
31    /// optimistic strategy which assumes P1 will typically be reserved for
32    /// multiplication.
33    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
34    pub(super) struct ExecPorts(pub(super) u8);
35
36    /// Port P5 (first choice) only
37    const P5: ExecPorts = ExecPorts(1 << 0);
38    /// Port P0 (second choice) only
39    const P0: ExecPorts = ExecPorts(1 << 1);
40    /// Port P1 (third choice) only
41    const P1: ExecPorts = ExecPorts(1 << 2);
42    /// Either port P0 or P1
43    const P01: ExecPorts = ExecPorts(P0.0 | P1.0);
44    /// Either port P0 or P5
45    const P05: ExecPorts = ExecPorts(P0.0 | P5.0);
46    /// Any of the three ports
47    const P015: ExecPorts = ExecPorts(P0.0 | P1.0 | P5.0);
48
49    /// Maximum value of [`instruction_latency_cycles()`] for any Opcode
50    const MAX_LATENCY: usize = 4;
51
52    /// Latency for each operation, in cycles
53    #[inline(always)]
54    pub(super) fn instruction_latency_cycles(op: Opcode) -> usize {
55        match op {
56            Opcode::AddConst => 1,
57            Opcode::AddShift => 1,
58            Opcode::Branch => 1,
59            Opcode::Rotate => 1,
60            Opcode::Sub => 1,
61            Opcode::Target => 1,
62            Opcode::Xor => 1,
63            Opcode::XorConst => 1,
64            Opcode::Mul => 3,
65            Opcode::SMulH => 4,
66            Opcode::UMulH => 4,
67        }
68    }
69
70    /// Break an instruction down into one or two micro-operation port sets.
71    #[inline(always)]
72    pub(super) fn micro_operations(op: Opcode) -> (ExecPorts, Option<ExecPorts>) {
73        match op {
74            Opcode::AddConst => (P015, None),
75            Opcode::Sub => (P015, None),
76            Opcode::Xor => (P015, None),
77            Opcode::XorConst => (P015, None),
78            Opcode::Mul => (P1, None),
79            Opcode::AddShift => (P01, None),
80            Opcode::Rotate => (P05, None),
81            Opcode::SMulH => (P1, Some(P5)),
82            Opcode::UMulH => (P1, Some(P5)),
83            Opcode::Branch => (P015, Some(P015)),
84            Opcode::Target => (P015, Some(P015)),
85        }
86    }
87
88    /// Each instruction advances the earliest possible issuing cycle by one
89    /// sub-cycle per micro-op.
90    #[inline(always)]
91    pub(super) fn instruction_sub_cycle_count(op: Opcode) -> usize {
92        match micro_operations(op) {
93            (_, None) => 1,
94            (_, Some(_)) => 2,
95        }
96    }
97}
98
99/// One single execution port
100///
101/// Formatted to use easily as an array index.
102/// It's helpful for this to be a compact data type.
103#[derive(Debug, Clone, Copy, Eq, PartialEq)]
104struct ExecPortIndex {
105    /// Equivalent to a bit position in [`model::ExecPorts`].
106    index: u8,
107}
108
109/// Overall state for the simulated execution schedule
110#[derive(Debug, Default, Clone)]
111pub(crate) struct Scheduler {
112    /// Current timestamp in the schedule, in sub-cycles
113    ///
114    /// Sub-cycles advance as code is generated, modeling the time taken to
115    /// fetch and decode instructions.
116    sub_cycle: SubCycle,
117
118    /// Current timestamp in the schedule, in cycles
119    ///
120    /// Derived from sub_cycle.
121    cycle: Cycle,
122
123    /// State for scheduling execution by fitting micro-ops into execution ports
124    exec: ExecSchedule,
125
126    /// State for scheduling register access by keeping track of data latency
127    data: DataSchedule,
128}
129
130impl Scheduler {
131    /// Create a new empty execution schedule at cycle 0
132    #[inline(always)]
133    pub(crate) fn new() -> Self {
134        Default::default()
135    }
136
137    /// Stall for one cycle.
138    ///
139    /// Used when register allocation fails.
140    /// Returns `Ok` if we had enough time, or `Err` if we ran out.
141    #[inline(always)]
142    pub(crate) fn stall(&mut self) -> Result<(), ()> {
143        self.advance(SubCycle::PER_CYCLE as usize)
144    }
145
146    /// Return the current instruction fetch/decode timestamp in sub-cycles.
147    #[inline(always)]
148    pub(crate) fn instruction_stream_sub_cycle(&self) -> SubCycle {
149        self.sub_cycle
150    }
151
152    /// Advance forward in time by some number of sub-cycles.
153    ///
154    /// Stops just before reaching `Cycle::target()`, where we stop
155    /// scheduling new instructions.
156    #[inline(always)]
157    fn advance(&mut self, n: usize) -> Result<(), ()> {
158        let sub_cycle = self.sub_cycle.add_usize(n)?;
159        let cycle = sub_cycle.cycle();
160        if cycle < Cycle::target() {
161            self.sub_cycle = sub_cycle;
162            self.cycle = cycle;
163            Ok(())
164        } else {
165            Err(())
166        }
167    }
168
169    /// Advance time forward by the modeled duration of the instruction fetch
170    /// and decode, in sub-cycles.
171    ///
172    /// Returns Ok if we still have enough time to schedule more, or Err if the
173    /// schedule would be full.
174    #[inline(always)]
175    pub(crate) fn advance_instruction_stream(&mut self, op: Opcode) -> Result<(), ()> {
176        self.advance(model::instruction_sub_cycle_count(op))
177    }
178
179    /// Calculate a timing plan describing the cycle and execution units
180    /// on which a particular opcode could run, at the earliest.
181    #[inline(always)]
182    pub(crate) fn instruction_plan(&self, op: Opcode) -> Result<InstructionPlan, ()> {
183        self.exec.instruction_plan(self.cycle, op)
184    }
185
186    /// Commit to using a plan returned by [`Self::instruction_plan()`],
187    /// for a particular concrete [`Instruction`] instance.
188    ///
189    /// Marks as busy each execution unit cycle in the plan, and updates the
190    /// latency for the [`Instruction`]'s destination register if it has one.
191    #[inline(always)]
192    pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) {
193        self.exec.mark_instruction_busy(plan);
194        if let Some(dst) = inst.destination() {
195            self.data
196                .plan_register_write(dst, plan.cycle_retired(inst.opcode()));
197        }
198    }
199
200    /// Look up if a register will be available at or before the indicated cycle.
201    #[inline(always)]
202    pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
203        self.data.register_available(reg, cycle)
204    }
205
206    /// Return the overall data latency.
207    ///
208    /// This is the Cycle at which we expect every register
209    /// to reach its final simulated state.
210    #[inline(always)]
211    pub(crate) fn overall_latency(&self) -> Cycle {
212        self.data.overall_latency()
213    }
214}
215
216/// Cycle timestamp
217///
218/// Measured from the beginning of the program, assuming no branches taken.
219/// It's useful to be able to store these compactly in register latency arrays.
220#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
221pub(crate) struct Cycle(u8);
222
223impl Cycle {
224    /// HashX stops generating code once the issue cycle reaches this target.
225    #[inline(always)]
226    fn target() -> Self {
227        Cycle(
228            model::TARGET_CYCLES
229                .try_into()
230                .expect("Cycle type wide enough for target count"),
231        )
232    }
233
234    /// Cast this Cycle count to a usize losslessly.
235    #[inline(always)]
236    pub(crate) fn as_usize(&self) -> usize {
237        self.0.into()
238    }
239
240    /// Add an integer number of cycles, returning Err(()) if we reach the end.
241    #[inline(always)]
242    fn add_usize(&self, n: usize) -> Result<Self, ()> {
243        let result = self.as_usize() + n;
244        if result < model::SCHEDULE_SIZE {
245            Ok(Cycle(
246                result
247                    .try_into()
248                    .expect("Cycle type wide enough for full schedule size"),
249            ))
250        } else {
251            Err(())
252        }
253    }
254}
255
256/// Sub-cycle timestamp
257///
258/// Timer for instruction decode, at a finer resolution than the Cycle
259/// we use for keeping schedule records. Doesn't need to be compact.
260#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
261pub(crate) struct SubCycle(u16);
262
263impl SubCycle {
264    /// Number of sub-cycles per cycle
265    const PER_CYCLE: u16 = 3;
266
267    /// Maximum value of a sub-cycle counter, based on the schedule size
268    const MAX: Self = SubCycle(model::SCHEDULE_SIZE as u16 * Self::PER_CYCLE - 1);
269
270    /// Cast this sub-cycle count into a usize losslessly.
271    #[inline(always)]
272    pub(crate) fn as_usize(&self) -> usize {
273        self.0.into()
274    }
275
276    /// Convert this sub-cycle into a full Cycle timestamp.
277    #[inline(always)]
278    fn cycle(&self) -> Cycle {
279        Cycle(
280            (self.0 / Self::PER_CYCLE)
281                .try_into()
282                .expect("Cycle type wide enough for conversion from SubCycle"),
283        )
284    }
285
286    /// Advance by an integer number of sub-cycles.
287    ///
288    /// Returns the new advanced [`SubCycle`], or `Err(())`
289    /// if we reach the end of the schedule.
290    #[inline(always)]
291    fn add_usize(&self, n: usize) -> Result<Self, ()> {
292        let result = self.as_usize() + n;
293        if result < Self::MAX.0.into() {
294            Ok(SubCycle(result.try_into().expect(
295                "SubCycle type wide enough for full schedule size",
296            )))
297        } else {
298            Err(())
299        }
300    }
301}
302
303/// Busy tracking for one CPU execution port
304#[derive(Debug, Default, Clone)]
305struct PortSchedule {
306    /// Bit array indicating when this port is busy, indexed by Cycle
307    busy: SimpleBitArray<
308        { (usize::BITS as usize + model::SCHEDULE_SIZE - 1) / (usize::BITS as usize) },
309    >,
310}
311
312/// Latency tracking for all relevant CPU registers
313#[derive(Debug, Default, Clone)]
314struct DataSchedule {
315    /// Cycle count at which this register is available, indexed by register ID
316    latencies: [Cycle; NUM_REGISTERS],
317}
318
319impl DataSchedule {
320    /// Plan to finish a register write at the indicated cycle
321    #[inline(always)]
322    fn plan_register_write(&mut self, dst: RegisterId, cycle: Cycle) {
323        self.latencies[dst.as_usize()] = cycle;
324    }
325
326    /// Look up if a register will be available at or before the indicated cycle.
327    #[inline(always)]
328    fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
329        self.latencies[reg.as_usize()] <= cycle
330    }
331
332    /// Return the overall latency, the [`Cycle`] at which we expect
333    /// every register to reach its latest simulated state.
334    #[inline(always)]
335    fn overall_latency(&self) -> Cycle {
336        match self.latencies.iter().max() {
337            Some(cycle) => *cycle,
338            None => Default::default(),
339        }
340    }
341}
342
343/// Execution schedule for all ports
344///
345/// This is a scoreboard that keeps track of which CPU units are busy in which
346/// cycles, so we can estimate a timestamp at which each instruction will write
347/// its result.
348#[derive(Debug, Default, Clone)]
349struct ExecSchedule {
350    /// Execution schedule (busy flags) for each port
351    ports: [PortSchedule; model::NUM_EXECUTION_PORTS],
352}
353
354impl ExecSchedule {
355    /// Calculate the next cycle at which we could schedule one micro-op.
356    ///
357    /// HashX always searches execution ports in the same order, and it will
358    /// look ahead up to the entire length of the schedule before failing.
359    #[inline(always)]
360    fn micro_plan(&self, begin: Cycle, ports: model::ExecPorts) -> Result<MicroOpPlan, ()> {
361        let mut cycle = begin;
362        loop {
363            for index in 0..(model::NUM_EXECUTION_PORTS as u8) {
364                if (ports.0 & (1 << index)) != 0
365                    && !self.ports[index as usize].busy.get(cycle.as_usize())
366                {
367                    return Ok(MicroOpPlan {
368                        cycle,
369                        port: ExecPortIndex { index },
370                    });
371                }
372            }
373            cycle = cycle.add_usize(1)?;
374        }
375    }
376
377    /// Mark the schedule busy according to a previously calculated plan.
378    #[inline(always)]
379    fn mark_micro_busy(&mut self, plan: MicroOpPlan) {
380        self.ports[plan.port.index as usize]
381            .busy
382            .set(plan.cycle.as_usize(), true);
383    }
384
385    /// Calculate a timing plan describing the cycle and execution units
386    /// we could use for scheduling one entire instruction.
387    ///
388    /// This takes place after the [`Opcode`] has been chosen but before
389    /// a full [`Instruction`] can be assembled.
390    #[inline(always)]
391    fn instruction_plan(&self, begin: Cycle, op: Opcode) -> Result<InstructionPlan, ()> {
392        match model::micro_operations(op) {
393            // Single-op instructions
394            (single_port, None) => {
395                InstructionPlan::from_micro_plans(self.micro_plan(begin, single_port)?, None)
396            }
397
398            // HashX schedules two-op instructions by searching forward
399            // until we find the first cycle where both ports are
400            // simultaneously available.
401            (first_port, Some(second_port)) => {
402                let mut cycle = begin;
403                loop {
404                    if let (Ok(first_plan), Ok(second_plan)) = (
405                        self.micro_plan(cycle, first_port),
406                        self.micro_plan(cycle, second_port),
407                    ) {
408                        if let Ok(joint_plan) =
409                            InstructionPlan::from_micro_plans(first_plan, Some(second_plan))
410                        {
411                            return Ok(joint_plan);
412                        }
413                    }
414                    cycle = cycle.add_usize(1)?;
415                }
416            }
417        }
418    }
419
420    /// Mark each micro-op in an InstructionPlan as busy in the schedule.
421    #[inline(always)]
422    fn mark_instruction_busy(&mut self, plan: &InstructionPlan) {
423        let (first, second) = plan.as_micro_plans();
424        self.mark_micro_busy(first);
425        if let Some(second) = second {
426            self.mark_micro_busy(second);
427        }
428    }
429}
430
431/// Detailed execution schedule for one micro-operation
432///
433/// Includes the [`Cycle`] it begins on, and the actual
434/// execution port it was assigned.
435#[derive(Debug, Clone, Copy, Eq, PartialEq)]
436struct MicroOpPlan {
437    /// The Cycle this operation begins on
438    cycle: Cycle,
439    /// Index of the execution port it runs on
440    port: ExecPortIndex,
441}
442
443/// Detailed execution schedule for one instruction
444///
445/// This is defined as either one or two micro-operations
446/// scheduled on the same cycle.
447#[derive(Debug, Clone, Copy, Eq, PartialEq)]
448pub(crate) struct InstructionPlan {
449    /// The Cycle this whole instruction begins on
450    cycle: Cycle,
451    /// First execution port, always present
452    first_port: ExecPortIndex,
453    /// Optional second execution port ID, for two-uop instructions
454    second_port: Option<ExecPortIndex>,
455}
456
457impl InstructionPlan {
458    /// Get the Cycle this whole instruction begins on.
459    #[inline(always)]
460    pub(crate) fn cycle_issued(&self) -> Cycle {
461        self.cycle
462    }
463
464    /// Calculate the cycle this instruction is complete by.
465    #[inline(always)]
466    pub(crate) fn cycle_retired(&self, op: Opcode) -> Cycle {
467        self.cycle
468            .add_usize(model::instruction_latency_cycles(op))
469            .expect("instruction retired prior to end of schedule")
470    }
471
472    /// Convert this `InstructionPlan` back to one or two [`MicroOpPlan`]s.
473    #[inline(always)]
474    fn as_micro_plans(&self) -> (MicroOpPlan, Option<MicroOpPlan>) {
475        (
476            MicroOpPlan {
477                cycle: self.cycle,
478                port: self.first_port,
479            },
480            self.second_port.map(|port| MicroOpPlan {
481                cycle: self.cycle,
482                port,
483            }),
484        )
485    }
486
487    /// Bundle two [`MicroOpPlan`]s into an [`InstructionPlan`],
488    /// if they are on matching cycles.
489    ///
490    /// Returns `Err(())` if the combination is not possible.
491    #[inline(always)]
492    fn from_micro_plans(first_op: MicroOpPlan, second_op: Option<MicroOpPlan>) -> Result<Self, ()> {
493        let second_port = match second_op {
494            None => None,
495            Some(second_op) => {
496                if first_op.cycle == second_op.cycle {
497                    Some(second_op.port)
498                } else {
499                    return Err(());
500                }
501            }
502        };
503        Ok(Self {
504            cycle: first_op.cycle,
505            first_port: first_op.port,
506            second_port,
507        })
508    }
509}
510
511/// Simple packed bit array implementation
512///
513/// This could use the `bitvec` crate if we cared to depend on it, but the
514/// functionality we need is so tiny let's keep this simple.
515#[derive(Debug, Clone, Eq, PartialEq)]
516struct SimpleBitArray<const N: usize> {
517    /// Array of words to use as a bit vector, in LSB-first order
518    inner: [usize; N],
519}
520
521impl<const N: usize> Default for SimpleBitArray<N> {
522    #[inline(always)]
523    fn default() -> Self {
524        Self {
525            inner: [0_usize; N],
526        }
527    }
528}
529
530impl<const N: usize> SimpleBitArray<N> {
531    /// Set or clear one bit in the array.
532    ///
533    /// Panics if the index is out of range.
534    #[inline(always)]
535    fn set(&mut self, index: usize, value: bool) {
536        let word_size = usize::BITS as usize;
537        let word_index = index / word_size;
538        let bit_mask = 1 << (index % word_size);
539        if value {
540            self.inner[word_index] |= bit_mask;
541        } else {
542            self.inner[word_index] &= !bit_mask;
543        }
544    }
545
546    /// Get one bit from the array.
547    ///
548    /// Panics if the index is out of range.
549    #[inline(always)]
550    fn get(&self, index: usize) -> bool {
551        let word_size = usize::BITS as usize;
552        let word_index = index / word_size;
553        let bit_mask = 1 << (index % word_size);
554        0 != (self.inner[word_index] & bit_mask)
555    }
556}