1
//! Scheduling model for program generation
2
//!
3
//! HashX uses a simple scheduling model inspired by the Intel Ivy Bridge
4
//! microarchitecture to choose registers that should be available and
5
//! avoid stalls.
6

            
7
use crate::program::{Instruction, Opcode};
8
use crate::register::{RegisterId, NUM_REGISTERS};
9

            
10
/// Scheduling information for each opcode
11
mod model {
12
    use crate::program::Opcode;
13

            
14
    /// Number of simulated cycles we run before stopping program generation
15
    pub(super) const TARGET_CYCLES: usize = 192;
16

            
17
    /// Total number of cycles to store schedule data for
18
    pub(super) const SCHEDULE_SIZE: usize = TARGET_CYCLES + MAX_LATENCY;
19

            
20
    /// Number of execution ports in our simulated microarchitecture
21
    pub(super) const NUM_EXECUTION_PORTS: usize = 3;
22

            
23
    /// Identify one or more CPU execution ports
24
    ///
25
    /// They are inspired by the Intel Ivy Bridge design and the original HashX
26
    /// implementation names them P5, P0, and P1.
27
    ///
28
    /// We can mostly ignore the names and treat this as an array of three
29
    /// ports specified in the order that HashX tries to allocate them. The
30
    /// original HashX implementation explains this allocation order as an
31
    /// optimistic strategy which assumes P1 will typically be reserved for
32
    /// multiplication.
33
    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
34
    pub(super) struct ExecPorts(pub(super) u8);
35

            
36
    /// Port P5 (first choice) only
37
    const P5: ExecPorts = ExecPorts(1 << 0);
38
    /// Port P0 (second choice) only
39
    const P0: ExecPorts = ExecPorts(1 << 1);
40
    /// Port P1 (third choice) only
41
    const P1: ExecPorts = ExecPorts(1 << 2);
42
    /// Either port P0 or P1
43
    const P01: ExecPorts = ExecPorts(P0.0 | P1.0);
44
    /// Either port P0 or P5
45
    const P05: ExecPorts = ExecPorts(P0.0 | P5.0);
46
    /// Any of the three ports
47
    const P015: ExecPorts = ExecPorts(P0.0 | P1.0 | P5.0);
48

            
49
    /// Maximum value of [`instruction_latency_cycles()`] for any Opcode
50
    const MAX_LATENCY: usize = 4;
51

            
52
    /// Latency for each operation, in cycles
53
    #[inline(always)]
54
4426890
    pub(super) fn instruction_latency_cycles(op: Opcode) -> usize {
55
4426890
        match op {
56
641550
            Opcode::AddConst => 1,
57
317980
            Opcode::AddShift => 1,
58
            Opcode::Branch => 1,
59
360815
            Opcode::Rotate => 1,
60
328770
            Opcode::Sub => 1,
61
            Opcode::Target => 1,
62
357435
            Opcode::Xor => 1,
63
649350
            Opcode::XorConst => 1,
64
1475630
            Opcode::Mul => 3,
65
147355
            Opcode::SMulH => 4,
66
148005
            Opcode::UMulH => 4,
67
        }
68
4426890
    }
69

            
70
    /// Break an instruction down into one or two micro-operation port sets.
71
    #[inline(always)]
72
9449570
    pub(super) fn micro_operations(op: Opcode) -> (ExecPorts, Option<ExecPorts>) {
73
9449570
        match op {
74
1283230
            Opcode::AddConst => (P015, None),
75
657540
            Opcode::Sub => (P015, None),
76
714870
            Opcode::Xor => (P015, None),
77
1298765
            Opcode::XorConst => (P015, None),
78
2955745
            Opcode::Mul => (P1, None),
79
636350
            Opcode::AddShift => (P01, None),
80
721630
            Opcode::Rotate => (P05, None),
81
294710
            Opcode::SMulH => (P1, Some(P5)),
82
296010
            Opcode::UMulH => (P1, Some(P5)),
83
295360
            Opcode::Branch => (P015, Some(P015)),
84
295360
            Opcode::Target => (P015, Some(P015)),
85
        }
86
9449570
    }
87

            
88
    /// Each instruction advances the earliest possible issuing cycle by one
89
    /// sub-cycle per micro-op.
90
    #[inline(always)]
91
4722250
    pub(super) fn instruction_sub_cycle_count(op: Opcode) -> usize {
92
4722250
        match micro_operations(op) {
93
4131530
            (_, None) => 1,
94
590720
            (_, Some(_)) => 2,
95
        }
96
4722250
    }
97
}
98

            
99
/// One single execution port
100
///
101
/// Formatted to use easily as an array index.
102
/// It's helpful for this to be a compact data type.
103
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
104
struct ExecPortIndex {
105
    /// Equivalent to a bit position in [`model::ExecPorts`].
106
    index: u8,
107
}
108

            
109
/// Overall state for the simulated execution schedule
110
#[derive(Debug, Default, Clone)]
111
pub(crate) struct Scheduler {
112
    /// Current timestamp in the schedule, in sub-cycles
113
    ///
114
    /// Sub-cycles advance as code is generated, modeling the time taken to
115
    /// fetch and decode instructions.
116
    sub_cycle: SubCycle,
117

            
118
    /// Current timestamp in the schedule, in cycles
119
    ///
120
    /// Derived from sub_cycle.
121
    cycle: Cycle,
122

            
123
    /// State for scheduling execution by fitting micro-ops into execution ports
124
    exec: ExecSchedule,
125

            
126
    /// State for scheduling register access by keeping track of data latency
127
    data: DataSchedule,
128
}
129

            
130
impl Scheduler {
131
    /// Create a new empty execution schedule at cycle 0
132
    #[inline(always)]
133
9230
    pub(crate) fn new() -> Self {
134
9230
        Default::default()
135
9230
    }
136

            
137
    /// Stall for one cycle.
138
    ///
139
    /// Used when register allocation fails.
140
    /// Returns `Ok` if we had enough time, or `Err` if we ran out.
141
    #[inline(always)]
142
1170
    pub(crate) fn stall(&mut self) -> Result<(), ()> {
143
1170
        self.advance(SubCycle::PER_CYCLE as usize)
144
1170
    }
145

            
146
    /// Return the current instruction fetch/decode timestamp in sub-cycles.
147
    #[inline(always)]
148
5054335
    pub(crate) fn instruction_stream_sub_cycle(&self) -> SubCycle {
149
5054335
        self.sub_cycle
150
5054335
    }
151

            
152
    /// Advance forward in time by some number of sub-cycles.
153
    ///
154
    /// Stops just before reaching `Cycle::target()`, where we stop
155
    /// scheduling new instructions.
156
    #[inline(always)]
157
4723420
    fn advance(&mut self, n: usize) -> Result<(), ()> {
158
4723420
        let sub_cycle = self.sub_cycle.add_usize(n)?;
159
4723420
        let cycle = sub_cycle.cycle();
160
4723420
        if cycle < Cycle::target() {
161
4714190
            self.sub_cycle = sub_cycle;
162
4714190
            self.cycle = cycle;
163
4714190
            Ok(())
164
        } else {
165
9230
            Err(())
166
        }
167
4723420
    }
168

            
169
    /// Advance time forward by the modeled duration of the instruction fetch
170
    /// and decode, in sub-cycles.
171
    ///
172
    /// Returns Ok if we still have enough time to schedule more, or Err if the
173
    /// schedule would be full.
174
    #[inline(always)]
175
4722250
    pub(crate) fn advance_instruction_stream(&mut self, op: Opcode) -> Result<(), ()> {
176
4722250
        self.advance(model::instruction_sub_cycle_count(op))
177
4722250
    }
178

            
179
    /// Calculate a timing plan describing the cycle and execution units
180
    /// on which a particular opcode could run, at the earliest.
181
    #[inline(always)]
182
4727320
    pub(crate) fn instruction_plan(&self, op: Opcode) -> Result<InstructionPlan, ()> {
183
4727320
        self.exec.instruction_plan(self.cycle, op)
184
4727320
    }
185

            
186
    /// Commit to using a plan returned by [`Self::instruction_plan()`],
187
    /// for a particular concrete [`Instruction`] instance.
188
    ///
189
    /// Marks as busy each execution unit cycle in the plan, and updates the
190
    /// latency for the [`Instruction`]'s destination register if it has one.
191
    #[inline(always)]
192
4722250
    pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) {
193
4722250
        self.exec.mark_instruction_busy(plan);
194
4722250
        if let Some(dst) = inst.destination() {
195
4426890
            self.data
196
4426890
                .plan_register_write(dst, plan.cycle_retired(inst.opcode()));
197
4426890
        }
198
4722250
    }
199

            
200
    /// Look up if a register will be available at or before the indicated cycle.
201
    #[inline(always)]
202
57696080
    pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
203
57696080
        self.data.register_available(reg, cycle)
204
57696080
    }
205

            
206
    /// Return the overall data latency.
207
    ///
208
    /// This is the Cycle at which we expect every register
209
    /// to reach its final simulated state.
210
    #[inline(always)]
211
8060
    pub(crate) fn overall_latency(&self) -> Cycle {
212
8060
        self.data.overall_latency()
213
8060
    }
214
}
215

            
216
/// Cycle timestamp
217
///
218
/// Measured from the beginning of the program, assuming no branches taken.
219
/// It's useful to be able to store these compactly in register latency arrays.
220
#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
221
pub(crate) struct Cycle(u8);
222

            
223
impl Cycle {
224
    /// HashX stops generating code once the issue cycle reaches this target.
225
    #[inline(always)]
226
4723420
    fn target() -> Self {
227
4723420
        Cycle(
228
4723420
            model::TARGET_CYCLES
229
4723420
                .try_into()
230
4723420
                .expect("Cycle type wide enough for target count"),
231
4723420
        )
232
4723420
    }
233

            
234
    /// Cast this Cycle count to a usize losslessly.
235
    #[inline(always)]
236
16223805
    pub(crate) fn as_usize(&self) -> usize {
237
16223805
        self.0.into()
238
16223805
    }
239

            
240
    /// Add an integer number of cycles, returning Err(()) if we reach the end.
241
    #[inline(always)]
242
4426890
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
243
4426890
        let result = self.as_usize() + n;
244
4426890
        if result < model::SCHEDULE_SIZE {
245
4426890
            Ok(Cycle(
246
4426890
                result
247
4426890
                    .try_into()
248
4426890
                    .expect("Cycle type wide enough for full schedule size"),
249
4426890
            ))
250
        } else {
251
            Err(())
252
        }
253
4426890
    }
254
}
255

            
256
/// Sub-cycle timestamp
257
///
258
/// Timer for instruction decode, at a finer resolution than the Cycle
259
/// we use for keeping schedule records. Doesn't need to be compact.
260
#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
261
pub(crate) struct SubCycle(u16);
262

            
263
impl SubCycle {
264
    /// Number of sub-cycles per cycle
265
    const PER_CYCLE: u16 = 3;
266

            
267
    /// Maximum value of a sub-cycle counter, based on the schedule size
268
    const MAX: Self = SubCycle(model::SCHEDULE_SIZE as u16 * Self::PER_CYCLE - 1);
269

            
270
    /// Cast this sub-cycle count into a usize losslessly.
271
    #[inline(always)]
272
9777755
    pub(crate) fn as_usize(&self) -> usize {
273
9777755
        self.0.into()
274
9777755
    }
275

            
276
    /// Convert this sub-cycle into a full Cycle timestamp.
277
    #[inline(always)]
278
4723420
    fn cycle(&self) -> Cycle {
279
4723420
        Cycle(
280
4723420
            (self.0 / Self::PER_CYCLE)
281
4723420
                .try_into()
282
4723420
                .expect("Cycle type wide enough for conversion from SubCycle"),
283
4723420
        )
284
4723420
    }
285

            
286
    /// Advance by an integer number of sub-cycles.
287
    ///
288
    /// Returns the new advanced [`SubCycle`], or `Err(())`
289
    /// if we reach the end of the schedule.
290
    #[inline(always)]
291
4723420
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
292
4723420
        let result = self.as_usize() + n;
293
4723420
        if result < Self::MAX.0.into() {
294
4723420
            Ok(SubCycle(result.try_into().expect(
295
4723420
                "SubCycle type wide enough for full schedule size",
296
4723420
            )))
297
        } else {
298
            Err(())
299
        }
300
4723420
    }
301
}
302

            
303
/// Busy tracking for one CPU execution port
304
#[derive(Debug, Default, Clone)]
305
struct PortSchedule {
306
    /// Bit array indicating when this port is busy, indexed by Cycle
307
    busy: SimpleBitArray<
308
        { (usize::BITS as usize + model::SCHEDULE_SIZE - 1) / (usize::BITS as usize) },
309
    >,
310
}
311

            
312
/// Latency tracking for all relevant CPU registers
313
#[derive(Debug, Default, Clone)]
314
struct DataSchedule {
315
    /// Cycle count at which this register is available, indexed by register ID
316
    latencies: [Cycle; NUM_REGISTERS],
317
}
318

            
319
impl DataSchedule {
320
    /// Plan to finish a register write at the indicated cycle
321
    #[inline(always)]
322
4426890
    fn plan_register_write(&mut self, dst: RegisterId, cycle: Cycle) {
323
4426890
        self.latencies[dst.as_usize()] = cycle;
324
4426890
    }
325

            
326
    /// Look up if a register will be available at or before the indicated cycle.
327
    #[inline(always)]
328
57696080
    fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
329
57696080
        self.latencies[reg.as_usize()] <= cycle
330
57696080
    }
331

            
332
    /// Return the overall latency, the [`Cycle`] at which we expect
333
    /// every register to reach its latest simulated state.
334
    #[inline(always)]
335
8060
    fn overall_latency(&self) -> Cycle {
336
8060
        match self.latencies.iter().max() {
337
8060
            Some(cycle) => *cycle,
338
            None => Default::default(),
339
        }
340
8060
    }
341
}
342

            
343
/// Execution schedule for all ports
344
///
345
/// This is a scoreboard that keeps track of which CPU units are busy in which
346
/// cycles, so we can estimate a timestamp at which each instruction will write
347
/// its result.
348
#[derive(Debug, Default, Clone)]
349
struct ExecSchedule {
350
    /// Execution schedule (busy flags) for each port
351
    ports: [PortSchedule; model::NUM_EXECUTION_PORTS],
352
}
353

            
354
impl ExecSchedule {
355
    /// Calculate the next cycle at which we could schedule one micro-op.
356
    ///
357
    /// HashX always searches execution ports in the same order, and it will
358
    /// look ahead up to the entire length of the schedule before failing.
359
    #[inline(always)]
360
5318040
    fn micro_plan(&self, begin: Cycle, ports: model::ExecPorts) -> Result<MicroOpPlan, ()> {
361
5318040
        let mut cycle = begin;
362
        loop {
363
10345205
            for index in 0..(model::NUM_EXECUTION_PORTS as u8) {
364
10345205
                if (ports.0 & (1 << index)) != 0
365
6475885
                    && !self.ports[index as usize].busy.get(cycle.as_usize())
366
                {
367
5318040
                    return Ok(MicroOpPlan {
368
5318040
                        cycle,
369
5318040
                        port: ExecPortIndex { index },
370
5318040
                    });
371
5027165
                }
372
            }
373
            cycle = cycle.add_usize(1)?;
374
        }
375
5318040
    }
376

            
377
    /// Mark the schedule busy according to a previously calculated plan.
378
    #[inline(always)]
379
5312970
    fn mark_micro_busy(&mut self, plan: MicroOpPlan) {
380
5312970
        self.ports[plan.port.index as usize]
381
5312970
            .busy
382
5312970
            .set(plan.cycle.as_usize(), true);
383
5312970
    }
384

            
385
    /// Calculate a timing plan describing the cycle and execution units
386
    /// we could use for scheduling one entire instruction.
387
    ///
388
    /// This takes place after the [`Opcode`] has been chosen but before
389
    /// a full [`Instruction`] can be assembled.
390
    #[inline(always)]
391
4727320
    fn instruction_plan(&self, begin: Cycle, op: Opcode) -> Result<InstructionPlan, ()> {
392
4727320
        match model::micro_operations(op) {
393
            // Single-op instructions
394
4136600
            (single_port, None) => {
395
4136600
                InstructionPlan::from_micro_plans(self.micro_plan(begin, single_port)?, None)
396
            }
397

            
398
            // HashX schedules two-op instructions by searching forward
399
            // until we find the first cycle where both ports are
400
            // simultaneously available.
401
590720
            (first_port, Some(second_port)) => {
402
590720
                let mut cycle = begin;
403
                loop {
404
590720
                    if let (Ok(first_plan), Ok(second_plan)) = (
405
590720
                        self.micro_plan(cycle, first_port),
406
590720
                        self.micro_plan(cycle, second_port),
407
                    ) {
408
590720
                        if let Ok(joint_plan) =
409
590720
                            InstructionPlan::from_micro_plans(first_plan, Some(second_plan))
410
                        {
411
590720
                            return Ok(joint_plan);
412
                        }
413
                    }
414
                    cycle = cycle.add_usize(1)?;
415
                }
416
            }
417
        }
418
4727320
    }
419

            
420
    /// Mark each micro-op in an InstructionPlan as busy in the schedule.
421
    #[inline(always)]
422
4722250
    fn mark_instruction_busy(&mut self, plan: &InstructionPlan) {
423
4722250
        let (first, second) = plan.as_micro_plans();
424
4722250
        self.mark_micro_busy(first);
425
4722250
        if let Some(second) = second {
426
590720
            self.mark_micro_busy(second);
427
4131530
        }
428
4722250
    }
429
}
430

            
431
/// Detailed execution schedule for one micro-operation
432
///
433
/// Includes the [`Cycle`] it begins on, and the actual
434
/// execution port it was assigned.
435
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
436
struct MicroOpPlan {
437
    /// The Cycle this operation begins on
438
    cycle: Cycle,
439
    /// Index of the execution port it runs on
440
    port: ExecPortIndex,
441
}
442

            
443
/// Detailed execution schedule for one instruction
444
///
445
/// This is defined as either one or two micro-operations
446
/// scheduled on the same cycle.
447
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
448
pub(crate) struct InstructionPlan {
449
    /// The Cycle this whole instruction begins on
450
    cycle: Cycle,
451
    /// First execution port, always present
452
    first_port: ExecPortIndex,
453
    /// Optional second execution port ID, for two-uop instructions
454
    second_port: Option<ExecPortIndex>,
455
}
456

            
457
impl InstructionPlan {
458
    /// Get the Cycle this whole instruction begins on.
459
    #[inline(always)]
460
57696080
    pub(crate) fn cycle_issued(&self) -> Cycle {
461
57696080
        self.cycle
462
57696080
    }
463

            
464
    /// Calculate the cycle this instruction is complete by.
465
    #[inline(always)]
466
4426890
    pub(crate) fn cycle_retired(&self, op: Opcode) -> Cycle {
467
4426890
        self.cycle
468
4426890
            .add_usize(model::instruction_latency_cycles(op))
469
4426890
            .expect("instruction retired prior to end of schedule")
470
4426890
    }
471

            
472
    /// Convert this `InstructionPlan` back to one or two [`MicroOpPlan`]s.
473
    #[inline(always)]
474
4722250
    fn as_micro_plans(&self) -> (MicroOpPlan, Option<MicroOpPlan>) {
475
4722250
        (
476
4722250
            MicroOpPlan {
477
4722250
                cycle: self.cycle,
478
4722250
                port: self.first_port,
479
4722250
            },
480
4731338
            self.second_port.map(|port| MicroOpPlan {
481
590720
                cycle: self.cycle,
482
590720
                port,
483
4731338
            }),
484
4722250
        )
485
4722250
    }
486

            
487
    /// Bundle two [`MicroOpPlan`]s into an [`InstructionPlan`],
488
    /// if they are on matching cycles.
489
    ///
490
    /// Returns `Err(())` if the combination is not possible.
491
    #[inline(always)]
492
4727320
    fn from_micro_plans(first_op: MicroOpPlan, second_op: Option<MicroOpPlan>) -> Result<Self, ()> {
493
4727320
        let second_port = match second_op {
494
4136600
            None => None,
495
590720
            Some(second_op) => {
496
590720
                if first_op.cycle == second_op.cycle {
497
590720
                    Some(second_op.port)
498
                } else {
499
                    return Err(());
500
                }
501
            }
502
        };
503
4727320
        Ok(Self {
504
4727320
            cycle: first_op.cycle,
505
4727320
            first_port: first_op.port,
506
4727320
            second_port,
507
4727320
        })
508
4727320
    }
509
}
510

            
511
/// Simple packed bit array implementation
512
///
513
/// This could use the `bitvec` crate if we cared to depend on it, but the
514
/// functionality we need is so tiny let's keep this simple.
515
#[derive(Debug, Clone, Eq, PartialEq)]
516
struct SimpleBitArray<const N: usize> {
517
    /// Array of words to use as a bit vector, in LSB-first order
518
    inner: [usize; N],
519
}
520

            
521
impl<const N: usize> Default for SimpleBitArray<N> {
522
    #[inline(always)]
523
27690
    fn default() -> Self {
524
27690
        Self {
525
27690
            inner: [0_usize; N],
526
27690
        }
527
27690
    }
528
}
529

            
530
impl<const N: usize> SimpleBitArray<N> {
531
    /// Set or clear one bit in the array.
532
    ///
533
    /// Panics if the index is out of range.
534
    #[inline(always)]
535
5312970
    fn set(&mut self, index: usize, value: bool) {
536
5312970
        let word_size = usize::BITS as usize;
537
5312970
        let word_index = index / word_size;
538
5312970
        let bit_mask = 1 << (index % word_size);
539
5312970
        if value {
540
5312970
            self.inner[word_index] |= bit_mask;
541
5312970
        } else {
542
            self.inner[word_index] &= !bit_mask;
543
        }
544
5312970
    }
545

            
546
    /// Get one bit from the array.
547
    ///
548
    /// Panics if the index is out of range.
549
    #[inline(always)]
550
6475885
    fn get(&self, index: usize) -> bool {
551
6475885
        let word_size = usize::BITS as usize;
552
6475885
        let word_index = index / word_size;
553
6475885
        let bit_mask = 1 << (index % word_size);
554
6475885
        0 != (self.inner[word_index] & bit_mask)
555
6475885
    }
556
}