tor_netdoc/parse/
parser.rs

1//! Based on a set of rules, validate a token stream and collect the
2//! tokens by type.
3//!
4//! See the "rules" module for definitions of keywords types and
5//! per-keyword rules.
6//!
7//! The key types in this module are SectionRules, which explains how to
8//! validate and partition a stream of Item, and Section, which contains
9//! a validated set of Item, ready to be interpreted.
10//!
11//! # Example
12//!
13//! (This is an internal API, so see the routerdesc.rs source for an
14//! example of use.)
15
16use crate::parse::keyword::Keyword;
17use crate::parse::rules::*;
18use crate::parse::tokenize::*;
19use crate::{NetdocErrorKind as EK, Result};
20
21use educe::Educe;
22
23/// Describe the rules for one section of a document.
24///
25/// The rules are represented as a mapping from token index to
26/// rules::TokenFmt.
27#[derive(Clone)]
28pub(crate) struct SectionRules<T: Keyword> {
29    /// A set of rules for decoding a series of tokens into a Section
30    /// object.  Each element of this array corresponds to the
31    /// token with the corresponding index values.
32    ///
33    /// When an array element is None, the corresponding keyword is
34    /// not allowed in this kind section.  Otherwise, the array
35    /// element is a TokenFmt describing how many of the corresponding
36    /// token may appear, and what they need to look like.
37    rules: Vec<Option<TokenFmt<T>>>,
38}
39
40/// The entry or entries for a particular keyword within a document.
41#[derive(Clone, Educe)]
42#[educe(Default)]
43struct TokVal<'a, K: Keyword>(Vec<Item<'a, K>>);
44
45impl<'a, K: Keyword> TokVal<'a, K> {
46    /// Return the number of Items for this value.
47    fn none() -> Self {
48        Default::default()
49    }
50    /// Return the number of Items for this value.
51    fn count(&self) -> usize {
52        self.0.len()
53    }
54    /// Return the first Item for this value, or None if there wasn't one.
55    fn first(&self) -> Option<&Item<'a, K>> {
56        self.0.first()
57    }
58    /// Return the Item for this value, if there is exactly one.
59    fn singleton(&self) -> Option<&Item<'a, K>> {
60        match &*self.0 {
61            [x] => Some(x),
62            _ => None,
63        }
64    }
65    /// Return all the Items for this value, as a slice.
66    fn as_slice(&self) -> &[Item<'a, K>] {
67        &self.0
68    }
69    /// Return the last Item for this value, if any.
70    fn last(&self) -> Option<&Item<'a, K>> {
71        self.0.last()
72    }
73}
74
75/// A Section is the result of sorting a document's entries by keyword.
76///
77/// TODO: I'd rather have this be pub(crate), but I haven't figured out
78/// how to make that work.
79pub struct Section<'a, T: Keyword> {
80    /// Map from Keyword index to TokVal
81    v: Vec<TokVal<'a, T>>,
82    /// The keyword that appeared first in this section.  This will
83    /// be set if `v` is nonempty.
84    first: Option<T>,
85    /// The keyword that appeared last in this section.  This will
86    /// be set if `v` is nonempty.
87    last: Option<T>,
88}
89
90impl<'a, T: Keyword> Section<'a, T> {
91    /// Make a new empty Section.
92    fn new() -> Self {
93        let n = T::n_vals();
94        let mut v = Vec::with_capacity(n);
95        v.resize(n, TokVal::none());
96        Section {
97            v,
98            first: None,
99            last: None,
100        }
101    }
102    /// Helper: return the tokval for some Keyword.
103    fn tokval(&self, t: T) -> &TokVal<'a, T> {
104        let idx = t.idx();
105        &self.v[idx]
106    }
107    /// Return all the Items for some Keyword, as a slice.
108    pub(crate) fn slice(&self, t: T) -> &[Item<'a, T>] {
109        self.tokval(t).as_slice()
110    }
111    /// Return a single Item for some Keyword, if there is exactly one.
112    pub(crate) fn get(&self, t: T) -> Option<&Item<'a, T>> {
113        self.tokval(t).singleton()
114    }
115    /// Return a single Item for some Keyword, giving an error if there
116    /// is not exactly one.
117    ///
118    /// It is usually a mistake to use this function on a Keyword that is
119    /// not required.
120    pub(crate) fn required(&self, t: T) -> Result<&Item<'a, T>> {
121        self.get(t)
122            .ok_or_else(|| EK::MissingToken.with_msg(t.to_str()))
123    }
124    /// Return a proxy MaybeItem object for some keyword.
125    //
126    /// A MaybeItem is used to represent an object that might or might
127    /// not be there.
128    pub(crate) fn maybe<'b>(&'b self, t: T) -> MaybeItem<'b, 'a, T> {
129        MaybeItem::from_option(self.get(t))
130    }
131    /// Return the first item that was accepted for this section, or None
132    /// if no items were accepted for this section.
133    pub(crate) fn first_item(&self) -> Option<&Item<'a, T>> {
134        match self.first {
135            None => None,
136            Some(t) => self.tokval(t).first(),
137        }
138    }
139    /// Return the last item that was accepted for this section, or None
140    /// if no items were accepted for this section.
141    pub(crate) fn last_item(&self) -> Option<&Item<'a, T>> {
142        match self.last {
143            None => None,
144            Some(t) => self.tokval(t).last(),
145        }
146    }
147    /// Insert an `item`.
148    ///
149    /// The `item` must have parsed Keyword `t`.
150    fn add_tok(&mut self, t: T, item: Item<'a, T>) {
151        let idx = Keyword::idx(t);
152        if idx >= self.v.len() {
153            self.v.resize(idx + 1, TokVal::none());
154        }
155        self.v[idx].0.push(item);
156        if self.first.is_none() {
157            self.first = Some(t);
158        }
159        self.last = Some(t);
160    }
161}
162
163/// A builder for a set of section rules.
164#[derive(Clone)]
165pub(crate) struct SectionRulesBuilder<T: Keyword> {
166    /// Have we been told, explicitly, to reject unrecognized tokens?
167    strict: bool,
168    /// The rules we're building.
169    rules: SectionRules<T>,
170}
171
172impl<T: Keyword> SectionRulesBuilder<T> {
173    /// Add a rule to this SectionRulesBuilder, based on a TokenFmtBuilder.
174    ///
175    /// Requires that no rule yet exists for the provided keyword.
176    pub(crate) fn add(&mut self, t: TokenFmtBuilder<T>) {
177        let rule: TokenFmt<_> = t.into();
178        let idx = rule.kwd().idx();
179        assert!(self.rules.rules[idx].is_none());
180        self.rules.rules[idx] = Some(rule);
181    }
182
183    /// Explicitly reject any unrecognized tokens.
184    ///
185    /// To avoid errors, you must either explicitly reject unrecognized tokens,
186    /// or you must define how they are handled.
187    pub(crate) fn reject_unrecognized(&mut self) {
188        self.strict = true;
189    }
190
191    /// Construct the SectionRules from this builder.
192    ///
193    /// # Panics
194    ///
195    /// Panics if you did not specify the behavior for unrecognized tokens,
196    /// using either `reject_unrecognized` or `add(UNRECOGNIZED.rule()...)`
197    pub(crate) fn build(self) -> SectionRules<T> {
198        let unrecognized_idx = T::unrecognized().idx();
199        assert!(
200            self.strict || self.rules.rules[unrecognized_idx].is_some(),
201            "BUG: Section has to handle UNRECOGNIZED tokens explicitly."
202        );
203        self.rules
204    }
205}
206
207impl<T: Keyword> SectionRules<T> {
208    /// Create a new builder for a SectionRules with no rules.
209    ///
210    /// By default, no Keyword is allowed by this SectionRules.
211    pub(crate) fn builder() -> SectionRulesBuilder<T> {
212        let n = T::n_vals();
213        let mut rules = Vec::with_capacity(n);
214        rules.resize(n, None);
215        SectionRulesBuilder {
216            strict: false,
217            rules: SectionRules { rules },
218        }
219    }
220
221    /// Parse a stream of tokens into a Section object without (fully)
222    /// verifying them.
223    ///
224    /// Some errors are detected early, but others only show up later
225    /// when we validate more carefully.
226    fn parse_unverified<'a, I>(&self, tokens: I, section: &mut Section<'a, T>) -> Result<()>
227    where
228        I: Iterator<Item = Result<Item<'a, T>>>,
229    {
230        for item in tokens {
231            let item = item?;
232
233            let tok = item.kwd();
234            let tok_idx = tok.idx();
235            if let Some(rule) = &self.rules[tok_idx] {
236                // we want this token.
237                assert!(rule.kwd() == tok);
238                section.add_tok(tok, item);
239                rule.check_multiplicity(section.v[tok_idx].as_slice())?;
240            } else {
241                // We don't have a rule for this token.
242                return Err(EK::UnexpectedToken
243                    .with_msg(tok.to_str())
244                    .at_pos(item.pos()));
245            }
246        }
247        Ok(())
248    }
249
250    /// Check whether the tokens in a section we've parsed conform to
251    /// these rules.
252    fn validate(&self, s: &Section<'_, T>) -> Result<()> {
253        // These vectors are both generated from T::n_vals().
254        assert_eq!(s.v.len(), self.rules.len());
255
256        // Iterate over every item, and make sure it matches the
257        // corresponding rule.
258        for (rule, t) in self.rules.iter().zip(s.v.iter()) {
259            match rule {
260                None => {
261                    // We aren't supposed to have any of these.
262                    if t.count() > 0 {
263                        unreachable!(
264                            "This item should have been rejected earlier, in parse_unverified()"
265                        );
266                    }
267                }
268                Some(rule) => {
269                    // We're allowed to have this. Is the number right?
270                    rule.check_multiplicity(t.as_slice())?;
271                    // The number is right. Check each individual item.
272                    for item in t.as_slice() {
273                        rule.check_item(item)?;
274                    }
275                }
276            }
277        }
278
279        Ok(())
280    }
281
282    /// Check all the base64-encoded objects on a given keyword.
283    ///
284    /// We use this to validate objects on unrecognized items, since
285    /// otherwise nothing would check that they are well-formed.
286    fn validate_objects(&self, s: &Section<'_, T>, kwd: T) -> Result<()> {
287        for item in s.slice(kwd).iter() {
288            let _ = item.obj_raw()?;
289        }
290        Ok(())
291    }
292
293    /// Parse a stream of tokens into a validated section.
294    pub(crate) fn parse<'a, I>(&self, tokens: I) -> Result<Section<'a, T>>
295    where
296        I: Iterator<Item = Result<Item<'a, T>>>,
297    {
298        let mut section = Section::new();
299        self.parse_unverified(tokens, &mut section)?;
300        self.validate(&section)?;
301        self.validate_objects(&section, T::unrecognized())?;
302        self.validate_objects(&section, T::ann_unrecognized())?;
303        Ok(section)
304    }
305}
306
307#[cfg(test)]
308mod test {
309    // @@ begin test lint list maintained by maint/add_warning @@
310    #![allow(clippy::bool_assert_comparison)]
311    #![allow(clippy::clone_on_copy)]
312    #![allow(clippy::dbg_macro)]
313    #![allow(clippy::mixed_attributes_style)]
314    #![allow(clippy::print_stderr)]
315    #![allow(clippy::print_stdout)]
316    #![allow(clippy::single_char_pattern)]
317    #![allow(clippy::unwrap_used)]
318    #![allow(clippy::unchecked_duration_subtraction)]
319    #![allow(clippy::useless_vec)]
320    #![allow(clippy::needless_pass_by_value)]
321    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
322    use super::SectionRules;
323    use crate::parse::keyword::Keyword;
324    use crate::parse::macros::test::Fruit;
325    use crate::parse::tokenize::{Item, NetDocReader};
326    use crate::{Error, NetdocErrorKind as EK, Result};
327    use once_cell::sync::Lazy;
328
329    /// Rules for parsing a set of router annotations.
330    static FRUIT_SALAD: Lazy<SectionRules<Fruit>> = Lazy::new(|| {
331        use Fruit::*;
332        let mut rules = SectionRules::builder();
333        rules.add(ANN_TASTY.rule().required().args(1..=1));
334        rules.add(ORANGE.rule().args(1..));
335        rules.add(STONEFRUIT.rule().may_repeat());
336        rules.add(GUAVA.rule().obj_optional());
337        rules.add(LEMON.rule().no_args().obj_required());
338        rules.reject_unrecognized();
339        rules.build()
340    });
341
342    #[test]
343    fn parse_section() -> Result<()> {
344        use Fruit::*;
345        let s = "\
346@tasty yes
347orange soda
348cherry cobbler
349cherry pie
350plum compote
351guava fresh from 7 trees
352-----BEGIN GUAVA MANIFESTO-----
353VGhlIGd1YXZhIGVtb2ppIGlzIG5vdCBjdXJyZW50bHkgc3VwcG9ydGVkIGluI
354HVuaWNvZGUgMTMuMC4gTGV0J3MgZmlnaHQgYWdhaW5zdCBhbnRpLWd1YXZhIG
355JpYXMu
356-----END GUAVA MANIFESTO-----
357lemon
358-----BEGIN LEMON-----
3598J+Niw==
360-----END LEMON-----
361";
362        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
363        let sec = FRUIT_SALAD.parse(r).unwrap();
364
365        assert_eq!(sec.required(ANN_TASTY)?.arg(0), Some("yes"));
366
367        assert!(sec.get(ORANGE).is_some());
368        assert_eq!(sec.get(ORANGE).unwrap().args_as_str(), "soda");
369
370        let stonefruit_slice = sec.slice(STONEFRUIT);
371        assert_eq!(stonefruit_slice.len(), 3);
372        let kwds: Vec<&str> = stonefruit_slice.iter().map(Item::kwd_str).collect();
373        assert_eq!(kwds, &["cherry", "cherry", "plum"]);
374
375        assert_eq!(sec.maybe(GUAVA).args_as_str(), Some("fresh from 7 trees"));
376        assert_eq!(sec.maybe(GUAVA).parse_arg::<u32>(2).unwrap(), Some(7));
377        assert!(sec.maybe(GUAVA).parse_arg::<u32>(1).is_err());
378
379        // Try the `obj` accessor.
380        assert_eq!(sec.get(GUAVA).unwrap().obj("GUAVA MANIFESTO").unwrap(),
381                   &b"The guava emoji is not currently supported in unicode 13.0. Let's fight against anti-guava bias."[..]);
382        assert!(matches!(
383            sec.get(ORANGE)
384                .unwrap()
385                .obj("ORANGE MANIFESTO")
386                .unwrap_err()
387                .netdoc_error_kind(),
388            EK::MissingObject // orange you glad there isn't a manifesto?
389        ));
390
391        // Try `maybe_item` a bit.
392        let maybe_banana = sec.maybe(BANANA);
393        assert!(maybe_banana.parse_arg::<u32>(3).unwrap().is_none()); // yes! we have none.
394        let maybe_guava = sec.maybe(GUAVA);
395        assert_eq!(maybe_guava.parse_arg::<u32>(2).unwrap(), Some(7));
396
397        assert_eq!(
398            sec.get(ANN_TASTY).unwrap() as *const Item<'_, _>,
399            sec.first_item().unwrap() as *const Item<'_, _>
400        );
401
402        assert_eq!(
403            sec.get(LEMON).unwrap() as *const Item<'_, _>,
404            sec.last_item().unwrap() as *const Item<'_, _>
405        );
406
407        Ok(())
408    }
409
410    #[test]
411    fn rejected() {
412        use crate::Pos;
413        fn check(s: &str, e: &Error) {
414            let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
415            let res = FRUIT_SALAD.parse(r);
416            assert!(res.is_err());
417            assert_eq!(&res.err().unwrap().within(s), e);
418        }
419
420        // unrecognized tokens aren't allowed here
421        check(
422            "orange foo\nfoobar x\n@tasty yes\n",
423            &EK::UnexpectedToken
424                .with_msg("<unrecognized>")
425                .at_pos(Pos::from_line(2, 1)),
426        );
427
428        // Only one orange per customer.
429        check(
430            "@tasty yes\norange foo\norange bar\n",
431            &EK::DuplicateToken
432                .with_msg("orange")
433                .at_pos(Pos::from_line(3, 1)),
434        );
435
436        // There needs to be a declaration of tastiness.
437        check("orange foo\n", &EK::MissingToken.with_msg("@tasty"));
438
439        // You can't have an orange without an argument.
440        check(
441            "@tasty nope\norange\n",
442            &EK::TooFewArguments
443                .with_msg("orange")
444                .at_pos(Pos::from_line(2, 1)),
445        );
446        // You can't have an more than one argument on "tasty".
447        check(
448            "@tasty yup indeed\norange normal\n",
449            &EK::TooManyArguments
450                .with_msg("@tasty")
451                .at_pos(Pos::from_line(1, 1)),
452        );
453
454        // Every lemon needs an object
455        check(
456            "@tasty yes\nlemon\norange no\n",
457            &EK::MissingObject
458                .with_msg("lemon")
459                .at_pos(Pos::from_line(2, 1)),
460        );
461
462        // oranges don't take an object.
463        check(
464            "@tasty yes\norange no\n-----BEGIN ORANGE-----\naaa\n-----END ORANGE-----\n",
465            &EK::UnexpectedObject
466                .with_msg("orange")
467                .at_pos(Pos::from_line(2, 1)),
468        );
469    }
470}