tor_netdoc/parse/
tokenize.rs

1//! Break a string into a set of directory-object Items.
2//!
3//! This module defines Item, which represents a basic entry in a
4//! directory document, and NetDocReader, which is used to break a
5//! string into Items.
6
7use crate::parse::keyword::Keyword;
8use crate::types::misc::FromBytes;
9use crate::util::PeekableIterator;
10use crate::{Error, NetdocErrorKind as EK, Pos, Result};
11use base64ct::{Base64, Encoding};
12use itertools::Itertools;
13use std::cell::{Ref, RefCell};
14use std::iter::Peekable;
15use std::str::FromStr;
16use tor_error::internal;
17
18/// Useful constants for netdoc object syntax
19pub(crate) mod object {
20    /// indicates the start of an object
21    pub(crate) const BEGIN_STR: &str = "-----BEGIN ";
22    /// indicates the end of an object
23    pub(crate) const END_STR: &str = "-----END ";
24    /// indicates the end of a begin or end tag.
25    pub(crate) const TAG_END: &str = "-----";
26    /// Maximum PEM base64 line length (not enforced during parsing)
27    #[cfg(feature = "hs-service")]
28    pub(crate) const BASE64_PEM_MAX_LINE: usize = 64;
29}
30
31/// Return true iff a given character is "space" according to the rules
32/// of dir-spec.txt
33pub(crate) fn is_sp(c: char) -> bool {
34    c == ' ' || c == '\t'
35}
36/// Check that all the characters in `s` are valid base64.
37///
38/// This is not a perfect check for base64ness -- it is mainly meant
39/// to help us recover after unterminated base64.
40fn b64check(s: &str) -> Result<()> {
41    for b in s.bytes() {
42        match b {
43            b'=' => (),
44            b'a'..=b'z' => (),
45            b'A'..=b'Z' => (),
46            b'0'..=b'9' => (),
47            b'/' | b'+' => (),
48            _ => {
49                return Err(EK::BadObjectBase64.at_pos(Pos::at(s)));
50            }
51        };
52    }
53    Ok(())
54}
55
56/// A tagged object that is part of a directory Item.
57///
58/// This represents a single blob within a pair of "-----BEGIN
59/// FOO-----" and "-----END FOO-----".  The data is not guaranteed to
60/// be actual base64 when this object is created: doing so would
61/// require either that we parse the base64 twice, or that we allocate
62/// a buffer to hold the data before it's needed.
63#[derive(Clone, Copy, Debug)]
64pub(crate) struct Object<'a> {
65    /// Reference to the "tag" string (the 'foo') in the BEGIN line.
66    tag: &'a str,
67    /// Reference to the allegedly base64-encoded data.  This may or
68    /// may not actually be base64 at this point.
69    data: &'a str,
70    /// Reference to the END line for this object.  This doesn't
71    /// need to be parsed, but it's used to find where this object
72    /// ends.
73    endline: &'a str,
74}
75
76/// A single part of a directory object.
77///
78/// Each Item -- called an "entry" in dir-spec.txt -- has a keyword, a
79/// (possibly empty) set of arguments, and an optional object.
80///
81/// This is a zero-copy implementation that points to slices within a
82/// containing string.
83#[derive(Clone, Debug)]
84pub(crate) struct Item<'a, K: Keyword> {
85    /// The keyword that determines the type of this item.
86    kwd: K,
87    /// A reference to the actual string that defines the keyword for
88    /// this item.
89    kwd_str: &'a str,
90    /// Reference to the arguments that appear in the same line after the
91    /// keyword.  Does not include the terminating newline or the
92    /// space that separates the keyword for its arguments.
93    args: &'a str,
94    /// The arguments, split by whitespace.  This vector is constructed
95    /// as needed, using interior mutability.
96    split_args: RefCell<Option<Vec<&'a str>>>,
97    /// If present, a base-64-encoded object that appeared at the end
98    /// of this item.
99    object: Option<Object<'a>>,
100}
101
102/// A cursor into a string that returns Items one by one.
103///
104/// (This type isn't used directly, but is returned wrapped in a Peekable.)
105#[derive(Debug)]
106struct NetDocReaderBase<'a, K: Keyword> {
107    /// The string we're parsing.
108    s: &'a str,
109    /// Our position within the string.
110    off: usize,
111    /// Tells Rust it's okay that we are parameterizing on K.
112    _k: std::marker::PhantomData<K>,
113}
114
115impl<'a, K: Keyword> NetDocReaderBase<'a, K> {
116    /// Create a new NetDocReader to split a string into tokens.
117    fn new(s: &'a str) -> Result<Self> {
118        Ok(NetDocReaderBase {
119            s: validate_utf_8_rules(s)?,
120            off: 0,
121            _k: std::marker::PhantomData,
122        })
123    }
124    /// Return the current Pos within the string.
125    fn pos(&self, pos: usize) -> Pos {
126        Pos::from_offset(self.s, pos)
127    }
128    /// Skip forward by n bytes.
129    ///
130    /// (Note that standard caveats with byte-oriented processing of
131    /// UTF-8 strings apply.)
132    fn advance(&mut self, n: usize) -> Result<()> {
133        if n > self.remaining() {
134            return Err(
135                Error::from(internal!("tried to advance past end of document"))
136                    .at_pos(Pos::from_offset(self.s, self.off)),
137            );
138        }
139        self.off += n;
140        Ok(())
141    }
142    /// Return the remaining number of bytes in this reader.
143    fn remaining(&self) -> usize {
144        self.s.len() - self.off
145    }
146
147    /// Return true if the next characters in this reader are `s`
148    fn starts_with(&self, s: &str) -> bool {
149        self.s[self.off..].starts_with(s)
150    }
151    /// Try to extract a NL-terminated line from this reader.  Always
152    /// remove data if the reader is nonempty.
153    fn line(&mut self) -> Result<&'a str> {
154        let remainder = &self.s[self.off..];
155        if let Some(nl_pos) = remainder.find('\n') {
156            self.advance(nl_pos + 1)?;
157            let line = &remainder[..nl_pos];
158
159            // TODO: we should probably detect \r and do something about it.
160            // Just ignoring it isn't the right answer, though.
161            Ok(line)
162        } else {
163            self.advance(remainder.len())?; // drain everything.
164            Err(EK::TruncatedLine.at_pos(self.pos(self.s.len())))
165        }
166    }
167
168    /// Try to extract a line that begins with a keyword from this reader.
169    ///
170    /// Returns a (kwd, args) tuple on success.
171    fn kwdline(&mut self) -> Result<(&'a str, &'a str)> {
172        let pos = self.off;
173        let line = self.line()?;
174        if line.is_empty() {
175            return Err(EK::EmptyLine.at_pos(self.pos(pos)));
176        }
177        let (line, anno_ok) = if let Some(rem) = line.strip_prefix("opt ") {
178            (rem, false)
179        } else {
180            (line, true)
181        };
182        let mut parts_iter = line.splitn(2, [' ', '\t']);
183        let kwd = match parts_iter.next() {
184            Some(k) => k,
185            // This case seems like it can't happen: split always returns
186            // something, apparently.
187            None => return Err(EK::MissingKeyword.at_pos(self.pos(pos))),
188        };
189        if !keyword_ok(kwd, anno_ok) {
190            return Err(EK::BadKeyword.at_pos(self.pos(pos)));
191        }
192        // TODO(nickm): dir-spec does not yet allow unicode in the arguments, but we're
193        // assuming that proposal 285 is accepted.
194        let args = match parts_iter.next() {
195            Some(a) => a,
196            // take a zero-length slice, so it will be within the string.
197            None => &kwd[kwd.len()..],
198        };
199        Ok((kwd, args))
200    }
201
202    /// Try to extract an Object beginning wrapped within BEGIN/END tags.
203    ///
204    /// Returns Ok(Some(Object(...))) on success if an object is
205    /// found, Ok(None) if no object is found, and Err only if a
206    /// corrupt object is found.
207    fn object(&mut self) -> Result<Option<Object<'a>>> {
208        use object::*;
209
210        let pos = self.off;
211        if !self.starts_with(BEGIN_STR) {
212            return Ok(None);
213        }
214        let line = self.line()?;
215        if !line.ends_with(TAG_END) {
216            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
217        }
218        let tag = &line[BEGIN_STR.len()..(line.len() - TAG_END.len())];
219        if !tag_keywords_ok(tag) {
220            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
221        }
222        let datapos = self.off;
223        let (endlinepos, endline) = loop {
224            let p = self.off;
225            let line = self.line()?;
226            if line.starts_with(END_STR) {
227                break (p, line);
228            }
229            // Exit if this line isn't plausible base64.  Otherwise,
230            // an unterminated base64 block could potentially
231            // "consume" all the rest of the string, which would stop
232            // us from recovering.
233            b64check(line).map_err(|e| e.within(self.s))?;
234        };
235        let data = &self.s[datapos..endlinepos];
236        if !endline.ends_with(TAG_END) {
237            return Err(EK::BadObjectEndTag.at_pos(self.pos(endlinepos)));
238        }
239        let endtag = &endline[END_STR.len()..(endline.len() - TAG_END.len())];
240        if endtag != tag {
241            return Err(EK::BadObjectMismatchedTag.at_pos(self.pos(endlinepos)));
242        }
243        Ok(Some(Object { tag, data, endline }))
244    }
245
246    /// Read the next Item from this NetDocReaderBase.
247    ///
248    /// If successful, returns Ok(Some(Item)), or Ok(None) if exhausted.
249    /// Returns Err on failure.
250    ///
251    /// Always consumes at least one line if possible; always ends on a
252    /// line boundary if one exists.
253    fn item(&mut self) -> Result<Option<Item<'a, K>>> {
254        if self.remaining() == 0 {
255            return Ok(None);
256        }
257        let (kwd_str, args) = self.kwdline()?;
258        let object = self.object()?;
259        let split_args = RefCell::new(None);
260        let kwd = K::from_str(kwd_str);
261        Ok(Some(Item {
262            kwd,
263            kwd_str,
264            args,
265            split_args,
266            object,
267        }))
268    }
269}
270
271/// Return true iff 's' is a valid keyword or annotation.
272///
273/// (Only allow annotations if `anno_ok` is true.`
274fn keyword_ok(mut s: &str, anno_ok: bool) -> bool {
275    /// Helper: return true if this character can appear in keywords.
276    fn kwd_char_ok(c: char) -> bool {
277        matches!(c,'A'..='Z' | 'a'..='z' |'0'..='9' | '-')
278    }
279
280    if s.is_empty() {
281        return false;
282    }
283    if anno_ok && s.starts_with('@') {
284        s = &s[1..];
285    }
286    if s.starts_with('-') {
287        return false;
288    }
289    s.chars().all(kwd_char_ok)
290}
291
292/// Return true iff 's' is a valid keywords string for a BEGIN/END tag.
293pub(crate) fn tag_keywords_ok(s: &str) -> bool {
294    s.split(' ').all(|w| keyword_ok(w, false))
295}
296
297/// When used as an Iterator, returns a sequence of `Result<Item>`.
298impl<'a, K: Keyword> Iterator for NetDocReaderBase<'a, K> {
299    type Item = Result<Item<'a, K>>;
300    fn next(&mut self) -> Option<Self::Item> {
301        self.item().transpose()
302    }
303}
304
305/// Helper: as base64::decode(), but allows newlines in the middle of the
306/// encoded object.
307fn base64_decode_multiline(s: &str) -> std::result::Result<Vec<u8>, base64ct::Error> {
308    // base64 module hates whitespace.
309    let mut s = s.to_string();
310    s.retain(|ch| ch != '\n');
311    let v = Base64::decode_vec(&s)?;
312    Ok(v)
313}
314
315impl<'a, K: Keyword> Item<'a, K> {
316    /// Return the parsed keyword part of this item.
317    pub(crate) fn kwd(&self) -> K {
318        self.kwd
319    }
320    /// Return the keyword part of this item, as a string.
321    pub(crate) fn kwd_str(&self) -> &'a str {
322        self.kwd_str
323    }
324    /// Return true if the keyword for this item is in 'ks'.
325    pub(crate) fn has_kwd_in(&self, ks: &[K]) -> bool {
326        ks.contains(&self.kwd)
327    }
328    /// Return the arguments of this item, as a single string.
329    pub(crate) fn args_as_str(&self) -> &'a str {
330        self.args
331    }
332    /// Return the arguments of this item as a vector.
333    fn args_as_vec(&self) -> Ref<'_, Vec<&'a str>> {
334        // We're using an interior mutability pattern here to lazily
335        // construct the vector.
336        if self.split_args.borrow().is_none() {
337            self.split_args.replace(Some(self.args().collect()));
338        }
339        Ref::map(self.split_args.borrow(), |opt| match opt {
340            Some(v) => v,
341            None => panic!(),
342        })
343    }
344    /// Return an iterator over the arguments of this item.
345    pub(crate) fn args(&self) -> impl Iterator<Item = &'a str> {
346        self.args.split(is_sp).filter(|s| !s.is_empty())
347    }
348    /// Return the nth argument of this item, if there is one.
349    pub(crate) fn arg(&self, idx: usize) -> Option<&'a str> {
350        self.args_as_vec().get(idx).copied()
351    }
352    /// Return the nth argument of this item, or an error if it isn't there.
353    pub(crate) fn required_arg(&self, idx: usize) -> Result<&'a str> {
354        self.arg(idx)
355            .ok_or_else(|| EK::MissingArgument.at_pos(Pos::at(self.args)))
356    }
357    /// Try to parse the nth argument (if it exists) into some type
358    /// that supports FromStr.
359    ///
360    /// Returns Ok(None) if the argument doesn't exist.
361    pub(crate) fn parse_optional_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
362    where
363        Error: From<V::Err>,
364    {
365        match self.arg(idx) {
366            None => Ok(None),
367            Some(s) => match s.parse() {
368                Ok(r) => Ok(Some(r)),
369                Err(e) => {
370                    let e: Error = e.into();
371                    Err(e.or_at_pos(Pos::at(s)))
372                }
373            },
374        }
375    }
376    /// Try to parse the nth argument (if it exists) into some type
377    /// that supports FromStr.
378    ///
379    /// Return an error if the argument doesn't exist.
380    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<V>
381    where
382        Error: From<V::Err>,
383    {
384        match self.parse_optional_arg(idx) {
385            Ok(Some(v)) => Ok(v),
386            Ok(None) => Err(EK::MissingArgument.at_pos(self.arg_pos(idx))),
387            Err(e) => Err(e),
388        }
389    }
390    /// Return the number of arguments for this Item
391    pub(crate) fn n_args(&self) -> usize {
392        self.args().count()
393    }
394    /// Return true iff this Item has an associated object.
395    pub(crate) fn has_obj(&self) -> bool {
396        self.object.is_some()
397    }
398    /// Return the tag of this item's associated object, if it has one.
399    pub(crate) fn obj_tag(&self) -> Option<&'a str> {
400        self.object.map(|o| o.tag)
401    }
402    /// Try to decode the base64 contents of this Item's associated object.
403    ///
404    /// On success, return the object's tag and decoded contents.
405    pub(crate) fn obj_raw(&self) -> Result<Option<(&'a str, Vec<u8>)>> {
406        match self.object {
407            None => Ok(None),
408            Some(obj) => {
409                let decoded = base64_decode_multiline(obj.data)
410                    .map_err(|_| EK::BadObjectBase64.at_pos(Pos::at(obj.data)))?;
411                Ok(Some((obj.tag, decoded)))
412            }
413        }
414    }
415    /// Try to decode the base64 contents of this Item's associated object,
416    /// and make sure that its tag matches 'want_tag'.
417    pub(crate) fn obj(&self, want_tag: &str) -> Result<Vec<u8>> {
418        match self.obj_raw()? {
419            None => Err(EK::MissingObject
420                .with_msg(self.kwd.to_str())
421                .at_pos(self.end_pos())),
422            Some((tag, decoded)) => {
423                if tag != want_tag {
424                    Err(EK::WrongObject.at_pos(Pos::at(tag)))
425                } else {
426                    Ok(decoded)
427                }
428            }
429        }
430    }
431    /// Try to decode the base64 contents of this item's associated object
432    /// as a given type that implements FromBytes.
433    pub(crate) fn parse_obj<V: FromBytes>(&self, want_tag: &str) -> Result<V> {
434        let bytes = self.obj(want_tag)?;
435        // Unwrap may be safe because above `.obj()` should return an Error if
436        // wanted tag was not present
437        #[allow(clippy::unwrap_used)]
438        let p = Pos::at(self.object.unwrap().data);
439        V::from_vec(bytes, p).map_err(|e| e.at_pos(p))
440    }
441    /// Return the position of this item.
442    ///
443    /// This position won't be useful unless it is later contextualized
444    /// with the containing string.
445    pub(crate) fn pos(&self) -> Pos {
446        Pos::at(self.kwd_str)
447    }
448    /// Return the position of this Item in a string.
449    ///
450    /// Returns None if this item doesn't actually belong to the string.
451    pub(crate) fn offset_in(&self, s: &str) -> Option<usize> {
452        crate::util::str::str_offset(s, self.kwd_str)
453    }
454    /// Return the position of the n'th argument of this item.
455    ///
456    /// If this item does not have a n'th argument, return the
457    /// position of the end of the final argument.
458    pub(crate) fn arg_pos(&self, n: usize) -> Pos {
459        let args = self.args_as_vec();
460        if n < args.len() {
461            Pos::at(args[n])
462        } else {
463            self.last_arg_end_pos()
464        }
465    }
466    /// Return the position at the end of the last argument.  (This will
467    /// point to a newline.)
468    fn last_arg_end_pos(&self) -> Pos {
469        Pos::at_end_of(self.args)
470    }
471    /// Return the position of the end of this object. (This will point to a
472    /// newline.)
473    pub(crate) fn end_pos(&self) -> Pos {
474        match self.object {
475            Some(o) => Pos::at_end_of(o.endline),
476            None => self.last_arg_end_pos(),
477        }
478    }
479    /// If this item occurs within s, return the byte offset
480    /// immediately after the end of this item.
481    pub(crate) fn offset_after(&self, s: &str) -> Option<usize> {
482        self.end_pos().offset_within(s).map(|nl_pos| nl_pos + 1)
483    }
484
485    /// Return the text of this item, if it originated within `str`,
486    /// from the start of its keyword up to and including its final newline.
487    #[allow(dead_code)] // unused when hsdesc not enabled.
488    pub(crate) fn text_within<'b>(&self, s: &'b str) -> Option<&'b str> {
489        let start = self.pos().offset_within(s)?;
490        let end = self.end_pos().offset_within(s)?;
491        s.get(start..=end)
492    }
493}
494
495/// Represents an Item that might not be present, whose arguments we
496/// want to inspect.  If the Item is there, this acts like a proxy to the
497/// item; otherwise, it treats the item as having no arguments.
498pub(crate) struct MaybeItem<'a, 'b, K: Keyword>(Option<&'a Item<'b, K>>);
499
500// All methods here are as for Item.
501impl<'a, 'b, K: Keyword> MaybeItem<'a, 'b, K> {
502    /// Return the position of this item, if it has one.
503    fn pos(&self) -> Pos {
504        match self.0 {
505            Some(item) => item.pos(),
506            None => Pos::None,
507        }
508    }
509    /// Construct a MaybeItem from an Option reference to an item.
510    pub(crate) fn from_option(opt: Option<&'a Item<'b, K>>) -> Self {
511        MaybeItem(opt)
512    }
513
514    /// If this item is present, parse its argument at position `idx`.
515    /// Treat the absence or malformedness of the argument as an error,
516    /// but treat the absence of this item as acceptable.
517    #[cfg(any(test, feature = "routerdesc"))]
518    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
519    where
520        Error: From<V::Err>,
521    {
522        match self.0 {
523            Some(item) => match item.parse_arg(idx) {
524                Ok(v) => Ok(Some(v)),
525                Err(e) => Err(e.or_at_pos(self.pos())),
526            },
527            None => Ok(None),
528        }
529    }
530    /// If this item is present, return its arguments as a single string.
531    pub(crate) fn args_as_str(&self) -> Option<&str> {
532        self.0.map(|item| item.args_as_str())
533    }
534    /// If this item is present, parse all of its arguments as a
535    /// single string.
536    pub(crate) fn parse_args_as_str<V: FromStr>(&self) -> Result<Option<V>>
537    where
538        Error: From<V::Err>,
539    {
540        match self.0 {
541            Some(item) => match item.args_as_str().parse::<V>() {
542                Ok(v) => Ok(Some(v)),
543                Err(e) => {
544                    let e: Error = e.into();
545                    Err(e.or_at_pos(self.pos()))
546                }
547            },
548            None => Ok(None),
549        }
550    }
551}
552
553/// Extension trait for `Result<Item>` -- makes it convenient to implement
554/// PauseAt predicates
555pub(crate) trait ItemResult<K: Keyword> {
556    /// Return true if this is an ok result with an annotation.
557    fn is_ok_with_annotation(&self) -> bool;
558    /// Return true if this is an ok result with a non-annotation.
559    fn is_ok_with_non_annotation(&self) -> bool;
560    /// Return true if this is an ok result with the keyword 'k'
561    fn is_ok_with_kwd(&self, k: K) -> bool {
562        self.is_ok_with_kwd_in(&[k])
563    }
564    /// Return true if this is an ok result with a keyword in the slice 'ks'
565    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool;
566    /// Return true if this is an ok result with a keyword not in the slice 'ks'
567    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool;
568    /// Return true if this is an empty-line error.
569    fn is_empty_line(&self) -> bool;
570}
571
572impl<'a, K: Keyword> ItemResult<K> for Result<Item<'a, K>> {
573    fn is_ok_with_annotation(&self) -> bool {
574        match self {
575            Ok(item) => item.kwd().is_annotation(),
576            Err(_) => false,
577        }
578    }
579    fn is_ok_with_non_annotation(&self) -> bool {
580        match self {
581            Ok(item) => !item.kwd().is_annotation(),
582            Err(_) => false,
583        }
584    }
585    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool {
586        match self {
587            Ok(item) => item.has_kwd_in(ks),
588            Err(_) => false,
589        }
590    }
591    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool {
592        match self {
593            Ok(item) => !item.has_kwd_in(ks),
594            Err(_) => false,
595        }
596    }
597    fn is_empty_line(&self) -> bool {
598        matches!(
599            self,
600            Err(e) if e.netdoc_error_kind() == crate::err::NetdocErrorKind::EmptyLine
601        )
602    }
603}
604
605/// A peekable cursor into a string that returns Items one by one.
606///
607/// This is an [`Iterator`], yielding [`Item`]s.
608#[derive(Debug)]
609pub(crate) struct NetDocReader<'a, K: Keyword> {
610    // TODO: I wish there were some way around having this string
611    // reference, since we already need one inside NetDocReaderBase.
612    /// The underlying string being parsed.
613    s: &'a str,
614    /// A stream of tokens being parsed by this NetDocReader.
615    tokens: Peekable<NetDocReaderBase<'a, K>>,
616}
617
618impl<'a, K: Keyword> NetDocReader<'a, K> {
619    /// Construct a new NetDocReader to read tokens from `s`.
620    pub(crate) fn new(s: &'a str) -> Result<Self> {
621        Ok(NetDocReader {
622            s,
623            tokens: NetDocReaderBase::new(s)?.peekable(),
624        })
625    }
626    /// Return a reference to the string used for this NetDocReader.
627    pub(crate) fn str(&self) -> &'a str {
628        self.s
629    }
630    /// Return a wrapper around the peekable iterator in this
631    /// NetDocReader that reads tokens until it reaches an element where
632    /// 'f' is true.
633    pub(crate) fn pause_at<'f, 'r, F>(
634        &mut self,
635        mut f: F,
636    ) -> itertools::PeekingTakeWhile<'_, Self, impl FnMut(&Result<Item<'a, K>>) -> bool + 'f>
637    where
638        'f: 'r,
639        F: FnMut(&Result<Item<'a, K>>) -> bool + 'f,
640        K: 'f,
641    {
642        self.peeking_take_while(move |i| !f(i))
643    }
644
645    /// Return true if there are no more items in this NetDocReader.
646    // The implementation sadly needs to mutate the inner state, even if it's not *semantically*
647    // mutated..  We don't want inner mutability just to placate clippy for an internal API.
648    #[allow(clippy::wrong_self_convention)]
649    #[allow(dead_code)] // TODO perhaps we should remove this ?
650    pub(crate) fn is_exhausted(&mut self) -> bool {
651        self.peek().is_none()
652    }
653
654    /// Give an error if there are remaining tokens in this NetDocReader.
655    pub(crate) fn should_be_exhausted(&mut self) -> Result<()> {
656        match self.peek() {
657            None => Ok(()),
658            Some(Ok(t)) => Err(EK::UnexpectedToken
659                .with_msg(t.kwd().to_str())
660                .at_pos(t.pos())),
661            Some(Err(e)) => Err(e.clone()),
662        }
663    }
664
665    /// Give an error if there are remaining tokens in this NetDocReader.
666    ///
667    /// Like [`should_be_exhausted`](Self::should_be_exhausted),
668    /// but permit empty lines at the end of the document.
669    #[cfg(feature = "routerdesc")]
670    pub(crate) fn should_be_exhausted_but_for_empty_lines(&mut self) -> Result<()> {
671        use crate::err::NetdocErrorKind as K;
672        while let Some(Err(e)) = self.peek() {
673            if e.netdoc_error_kind() == K::EmptyLine {
674                let _ignore = self.next();
675            } else {
676                break;
677            }
678        }
679        self.should_be_exhausted()
680    }
681
682    /// Return the position from which the underlying reader is about to take
683    /// the next token.  Use to make sure that the reader is progressing.
684    pub(crate) fn pos(&mut self) -> Pos {
685        match self.tokens.peek() {
686            Some(Ok(tok)) => tok.pos(),
687            Some(Err(e)) => e.pos(),
688            None => Pos::at_end_of(self.s),
689        }
690    }
691}
692
693impl<'a, K: Keyword> Iterator for NetDocReader<'a, K> {
694    type Item = Result<Item<'a, K>>;
695    fn next(&mut self) -> Option<Self::Item> {
696        self.tokens.next()
697    }
698}
699
700impl<'a, K: Keyword> PeekableIterator for NetDocReader<'a, K> {
701    fn peek(&mut self) -> Option<&Self::Item> {
702        self.tokens.peek()
703    }
704}
705
706impl<'a, K: Keyword> itertools::PeekingNext for NetDocReader<'a, K> {
707    fn peeking_next<F>(&mut self, f: F) -> Option<Self::Item>
708    where
709        F: FnOnce(&Self::Item) -> bool,
710    {
711        if f(self.peek()?) {
712            self.next()
713        } else {
714            None
715        }
716    }
717}
718
719/// Check additional UTF-8 rules that the netdoc metaformat imposes on
720/// our documents.
721//
722// NOTE: We might decide in the future to loosen our rules here
723// for parsers that handle concatenated documents:
724// we might want to reject only those documents that contain NULs.
725// But with luck that will never be necessary.
726fn validate_utf_8_rules(s: &str) -> Result<&str> {
727    // No BOM, or mangled BOM, is allowed.
728    let first_char = s.chars().next();
729    if [Some('\u{feff}'), Some('\u{fffe}')].contains(&first_char) {
730        return Err(EK::BomMarkerFound.at_pos(Pos::at(s)));
731    }
732    // No NUL bytes are allowed.
733    if let Some(nul_pos) = memchr::memchr(0, s.as_bytes()) {
734        return Err(EK::NulFound.at_pos(Pos::from_byte(nul_pos)));
735    }
736    Ok(s)
737}
738
739#[cfg(test)]
740mod test {
741    // @@ begin test lint list maintained by maint/add_warning @@
742    #![allow(clippy::bool_assert_comparison)]
743    #![allow(clippy::clone_on_copy)]
744    #![allow(clippy::dbg_macro)]
745    #![allow(clippy::mixed_attributes_style)]
746    #![allow(clippy::print_stderr)]
747    #![allow(clippy::print_stdout)]
748    #![allow(clippy::single_char_pattern)]
749    #![allow(clippy::unwrap_used)]
750    #![allow(clippy::unchecked_duration_subtraction)]
751    #![allow(clippy::useless_vec)]
752    #![allow(clippy::needless_pass_by_value)]
753    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
754    #![allow(clippy::cognitive_complexity)]
755    use super::*;
756    use crate::parse::macros::test::Fruit;
757    use crate::{NetdocErrorKind as EK, Pos, Result};
758
759    #[test]
760    fn read_simple() {
761        use Fruit::*;
762
763        let s = "\
764@tasty very much so
765opt apple 77
766banana 60
767cherry 6
768-----BEGIN CHERRY SYNOPSIS-----
7698J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
770-----END CHERRY SYNOPSIS-----
771plum hello there
772";
773        let mut r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
774
775        assert_eq!(r.str(), s);
776        assert!(r.should_be_exhausted().is_err()); // it's not exhausted.
777
778        let toks: Result<Vec<_>> = r.by_ref().collect();
779        assert!(r.should_be_exhausted().is_ok());
780
781        let toks = toks.unwrap();
782        assert_eq!(toks.len(), 5);
783        assert_eq!(toks[0].kwd(), ANN_TASTY);
784        assert_eq!(toks[0].n_args(), 3);
785        assert_eq!(toks[0].args_as_str(), "very much so");
786        assert_eq!(toks[0].arg(1), Some("much"));
787        {
788            let a: Vec<_> = toks[0].args().collect();
789            assert_eq!(a, vec!["very", "much", "so"]);
790        }
791        assert!(toks[0].parse_arg::<usize>(0).is_err());
792        assert!(toks[0].parse_arg::<usize>(10).is_err());
793        assert!(!toks[0].has_obj());
794        assert_eq!(toks[0].obj_tag(), None);
795
796        assert_eq!(toks[2].pos().within(s), Pos::from_line(3, 1));
797        assert_eq!(toks[2].arg_pos(0).within(s), Pos::from_line(3, 8));
798        assert_eq!(toks[2].last_arg_end_pos().within(s), Pos::from_line(3, 10));
799        assert_eq!(toks[2].end_pos().within(s), Pos::from_line(3, 10));
800
801        assert_eq!(toks[3].kwd(), STONEFRUIT);
802        assert_eq!(toks[3].kwd_str(), "cherry"); // not cherry/plum!
803        assert_eq!(toks[3].n_args(), 1);
804        assert_eq!(toks[3].required_arg(0), Ok("6"));
805        assert_eq!(toks[3].parse_arg::<usize>(0), Ok(6));
806        assert_eq!(toks[3].parse_optional_arg::<usize>(0), Ok(Some(6)));
807        assert_eq!(toks[3].parse_optional_arg::<usize>(3), Ok(None));
808        assert!(toks[3].has_obj());
809        assert_eq!(toks[3].obj_tag(), Some("CHERRY SYNOPSIS"));
810        assert_eq!(
811            &toks[3].obj("CHERRY SYNOPSIS").unwrap()[..],
812            "🍒🍒🍒🍒🍒🍒".as_bytes()
813        );
814        assert!(toks[3].obj("PLUOT SYNOPSIS").is_err());
815        // this "end-pos" value is questionable!
816        assert_eq!(toks[3].end_pos().within(s), Pos::from_line(7, 30));
817    }
818
819    #[test]
820    fn test_badtoks() {
821        use Fruit::*;
822
823        let s = "\
824-foobar 9090
825apple 3.14159
826$hello
827unrecognized 127.0.0.1 foo
828plum
829-----BEGIN WHATEVER-----
8308J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
831-----END SOMETHING ELSE-----
832orange
833orange
834-----BEGIN WHATEVER-----
835not! base64!
836-----END WHATEVER-----
837guava paste
838opt @annotation
839orange
840-----BEGIN LOBSTER
8418J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
842-----END SOMETHING ELSE-----
843orange
844-----BEGIN !!!!!!-----
8458J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
846-----END !!!!!!-----
847cherry
848-----BEGIN CHERRY SYNOPSIS-----
8498J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
850-----END CHERRY SYNOPSIS
851
852truncated line";
853
854        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
855        let toks: Vec<_> = r.collect();
856
857        assert!(toks[0].is_err());
858        assert_eq!(
859            toks[0].as_ref().err().unwrap(),
860            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
861        );
862
863        assert!(toks[1].is_ok());
864        assert!(toks[1].is_ok_with_non_annotation());
865        assert!(!toks[1].is_ok_with_annotation());
866        assert!(toks[1].is_ok_with_kwd_in(&[APPLE, ORANGE]));
867        assert!(toks[1].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
868        let t = toks[1].as_ref().unwrap();
869        assert_eq!(t.kwd(), APPLE);
870        assert_eq!(t.arg(0), Some("3.14159"));
871
872        assert!(toks[2].is_err());
873        assert!(!toks[2].is_ok_with_non_annotation());
874        assert!(!toks[2].is_ok_with_annotation());
875        assert!(!toks[2].is_ok_with_kwd_in(&[APPLE, ORANGE]));
876        assert!(!toks[2].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
877        assert_eq!(
878            toks[2].as_ref().err().unwrap(),
879            &EK::BadKeyword.at_pos(Pos::from_line(3, 1))
880        );
881
882        assert!(toks[3].is_ok());
883        let t = toks[3].as_ref().unwrap();
884        assert_eq!(t.kwd(), UNRECOGNIZED);
885        assert_eq!(t.arg(1), Some("foo"));
886
887        assert!(toks[4].is_err());
888        assert_eq!(
889            toks[4].as_ref().err().unwrap(),
890            &EK::BadObjectMismatchedTag.at_pos(Pos::from_line(8, 1))
891        );
892
893        assert!(toks[5].is_ok());
894        let t = toks[5].as_ref().unwrap();
895        assert_eq!(t.kwd(), ORANGE);
896        assert_eq!(t.args_as_str(), "");
897
898        // This blob counts as two errors: a bad base64 blob, and
899        // then an end line.
900        assert!(toks[6].is_err());
901        assert_eq!(
902            toks[6].as_ref().err().unwrap(),
903            &EK::BadObjectBase64.at_pos(Pos::from_line(12, 1))
904        );
905
906        assert!(toks[7].is_err());
907        assert_eq!(
908            toks[7].as_ref().err().unwrap(),
909            &EK::BadKeyword.at_pos(Pos::from_line(13, 1))
910        );
911
912        assert!(toks[8].is_ok());
913        let t = toks[8].as_ref().unwrap();
914        assert_eq!(t.kwd(), GUAVA);
915
916        // this is an error because you can't use opt with annotations.
917        assert!(toks[9].is_err());
918        assert_eq!(
919            toks[9].as_ref().err().unwrap(),
920            &EK::BadKeyword.at_pos(Pos::from_line(15, 1))
921        );
922
923        // this looks like a few errors.
924        assert!(toks[10].is_err());
925        assert_eq!(
926            toks[10].as_ref().err().unwrap(),
927            &EK::BadObjectBeginTag.at_pos(Pos::from_line(17, 1))
928        );
929        assert!(toks[11].is_err());
930        assert_eq!(
931            toks[11].as_ref().err().unwrap(),
932            &EK::BadKeyword.at_pos(Pos::from_line(18, 1))
933        );
934        assert!(toks[12].is_err());
935        assert_eq!(
936            toks[12].as_ref().err().unwrap(),
937            &EK::BadKeyword.at_pos(Pos::from_line(19, 1))
938        );
939
940        // so does this.
941        assert!(toks[13].is_err());
942        assert_eq!(
943            toks[13].as_ref().err().unwrap(),
944            &EK::BadObjectBeginTag.at_pos(Pos::from_line(21, 1))
945        );
946        assert!(toks[14].is_err());
947        assert_eq!(
948            toks[14].as_ref().err().unwrap(),
949            &EK::BadKeyword.at_pos(Pos::from_line(22, 1))
950        );
951        assert!(toks[15].is_err());
952        assert_eq!(
953            toks[15].as_ref().err().unwrap(),
954            &EK::BadKeyword.at_pos(Pos::from_line(23, 1))
955        );
956
957        // not this.
958        assert!(toks[16].is_err());
959        assert_eq!(
960            toks[16].as_ref().err().unwrap(),
961            &EK::BadObjectEndTag.at_pos(Pos::from_line(27, 1))
962        );
963
964        assert!(toks[17].is_err());
965        assert_eq!(
966            toks[17].as_ref().err().unwrap(),
967            &EK::EmptyLine.at_pos(Pos::from_line(28, 1))
968        );
969
970        assert!(toks[18].is_err());
971        assert_eq!(
972            toks[18].as_ref().err().unwrap(),
973            &EK::TruncatedLine.at_pos(Pos::from_line(29, 15))
974        );
975    }
976
977    #[test]
978    fn test_leading_space_forbidden() {
979        // We need to make sure that items with a leading space aren't accepted:
980        // the spec forbids it, and it can provide a vector for inflating the size
981        // of downloaded hsdescs (see prop360).
982
983        // Try a simple item with a space at the front.
984        let s = "    guava space\n";
985        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
986        let toks: Vec<_> = r.collect();
987
988        // No space allowed at the start of a line.
989        assert_eq!(
990            toks[0].as_ref().err().unwrap(),
991            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
992        );
993
994        // Try an item with an object, inserting space at the start of each ine in turn.
995        let s = "cherry
996-----BEGIN WHATEVER-----
9978J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
998-----END WHATEVER-----
999";
1000
1001        let orig_lines = s
1002            .split_terminator('\n')
1003            .map(str::to_string)
1004            .collect::<Vec<_>>();
1005        assert_eq!(orig_lines.len(), 4);
1006        let expected_kinds = [
1007            EK::BadKeyword,
1008            EK::BadKeyword,
1009            EK::BadObjectBase64,
1010            EK::BadObjectBase64,
1011        ];
1012        for pos in 0..orig_lines.len() {
1013            let mut lines = orig_lines.clone();
1014            lines[pos] = format!(" {}", lines[pos]);
1015            let joined = format!("{}\n", lines.join("\n"));
1016
1017            let r: NetDocReader<'_, Fruit> = NetDocReader::new(&joined).unwrap();
1018            let toks: Result<Vec<_>> = r.collect();
1019            assert_eq!(toks.unwrap_err().netdoc_error_kind(), expected_kinds[pos]);
1020        }
1021    }
1022
1023    #[test]
1024    fn test_validate_strings() {
1025        use validate_utf_8_rules as v;
1026        assert_eq!(v(""), Ok(""));
1027        assert_eq!(v("hello world"), Ok("hello world"));
1028        // We don't have to test a lot more valid cases, since this function is called before
1029        // parsing any string.
1030
1031        for s in ["\u{feff}", "\u{feff}hello world", "\u{fffe}hello world"] {
1032            let e = v(s).unwrap_err();
1033            assert_eq!(e.netdoc_error_kind(), EK::BomMarkerFound);
1034            assert_eq!(e.pos().offset_within(s), Some(0));
1035        }
1036
1037        for s in [
1038            "\0hello world",
1039            "\0",
1040            "\0\0\0",
1041            "hello\0world",
1042            "hello world\0",
1043        ] {
1044            let e = v(s).unwrap_err();
1045            assert_eq!(e.netdoc_error_kind(), EK::NulFound);
1046            let nul_pos = e.pos().offset_within(s).unwrap();
1047            assert_eq!(s.as_bytes()[nul_pos], 0);
1048        }
1049    }
1050
1051    fn single_fruit(s: &str) -> Item<'_, Fruit> {
1052        NetDocReader::<Fruit>::new(s)
1053            .unwrap()
1054            .next()
1055            .unwrap()
1056            .unwrap()
1057    }
1058
1059    #[test]
1060    fn end_of_item() {
1061        let s = "guava friends 123   \n";
1062        let item = single_fruit(s);
1063        assert_eq!(
1064            item.end_pos().within(s),
1065            Pos::from_byte(s.find('\n').unwrap()).within(s)
1066        );
1067
1068        let s = "cherry
1069-----BEGIN WHATEVER-----
10708J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
1071-----END WHATEVER-----\n";
1072        let item = single_fruit(s);
1073        dbg!(&item);
1074        assert_eq!(
1075            item.end_pos().within(s),
1076            Pos::from_byte(s.rfind('\n').unwrap()).within(s)
1077        );
1078    }
1079}