1
//! Break a string into a set of directory-object Items.
2
//!
3
//! This module defines Item, which represents a basic entry in a
4
//! directory document, and NetDocReader, which is used to break a
5
//! string into Items.
6

            
7
use crate::parse::keyword::Keyword;
8
use crate::types::misc::FromBytes;
9
use crate::util::PeekableIterator;
10
use crate::{Error, NetdocErrorKind as EK, Pos, Result};
11
use base64ct::{Base64, Encoding};
12
use itertools::Itertools;
13
use std::cell::{Ref, RefCell};
14
use std::iter::Peekable;
15
use std::str::FromStr;
16
use tor_error::internal;
17

            
18
/// Useful constants for netdoc object syntax
19
pub(crate) mod object {
20
    /// indicates the start of an object
21
    pub(crate) const BEGIN_STR: &str = "-----BEGIN ";
22
    /// indicates the end of an object
23
    pub(crate) const END_STR: &str = "-----END ";
24
    /// indicates the end of a begin or end tag.
25
    pub(crate) const TAG_END: &str = "-----";
26
    /// Maximum PEM base64 line length (not enforced during parsing)
27
    #[cfg(feature = "hs-service")]
28
    pub(crate) const BASE64_PEM_MAX_LINE: usize = 64;
29
}
30

            
31
/// Return true iff a given character is "space" according to the rules
32
/// of dir-spec.txt
33
29430395
pub(crate) fn is_sp(c: char) -> bool {
34
29430395
    c == ' ' || c == '\t'
35
29430395
}
36
/// Check that all the characters in `s` are valid base64.
37
///
38
/// This is not a perfect check for base64ness -- it is mainly meant
39
/// to help us recover after unterminated base64.
40
93454
fn b64check(s: &str) -> Result<()> {
41
5694802
    for b in s.bytes() {
42
5694802
        match b {
43
16736
            b'=' => (),
44
2293083
            b'a'..=b'z' => (),
45
2380207
            b'A'..=b'Z' => (),
46
851488
            b'0'..=b'9' => (),
47
153282
            b'/' | b'+' => (),
48
            _ => {
49
6
                return Err(EK::BadObjectBase64.at_pos(Pos::at(s)));
50
            }
51
        };
52
    }
53
93448
    Ok(())
54
93454
}
55

            
56
/// A tagged object that is part of a directory Item.
57
///
58
/// This represents a single blob within a pair of "-----BEGIN
59
/// FOO-----" and "-----END FOO-----".  The data is not guaranteed to
60
/// be actual base64 when this object is created: doing so would
61
/// require either that we parse the base64 twice, or that we allocate
62
/// a buffer to hold the data before it's needed.
63
#[derive(Clone, Copy, Debug)]
64
pub(crate) struct Object<'a> {
65
    /// Reference to the "tag" string (the 'foo') in the BEGIN line.
66
    tag: &'a str,
67
    /// Reference to the allegedly base64-encoded data.  This may or
68
    /// may not actually be base64 at this point.
69
    data: &'a str,
70
    /// Reference to the END line for this object.  This doesn't
71
    /// need to be parsed, but it's used to find where this object
72
    /// ends.
73
    endline: &'a str,
74
}
75

            
76
/// A single part of a directory object.
77
///
78
/// Each Item -- called an "entry" in dir-spec.txt -- has a keyword, a
79
/// (possibly empty) set of arguments, and an optional object.
80
///
81
/// This is a zero-copy implementation that points to slices within a
82
/// containing string.
83
#[derive(Clone, Debug)]
84
pub(crate) struct Item<'a, K: Keyword> {
85
    /// The keyword that determines the type of this item.
86
    kwd: K,
87
    /// A reference to the actual string that defines the keyword for
88
    /// this item.
89
    kwd_str: &'a str,
90
    /// Reference to the arguments that appear in the same line after the
91
    /// keyword.  Does not include the terminating newline or the
92
    /// space that separates the keyword for its arguments.
93
    args: &'a str,
94
    /// The arguments, split by whitespace.  This vector is constructed
95
    /// as needed, using interior mutability.
96
    split_args: RefCell<Option<Vec<&'a str>>>,
97
    /// If present, a base-64-encoded object that appeared at the end
98
    /// of this item.
99
    object: Option<Object<'a>>,
100
}
101

            
102
/// A cursor into a string that returns Items one by one.
103
///
104
/// (This type isn't used directly, but is returned wrapped in a Peekable.)
105
#[derive(Debug)]
106
struct NetDocReaderBase<'a, K: Keyword> {
107
    /// The string we're parsing.
108
    s: &'a str,
109
    /// Our position within the string.
110
    off: usize,
111
    /// Tells Rust it's okay that we are parameterizing on K.
112
    _k: std::marker::PhantomData<K>,
113
}
114

            
115
impl<'a, K: Keyword> NetDocReaderBase<'a, K> {
116
    /// Create a new NetDocReader to split a string into tokens.
117
2997
    fn new(s: &'a str) -> Result<Self> {
118
2997
        Ok(NetDocReaderBase {
119
2997
            s: validate_utf_8_rules(s)?,
120
            off: 0,
121
2997
            _k: std::marker::PhantomData,
122
        })
123
2997
    }
124
    /// Return the current Pos within the string.
125
144
    fn pos(&self, pos: usize) -> Pos {
126
144
        Pos::from_offset(self.s, pos)
127
144
    }
128
    /// Skip forward by n bytes.
129
    ///
130
    /// (Note that standard caveats with byte-oriented processing of
131
    /// UTF-8 strings apply.)
132
169300
    fn advance(&mut self, n: usize) -> Result<()> {
133
169300
        if n > self.remaining() {
134
            return Err(
135
                Error::from(internal!("tried to advance past end of document"))
136
                    .at_pos(Pos::from_offset(self.s, self.off)),
137
            );
138
169300
        }
139
169300
        self.off += n;
140
169300
        Ok(())
141
169300
    }
142
    /// Return the remaining number of bytes in this reader.
143
224172
    fn remaining(&self) -> usize {
144
224172
        self.s.len() - self.off
145
224172
    }
146

            
147
    /// Return true if the next characters in this reader are `s`
148
51824
    fn starts_with(&self, s: &str) -> bool {
149
51824
        self.s[self.off..].starts_with(s)
150
51824
    }
151
    /// Try to extract a NL-terminated line from this reader.  Always
152
    /// remove data if the reader is nonempty.
153
169300
    fn line(&mut self) -> Result<&'a str> {
154
169300
        let remainder = &self.s[self.off..];
155
169300
        if let Some(nl_pos) = remainder.find('\n') {
156
169202
            self.advance(nl_pos + 1)?;
157
169202
            let line = &remainder[..nl_pos];
158
169202

            
159
169202
            // TODO: we should probably detect \r and do something about it.
160
169202
            // Just ignoring it isn't the right answer, though.
161
169202
            Ok(line)
162
        } else {
163
98
            self.advance(remainder.len())?; // drain everything.
164
98
            Err(EK::TruncatedLine.at_pos(self.pos(self.s.len())))
165
        }
166
169300
    }
167

            
168
    /// Try to extract a line that begins with a keyword from this reader.
169
    ///
170
    /// Returns a (kwd, args) tuple on success.
171
51960
    fn kwdline(&mut self) -> Result<(&'a str, &'a str)> {
172
51960
        let pos = self.off;
173
51960
        let line = self.line()?;
174
51862
        if line.is_empty() {
175
14
            return Err(EK::EmptyLine.at_pos(self.pos(pos)));
176
51848
        }
177
51848
        let (line, anno_ok) = if let Some(rem) = line.strip_prefix("opt ") {
178
4
            (rem, false)
179
        } else {
180
51844
            (line, true)
181
        };
182
51848
        let mut parts_iter = line.splitn(2, [' ', '\t']);
183
51848
        let kwd = match parts_iter.next() {
184
51848
            Some(k) => k,
185
            // This case seems like it can't happen: split always returns
186
            // something, apparently.
187
            None => return Err(EK::MissingKeyword.at_pos(self.pos(pos))),
188
        };
189
51848
        if !keyword_ok(kwd, anno_ok) {
190
24
            return Err(EK::BadKeyword.at_pos(self.pos(pos)));
191
51824
        }
192
        // TODO(nickm): dir-spec does not yet allow unicode in the arguments, but we're
193
        // assuming that proposal 285 is accepted.
194
51824
        let args = match parts_iter.next() {
195
37047
            Some(a) => a,
196
            // take a zero-length slice, so it will be within the string.
197
14777
            None => &kwd[kwd.len()..],
198
        };
199
51824
        Ok((kwd, args))
200
51960
    }
201

            
202
    /// Try to extract an Object beginning wrapped within BEGIN/END tags.
203
    ///
204
    /// Returns Ok(Some(Object(...))) on success if an object is
205
    /// found, Ok(None) if no object is found, and Err only if a
206
    /// corrupt object is found.
207
51824
    fn object(&mut self) -> Result<Option<Object<'a>>> {
208
        use object::*;
209

            
210
51824
        let pos = self.off;
211
51824
        if !self.starts_with(BEGIN_STR) {
212
37041
            return Ok(None);
213
14783
        }
214
14783
        let line = self.line()?;
215
14783
        if !line.ends_with(TAG_END) {
216
2
            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
217
14781
        }
218
14781
        let tag = &line[BEGIN_STR.len()..(line.len() - TAG_END.len())];
219
14781
        if !tag_keywords_ok(tag) {
220
2
            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
221
14779
        }
222
14779
        let datapos = self.off;
223
14773
        let (endlinepos, endline) = loop {
224
102557
            let p = self.off;
225
102557
            let line = self.line()?;
226
102557
            if line.starts_with(END_STR) {
227
14773
                break (p, line);
228
87784
            }
229
87784
            // Exit if this line isn't plausible base64.  Otherwise,
230
87784
            // an unterminated base64 block could potentially
231
87784
            // "consume" all the rest of the string, which would stop
232
87784
            // us from recovering.
233
87784
            b64check(line).map_err(|e| e.within(self.s))?;
234
        };
235
14773
        let data = &self.s[datapos..endlinepos];
236
14773
        if !endline.ends_with(TAG_END) {
237
2
            return Err(EK::BadObjectEndTag.at_pos(self.pos(endlinepos)));
238
14771
        }
239
14771
        let endtag = &endline[END_STR.len()..(endline.len() - TAG_END.len())];
240
14771
        if endtag != tag {
241
2
            return Err(EK::BadObjectMismatchedTag.at_pos(self.pos(endlinepos)));
242
14769
        }
243
14769
        Ok(Some(Object { tag, data, endline }))
244
51824
    }
245

            
246
    /// Read the next Item from this NetDocReaderBase.
247
    ///
248
    /// If successful, returns Ok(Some(Item)), or Ok(None) if exhausted.
249
    /// Returns Err on failure.
250
    ///
251
    /// Always consumes at least one line if possible; always ends on a
252
    /// line boundary if one exists.
253
54872
    fn item(&mut self) -> Result<Option<Item<'a, K>>> {
254
54872
        if self.remaining() == 0 {
255
2912
            return Ok(None);
256
51960
        }
257
51960
        let (kwd_str, args) = self.kwdline()?;
258
51824
        let object = self.object()?;
259
51810
        let split_args = RefCell::new(None);
260
51810
        let kwd = K::from_str(kwd_str);
261
51810
        Ok(Some(Item {
262
51810
            kwd,
263
51810
            kwd_str,
264
51810
            args,
265
51810
            split_args,
266
51810
            object,
267
51810
        }))
268
54872
    }
269
}
270

            
271
/// Return true iff 's' is a valid keyword or annotation.
272
///
273
/// (Only allow annotations if `anno_ok` is true.`
274
103096
fn keyword_ok(mut s: &str, anno_ok: bool) -> bool {
275
    /// Helper: return true if this character can appear in keywords.
276
939522
    fn kwd_char_ok(c: char) -> bool {
277
939522
        matches!(c,'A'..='Z' | 'a'..='z' |'0'..='9' | '-')
278
939522
    }
279

            
280
103096
    if s.is_empty() {
281
6
        return false;
282
103090
    }
283
103090
    if anno_ok && s.starts_with('@') {
284
30
        s = &s[1..];
285
103060
    }
286
103090
    if s.starts_with('-') {
287
8
        return false;
288
103082
    }
289
103082
    s.chars().all(kwd_char_ok)
290
103096
}
291

            
292
/// Return true iff 's' is a valid keywords string for a BEGIN/END tag.
293
15802
pub(crate) fn tag_keywords_ok(s: &str) -> bool {
294
31519
    s.split(' ').all(|w| keyword_ok(w, false))
295
15802
}
296

            
297
/// When used as an Iterator, returns a sequence of `Result<Item>`.
298
impl<'a, K: Keyword> Iterator for NetDocReaderBase<'a, K> {
299
    type Item = Result<Item<'a, K>>;
300
54872
    fn next(&mut self) -> Option<Self::Item> {
301
54872
        self.item().transpose()
302
54872
    }
303
}
304

            
305
/// Helper: as base64::decode(), but allows newlines in the middle of the
306
/// encoded object.
307
15656
pub(crate) fn base64_decode_multiline(s: &str) -> std::result::Result<Vec<u8>, base64ct::Error> {
308
15656
    // base64 module hates whitespace.
309
15656
    let mut s = s.to_string();
310
5785270
    s.retain(|ch| ch != '\n');
311
15656
    let v = Base64::decode_vec(&s)?;
312
15656
    Ok(v)
313
15656
}
314

            
315
impl<'a, K: Keyword> Item<'a, K> {
316
    /// Return the parsed keyword part of this item.
317
126402
    pub(crate) fn kwd(&self) -> K {
318
126402
        self.kwd
319
126402
    }
320
    /// Return the keyword part of this item, as a string.
321
2404
    pub(crate) fn kwd_str(&self) -> &'a str {
322
2404
        self.kwd_str
323
2404
    }
324
    /// Return true if the keyword for this item is in 'ks'.
325
48950
    pub(crate) fn has_kwd_in(&self, ks: &[K]) -> bool {
326
48950
        ks.contains(&self.kwd)
327
48950
    }
328
    /// Return the arguments of this item, as a single string.
329
20739
    pub(crate) fn args_as_str(&self) -> &'a str {
330
20739
        self.args
331
20739
    }
332
    /// Return the arguments of this item as a vector.
333
57833
    fn args_as_vec(&self) -> Ref<'_, Vec<&'a str>> {
334
57833
        // We're using an interior mutability pattern here to lazily
335
57833
        // construct the vector.
336
57833
        if self.split_args.borrow().is_none() {
337
27569
            self.split_args.replace(Some(self.args().collect()));
338
30264
        }
339
57833
        Ref::map(self.split_args.borrow(), |opt| match opt {
340
57833
            Some(v) => v,
341
            None => panic!(),
342
57833
        })
343
57833
    }
344
    /// Return an iterator over the arguments of this item.
345
101742
    pub(crate) fn args(&self) -> impl Iterator<Item = &'a str> + use<'a, K> {
346
293444
        self.args.split(is_sp).filter(|s| !s.is_empty())
347
101742
    }
348
    /// Return the nth argument of this item, if there is one.
349
57825
    pub(crate) fn arg(&self, idx: usize) -> Option<&'a str> {
350
57825
        self.args_as_vec().get(idx).copied()
351
57825
    }
352
    /// Return the nth argument of this item, or an error if it isn't there.
353
20626
    pub(crate) fn required_arg(&self, idx: usize) -> Result<&'a str> {
354
20626
        self.arg(idx)
355
20626
            .ok_or_else(|| EK::MissingArgument.at_pos(Pos::at(self.args)))
356
20626
    }
357
    /// Try to parse the nth argument (if it exists) into some type
358
    /// that supports FromStr.
359
    ///
360
    /// Returns Ok(None) if the argument doesn't exist.
361
35005
    pub(crate) fn parse_optional_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
362
35005
    where
363
35005
        Error: From<V::Err>,
364
35005
    {
365
35005
        match self.arg(idx) {
366
6
            None => Ok(None),
367
34999
            Some(s) => match s.parse() {
368
34995
                Ok(r) => Ok(Some(r)),
369
4
                Err(e) => {
370
4
                    let e: Error = e.into();
371
4
                    Err(e.or_at_pos(Pos::at(s)))
372
                }
373
            },
374
        }
375
35005
    }
376
    /// Try to parse the nth argument (if it exists) into some type
377
    /// that supports FromStr.
378
    ///
379
    /// Return an error if the argument doesn't exist.
380
34997
    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<V>
381
34997
    where
382
34997
        Error: From<V::Err>,
383
34997
    {
384
34997
        match self.parse_optional_arg(idx) {
385
34991
            Ok(Some(v)) => Ok(v),
386
2
            Ok(None) => Err(EK::MissingArgument.at_pos(self.arg_pos(idx))),
387
4
            Err(e) => Err(e),
388
        }
389
34997
    }
390
    /// Return the number of arguments for this Item
391
72152
    pub(crate) fn n_args(&self) -> usize {
392
72152
        self.args().count()
393
72152
    }
394
    /// Return true iff this Item has an associated object.
395
52070
    pub(crate) fn has_obj(&self) -> bool {
396
52070
        self.object.is_some()
397
52070
    }
398
    /// Return the tag of this item's associated object, if it has one.
399
169
    pub(crate) fn obj_tag(&self) -> Option<&'a str> {
400
169
        self.object.map(|o| o.tag)
401
169
    }
402
    /// Try to decode the base64 contents of this Item's associated object.
403
    ///
404
    /// On success, return the object's tag and decoded contents.
405
17732
    pub(crate) fn obj_raw(&self) -> Result<Option<(&'a str, Vec<u8>)>> {
406
17732
        match self.object {
407
2142
            None => Ok(None),
408
15590
            Some(obj) => {
409
15590
                let decoded = base64_decode_multiline(obj.data)
410
15590
                    .map_err(|_| EK::BadObjectBase64.at_pos(Pos::at(obj.data)))?;
411
15590
                Ok(Some((obj.tag, decoded)))
412
            }
413
        }
414
17732
    }
415
    /// Try to decode the base64 contents of this Item's associated object,
416
    /// and make sure that its tag matches 'want_tag'.
417
15592
    pub(crate) fn obj(&self, want_tag: &str) -> Result<Vec<u8>> {
418
15592
        match self.obj_raw()? {
419
2
            None => Err(EK::MissingObject
420
2
                .with_msg(self.kwd.to_str())
421
2
                .at_pos(self.end_pos())),
422
15590
            Some((tag, decoded)) => {
423
15590
                if tag != want_tag {
424
4
                    Err(EK::WrongObject.at_pos(Pos::at(tag)))
425
                } else {
426
15586
                    Ok(decoded)
427
                }
428
            }
429
        }
430
15592
    }
431
    /// Try to decode the base64 contents of this item's associated object
432
    /// as a given type that implements FromBytes.
433
9961
    pub(crate) fn parse_obj<V: FromBytes>(&self, want_tag: &str) -> Result<V> {
434
9961
        let bytes = self.obj(want_tag)?;
435
        // Unwrap may be safe because above `.obj()` should return an Error if
436
        // wanted tag was not present
437
        #[allow(clippy::unwrap_used)]
438
9961
        let p = Pos::at(self.object.unwrap().data);
439
9961
        V::from_vec(bytes, p).map_err(|e| e.at_pos(p))
440
9961
    }
441
    /// Return the position of this item.
442
    ///
443
    /// This position won't be useful unless it is later contextualized
444
    /// with the containing string.
445
2937
    pub(crate) fn pos(&self) -> Pos {
446
2937
        Pos::at(self.kwd_str)
447
2937
    }
448
    /// Return the position of this Item in a string.
449
    ///
450
    /// Returns None if this item doesn't actually belong to the string.
451
8372
    pub(crate) fn offset_in(&self, s: &str) -> Option<usize> {
452
8372
        crate::util::str::str_offset(s, self.kwd_str)
453
8372
    }
454
    /// Return the position of the n'th argument of this item.
455
    ///
456
    /// If this item does not have a n'th argument, return the
457
    /// position of the end of the final argument.
458
8
    pub(crate) fn arg_pos(&self, n: usize) -> Pos {
459
8
        let args = self.args_as_vec();
460
8
        if n < args.len() {
461
6
            Pos::at(args[n])
462
        } else {
463
2
            self.last_arg_end_pos()
464
        }
465
8
    }
466
    /// Return the position at the end of the last argument.  (This will
467
    /// point to a newline.)
468
569
    fn last_arg_end_pos(&self) -> Pos {
469
569
        Pos::at_end_of(self.args)
470
569
    }
471
    /// Return the position of the end of this object. (This will point to a
472
    /// newline.)
473
740
    pub(crate) fn end_pos(&self) -> Pos {
474
740
        match self.object {
475
175
            Some(o) => Pos::at_end_of(o.endline),
476
565
            None => self.last_arg_end_pos(),
477
        }
478
740
    }
479
    /// If this item occurs within s, return the byte offset
480
    /// immediately after the end of this item.
481
394
    pub(crate) fn offset_after(&self, s: &str) -> Option<usize> {
482
394
        self.end_pos().offset_within(s).map(|nl_pos| nl_pos + 1)
483
394
    }
484

            
485
    /// Return the text of this item, if it originated within `str`,
486
    /// from the start of its keyword up to and including its final newline.
487
    #[allow(dead_code)] // unused when hsdesc not enabled.
488
165
    pub(crate) fn text_within<'b>(&self, s: &'b str) -> Option<&'b str> {
489
165
        let start = self.pos().offset_within(s)?;
490
165
        let end = self.end_pos().offset_within(s)?;
491
165
        s.get(start..=end)
492
165
    }
493
}
494

            
495
/// Represents an Item that might not be present, whose arguments we
496
/// want to inspect.  If the Item is there, this acts like a proxy to the
497
/// item; otherwise, it treats the item as having no arguments.
498
pub(crate) struct MaybeItem<'a, 'b, K: Keyword>(Option<&'a Item<'b, K>>);
499

            
500
// All methods here are as for Item.
501
impl<'a, 'b, K: Keyword> MaybeItem<'a, 'b, K> {
502
    /// Return the position of this item, if it has one.
503
4
    fn pos(&self) -> Pos {
504
4
        match self.0 {
505
4
            Some(item) => item.pos(),
506
            None => Pos::None,
507
        }
508
4
    }
509
    /// Construct a MaybeItem from an Option reference to an item.
510
10963
    pub(crate) fn from_option(opt: Option<&'a Item<'b, K>>) -> Self {
511
10963
        MaybeItem(opt)
512
10963
    }
513

            
514
    /// If this item is present, parse its argument at position `idx`.
515
    /// Treat the absence or malformedness of the argument as an error,
516
    /// but treat the absence of this item as acceptable.
517
    #[cfg(any(test, feature = "routerdesc"))]
518
2000
    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
519
2000
    where
520
2000
        Error: From<V::Err>,
521
2000
    {
522
2000
        match self.0 {
523
1998
            Some(item) => match item.parse_arg(idx) {
524
1996
                Ok(v) => Ok(Some(v)),
525
2
                Err(e) => Err(e.or_at_pos(self.pos())),
526
            },
527
2
            None => Ok(None),
528
        }
529
2000
    }
530
    /// If this item is present, return its arguments as a single string.
531
3624
    pub(crate) fn args_as_str(&self) -> Option<&str> {
532
3624
        self.0.map(|item| item.args_as_str())
533
3624
    }
534
    /// If this item is present, parse all of its arguments as a
535
    /// single string.
536
5339
    pub(crate) fn parse_args_as_str<V: FromStr>(&self) -> Result<Option<V>>
537
5339
    where
538
5339
        Error: From<V::Err>,
539
5339
    {
540
5339
        match self.0 {
541
2259
            Some(item) => match item.args_as_str().parse::<V>() {
542
2257
                Ok(v) => Ok(Some(v)),
543
2
                Err(e) => {
544
2
                    let e: Error = e.into();
545
2
                    Err(e.or_at_pos(self.pos()))
546
                }
547
            },
548
3080
            None => Ok(None),
549
        }
550
5339
    }
551
}
552

            
553
/// Extension trait for `Result<Item>` -- makes it convenient to implement
554
/// PauseAt predicates
555
pub(crate) trait ItemResult<K: Keyword> {
556
    /// Return true if this is an ok result with an annotation.
557
    fn is_ok_with_annotation(&self) -> bool;
558
    /// Return true if this is an ok result with a non-annotation.
559
    fn is_ok_with_non_annotation(&self) -> bool;
560
    /// Return true if this is an ok result with the keyword 'k'
561
8014
    fn is_ok_with_kwd(&self, k: K) -> bool {
562
8014
        self.is_ok_with_kwd_in(&[k])
563
8014
    }
564
    /// Return true if this is an ok result with a keyword in the slice 'ks'
565
    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool;
566
    /// Return true if this is an ok result with a keyword not in the slice 'ks'
567
    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool;
568
    /// Return true if this is an empty-line error.
569
    fn is_empty_line(&self) -> bool;
570
}
571

            
572
impl<'a, K: Keyword> ItemResult<K> for Result<Item<'a, K>> {
573
4026
    fn is_ok_with_annotation(&self) -> bool {
574
4026
        match self {
575
4012
            Ok(item) => item.kwd().is_annotation(),
576
14
            Err(_) => false,
577
        }
578
4026
    }
579
42
    fn is_ok_with_non_annotation(&self) -> bool {
580
42
        match self {
581
38
            Ok(item) => !item.kwd().is_annotation(),
582
4
            Err(_) => false,
583
        }
584
42
    }
585
42946
    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool {
586
42946
        match self {
587
42930
            Ok(item) => item.has_kwd_in(ks),
588
16
            Err(_) => false,
589
        }
590
42946
    }
591
6116
    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool {
592
6116
        match self {
593
6020
            Ok(item) => !item.has_kwd_in(ks),
594
96
            Err(_) => false,
595
        }
596
6116
    }
597
4020
    fn is_empty_line(&self) -> bool {
598
12
        matches!(
599
12
            self,
600
12
            Err(e) if e.netdoc_error_kind() == crate::err::NetdocErrorKind::EmptyLine
601
        )
602
4020
    }
603
}
604

            
605
/// A peekable cursor into a string that returns Items one by one.
606
///
607
/// This is an [`Iterator`], yielding [`Item`]s.
608
#[derive(Debug)]
609
pub(crate) struct NetDocReader<'a, K: Keyword> {
610
    // TODO: I wish there were some way around having this string
611
    // reference, since we already need one inside NetDocReaderBase.
612
    /// The underlying string being parsed.
613
    s: &'a str,
614
    /// A stream of tokens being parsed by this NetDocReader.
615
    tokens: Peekable<NetDocReaderBase<'a, K>>,
616
}
617

            
618
impl<'a, K: Keyword> NetDocReader<'a, K> {
619
    /// Construct a new NetDocReader to read tokens from `s`.
620
2997
    pub(crate) fn new(s: &'a str) -> Result<Self> {
621
2997
        Ok(NetDocReader {
622
2997
            s,
623
2997
            tokens: NetDocReaderBase::new(s)?.peekable(),
624
        })
625
2997
    }
626
    /// Return a reference to the string used for this NetDocReader.
627
2765
    pub(crate) fn str(&self) -> &'a str {
628
2765
        self.s
629
2765
    }
630
    /// Return a wrapper around the peekable iterator in this
631
    /// NetDocReader that reads tokens until it reaches an element where
632
    /// 'f' is true.
633
6814
    pub(crate) fn pause_at<'f, 'r, F>(
634
6814
        &mut self,
635
6814
        mut f: F,
636
6814
    ) -> itertools::PeekingTakeWhile<
637
6814
        '_,
638
6814
        Self,
639
6814
        impl FnMut(&Result<Item<'a, K>>) -> bool + 'f + use<'a, 'f, F, K>,
640
6814
    >
641
6814
    where
642
6814
        'f: 'r,
643
6814
        F: FnMut(&Result<Item<'a, K>>) -> bool + 'f,
644
6814
        K: 'f,
645
6814
    {
646
47839
        self.peeking_take_while(move |i| !f(i))
647
6814
    }
648

            
649
    /// Return true if there are no more items in this NetDocReader.
650
    // The implementation sadly needs to mutate the inner state, even if it's not *semantically*
651
    // mutated..  We don't want inner mutability just to placate clippy for an internal API.
652
    #[allow(clippy::wrong_self_convention)]
653
    #[allow(dead_code)] // TODO perhaps we should remove this ?
654
    pub(crate) fn is_exhausted(&mut self) -> bool {
655
        self.peek().is_none()
656
    }
657

            
658
    /// Give an error if there are remaining tokens in this NetDocReader.
659
2096
    pub(crate) fn should_be_exhausted(&mut self) -> Result<()> {
660
2096
        match self.peek() {
661
2094
            None => Ok(()),
662
2
            Some(Ok(t)) => Err(EK::UnexpectedToken
663
2
                .with_msg(t.kwd().to_str())
664
2
                .at_pos(t.pos())),
665
            Some(Err(e)) => Err(e.clone()),
666
        }
667
2096
    }
668

            
669
    /// Give an error if there are remaining tokens in this NetDocReader.
670
    ///
671
    /// Like [`should_be_exhausted`](Self::should_be_exhausted),
672
    /// but permit empty lines at the end of the document.
673
    #[cfg(feature = "routerdesc")]
674
1980
    pub(crate) fn should_be_exhausted_but_for_empty_lines(&mut self) -> Result<()> {
675
        use crate::err::NetdocErrorKind as K;
676
1982
        while let Some(Err(e)) = self.peek() {
677
2
            if e.netdoc_error_kind() == K::EmptyLine {
678
2
                let _ignore = self.next();
679
2
            } else {
680
                break;
681
            }
682
        }
683
1980
        self.should_be_exhausted()
684
1980
    }
685

            
686
    /// Return the position from which the underlying reader is about to take
687
    /// the next token.  Use to make sure that the reader is progressing.
688
503
    pub(crate) fn pos(&mut self) -> Pos {
689
503
        match self.tokens.peek() {
690
497
            Some(Ok(tok)) => tok.pos(),
691
2
            Some(Err(e)) => e.pos(),
692
4
            None => Pos::at_end_of(self.s),
693
        }
694
503
    }
695
}
696

            
697
impl<'a, K: Keyword> Iterator for NetDocReader<'a, K> {
698
    type Item = Result<Item<'a, K>>;
699
52359
    fn next(&mut self) -> Option<Self::Item> {
700
52359
        self.tokens.next()
701
52359
    }
702
}
703

            
704
impl<'a, K: Keyword> PeekableIterator for NetDocReader<'a, K> {
705
60991
    fn peek(&mut self) -> Option<&Self::Item> {
706
60991
        self.tokens.peek()
707
60991
    }
708
}
709

            
710
impl<'a, K: Keyword> itertools::PeekingNext for NetDocReader<'a, K> {
711
49986
    fn peeking_next<F>(&mut self, f: F) -> Option<Self::Item>
712
49986
    where
713
49986
        F: FnOnce(&Self::Item) -> bool,
714
49986
    {
715
49986
        if f(self.peek()?) { self.next() } else { None }
716
49986
    }
717
}
718

            
719
/// Check additional UTF-8 rules that the netdoc metaformat imposes on
720
/// our documents.
721
//
722
// NOTE: We might decide in the future to loosen our rules here
723
// for parsers that handle concatenated documents:
724
// we might want to reject only those documents that contain NULs.
725
// But with luck that will never be necessary.
726
3377
fn validate_utf_8_rules(s: &str) -> Result<&str> {
727
3377
    // No BOM, or mangled BOM, is allowed.
728
3377
    let first_char = s.chars().next();
729
3377
    if [Some('\u{feff}'), Some('\u{fffe}')].contains(&first_char) {
730
6
        return Err(EK::BomMarkerFound.at_pos(Pos::at(s)));
731
3371
    }
732
    // No NUL bytes are allowed.
733
3371
    if let Some(nul_pos) = memchr::memchr(0, s.as_bytes()) {
734
10
        return Err(EK::NulFound.at_pos(Pos::from_byte(nul_pos)));
735
3361
    }
736
3361
    Ok(s)
737
3377
}
738

            
739
#[cfg(test)]
740
mod test {
741
    // @@ begin test lint list maintained by maint/add_warning @@
742
    #![allow(clippy::bool_assert_comparison)]
743
    #![allow(clippy::clone_on_copy)]
744
    #![allow(clippy::dbg_macro)]
745
    #![allow(clippy::mixed_attributes_style)]
746
    #![allow(clippy::print_stderr)]
747
    #![allow(clippy::print_stdout)]
748
    #![allow(clippy::single_char_pattern)]
749
    #![allow(clippy::unwrap_used)]
750
    #![allow(clippy::unchecked_duration_subtraction)]
751
    #![allow(clippy::useless_vec)]
752
    #![allow(clippy::needless_pass_by_value)]
753
    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
754
    #![allow(clippy::cognitive_complexity)]
755
    use super::*;
756
    use crate::parse::macros::test::Fruit;
757
    use crate::{NetdocErrorKind as EK, Pos, Result};
758

            
759
    #[test]
760
    fn read_simple() {
761
        use Fruit::*;
762

            
763
        let s = "\
764
@tasty very much so
765
opt apple 77
766
banana 60
767
cherry 6
768
-----BEGIN CHERRY SYNOPSIS-----
769
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
770
-----END CHERRY SYNOPSIS-----
771
plum hello there
772
";
773
        let mut r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
774

            
775
        assert_eq!(r.str(), s);
776
        assert!(r.should_be_exhausted().is_err()); // it's not exhausted.
777

            
778
        let toks: Result<Vec<_>> = r.by_ref().collect();
779
        assert!(r.should_be_exhausted().is_ok());
780

            
781
        let toks = toks.unwrap();
782
        assert_eq!(toks.len(), 5);
783
        assert_eq!(toks[0].kwd(), ANN_TASTY);
784
        assert_eq!(toks[0].n_args(), 3);
785
        assert_eq!(toks[0].args_as_str(), "very much so");
786
        assert_eq!(toks[0].arg(1), Some("much"));
787
        {
788
            let a: Vec<_> = toks[0].args().collect();
789
            assert_eq!(a, vec!["very", "much", "so"]);
790
        }
791
        assert!(toks[0].parse_arg::<usize>(0).is_err());
792
        assert!(toks[0].parse_arg::<usize>(10).is_err());
793
        assert!(!toks[0].has_obj());
794
        assert_eq!(toks[0].obj_tag(), None);
795

            
796
        assert_eq!(toks[2].pos().within(s), Pos::from_line(3, 1));
797
        assert_eq!(toks[2].arg_pos(0).within(s), Pos::from_line(3, 8));
798
        assert_eq!(toks[2].last_arg_end_pos().within(s), Pos::from_line(3, 10));
799
        assert_eq!(toks[2].end_pos().within(s), Pos::from_line(3, 10));
800

            
801
        assert_eq!(toks[3].kwd(), STONEFRUIT);
802
        assert_eq!(toks[3].kwd_str(), "cherry"); // not cherry/plum!
803
        assert_eq!(toks[3].n_args(), 1);
804
        assert_eq!(toks[3].required_arg(0), Ok("6"));
805
        assert_eq!(toks[3].parse_arg::<usize>(0), Ok(6));
806
        assert_eq!(toks[3].parse_optional_arg::<usize>(0), Ok(Some(6)));
807
        assert_eq!(toks[3].parse_optional_arg::<usize>(3), Ok(None));
808
        assert!(toks[3].has_obj());
809
        assert_eq!(toks[3].obj_tag(), Some("CHERRY SYNOPSIS"));
810
        assert_eq!(
811
            &toks[3].obj("CHERRY SYNOPSIS").unwrap()[..],
812
            "🍒🍒🍒🍒🍒🍒".as_bytes()
813
        );
814
        assert!(toks[3].obj("PLUOT SYNOPSIS").is_err());
815
        // this "end-pos" value is questionable!
816
        assert_eq!(toks[3].end_pos().within(s), Pos::from_line(7, 30));
817
    }
818

            
819
    #[test]
820
    fn test_badtoks() {
821
        use Fruit::*;
822

            
823
        let s = "\
824
-foobar 9090
825
apple 3.14159
826
$hello
827
unrecognized 127.0.0.1 foo
828
plum
829
-----BEGIN WHATEVER-----
830
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
831
-----END SOMETHING ELSE-----
832
orange
833
orange
834
-----BEGIN WHATEVER-----
835
not! base64!
836
-----END WHATEVER-----
837
guava paste
838
opt @annotation
839
orange
840
-----BEGIN LOBSTER
841
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
842
-----END SOMETHING ELSE-----
843
orange
844
-----BEGIN !!!!!!-----
845
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
846
-----END !!!!!!-----
847
cherry
848
-----BEGIN CHERRY SYNOPSIS-----
849
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
850
-----END CHERRY SYNOPSIS
851

            
852
truncated line";
853

            
854
        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
855
        let toks: Vec<_> = r.collect();
856

            
857
        assert!(toks[0].is_err());
858
        assert_eq!(
859
            toks[0].as_ref().err().unwrap(),
860
            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
861
        );
862

            
863
        assert!(toks[1].is_ok());
864
        assert!(toks[1].is_ok_with_non_annotation());
865
        assert!(!toks[1].is_ok_with_annotation());
866
        assert!(toks[1].is_ok_with_kwd_in(&[APPLE, ORANGE]));
867
        assert!(toks[1].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
868
        let t = toks[1].as_ref().unwrap();
869
        assert_eq!(t.kwd(), APPLE);
870
        assert_eq!(t.arg(0), Some("3.14159"));
871

            
872
        assert!(toks[2].is_err());
873
        assert!(!toks[2].is_ok_with_non_annotation());
874
        assert!(!toks[2].is_ok_with_annotation());
875
        assert!(!toks[2].is_ok_with_kwd_in(&[APPLE, ORANGE]));
876
        assert!(!toks[2].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
877
        assert_eq!(
878
            toks[2].as_ref().err().unwrap(),
879
            &EK::BadKeyword.at_pos(Pos::from_line(3, 1))
880
        );
881

            
882
        assert!(toks[3].is_ok());
883
        let t = toks[3].as_ref().unwrap();
884
        assert_eq!(t.kwd(), UNRECOGNIZED);
885
        assert_eq!(t.arg(1), Some("foo"));
886

            
887
        assert!(toks[4].is_err());
888
        assert_eq!(
889
            toks[4].as_ref().err().unwrap(),
890
            &EK::BadObjectMismatchedTag.at_pos(Pos::from_line(8, 1))
891
        );
892

            
893
        assert!(toks[5].is_ok());
894
        let t = toks[5].as_ref().unwrap();
895
        assert_eq!(t.kwd(), ORANGE);
896
        assert_eq!(t.args_as_str(), "");
897

            
898
        // This blob counts as two errors: a bad base64 blob, and
899
        // then an end line.
900
        assert!(toks[6].is_err());
901
        assert_eq!(
902
            toks[6].as_ref().err().unwrap(),
903
            &EK::BadObjectBase64.at_pos(Pos::from_line(12, 1))
904
        );
905

            
906
        assert!(toks[7].is_err());
907
        assert_eq!(
908
            toks[7].as_ref().err().unwrap(),
909
            &EK::BadKeyword.at_pos(Pos::from_line(13, 1))
910
        );
911

            
912
        assert!(toks[8].is_ok());
913
        let t = toks[8].as_ref().unwrap();
914
        assert_eq!(t.kwd(), GUAVA);
915

            
916
        // this is an error because you can't use opt with annotations.
917
        assert!(toks[9].is_err());
918
        assert_eq!(
919
            toks[9].as_ref().err().unwrap(),
920
            &EK::BadKeyword.at_pos(Pos::from_line(15, 1))
921
        );
922

            
923
        // this looks like a few errors.
924
        assert!(toks[10].is_err());
925
        assert_eq!(
926
            toks[10].as_ref().err().unwrap(),
927
            &EK::BadObjectBeginTag.at_pos(Pos::from_line(17, 1))
928
        );
929
        assert!(toks[11].is_err());
930
        assert_eq!(
931
            toks[11].as_ref().err().unwrap(),
932
            &EK::BadKeyword.at_pos(Pos::from_line(18, 1))
933
        );
934
        assert!(toks[12].is_err());
935
        assert_eq!(
936
            toks[12].as_ref().err().unwrap(),
937
            &EK::BadKeyword.at_pos(Pos::from_line(19, 1))
938
        );
939

            
940
        // so does this.
941
        assert!(toks[13].is_err());
942
        assert_eq!(
943
            toks[13].as_ref().err().unwrap(),
944
            &EK::BadObjectBeginTag.at_pos(Pos::from_line(21, 1))
945
        );
946
        assert!(toks[14].is_err());
947
        assert_eq!(
948
            toks[14].as_ref().err().unwrap(),
949
            &EK::BadKeyword.at_pos(Pos::from_line(22, 1))
950
        );
951
        assert!(toks[15].is_err());
952
        assert_eq!(
953
            toks[15].as_ref().err().unwrap(),
954
            &EK::BadKeyword.at_pos(Pos::from_line(23, 1))
955
        );
956

            
957
        // not this.
958
        assert!(toks[16].is_err());
959
        assert_eq!(
960
            toks[16].as_ref().err().unwrap(),
961
            &EK::BadObjectEndTag.at_pos(Pos::from_line(27, 1))
962
        );
963

            
964
        assert!(toks[17].is_err());
965
        assert_eq!(
966
            toks[17].as_ref().err().unwrap(),
967
            &EK::EmptyLine.at_pos(Pos::from_line(28, 1))
968
        );
969

            
970
        assert!(toks[18].is_err());
971
        assert_eq!(
972
            toks[18].as_ref().err().unwrap(),
973
            &EK::TruncatedLine.at_pos(Pos::from_line(29, 15))
974
        );
975
    }
976

            
977
    #[test]
978
    fn test_leading_space_forbidden() {
979
        // We need to make sure that items with a leading space aren't accepted:
980
        // the spec forbids it, and it can provide a vector for inflating the size
981
        // of downloaded hsdescs (see prop360).
982

            
983
        // Try a simple item with a space at the front.
984
        let s = "    guava space\n";
985
        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
986
        let toks: Vec<_> = r.collect();
987

            
988
        // No space allowed at the start of a line.
989
        assert_eq!(
990
            toks[0].as_ref().err().unwrap(),
991
            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
992
        );
993

            
994
        // Try an item with an object, inserting space at the start of each ine in turn.
995
        let s = "cherry
996
-----BEGIN WHATEVER-----
997
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
998
-----END WHATEVER-----
999
";
        let orig_lines = s
            .split_terminator('\n')
            .map(str::to_string)
            .collect::<Vec<_>>();
        assert_eq!(orig_lines.len(), 4);
        let expected_kinds = [
            EK::BadKeyword,
            EK::BadKeyword,
            EK::BadObjectBase64,
            EK::BadObjectBase64,
        ];
        for pos in 0..orig_lines.len() {
            let mut lines = orig_lines.clone();
            lines[pos] = format!(" {}", lines[pos]);
            let joined = format!("{}\n", lines.join("\n"));
            let r: NetDocReader<'_, Fruit> = NetDocReader::new(&joined).unwrap();
            let toks: Result<Vec<_>> = r.collect();
            assert_eq!(toks.unwrap_err().netdoc_error_kind(), expected_kinds[pos]);
        }
    }
    #[test]
    fn test_validate_strings() {
        use validate_utf_8_rules as v;
        assert_eq!(v(""), Ok(""));
        assert_eq!(v("hello world"), Ok("hello world"));
        // We don't have to test a lot more valid cases, since this function is called before
        // parsing any string.
        for s in ["\u{feff}", "\u{feff}hello world", "\u{fffe}hello world"] {
            let e = v(s).unwrap_err();
            assert_eq!(e.netdoc_error_kind(), EK::BomMarkerFound);
            assert_eq!(e.pos().offset_within(s), Some(0));
        }
        for s in [
            "\0hello world",
            "\0",
            "\0\0\0",
            "hello\0world",
            "hello world\0",
        ] {
            let e = v(s).unwrap_err();
            assert_eq!(e.netdoc_error_kind(), EK::NulFound);
            let nul_pos = e.pos().offset_within(s).unwrap();
            assert_eq!(s.as_bytes()[nul_pos], 0);
        }
    }
    fn single_fruit(s: &str) -> Item<'_, Fruit> {
        NetDocReader::<Fruit>::new(s)
            .unwrap()
            .next()
            .unwrap()
            .unwrap()
    }
    #[test]
    fn end_of_item() {
        let s = "guava friends 123   \n";
        let item = single_fruit(s);
        assert_eq!(
            item.end_pos().within(s),
            Pos::from_byte(s.find('\n').unwrap()).within(s)
        );
        let s = "cherry
-----BEGIN WHATEVER-----
8J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
-----END WHATEVER-----\n";
        let item = single_fruit(s);
        dbg!(&item);
        assert_eq!(
            item.end_pos().within(s),
            Pos::from_byte(s.rfind('\n').unwrap()).within(s)
        );
    }
}