1
//! Macros to make it easier to work with rate-limited logs
2
//!
3

            
4
/// Log a rate-limited failure message.
5
///
6
/// This macro looks at a single `Result<T,E>`,
7
/// and keeps track of how many times the `Ok` and `Err` branches are seen.
8
/// After a delay, it reports via [`tracing::event!`]
9
/// how many errors it has seen since its last report.
10
/// (It only reports an absence of errors
11
/// when such an absence has followed
12
/// an error report.)
13
///
14
/// ## A simple example
15
///
16
/// ```
17
/// # use std::num::ParseIntError;
18
/// # let s = "1234";
19
/// # let source = "the cache";
20
/// use tor_log_ratelim::log_ratelim;
21
///  let r: Result<u8, ParseIntError> = s.parse();
22
///
23
/// log_ratelim!(
24
///   // The activity we were doing.
25
///   "Parsing a value from {}", source;
26
///   // The result we got.
27
///   r;
28
/// );
29
/// ```
30
///
31
/// This invocation could report errors like
32
/// ```text
33
/// WARN: Parsing a value from the cache: error (occurred 9/12 times in the last 5 minutes): number too large to fit in target type
34
/// ```
35
///
36
/// After a while without errors, it might log:
37
/// ```text
38
/// WARN: Parsing a value from the cache: now working (occurred 0/100 times in th last hour)
39
/// ```
40
///
41
/// ## Important concept: Activities
42
///
43
/// Every invocation of `log_ratelim!` defines a _set_ of rate limits
44
/// with respect to a collection of **activities**.  
45
/// Each separate **activity** value gets its own rate limit.
46
/// This lets you have separate rate limits for different operations,
47
/// such as connecting to different parties,
48
/// or invoking different programs.
49
///
50
/// Typical activities might be
51
/// `"Connecting to port {}", p`
52
/// or
53
/// `"Trying to start program {}", p`
54
///
55
/// (These activities should be described using a verb ending with "-ing",
56
/// to make the output nice.)
57
///
58
/// ## Requirements on the error type.
59
///
60
/// The error type for each `Result` passed to this macro must implement:
61
///  * [`Clone`]
62
///  * [`Send`]
63
///  * [`Error`](std::error::Error)
64
///
65
/// ## Reports are representative
66
///
67
/// No matter how many failures are seen between log messages,
68
/// `log_ratelim!` only records and reports
69
/// one error for each time it logs.
70
///
71
/// Its current behavior is to record and report
72
/// the _first_ error for each logged interval,
73
/// and discard the others.  
74
/// This can lead to confusing results if the error is not representative.
75
///
76
/// ## Advanced syntax
77
///
78
/// The `log_ratelim` macro can record additional information for its
79
/// representative error report,
80
/// and can log information on successes as well.
81
///
82
/// A full invocation of `log_ratelim!` looks like this:
83
///
84
/// ```
85
/// # use std::num::ParseIntError;
86
/// # let s = "1234";
87
/// # let source = "the cache";
88
/// # let more_information = |_| "";
89
/// use tor_log_ratelim::log_ratelim;
90
/// let r: Result<u8, ParseIntError> = s.parse();
91
/// log_ratelim!(
92
///   "Parsing a value from {}", source;
93
///   r;
94
///   Err(x) => WARN, "The problem was {}", more_information(x);
95
///   Ok(v) => TRACE, "Parsed {} successfully", v;
96
/// );
97
/// ```
98
///
99
/// Here the clause starting with `Err(x)`
100
/// tells the logger to include a message along with the error report,
101
/// and we explicitly specifies the level at which
102
/// to log our failure reports.
103
///
104
/// Note that the error itself is **always** reported;
105
/// there is no need to say
106
/// `Err(e) => WARN, "{}", e`.
107
/// In fact, doing so will create a redundant report of
108
/// the error.
109
//
110
// TODO: I don't think it makes sense to have an Ok() logger.
111
// Instead, we could just say
112
//    log_ratelim!("Parsing a value from {}", source; r;);
113
//    trace!("parsed value from {}: {:?}", source, r);
114
// This is probably better, since it logs a trace for every occurrence.
115
//
116
/// The clause starting with `Ok(v)` tells the logger what to do on success:
117
/// each individual success causes a _non-rate-limited_
118
/// message at TRACE level.
119
///
120
/// The `Ok() ...` clause
121
/// and the `Err() ...` clause
122
/// are both optional.
123
///
124
/// Within the Err() clause,
125
/// the format string and its arguments
126
/// are optional.
127
//
128
// TODO performance notes:
129
//
130
// There are many opportunities for possibly making this code go faster:
131
//  - Optimize the case where there is only one activity.
132
//  - Use a non-string key to distinguish activities, to avoid formatting
133
//    the activity string needlessly.
134
//  - Limit copies (of activity and of error).
135
//  - Use Event and Metadata from the tracing crate to defer formatting
136
//  - Check Metadata early for the case where we don't want to report the
137
//    event at all.
138
//
139
// Let's not pursue any of these until we know that this code actually
140
// shows up in a critical path.
141
#[macro_export]
142
macro_rules! log_ratelim {
143
  // ====
144
  // Actual implementation for rate-limited logging.
145
  // ====
146

            
147
  // Nobody invokes this syntax directly; it's used as a common body for the
148
  // various other syntaxes.
149
  {
150
    @impl activity_format: ( $act_fmt:literal $(, $act_arg:expr)* ) ;
151
          result: ($result:expr ) ;
152
          on_error: (Err($err_pat:pat), $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? );
153
          $( on_ok: (Ok($ok_pat:pat), $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr)*  ); )?
154
  } => {
155
    #[allow(clippy::redundant_closure_call)]
156
154
    (||{
157
    use $crate::macro_prelude::*;
158
154
    let Some(runtime) = rt_support() else {
159
      // Nobody has called `install_runtime()`: we should just log whatever
160
      // happened and not worry about the rate-limiting.
161
154
      match &$result {
162
        #[allow(clippy::redundant_pattern)]
163
        Err(ref the_error @ $err_pat) => {
164
          tracing::event!(
165
            tracing::Level::$err_level,
166
            concat!($act_fmt, $(": ", $err_fmt, )? ": {}"),
167
            $($act_arg,)*
168
            $( $($err_arg, )* )?
169
            the_error.report()
170
          );
171
        }
172
        $(Ok($ok_pat) => {
173
          tracing::event!(
174
            tracing::Level::$ok_level,
175
            $ok_fmt
176
            $(, $ok_arg)*
177
          );
178
        })?
179
        #[allow(unreachable_patterns)]
180
154
        Ok(_) => {}
181
      }
182
154
      return;
183
    };
184

            
185
    /// An implementation of Loggable for this log message.
186
    //
187
    // We use a separate implementation here so that the tracing metadata will get
188
    // constructed correctly.  If we called tracing::event! from a location in
189
    // `tor-log-ratelim`, all the messages would appear to originate from there.
190
    //
191
    // (TODO: We could use tracing::Metadata explicitly, perhaps? That might be hard.)
192
    struct Lg(LogState);
193
    impl Loggable for Lg {
194
        fn flush(&mut self, summarizing: std::time::Duration) -> Activity {
195
            let activity = self.0.activity();
196
            match activity {
197
               Activity::Active => {
198
                  tracing::event!(
199
                      tracing::Level::$err_level,
200
                      "{}",
201
                      self.0.display_problem(summarizing)
202
                  );
203
               }
204
               Activity::Dormant => {
205
                  tracing::event!(
206
                      // Using err_level here is in some respects confusing:
207
                      // if the _presence_ of the problem is (say) a WARN,
208
                      // why should its newfound absence also be a WARN?
209
                      //
210
                      // We have had to decide which is worse:
211
                      // that a user only watching WARNs
212
                      // might not see a problem has gone away,
213
                      // or that a non-problem would be reported
214
                      // at an excessive severity.
215
                      // We went with the latter.
216
                      tracing::Level::$err_level,
217
                      "{}",
218
                      self.0.display_recovery(summarizing)
219
                  );
220
               }
221
            }
222
            self.0.reset();
223
            activity
224
        }
225
    }
226

            
227
    /// A lazy map from activity keys to weak RateLim handles.
228
    //
229
    // The strong reference for each RateLim is held by a task that flushes
230
    // the logger as appropriate, and drops the strong reference once it's
231
    // quiescent.
232
    static LOGGERS: Lazy<Mutex<WeakValueHashMap<String, Weak<RateLim<Lg>>>>> =
233
      Lazy::new(|| Mutex::new(WeakValueHashMap::new()));
234

            
235
    // We assign a separate rate limit for each activity.
236
    // For now, this is string-ly typed.
237
    let activity = format!($act_fmt $(, $act_arg)*);
238
    let key = activity.clone();
239

            
240
    match &$result {
241
      #[allow(clippy::redundant_pattern)]
242
      Err(ref the_error @ $err_pat) => {
243
        // The operation failed.
244
        //
245
        // 1) Create a rate-limited logger for this activity if one  did not
246
        //    already exist.
247
        let logger = LOGGERS
248
          .lock()
249
          .expect("poisoned lock")
250
          .entry(key)
251
          .or_insert_with(|| RateLim::new(Lg(LogState::new(activity))));
252
        // 2) Note failure in the activity with note_fail().
253
        logger.event(runtime, |lg| lg.0.note_fail(||
254
          // 2b) If this is the first time that this activity failed since the
255
          //     last flush, record the formatted err_msg, and a Clone of the error.
256
          (
257
            $crate::log_ratelim!{@first_nonempty
258
              { $( Some(format!($err_fmt $(, $err_arg)* )) )? }
259
              { None }
260
            },
261
            Some(Box::new(the_error.clone()))
262
          )
263
        ));
264
      }
265
      Ok($crate::log_ratelim!{@first_nonempty { $($ok_pat)? } {_} }) => {
266
        // The operation succeeded.
267
        //
268
        // 1) If this activity is tracked, call note_ok() on it.
269
        if let Some(logger) = LOGGERS
270
          .lock()
271
          .expect("poisoned lock")
272
          .get(&key) {
273
            logger.nonevent(|lg| lg.0.note_ok());
274
          }
275
        // 2) If we have a per-success item to log, log it.
276
        $(
277
        tracing::event!(tracing::Level::$ok_level, $ok_fmt $(, $ok_arg )* );
278
        )?
279
      }
280
    }
281
  })()
282
  };
283

            
284
  // ======
285
  // Exposed, documented syntax.
286
  // ======
287

            
288
  // Regular invocation with an Err(_) case.
289
  {
290
    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
291
    $result:expr ;
292
    Err($err_pat:pat) => $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? $(,)?
293
    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)?  )?
294
    $(;)?
295
  } => {
296
    $crate::log_ratelim!{
297
      @impl
298
        activity_format: ( $act_fmt $(, $act_arg)* );
299
        result: ($result);
300
        on_error: (Err($err_pat), $err_level $(, $err_fmt $(, $err_arg)* )? );
301
        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
302
    }
303
  };
304
  // Regular invocation with no Err(_) case.
305
  {
306
    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
307
    $result:expr
308
    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)? )?
309
    $(;)?
310
  } => {
311
    $crate::log_ratelim!{
312
      @impl
313
        activity_format: ( $act_fmt $(, $act_arg)* );
314
        result: ($result);
315
        on_error: (Err(_), WARN);
316
        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
317
    }
318
  };
319

            
320
  // Expand to the first of two bodies that has at least one token in it.
321
  { @first_nonempty { $($a:tt)+ } { $($b:tt)* }} => { $($a)+ };
322
  { @first_nonempty { } { $($b:tt)* } } => { $($b)+ };
323
}
324

            
325
#[cfg(test)]
326
mod test_syntax {
327
    #![allow(dead_code)]
328

            
329
    #[derive(Clone, Debug, thiserror::Error)]
330
    enum MyErr {
331
        #[error("it didn't work")]
332
        DidntWork,
333
    }
334
    impl MyErr {
335
        fn badness(&self) -> u8 {
336
            3
337
        }
338
    }
339

            
340
    /// This doesn't actually run or test anything; it just makes sure that all
341
    /// the different syntaxes work.
342
    fn various_syntaxes(friend: &str, r: &Result<u32, MyErr>) {
343
        log_ratelim!(
344
          "saying hi to {}", friend;
345
          r;
346
        );
347

            
348
        log_ratelim!(
349
          "saying hi to {}", friend;
350
          r;
351
          Err(_) => WARN;
352
        );
353

            
354
        log_ratelim!(
355
          "saying hi to {}", friend;
356
          r;
357
          Err(e) => WARN, "badness={}", e.badness();
358
        );
359

            
360
        log_ratelim!(
361
          "saying hi to {}", friend;
362
          r;
363
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
364
        );
365

            
366
        log_ratelim!(
367
          "saying hi to {}", friend;
368
          r;
369
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
370
        );
371

            
372
        log_ratelim!(
373
          "saying hi to {}", friend;
374
          r;
375
          Err(e) => WARN, "badness={}", e.badness();
376
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
377
        );
378
    }
379
}