Tor  0.4.8.0-alpha-dev
congestion_control_common.c
Go to the documentation of this file.
1 /* Copyright (c) 2021, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
3 
4 /**
5  * \file congestion_control_common.c
6  * \brief Common code used by all congestion control algorithms.
7  */
8 
9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
10 
11 #include "core/or/or.h"
12 
14 #include "core/or/circuitlist.h"
15 #include "core/or/crypt_path.h"
16 #include "core/or/or_circuit_st.h"
18 #include "core/or/channel.h"
20 #include "core/or/sendme.h"
26 #include "core/or/trace_probes_cc.h"
27 #include "lib/time/compat_time.h"
29 #include "app/config/config.h"
30 
31 #include "trunnel/congestion_control.h"
32 #include "trunnel/extension.h"
33 
34 /* Consensus parameter defaults.
35  *
36  * More details for each of the parameters can be found in proposal 324,
37  * section 6.5 including tuning notes. */
38 #define SENDME_INC_DFLT (TLS_RECORD_MAX_CELLS)
39 #define CIRCWINDOW_INIT (4*SENDME_INC_DFLT)
40 
41 #define CC_ALG_DFLT (CC_ALG_SENDME)
42 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS)
43 
44 #define CWND_INC_DFLT (TLS_RECORD_MAX_CELLS)
45 #define CWND_INC_PCT_SS_DFLT (100)
46 #define CWND_INC_RATE_DFLT (1)
47 
48 #define CWND_MIN_DFLT (2*SENDME_INC_DFLT)
49 #define CWND_MAX_DFLT (INT32_MAX)
50 
51 #define BWE_SENDME_MIN_DFLT (5)
52 
53 #define N_EWMA_CWND_PCT_DFLT (50)
54 #define N_EWMA_MAX_DFLT (10)
55 #define N_EWMA_SS_DFLT (2)
56 
57 #define RTT_RESET_PCT_DFLT (100)
58 
59 /* BDP algorithms for each congestion control algorithms use the piecewise
60  * estimattor. See section 3.1.4 of proposal 324. */
61 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
62 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
63 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE
64 
65 /* Indicate OR connection buffer limitations used to stop or start accepting
66  * cells in its outbuf.
67  *
68  * These watermarks are historical to tor in a sense that they've been used
69  * almost from the genesis point. And were likely defined to fit the bounds of
70  * TLS records of 16KB which would be around 32 cells.
71  *
72  * These are defaults of the consensus parameter "orconn_high" and "orconn_low"
73  * values. */
74 #define OR_CONN_HIGHWATER_DFLT (32*1024)
75 #define OR_CONN_LOWWATER_DFLT (16*1024)
76 
77 /* Low and high values of circuit cell queue sizes. They are used to tell when
78  * to start or stop reading on the streams attached on the circuit.
79  *
80  * These are defaults of the consensus parameters "cellq_high" and "cellq_low".
81  */
82 #define CELL_QUEUE_LOW_DFLT (10)
83 #define CELL_QUEUE_HIGH_DFLT (256)
84 
86  uint64_t);
88  const circuit_t *,
89  const crypt_path_t *,
90  uint64_t, uint64_t);
91 /* For unit tests */
93 
94 /* Number of times the RTT value was reset. For MetricsPort. */
95 static uint64_t num_rtt_reset;
96 
97 /* Number of times the clock was stalled. For MetricsPort. */
98 static uint64_t num_clock_stalls;
99 
100 /* Consensus parameters cached. The non static ones are extern. */
101 static uint32_t cwnd_max = CWND_MAX_DFLT;
102 int32_t cell_queue_high = CELL_QUEUE_HIGH_DFLT;
103 int32_t cell_queue_low = CELL_QUEUE_LOW_DFLT;
104 uint32_t or_conn_highwater = OR_CONN_HIGHWATER_DFLT;
105 uint32_t or_conn_lowwater = OR_CONN_LOWWATER_DFLT;
106 uint8_t cc_sendme_inc = SENDME_INC_DFLT;
107 static cc_alg_t cc_alg = CC_ALG_DFLT;
108 
109 /**
110  * Number of cwnd worth of sendme acks to smooth RTT and BDP with,
111  * using N_EWMA */
112 static uint8_t n_ewma_cwnd_pct;
113 
114 /**
115  * Maximum number N for the N-count EWMA averaging of RTT and BDP.
116  */
117 static uint8_t n_ewma_max;
118 
119 /**
120  * Maximum number N for the N-count EWMA averaging of RTT in Slow Start.
121  */
122 static uint8_t n_ewma_ss;
123 
124 /**
125  * Minimum number of sendmes before we begin BDP estimates
126  */
127 static uint8_t bwe_sendme_min;
128 
129 /**
130  * Percentage of the current RTT to use when resetting the minimum RTT
131  * for a circuit. (RTT is reset when the cwnd hits cwnd_min).
132  */
133 static uint8_t rtt_reset_pct;
134 
135 /** Metric to count the number of congestion control circuits **/
137 
138 /** Return the number of RTT reset that have been done. */
139 uint64_t
141 {
142  return num_rtt_reset;
143 }
144 
145 /** Return the number of clock stalls that have been done. */
146 uint64_t
148 {
149  return num_clock_stalls;
150 }
151 
152 /**
153  * Update global congestion control related consensus parameter values,
154  * every consensus update.
155  */
156 void
158 {
159 #define CELL_QUEUE_HIGH_MIN (1)
160 #define CELL_QUEUE_HIGH_MAX (1000)
161  cell_queue_high = networkstatus_get_param(ns, "cellq_high",
162  CELL_QUEUE_HIGH_DFLT,
163  CELL_QUEUE_HIGH_MIN,
164  CELL_QUEUE_HIGH_MAX);
165 
166 #define CELL_QUEUE_LOW_MIN (1)
167 #define CELL_QUEUE_LOW_MAX (1000)
168  cell_queue_low = networkstatus_get_param(ns, "cellq_low",
169  CELL_QUEUE_LOW_DFLT,
170  CELL_QUEUE_LOW_MIN,
171  CELL_QUEUE_LOW_MAX);
172 
173 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE)
174 #define OR_CONN_HIGHWATER_MAX (INT32_MAX)
175  or_conn_highwater =
176  networkstatus_get_param(ns, "orconn_high",
177  OR_CONN_HIGHWATER_DFLT,
178  OR_CONN_HIGHWATER_MIN,
179  OR_CONN_HIGHWATER_MAX);
180 
181 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE)
182 #define OR_CONN_LOWWATER_MAX (INT32_MAX)
183  or_conn_lowwater =
184  networkstatus_get_param(ns, "orconn_low",
185  OR_CONN_LOWWATER_DFLT,
186  OR_CONN_LOWWATER_MIN,
187  OR_CONN_LOWWATER_MAX);
188 
189 #define CWND_MAX_MIN 500
190 #define CWND_MAX_MAX (INT32_MAX)
191  cwnd_max =
192  networkstatus_get_param(NULL, "cc_cwnd_max",
193  CWND_MAX_DFLT,
194  CWND_MAX_MIN,
195  CWND_MAX_MAX);
196 
197 #define RTT_RESET_PCT_MIN (0)
198 #define RTT_RESET_PCT_MAX (100)
199  rtt_reset_pct =
200  networkstatus_get_param(NULL, "cc_rtt_reset_pct",
201  RTT_RESET_PCT_DFLT,
202  RTT_RESET_PCT_MIN,
203  RTT_RESET_PCT_MAX);
204 
205 #define SENDME_INC_MIN 1
206 #define SENDME_INC_MAX (255)
207  cc_sendme_inc =
208  networkstatus_get_param(NULL, "cc_sendme_inc",
209  SENDME_INC_DFLT,
210  SENDME_INC_MIN,
211  SENDME_INC_MAX);
212 
213 #define CC_ALG_MIN 0
214 #define CC_ALG_MAX (NUM_CC_ALGS-1)
215  cc_alg =
216  networkstatus_get_param(NULL, "cc_alg",
217  CC_ALG_DFLT,
218  CC_ALG_MIN,
219  CC_ALG_MAX);
220 
221 #define BWE_SENDME_MIN_MIN 2
222 #define BWE_SENDME_MIN_MAX (20)
224  networkstatus_get_param(NULL, "cc_bwe_min",
225  BWE_SENDME_MIN_DFLT,
226  BWE_SENDME_MIN_MIN,
227  BWE_SENDME_MIN_MAX);
228 
229 #define N_EWMA_CWND_PCT_MIN 1
230 #define N_EWMA_CWND_PCT_MAX (255)
232  networkstatus_get_param(NULL, "cc_ewma_cwnd_pct",
233  N_EWMA_CWND_PCT_DFLT,
234  N_EWMA_CWND_PCT_MIN,
235  N_EWMA_CWND_PCT_MAX);
236 
237 #define N_EWMA_MAX_MIN 2
238 #define N_EWMA_MAX_MAX (INT32_MAX)
239  n_ewma_max =
240  networkstatus_get_param(NULL, "cc_ewma_max",
241  N_EWMA_MAX_DFLT,
242  N_EWMA_MAX_MIN,
243  N_EWMA_MAX_MAX);
244 
245 #define N_EWMA_SS_MIN 2
246 #define N_EWMA_SS_MAX (INT32_MAX)
247  n_ewma_ss =
248  networkstatus_get_param(NULL, "cc_ewma_ss",
249  N_EWMA_SS_DFLT,
250  N_EWMA_SS_MIN,
251  N_EWMA_SS_MAX);
252 }
253 
254 /**
255  * Set congestion control parameters on a circuit's congestion
256  * control object based on values from the consensus.
257  *
258  * cc_alg is the negotiated congestion control algorithm.
259  *
260  * sendme_inc is the number of packaged cells that a sendme cell
261  * acks. This parameter will come from circuit negotiation.
262  */
263 static void
265  const circuit_params_t *params,
266  cc_path_t path)
267 {
268  const or_options_t *opts = get_options();
269  cc->sendme_inc = params->sendme_inc_cells;
270 
271 #define CWND_INIT_MIN SENDME_INC_DFLT
272 #define CWND_INIT_MAX (10000)
273  cc->cwnd =
274  networkstatus_get_param(NULL, "cc_cwnd_init",
275  CIRCWINDOW_INIT,
276  CWND_INIT_MIN,
277  CWND_INIT_MAX);
278 
279 #define CWND_INC_PCT_SS_MIN 1
280 #define CWND_INC_PCT_SS_MAX (500)
281  cc->cwnd_inc_pct_ss =
282  networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss",
283  CWND_INC_PCT_SS_DFLT,
284  CWND_INC_PCT_SS_MIN,
285  CWND_INC_PCT_SS_MAX);
286 
287 #define CWND_INC_MIN 1
288 #define CWND_INC_MAX (1000)
289  cc->cwnd_inc =
290  networkstatus_get_param(NULL, "cc_cwnd_inc",
291  CWND_INC_DFLT,
292  CWND_INC_MIN,
293  CWND_INC_MAX);
294 
295 #define CWND_INC_RATE_MIN 1
296 #define CWND_INC_RATE_MAX (250)
297  cc->cwnd_inc_rate =
298  networkstatus_get_param(NULL, "cc_cwnd_inc_rate",
299  CWND_INC_RATE_DFLT,
300  CWND_INC_RATE_MIN,
301  CWND_INC_RATE_MAX);
302 
303 #define CWND_MIN_MIN SENDME_INC_DFLT
304 #define CWND_MIN_MAX (1000)
305  cc->cwnd_min =
306  networkstatus_get_param(NULL, "cc_cwnd_min",
307  CWND_MIN_DFLT,
308  CWND_MIN_MIN,
309  CWND_MIN_MAX);
310 
311  /* If the consensus says to use OG sendme, but torrc has
312  * always-enabled, use the default "always" alg (vegas),
313  * else use cached conensus alg. */
314  if (cc_alg == CC_ALG_SENDME && opts->AlwaysCongestionControl) {
315  cc->cc_alg = CC_ALG_DFLT_ALWAYS;
316  } else {
317  cc->cc_alg = cc_alg;
318  }
319 
320  bdp_alg_t default_bdp_alg = 0;
321 
322  switch (cc->cc_alg) {
323  case CC_ALG_WESTWOOD:
324  default_bdp_alg = WESTWOOD_BDP_ALG;
325  break;
326  case CC_ALG_VEGAS:
327  default_bdp_alg = VEGAS_BDP_MIX_ALG;
328  break;
329  case CC_ALG_NOLA:
330  default_bdp_alg = NOLA_BDP_ALG;
331  break;
332  case CC_ALG_SENDME:
333  default:
335  return; // No alg-specific params
336  }
337 
338  cc->bdp_alg =
339  networkstatus_get_param(NULL, "cc_bdp_alg",
340  default_bdp_alg,
341  0,
342  NUM_BDP_ALGS-1);
343 
344  /* Algorithm-specific parameters */
345  if (cc->cc_alg == CC_ALG_WESTWOOD) {
347  } else if (cc->cc_alg == CC_ALG_VEGAS) {
349  } else if (cc->cc_alg == CC_ALG_NOLA) {
351  }
352 }
353 
354 /** Returns true if congestion control is enabled in the most recent
355  * consensus, or if __AlwaysCongestionControl is set to true.
356  *
357  * Note that this function (and many many other functions) should not
358  * be called from the CPU worker threads when handling congestion
359  * control negotiation. Relevant values are marshaled into the
360  * `circuit_params_t` struct, in order to be used in worker threads
361  * without touching global state. Use those values in CPU worker
362  * threads, instead of calling this function.
363  *
364  * The danger is still present, in your time, as it was in ours.
365  */
366 bool
368 {
369  const or_options_t *opts = NULL;
370 
371  tor_assert_nonfatal_once(in_main_thread());
372 
373  opts = get_options();
374 
375  /* If the user has set "__AlwaysCongesttionControl",
376  * then always try to negotiate congestion control, regardless
377  * of consensus param. This is to be used for testing and sbws.
378  *
379  * Note that we do *not* allow disabling congestion control
380  * if the consensus says to use it, as this is bad for queueing
381  * and fairness. */
382  if (opts->AlwaysCongestionControl)
383  return 1;
384 
385  return cc_alg != CC_ALG_SENDME;
386 }
387 
388 /**
389  * For unit tests only: set the cached consensus cc alg to
390  * specified value.
391  */
392 void
394 {
395  cc_alg = CC_ALG_VEGAS;
396 }
397 
398 /**
399  * Allocate and initialize fields in congestion control object.
400  *
401  * cc_alg is the negotiated congestion control algorithm.
402  *
403  * sendme_inc is the number of packaged cells that a sendme cell
404  * acks. This parameter will come from circuit negotiation.
405  */
406 static void
408  const circuit_params_t *params,
409  cc_path_t path)
410 {
413 
414  cc->in_slow_start = 1;
415  congestion_control_init_params(cc, params, path);
416 
418 }
419 
420 /** Allocate and initialize a new congestion control object */
423 {
424  congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t));
425 
426  congestion_control_init(cc, params, path);
427 
429 
430  return cc;
431 }
432 
433 /**
434  * Free a congestion control object and its associated state.
435  */
436 void
438 {
439  if (!cc)
440  return;
441 
442  SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t));
443  SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t, tor_free(t));
444  smartlist_free(cc->sendme_pending_timestamps);
445  smartlist_free(cc->sendme_arrival_timestamps);
446 
447  tor_free(cc);
448 }
449 
450 /**
451  * Enqueue a u64 timestamp to the end of a queue of timestamps.
452  */
453 static inline void
454 enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec)
455 {
456  uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t));
457  *timestamp_ptr = timestamp_usec;
458 
459  smartlist_add(timestamps_u64, timestamp_ptr);
460 }
461 
462 /**
463  * Peek at the head of a smartlist queue of u64 timestamps.
464  */
465 static inline uint64_t
466 peek_timestamp(const smartlist_t *timestamps_u64_usecs)
467 {
468  uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
469 
470  if (BUG(!timestamp_ptr)) {
471  log_err(LD_CIRC, "Congestion control timestamp list became empty!");
472  return 0;
473  }
474 
475  return *timestamp_ptr;
476 }
477 
478 /**
479  * Dequeue a u64 monotime usec timestamp from the front of a
480  * smartlist of pointers to 64.
481  */
482 static inline uint64_t
483 dequeue_timestamp(smartlist_t *timestamps_u64_usecs)
484 {
485  uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
486  uint64_t timestamp_u64;
487 
488  if (BUG(!timestamp_ptr)) {
489  log_err(LD_CIRC, "Congestion control timestamp list became empty!");
490  return 0;
491  }
492 
493  timestamp_u64 = *timestamp_ptr;
494  smartlist_del_keeporder(timestamps_u64_usecs, 0);
495  tor_free(timestamp_ptr);
496 
497  return timestamp_u64;
498 }
499 
500 /**
501  * Returns the number N of N-count EWMA, for averaging RTT and BDP over
502  * N SENDME acks.
503  *
504  * This N is bracketed between a divisor of the number of acks in a CWND
505  * and a max value. It is always at least 2.
506  */
507 static inline uint64_t
509 {
510  uint64_t ewma_cnt = 0;
511 
512  if (cc->in_slow_start) {
513  /* In slow-start, we check the Vegas condition every sendme,
514  * so much lower ewma counts are needed. */
515  ewma_cnt = n_ewma_ss;
516  } else {
517  /* After slow-start, we check the Vegas condition only once per
518  * CWND, so it is better to average over longer periods. */
519  ewma_cnt = MIN(CWND_UPDATE_RATE(cc)*n_ewma_cwnd_pct/100,
520  n_ewma_max);
521  }
522  ewma_cnt = MAX(ewma_cnt, 2);
523  return ewma_cnt;
524 }
525 
526 /**
527  * Get a package window from either old sendme logic, or congestion control.
528  *
529  * A package window is how many cells you can still send.
530  */
531 int
533  const crypt_path_t *cpath)
534 {
535  int package_window;
537 
538  tor_assert(circ);
539 
540  if (cpath) {
541  package_window = cpath->package_window;
542  cc = cpath->ccontrol;
543  } else {
544  package_window = circ->package_window;
545  cc = circ->ccontrol;
546  }
547 
548  if (!cc) {
549  return package_window;
550  } else {
551  /* Inflight can be above cwnd if cwnd was just reduced */
552  if (cc->inflight > cc->cwnd)
553  return 0;
554  /* In the extremely unlikely event that cwnd-inflight is larger than
555  * INT32_MAX, just return that cap, so old code doesn't explode. */
556  else if (cc->cwnd - cc->inflight > INT32_MAX)
557  return INT32_MAX;
558  else
559  return (int)(cc->cwnd - cc->inflight);
560  }
561 }
562 
563 /**
564  * Returns the number of cells that are acked by every sendme.
565  */
566 int
567 sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint)
568 {
569  int sendme_inc = CIRCWINDOW_INCREMENT;
570  congestion_control_t *cc = NULL;
571 
572  if (layer_hint) {
573  cc = layer_hint->ccontrol;
574  } else {
575  cc = circ->ccontrol;
576  }
577 
578  if (cc) {
579  sendme_inc = cc->sendme_inc;
580  }
581 
582  return sendme_inc;
583 }
584 
585 /** Return true iff the next cell we send will result in the other endpoint
586  * sending a SENDME.
587  *
588  * We are able to know that because the package or inflight window value minus
589  * one cell (the possible SENDME cell) should be a multiple of the
590  * cells-per-sendme increment value (set via consensus parameter, negotiated
591  * for the circuit, and passed in as sendme_inc).
592  *
593  * This function is used when recording a cell digest and this is done quite
594  * low in the stack when decrypting or encrypting a cell. The window is only
595  * updated once the cell is actually put in the outbuf.
596  */
597 bool
599  const crypt_path_t *layer_hint)
600 {
602  int window;
603 
604  tor_assert(circ);
605 
606  if (layer_hint) {
607  window = layer_hint->package_window;
608  cc = layer_hint->ccontrol;
609  } else {
610  window = circ->package_window;
611  cc = circ->ccontrol;
612  }
613 
614  /* If we are using congestion control and the alg is not
615  * old-school 'fixed', then use cc->inflight to determine
616  * when sendmes will be sent */
617  if (cc) {
618  if (!cc->inflight)
619  return false;
620 
621  /* This check must be +1 because this function is called *before*
622  * inflight is incremented for the sent cell */
623  if ((cc->inflight+1) % cc->sendme_inc != 0)
624  return false;
625 
626  return true;
627  }
628 
629  /* At the start of the window, no SENDME will be expected. */
630  if (window == CIRCWINDOW_START) {
631  return false;
632  }
633 
634  /* Are we at the limit of the increment and if not, we don't expect next
635  * cell is a SENDME.
636  *
637  * We test against the window minus 1 because when we are looking if the
638  * next cell is a SENDME, the window (either package or deliver) hasn't been
639  * decremented just yet so when this is called, we are currently processing
640  * the "window - 1" cell.
641  */
642  if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) {
643  return false;
644  }
645 
646  /* Next cell is expected to be a SENDME. */
647  return true;
648 }
649 
650 /**
651  * Call-in to tell congestion control code that this circuit sent a cell.
652  *
653  * This updates the 'inflight' counter, and if this is a cell that will
654  * cause the other end to send a SENDME, record the current time in a list
655  * of pending timestamps, so that we can later compute the circuit RTT when
656  * the SENDME comes back. */
657 void
659  const circuit_t *circ,
660  const crypt_path_t *cpath)
661 {
662  tor_assert(circ);
663  tor_assert(cc);
664 
665  /* Is this the last cell before a SENDME? The idea is that if the
666  * package_window reaches a multiple of the increment, after this cell, we
667  * should expect a SENDME. Note that this function must be called *before*
668  * we account for the sent cell. */
669  if (!circuit_sent_cell_for_sendme(circ, cpath)) {
670  cc->inflight++;
671  return;
672  }
673 
674  cc->inflight++;
675 
676  /* Record this cell time for RTT computation when SENDME arrives */
679 }
680 
681 /**
682  * Returns true if any edge connections are active.
683  *
684  * We need to know this so that we can stop computing BDP if the
685  * edges are not sending on the circuit.
686  */
687 static int
689  const crypt_path_t *layer_hint)
690 {
691  const edge_connection_t *streams;
692 
693  if (CIRCUIT_IS_ORIGIN(circ)) {
694  streams = CONST_TO_ORIGIN_CIRCUIT(circ)->p_streams;
695  } else {
696  streams = CONST_TO_OR_CIRCUIT(circ)->n_streams;
697  }
698 
699  /* Check linked list of streams */
700  for (const edge_connection_t *conn = streams; conn != NULL;
701  conn = conn->next_stream) {
702  if (conn->base_.marked_for_close)
703  continue;
704 
705  if (!layer_hint || conn->cpath_layer == layer_hint) {
706  if (connection_get_inbuf_len(TO_CONN(conn)) > 0) {
707  log_info(LD_CIRC, "CC: More in edge inbuf...");
708  return 1;
709  }
710 
711  /* If we did not reach EOF on this read, there's more */
712  if (!TO_CONN(conn)->inbuf_reached_eof) {
713  log_info(LD_CIRC, "CC: More on edge conn...");
714  return 1;
715  }
716 
717  if (TO_CONN(conn)->linked_conn) {
718  if (connection_get_inbuf_len(TO_CONN(conn)->linked_conn) > 0) {
719  log_info(LD_CIRC, "CC: More in linked inbuf...");
720  return 1;
721  }
722 
723  /* If there is a linked conn, and *it* did not each EOF,
724  * there's more */
725  if (!TO_CONN(conn)->linked_conn->inbuf_reached_eof) {
726  log_info(LD_CIRC, "CC: More on linked conn...");
727  return 1;
728  }
729  }
730  }
731  }
732 
733  return 0;
734 }
735 
736 /**
737  * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
738  * list, and use this to update RTT.
739  *
740  * Returns true if circuit estimates were successfully updated, false
741  * otherwise.
742  */
743 bool
745  const circuit_t *circ,
746  const crypt_path_t *layer_hint)
747 {
748  uint64_t now_usec = monotime_absolute_usec();
749 
750  /* Update RTT first, then BDP. BDP needs fresh RTT */
751  uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec);
752  return congestion_control_update_circuit_bdp(cc, circ, layer_hint, now_usec,
753  curr_rtt_usec);
754 }
755 
756 /**
757  * Returns true if we have enough time data to use heuristics
758  * to compare RTT to a baseline.
759  */
760 static bool
762 {
763  /* If we have exited slow start and also have an EWMA RTT, we
764  * should have processed at least a cwnd worth of RTTs */
765  if (!cc->in_slow_start && cc->ewma_rtt_usec) {
766  return true;
767  }
768 
769  /* If we managed to get enough acks to estimate a SENDME BDP, then
770  * we have enough to estimate clock jumps relative to a baseline,
771  * too. (This is at least 'cc_bwe_min' acks). */
772  if (cc->bdp[BDP_ALG_SENDME_RATE]) {
773  return true;
774  }
775 
776  /* Not enough data to estimate clock jumps */
777  return false;
778 }
779 
780 static bool is_monotime_clock_broken = false;
781 
782 /**
783  * Returns true if the monotime delta is 0, or is significantly
784  * different than the previous delta. Either case indicates
785  * that the monotime time source stalled or jumped.
786  *
787  * Also caches the clock state in the is_monotime_clock_broken flag,
788  * so we can also provide a is_monotime_clock_reliable() function,
789  * used by flow control rate timing.
790  */
791 static bool
793  uint64_t old_delta, uint64_t new_delta)
794 {
795 #define DELTA_DISCREPENCY_RATIO_MAX 5000
796  /* If we have a 0 new_delta, that is definitely a monotime stall */
797  if (new_delta == 0) {
798  static ratelim_t stall_info_limit = RATELIM_INIT(60);
799  log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC,
800  "Congestion control cannot measure RTT due to monotime stall.");
801 
802  is_monotime_clock_broken = true;
803  return true;
804  }
805 
806  /*
807  * For the heuristic cases, we need at least a few timestamps,
808  * to average out any previous partial stalls or jumps. So until
809  * that point, let's just assume its OK.
810  */
812  return false;
813  }
814 
815  /* If old_delta is significantly larger than new_delta, then
816  * this means that the monotime clock could have recently
817  * stopped moving forward. However, use the cache for this
818  * value, because it may also be caused by network activity,
819  * or by a previous clock jump that was not detected.
820  *
821  * So if we have not gotten a 0-delta recently, we will
822  * still allow this new low RTT, but just yell about it. */
823  if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) {
824  static ratelim_t dec_notice_limit = RATELIM_INIT(300);
825  log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
826  "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64
827  "), likely due to clock jump.",
828  new_delta/1000, old_delta/1000);
829 
830  return is_monotime_clock_broken;
831  }
832 
833  /* If new_delta is significantly larger than old_delta, then
834  * this means that the monotime clock suddenly jumped forward.
835  * However, do not cache this value, because it may also be caused
836  * by network activity.
837  */
838  if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) {
839  static ratelim_t dec_notice_limit = RATELIM_INIT(300);
840  log_fn_ratelim(&dec_notice_limit, LOG_PROTOCOL_WARN, LD_CIRC,
841  "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64
842  "), likely due to clock jump or suspended remote endpoint.",
843  new_delta/1000, old_delta/1000);
844 
845  return true;
846  }
847 
848  /* All good! Update cached status, too */
849  is_monotime_clock_broken = false;
850 
851  return false;
852 }
853 
854 /**
855  * Is the monotime clock stalled according to any circuits?
856  */
857 bool
859 {
860  return !is_monotime_clock_broken;
861 }
862 
863 /**
864  * Called when we get a SENDME. Updates circuit RTT by pulling off a
865  * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
866  * the queue of such timestamps, and comparing that to current time.
867  *
868  * Also updates min, max, and EWMA of RTT.
869  *
870  * Returns the current circuit RTT in usecs, or 0 if it could not be
871  * measured (due to clock jump, stall, etc).
872  */
873 static uint64_t
875  uint64_t now_usec)
876 {
877  uint64_t rtt, ewma_cnt;
878  uint64_t sent_at_timestamp;
879 
880  tor_assert(cc);
881 
882  /* Get the time that we sent the cell that resulted in the other
883  * end sending this sendme. Use this to calculate RTT */
884  sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
885 
886  rtt = now_usec - sent_at_timestamp;
887 
888  /* Do not update RTT at all if it looks fishy */
889  if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) {
890  num_clock_stalls++; /* Accounting */
891  return 0;
892  }
893 
894  ewma_cnt = n_ewma_count(cc);
895 
896  cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt);
897 
898  if (rtt > cc->max_rtt_usec) {
899  cc->max_rtt_usec = rtt;
900  }
901 
902  if (cc->min_rtt_usec == 0) {
903  // If we do not have a min_rtt yet, use current ewma
904  cc->min_rtt_usec = cc->ewma_rtt_usec;
905  } else if (cc->cwnd == cc->cwnd_min && !cc->in_slow_start) {
906  // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state
907  // if we hit cwnd_min due to an abnormally low rtt.
908  uint64_t new_rtt = percent_max_mix(cc->ewma_rtt_usec, cc->min_rtt_usec,
909  rtt_reset_pct);
910 
911  static ratelim_t rtt_notice_limit = RATELIM_INIT(300);
912  log_fn_ratelim(&rtt_notice_limit, LOG_NOTICE, LD_CIRC,
913  "Resetting circ RTT from %"PRIu64" to %"PRIu64" due to low cwnd",
914  cc->min_rtt_usec/1000, new_rtt/1000);
915 
916  cc->min_rtt_usec = new_rtt;
917  num_rtt_reset++; /* Accounting */
918  } else if (cc->ewma_rtt_usec < cc->min_rtt_usec) {
919  // Using the EWMA for min instead of current RTT helps average out
920  // effects from other conns
921  cc->min_rtt_usec = cc->ewma_rtt_usec;
922  }
923 
924  return rtt;
925 }
926 
927 /**
928  * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
929  * estimates of a circuit. Several methods of computing BDP are used,
930  * depending on scenario. While some congestion control algorithms only
931  * use one of these methods, we update them all because it's quick and easy.
932  *
933  * - now_usec is the current monotime in usecs.
934  * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
935  * RTT could bemeasured.
936  *
937  * Returns true if we were able to update BDP, false otherwise.
938  */
939 static bool
941  const circuit_t *circ,
942  const crypt_path_t *layer_hint,
943  uint64_t now_usec,
944  uint64_t curr_rtt_usec)
945 {
946  int chan_q = 0;
947  unsigned int blocked_on_chan = 0;
948  uint64_t timestamp_usec;
949  uint64_t sendme_rate_bdp = 0;
950 
951  tor_assert(cc);
952 
953  if (CIRCUIT_IS_ORIGIN(circ)) {
954  /* origin circs use n_chan */
955  chan_q = circ->n_chan_cells.n;
956  blocked_on_chan = circ->streams_blocked_on_n_chan;
957  } else {
958  /* Both onion services and exits use or_circuit and p_chan */
959  chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n;
960  blocked_on_chan = circ->streams_blocked_on_p_chan;
961  }
962 
963  /* If we have no EWMA RTT, it is because monotime has been stalled
964  * or messed up the entire time so far. Set our BDP estimates directly
965  * to current cwnd */
966  if (!cc->ewma_rtt_usec) {
967  uint64_t cwnd = cc->cwnd;
968 
969  tor_assert_nonfatal(cc->cwnd <= cwnd_max);
970 
971  /* If the channel is blocked, keep subtracting off the chan_q
972  * until we hit the min cwnd. */
973  if (blocked_on_chan) {
974  /* Cast is fine because we're less than int32 */
975  if (chan_q >= (int64_t)cwnd) {
976  log_notice(LD_CIRC,
977  "Clock stall with large chanq: %d %"PRIu64, chan_q, cwnd);
978  cwnd = cc->cwnd_min;
979  } else {
980  cwnd = MAX(cwnd - chan_q, cc->cwnd_min);
981  }
982  cc->blocked_chan = 1;
983  } else {
984  cc->blocked_chan = 0;
985  }
986 
987  cc->bdp[BDP_ALG_CWND_RTT] = cwnd;
988  cc->bdp[BDP_ALG_INFLIGHT_RTT] = cwnd;
989  cc->bdp[BDP_ALG_SENDME_RATE] = cwnd;
990  cc->bdp[BDP_ALG_PIECEWISE] = cwnd;
991 
992  static ratelim_t dec_notice_limit = RATELIM_INIT(300);
993  log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
994  "Our clock has been stalled for the entire lifetime of a circuit. "
995  "Performance may be sub-optimal.");
996 
997  return blocked_on_chan;
998  }
999 
1000  /* Congestion window based BDP will respond to changes in RTT only, and is
1001  * relative to cwnd growth. It is useful for correcting for BDP
1002  * overestimation, but if BDP is higher than the current cwnd, it will
1003  * underestimate it.
1004  *
1005  * We multiply here first to avoid precision issues from min_RTT being
1006  * close to ewma RTT. Since all fields are u64, there is plenty of
1007  * room here to multiply first.
1008  */
1009  cc->bdp[BDP_ALG_CWND_RTT] = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec;
1010 
1011  /*
1012  * If we have no pending streams, we do not have enough data to fill
1013  * the BDP, so preserve our old estimates but do not make any more.
1014  */
1015  if (!blocked_on_chan && !circuit_has_active_streams(circ, layer_hint)) {
1016  log_info(LD_CIRC,
1017  "CC: Streams drained. Spare package window: %"PRIu64
1018  ", no BDP update", cc->cwnd - cc->inflight);
1019 
1020  /* Clear SENDME timestamps; they will be wrong with intermittent data */
1021  SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t,
1022  tor_free(t));
1024  } else if (curr_rtt_usec && is_monotime_clock_reliable()) {
1025  /* Sendme-based BDP will quickly measure BDP in much less than
1026  * a cwnd worth of data when in use (in 2-10 SENDMEs).
1027  *
1028  * But if the link goes idle, it will be vastly lower than true BDP. Hence
1029  * we only compute it if we have either pending stream data, or streams
1030  * are still blocked on the channel queued data.
1031  *
1032  * We also do not compute it if we do not have a current RTT passed in,
1033  * because that means that monotime is currently stalled or just jumped.
1034  */
1036 
1037  if (smartlist_len(cc->sendme_arrival_timestamps) >= bwe_sendme_min) {
1038  /* If we have more sendmes than fit in a cwnd, trim the list.
1039  * Those are not acurrately measuring throughput, if cwnd is
1040  * currently smaller than BDP */
1041  while (smartlist_len(cc->sendme_arrival_timestamps) >
1042  bwe_sendme_min &&
1043  (uint64_t)smartlist_len(cc->sendme_arrival_timestamps) >
1044  n_ewma_count(cc)) {
1046  }
1047  int sendme_cnt = smartlist_len(cc->sendme_arrival_timestamps);
1048 
1049  /* Calculate SENDME_BWE_COUNT pure average */
1050  timestamp_usec = peek_timestamp(cc->sendme_arrival_timestamps);
1051  uint64_t delta = now_usec - timestamp_usec;
1052 
1053  /* In Shadow, the time delta between acks can be 0 if there is no
1054  * network activity between them. Only update BDP if the delta is
1055  * non-zero. */
1056  if (delta > 0) {
1057  /* The acked data is in sendme_cnt-1 chunks, because we are counting
1058  * the data that is processed by the other endpoint *between* all of
1059  * these sendmes. There's one less gap between the sendmes than the
1060  * number of sendmes. */
1061  uint64_t cells = (sendme_cnt-1)*cc->sendme_inc;
1062 
1063  /* The bandwidth estimate is cells/delta, which when multiplied
1064  * by min RTT obtains the BDP. However, we multiply first to
1065  * avoid precision issues with the RTT being close to delta in size. */
1066  sendme_rate_bdp = cells*cc->min_rtt_usec/delta;
1067 
1068  /* Calculate BDP_EWMA_COUNT N-EWMA */
1069  cc->bdp[BDP_ALG_SENDME_RATE] =
1070  n_count_ewma(sendme_rate_bdp, cc->bdp[BDP_ALG_SENDME_RATE],
1071  n_ewma_count(cc));
1072  }
1073  }
1074 
1075  /* In-flight BDP will cause the cwnd to drift down when underutilized.
1076  * It is most useful when the local OR conn is blocked, so we only
1077  * compute it if we're utilized. */
1078  cc->bdp[BDP_ALG_INFLIGHT_RTT] =
1079  (cc->inflight - chan_q)*cc->min_rtt_usec/
1080  MAX(cc->ewma_rtt_usec, curr_rtt_usec);
1081  } else {
1082  /* We can still update inflight with just an EWMA RTT, but only
1083  * if there is data flowing */
1084  cc->bdp[BDP_ALG_INFLIGHT_RTT] =
1085  (cc->inflight - chan_q)*cc->min_rtt_usec/cc->ewma_rtt_usec;
1086  }
1087 
1088  /* The orconn is blocked; use smaller of inflight vs SENDME */
1089  if (blocked_on_chan) {
1090  log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d",
1091  chan_q);
1092 
1093  /* A blocked channel is an immediate congestion signal, but it still
1094  * happens only once per cwnd */
1095  if (!cc->blocked_chan) {
1096  cc->next_cc_event = 0;
1097  cc->blocked_chan = 1;
1098  }
1099 
1100  if (cc->bdp[BDP_ALG_SENDME_RATE]) {
1101  cc->bdp[BDP_ALG_PIECEWISE] = MIN(cc->bdp[BDP_ALG_INFLIGHT_RTT],
1102  cc->bdp[BDP_ALG_SENDME_RATE]);
1103  } else {
1104  cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_INFLIGHT_RTT];
1105  }
1106  } else {
1107  /* If we were previously blocked, emit a new congestion event
1108  * now that we are unblocked, to re-evaluate cwnd */
1109  if (cc->blocked_chan) {
1110  cc->blocked_chan = 0;
1111  cc->next_cc_event = 0;
1112  log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d",
1113  chan_q);
1114  }
1115 
1116  cc->bdp[BDP_ALG_PIECEWISE] = MAX(cc->bdp[BDP_ALG_SENDME_RATE],
1117  cc->bdp[BDP_ALG_CWND_RTT]);
1118  }
1119 
1120  /* We can end up with no piecewise value if we didn't have either
1121  * a SENDME estimate or enough data for an inflight estimate.
1122  * It also happens on the very first sendme, since we need two
1123  * to get a BDP. In these cases, use the cwnd method. */
1124  if (!cc->bdp[BDP_ALG_PIECEWISE]) {
1125  cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_CWND_RTT];
1126  log_info(LD_CIRC, "CC: No piecewise BDP. Using %"PRIu64,
1127  cc->bdp[BDP_ALG_PIECEWISE]);
1128  }
1129 
1130  if (cc->next_cc_event == 0) {
1131  if (CIRCUIT_IS_ORIGIN(circ)) {
1132  log_info(LD_CIRC,
1133  "CC: Circuit %d "
1134  "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
1135  "BDP estimates: "
1136  "%"PRIu64", "
1137  "%"PRIu64", "
1138  "%"PRIu64", "
1139  "%"PRIu64", "
1140  "%"PRIu64". ",
1141  CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier,
1142  cc->min_rtt_usec/1000,
1143  curr_rtt_usec/1000,
1144  cc->ewma_rtt_usec/1000,
1145  cc->max_rtt_usec/1000,
1146  cc->bdp[BDP_ALG_INFLIGHT_RTT],
1147  cc->bdp[BDP_ALG_CWND_RTT],
1148  sendme_rate_bdp,
1149  cc->bdp[BDP_ALG_SENDME_RATE],
1150  cc->bdp[BDP_ALG_PIECEWISE]
1151  );
1152  } else {
1153  log_info(LD_CIRC,
1154  "CC: Circuit %"PRIu64":%d "
1155  "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
1156  "%"PRIu64", "
1157  "%"PRIu64", "
1158  "%"PRIu64", "
1159  "%"PRIu64", "
1160  "%"PRIu64". ",
1161  CONST_TO_OR_CIRCUIT(circ)->p_chan->global_identifier,
1162  CONST_TO_OR_CIRCUIT(circ)->p_circ_id,
1163  cc->min_rtt_usec/1000,
1164  curr_rtt_usec/1000,
1165  cc->ewma_rtt_usec/1000,
1166  cc->max_rtt_usec/1000,
1167  cc->bdp[BDP_ALG_INFLIGHT_RTT],
1168  cc->bdp[BDP_ALG_CWND_RTT],
1169  sendme_rate_bdp,
1170  cc->bdp[BDP_ALG_SENDME_RATE],
1171  cc->bdp[BDP_ALG_PIECEWISE]
1172  );
1173  }
1174  }
1175 
1176  /* We updated BDP this round if either we had a blocked channel, or
1177  * the curr_rtt_usec was not 0. */
1178  bool ret = (blocked_on_chan || curr_rtt_usec != 0);
1179  if (ret) {
1180  tor_trace(TR_SUBSYS(cc), TR_EV(bdp_update), circ, cc, curr_rtt_usec,
1181  sendme_rate_bdp);
1182  }
1183  return ret;
1184 }
1185 
1186 /**
1187  * Dispatch the sendme to the appropriate congestion control algorithm.
1188  */
1189 int
1191  const circuit_t *circ,
1192  const crypt_path_t *layer_hint)
1193 {
1194  int ret = -END_CIRC_REASON_INTERNAL;
1195  switch (cc->cc_alg) {
1196  case CC_ALG_WESTWOOD:
1197  ret = congestion_control_westwood_process_sendme(cc, circ, layer_hint);
1198  break;
1199 
1200  case CC_ALG_VEGAS:
1201  ret = congestion_control_vegas_process_sendme(cc, circ, layer_hint);
1202  break;
1203 
1204  case CC_ALG_NOLA:
1205  ret = congestion_control_nola_process_sendme(cc, circ, layer_hint);
1206  break;
1207 
1208  case CC_ALG_SENDME:
1209  default:
1210  tor_assert(0);
1211  }
1212 
1213  if (cc->cwnd > cwnd_max) {
1214  static ratelim_t cwnd_limit = RATELIM_INIT(60);
1215  log_fn_ratelim(&cwnd_limit, LOG_NOTICE, LD_CIRC,
1216  "Congestion control cwnd %"PRIu64" exceeds max %d, clamping.",
1217  cc->cwnd, cwnd_max);
1218  cc->cwnd = cwnd_max;
1219  }
1220 
1221  return ret;
1222 }
1223 
1224 /**
1225  * Build an extension field request to negotiate congestion control.
1226  *
1227  * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
1228  * is created in msg_out. It is a single 0-length field that signifies that we
1229  * want to use congestion control. The length of msg_out is provided via
1230  * msg_len_out.
1231  *
1232  * If congestion control is not enabled, a payload with 0 extensions is created
1233  * and returned.
1234  *
1235  * If there is a failure building the request, -1 is returned, else 0.
1236  *
1237  * *msg_out must be freed if the return value is 0.
1238  */
1239 int
1240 congestion_control_build_ext_request(uint8_t **msg_out, size_t *msg_len_out)
1241 {
1242  uint8_t *request = NULL;
1243  trn_extension_t *ext = NULL;
1244  trn_extension_field_t *field = NULL;
1245 
1246  ext = trn_extension_new();
1247 
1248  /* With congestion control enabled, add the request, else it is an empty
1249  * request in the payload. */
1250 
1252  /* Build the extension field that will hold the CC field. */
1253  field = trn_extension_field_new();
1254  trn_extension_field_set_field_type(field,
1255  TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST);
1256 
1257  /* No payload indicating a request to use congestion control. */
1258  trn_extension_field_set_field_len(field, 0);
1259 
1260  /* Build final extension. */
1261  trn_extension_add_fields(ext, field);
1262  trn_extension_set_num(ext, 1);
1263  }
1264 
1265  /* Encode extension. */
1266  ssize_t ret = trn_extension_encoded_len(ext);
1267  if (BUG(ret < 0)) {
1268  goto err;
1269  }
1270  size_t request_len = ret;
1271  request = tor_malloc_zero(request_len);
1272  ret = trn_extension_encode(request, request_len, ext);
1273  if (BUG(ret < 0)) {
1274  tor_free(request);
1275  goto err;
1276  }
1277  *msg_out = request;
1278  *msg_len_out = request_len;
1279 
1280  /* Free everything, we've encoded the request now. */
1281  ret = 0;
1282 
1283  err:
1284  trn_extension_free(ext);
1285  return (int)ret;
1286 }
1287 
1288 /**
1289  * Parse a congestion control ntorv3 request payload for extensions.
1290  *
1291  * On parsing failure, -1 is returned.
1292  *
1293  * If congestion control request is present, return 1. If it is not present,
1294  * return 0.
1295  *
1296  * WARNING: Called from CPU worker! Must not access any global state.
1297  */
1298 int
1299 congestion_control_parse_ext_request(const uint8_t *msg, const size_t msg_len)
1300 {
1301  ssize_t ret = 0;
1302  trn_extension_t *ext = NULL;
1303  size_t num_fields = 0;
1304 
1305  /* Parse extension from payload. */
1306  ret = trn_extension_parse(&ext, msg, msg_len);
1307  if (ret < 0) {
1308  goto end;
1309  }
1310 
1311  /* No extension implies no support for congestion control. In this case, we
1312  * simply return 0 to indicate CC is disabled. */
1313  if ((num_fields = trn_extension_get_num(ext)) == 0) {
1314  ret = 0;
1315  goto end;
1316  }
1317 
1318  /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST,
1319  * then congestion control is enabled. Ignore unknown fields. */
1320  for (size_t f = 0; f < num_fields; f++) {
1321  const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1322  if (field == NULL) {
1323  ret = -1;
1324  goto end;
1325  }
1326 
1327  /* For congestion control to be enabled, we only need the field type. */
1328  if (trn_extension_field_get_field_type(field) ==
1329  TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST) {
1330  ret = 1;
1331  break;
1332  }
1333  }
1334 
1335  end:
1336  trn_extension_free(ext);
1337  return (int)ret;
1338 }
1339 
1340 /**
1341  * Given our observed parameters for circuits and congestion control,
1342  * as well as the parameters for the resulting circuit, build a response
1343  * payload using extension fields into *msg_out, with length specified in
1344  * *msg_out_len.
1345  *
1346  * If congestion control will be enabled, the extension field for
1347  * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value.
1348  *
1349  * If congestion control won't be enabled, an extension payload with 0
1350  * fields will be created.
1351  *
1352  * Return 0 if an extension payload was created in *msg_out, and -1 on
1353  * error.
1354  *
1355  * *msg_out must be freed if the return value is 0.
1356  *
1357  * WARNING: Called from CPU worker! Must not access any global state.
1358  */
1359 int
1361  const circuit_params_t *circ_params,
1362  uint8_t **msg_out, size_t *msg_len_out)
1363 {
1364  ssize_t ret;
1365  uint8_t *request = NULL;
1366  trn_extension_t *ext = NULL;
1367  trn_extension_field_t *field = NULL;
1368  trn_extension_field_cc_t *cc_field = NULL;
1369 
1370  tor_assert(our_params);
1371  tor_assert(circ_params);
1372  tor_assert(msg_out);
1373  tor_assert(msg_len_out);
1374 
1375  ext = trn_extension_new();
1376 
1377  if (circ_params->cc_enabled) {
1378  /* Build the extension field that will hold the CC field. */
1379  field = trn_extension_field_new();
1380  trn_extension_field_set_field_type(field,
1381  TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE);
1382 
1383  /* Build the congestion control field response. */
1384  cc_field = trn_extension_field_cc_new();
1385  trn_extension_field_cc_set_sendme_inc(cc_field,
1386  our_params->sendme_inc_cells);
1387 
1388  ret = trn_extension_field_cc_encoded_len(cc_field);
1389  if (BUG(ret <= 0)) {
1390  trn_extension_field_free(field);
1391  goto err;
1392  }
1393  size_t field_len = ret;
1394  trn_extension_field_set_field_len(field, field_len);
1395  trn_extension_field_setlen_field(field, field_len);
1396 
1397  uint8_t *field_array = trn_extension_field_getarray_field(field);
1398  ret = trn_extension_field_cc_encode(field_array,
1399  trn_extension_field_getlen_field(field), cc_field);
1400  if (BUG(ret <= 0)) {
1401  trn_extension_field_free(field);
1402  goto err;
1403  }
1404 
1405  /* Build final extension. */
1406  trn_extension_add_fields(ext, field);
1407  trn_extension_set_num(ext, 1);
1408  }
1409 
1410  /* Encode extension. */
1411  ret = trn_extension_encoded_len(ext);
1412  if (BUG(ret < 0)) {
1413  goto err;
1414  }
1415  size_t request_len = ret;
1416  request = tor_malloc_zero(request_len);
1417  ret = trn_extension_encode(request, request_len, ext);
1418  if (BUG(ret < 0)) {
1419  tor_free(request);
1420  goto err;
1421  }
1422  *msg_out = request;
1423  *msg_len_out = request_len;
1424 
1425  /* We've just encoded the extension, clean everything. */
1426  ret = 0;
1427 
1428  err:
1429  trn_extension_free(ext);
1430  trn_extension_field_cc_free(cc_field);
1431  return (int)ret;
1432 }
1433 
1434 /** Return true iff the given sendme increment is within the acceptable
1435  * margins. */
1436 bool
1438 {
1439  /* We will only accept this response (and this circuit) if sendme_inc
1440  * is within a factor of 2 of our consensus value. We should not need
1441  * to change cc_sendme_inc much, and if we do, we can spread out those
1442  * changes over smaller increments once every 4 hours. Exits that
1443  * violate this range should just not be used. */
1444 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1445 
1446  if (sendme_inc == 0)
1447  return false;
1448 
1449  if (sendme_inc >
1450  MAX_SENDME_INC_NEGOTIATE_FACTOR * congestion_control_sendme_inc() ||
1451  sendme_inc <
1452  congestion_control_sendme_inc() / MAX_SENDME_INC_NEGOTIATE_FACTOR) {
1453  return false;
1454  }
1455  return true;
1456 }
1457 
1458 /** Return 1 if CC is enabled which also will set the SENDME increment into our
1459  * params_out. Return 0 if CC is disabled. Else, return -1 on error. */
1460 int
1462  const size_t msg_len,
1463  circuit_params_t *params_out)
1464 {
1465  ssize_t ret = 0;
1466  size_t num_fields = 0;
1467  trn_extension_t *ext = NULL;
1468  trn_extension_field_cc_t *cc_field = NULL;
1469 
1470  /* We will only accept this response (and this circuit) if sendme_inc
1471  * is within a factor of 2 of our consensus value. We should not need
1472  * to change cc_sendme_inc much, and if we do, we can spread out those
1473  * changes over smaller increments once every 4 hours. Exits that
1474  * violate this range should just not be used. */
1475 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1476 
1477  /* Parse extension from payload. */
1478  ret = trn_extension_parse(&ext, msg, msg_len);
1479  if (ret < 0) {
1480  goto end;
1481  }
1482 
1483  if ((num_fields = trn_extension_get_num(ext)) == 0) {
1484  ret = 0;
1485  goto end;
1486  }
1487 
1488  /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE,
1489  * then congestion control is enabled. Ignore unknown fields. */
1490  for (size_t f = 0; f < num_fields; f++) {
1491  const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1492  if (field == NULL) {
1493  ret = -1;
1494  goto end;
1495  }
1496 
1497  /* Only examine TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE; ignore other fields */
1498  if (trn_extension_field_get_field_type(field) ==
1499  TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE) {
1500 
1501  /* Parse the field into the congestion control field. */
1502  ret = trn_extension_field_cc_parse(&cc_field,
1503  trn_extension_field_getconstarray_field(field),
1504  trn_extension_field_getlen_field(field));
1505  if (ret < 0) {
1506  goto end;
1507  }
1508 
1509  uint8_t sendme_inc_cells =
1510  trn_extension_field_cc_get_sendme_inc(cc_field);
1511  if (!congestion_control_validate_sendme_increment(sendme_inc_cells)) {
1512  ret = -1;
1513  goto end;
1514  }
1515 
1516  /* All good. Get value and break */
1517  params_out->sendme_inc_cells = sendme_inc_cells;
1518  ret = 1;
1519  break;
1520  }
1521  }
1522 
1523  end:
1524  trn_extension_free(ext);
1525  trn_extension_field_cc_free(cc_field);
1526 
1527  return (int)ret;
1528 }
1529 
1530 /**
1531  * Returns a formatted string of fields containing congestion
1532  * control information, for the CIRC_BW control port event.
1533  *
1534  * An origin circuit can have a ccontrol object directly on it,
1535  * if it is an onion service, or onion client. Exit-bound clients
1536  * will have the ccontrol on the cpath associated with their exit
1537  * (the last one in the cpath list).
1538  *
1539  * WARNING: This function does not support leaky-pipe topology. It
1540  * is to be used for control port information only.
1541  */
1542 char *
1544 {
1545  const congestion_control_t *ccontrol = NULL;
1546  char *ret = NULL;
1547  int len;
1548 
1549  if (TO_CIRCUIT(circ)->ccontrol) {
1550  ccontrol = TO_CIRCUIT(circ)->ccontrol;
1551  } else if (circ->cpath && circ->cpath->prev->ccontrol) {
1552  /* Get ccontrol for last hop (exit) if it exists */
1553  ccontrol = circ->cpath->prev->ccontrol;
1554  }
1555 
1556  if (!ccontrol)
1557  return NULL;
1558 
1559  len = tor_asprintf(&ret,
1560  " SS=%d CWND=%"PRIu64" RTT=%"PRIu64" MIN_RTT=%"PRIu64,
1561  ccontrol->in_slow_start, ccontrol->cwnd,
1562  ccontrol->ewma_rtt_usec/1000,
1563  ccontrol->min_rtt_usec/1000);
1564  if (len < 0) {
1565  log_warn(LD_BUG, "Unable to format event for controller.");
1566  return NULL;
1567  }
1568 
1569  return ret;
1570 }
Header file for channel.c.
Header file for circuitlist.c.
#define CIRCUIT_IS_ORIGIN(c)
Definition: circuitlist.h:147
#define MAX(a, b)
Definition: cmp.h:22
int in_main_thread(void)
uint64_t monotime_absolute_usec(void)
Definition: compat_time.c:804
Functions and types for monotonic times.
const or_options_t * get_options(void)
Definition: config.c:926
Header file for config.c.
bool congestion_control_validate_sendme_increment(uint8_t sendme_inc)
static uint64_t n_ewma_count(const congestion_control_t *cc)
char * congestion_control_get_control_port_fields(const origin_circuit_t *circ)
int sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint)
static uint64_t dequeue_timestamp(smartlist_t *timestamps_u64_usecs)
static uint8_t rtt_reset_pct
static uint64_t peek_timestamp(const smartlist_t *timestamps_u64_usecs)
static uint8_t n_ewma_max
static uint64_t congestion_control_update_circuit_rtt(congestion_control_t *, uint64_t)
void congestion_control_free_(congestion_control_t *cc)
static void congestion_control_init(congestion_control_t *cc, const circuit_params_t *params, cc_path_t path)
int congestion_control_dispatch_cc_alg(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *layer_hint)
bool circuit_sent_cell_for_sendme(const circuit_t *circ, const crypt_path_t *layer_hint)
uint64_t congestion_control_get_num_clock_stalls(void)
static bool time_delta_should_use_heuristics(const congestion_control_t *cc)
static bool time_delta_stalled_or_jumped(const congestion_control_t *cc, uint64_t old_delta, uint64_t new_delta)
uint64_t congestion_control_get_num_rtt_reset(void)
void congestion_control_note_cell_sent(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *cpath)
int congestion_control_build_ext_response(const circuit_params_t *our_params, const circuit_params_t *circ_params, uint8_t **msg_out, size_t *msg_len_out)
static int circuit_has_active_streams(const circuit_t *circ, const crypt_path_t *layer_hint)
static bool congestion_control_update_circuit_bdp(congestion_control_t *, const circuit_t *, const crypt_path_t *, uint64_t, uint64_t)
int congestion_control_parse_ext_request(const uint8_t *msg, const size_t msg_len)
void congestion_control_set_cc_enabled(void)
int congestion_control_build_ext_request(uint8_t **msg_out, size_t *msg_len_out)
static void congestion_control_init_params(congestion_control_t *cc, const circuit_params_t *params, cc_path_t path)
int congestion_control_parse_ext_response(const uint8_t *msg, const size_t msg_len, circuit_params_t *params_out)
static uint8_t bwe_sendme_min
congestion_control_t * congestion_control_new(const circuit_params_t *params, cc_path_t path)
bool congestion_control_enabled(void)
bool congestion_control_update_circuit_estimates(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *layer_hint)
uint64_t cc_stats_circs_created
bool is_monotime_clock_reliable(void)
static uint8_t n_ewma_ss
int congestion_control_get_package_window(const circuit_t *circ, const crypt_path_t *cpath)
static void enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec)
static uint8_t n_ewma_cwnd_pct
void congestion_control_new_consensus_params(const networkstatus_t *ns)
Public APIs for congestion control.
static uint64_t n_count_ewma(uint64_t curr, uint64_t prev, uint64_t N)
static uint64_t percent_max_mix(uint64_t a, uint64_t b, uint8_t pct_max)
static uint8_t congestion_control_sendme_inc(void)
int congestion_control_nola_process_sendme(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *layer_hint)
void congestion_control_nola_set_params(congestion_control_t *cc)
Private-ish APIs for the TOR_NOLA congestion control algorithm.
Structure definitions for congestion control.
static uint64_t CWND_UPDATE_RATE(const struct congestion_control_t *cc)
#define NUM_BDP_ALGS
@ CC_ALG_VEGAS
@ CC_ALG_NOLA
@ CC_ALG_SENDME
@ CC_ALG_WESTWOOD
void congestion_control_vegas_set_params(congestion_control_t *cc, cc_path_t path)
int congestion_control_vegas_process_sendme(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *layer_hint)
Private-ish APIs for the TOR_VEGAS congestion control algorithm.
int congestion_control_westwood_process_sendme(congestion_control_t *cc, const circuit_t *circ, const crypt_path_t *layer_hint)
void congestion_control_westwood_set_params(congestion_control_t *cc)
Private-ish APIs for the TOR_WESTWOOD congestion control algorithm.
Header file for connection.c.
Header file for crypt_path.c.
#define TR_SUBSYS(name)
Definition: events.h:45
#define log_fn_ratelim(ratelim, severity, domain, args,...)
Definition: log.h:288
#define LD_BUG
Definition: log.h:86
#define LOG_NOTICE
Definition: log.h:50
#define LD_CIRC
Definition: log.h:82
#define LOG_INFO
Definition: log.h:45
#define tor_free(p)
Definition: malloc.h:56
int32_t networkstatus_get_param(const networkstatus_t *ns, const char *param_name, int32_t default_val, int32_t min_val, int32_t max_val)
Header file for networkstatus.c.
Header file for onion_crypto.c.
Master header file for Tor-specific functionality.
#define TO_CIRCUIT(x)
Definition: or.h:836
#define CIRCWINDOW_START
Definition: or.h:385
#define TO_CONN(c)
Definition: or.h:603
#define CIRCWINDOW_INCREMENT
Definition: or.h:389
Origin circuit structure.
int tor_asprintf(char **strp, const char *fmt,...)
Definition: printf.c:75
Header file for sendme.c.
smartlist_t * smartlist_new(void)
void smartlist_add(smartlist_t *sl, void *element)
void smartlist_clear(smartlist_t *sl)
void smartlist_del_keeporder(smartlist_t *sl, int idx)
#define SMARTLIST_FOREACH(sl, type, var, cmd)
uint8_t sendme_inc_cells
Definition: onion_crypto.h:36
unsigned int streams_blocked_on_n_chan
Definition: circuit_st.h:92
cell_queue_t n_chan_cells
Definition: circuit_st.h:82
int package_window
Definition: circuit_st.h:117
unsigned int streams_blocked_on_p_chan
Definition: circuit_st.h:95
struct congestion_control_t * ccontrol
Definition: circuit_st.h:250
smartlist_t * sendme_arrival_timestamps
smartlist_t * sendme_pending_timestamps
struct crypt_path_t * prev
Definition: crypt_path_st.h:80
struct congestion_control_t * ccontrol
Definition: crypt_path_st.h:89
struct edge_connection_t * next_stream
cell_queue_t p_chan_cells
Definition: or_circuit_st.h:35
edge_connection_t * n_streams
Definition: or_circuit_st.h:39
int AlwaysCongestionControl
edge_connection_t * p_streams
crypt_path_t * cpath
#define tor_assert(expr)
Definition: util_bug.h:102
#define tor_fragile_assert()
Definition: util_bug.h:270