Tor 0.4.9.0-alpha-dev
util_string.c
Go to the documentation of this file.
1/* Copyright (c) 2003-2004, Roger Dingledine
2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
4/* See LICENSE for licensing information */
5
6/**
7 * \file util_string.c
8 * \brief Non-standard string functions used throughout Tor.
9 **/
10
13#include "lib/err/torerr.h"
14#include "lib/ctime/di_ops.h"
16
17#include <string.h>
18#include <stdlib.h>
19
20/** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
21 * <b>needle</b>, return a pointer to the first occurrence of the needle
22 * within the haystack, or NULL if there is no such occurrence.
23 *
24 * This function is <em>not</em> timing-safe.
25 *
26 * Requires that <b>nlen</b> be greater than zero.
27 */
28const void *
29tor_memmem(const void *_haystack, size_t hlen,
30 const void *_needle, size_t nlen)
31{
32#if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
33 raw_assert(nlen);
34 if (nlen > hlen)
35 return NULL;
36 return memmem(_haystack, hlen, _needle, nlen);
37#else
38 /* This isn't as fast as the GLIBC implementation, but it doesn't need to
39 * be. */
40 const char *p, *last_possible_start;
41 const char *haystack = (const char*)_haystack;
42 const char *needle = (const char*)_needle;
43 char first;
44 raw_assert(nlen);
45
46 if (nlen > hlen)
47 return NULL;
48
49 p = haystack;
50 /* Last position at which the needle could start. */
51 last_possible_start = haystack + hlen - nlen;
52 first = *(const char*)needle;
53 while ((p = memchr(p, first, last_possible_start + 1 - p))) {
54 if (fast_memeq(p, needle, nlen))
55 return p;
56 if (++p > last_possible_start) {
57 /* This comparison shouldn't be necessary, since if p was previously
58 * equal to last_possible_start, the next memchr call would be
59 * "memchr(p, first, 0)", which will return NULL. But it clarifies the
60 * logic. */
61 return NULL;
62 }
63 }
64 return NULL;
65#endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
66}
67
68const void *
69tor_memstr(const void *haystack, size_t hlen, const char *needle)
70{
71 return tor_memmem(haystack, hlen, needle, strlen(needle));
72}
73
74/** Return true iff the 'len' bytes at 'mem' are all zero. */
75int
76fast_mem_is_zero(const char *mem, size_t len)
77{
78 static const char ZERO[] = {
79 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
80 };
81 while (len >= sizeof(ZERO)) {
82 /* It's safe to use fast_memcmp here, since the very worst thing an
83 * attacker could learn is how many initial bytes of a secret were zero */
84 if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
85 return 0;
86 len -= sizeof(ZERO);
87 mem += sizeof(ZERO);
88 }
89 /* Deal with leftover bytes. */
90 if (len)
91 return fast_memeq(mem, ZERO, len);
92
93 return 1;
94}
95
96/** Return true iff the DIGEST_LEN bytes in digest are all zero. */
97int
98tor_digest_is_zero(const char *digest)
99{
100 return safe_mem_is_zero(digest, DIGEST_LEN);
101}
102
103/** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
104int
105tor_digest256_is_zero(const char *digest)
106{
107 return safe_mem_is_zero(digest, DIGEST256_LEN);
108}
109
110/** Remove from the string <b>s</b> every character which appears in
111 * <b>strip</b>. */
112void
113tor_strstrip(char *s, const char *strip)
114{
115 char *readp = s;
116 while (*readp) {
117 if (strchr(strip, *readp)) {
118 ++readp;
119 } else {
120 *s++ = *readp++;
121 }
122 }
123 *s = '\0';
124}
125
126/** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
127 * lowercase. */
128void
130{
131 while (*s) {
132 *s = TOR_TOLOWER(*s);
133 ++s;
134 }
135}
136
137/** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
138 * lowercase. */
139void
141{
142 while (*s) {
143 *s = TOR_TOUPPER(*s);
144 ++s;
145 }
146}
147
148/** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
149void
150tor_strreplacechar(char *s, char find, char replacement)
151{
152 for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
153 *s = replacement;
154 }
155}
156
157/** Return 1 if every character in <b>s</b> is printable, else return 0.
158 */
159int
160tor_strisprint(const char *s)
161{
162 while (*s) {
163 if (!TOR_ISPRINT(*s))
164 return 0;
165 s++;
166 }
167 return 1;
168}
169
170/** Return 1 if no character in <b>s</b> is uppercase, else return 0.
171 */
172int
173tor_strisnonupper(const char *s)
174{
175 while (*s) {
176 if (TOR_ISUPPER(*s))
177 return 0;
178 s++;
179 }
180 return 1;
181}
182
183/** Return true iff every character in <b>s</b> is whitespace space; else
184 * return false. */
185int
186tor_strisspace(const char *s)
187{
188 while (*s) {
189 if (!TOR_ISSPACE(*s))
190 return 0;
191 s++;
192 }
193 return 1;
194}
195
196/** As strcmp, except that either string may be NULL. The NULL string is
197 * considered to be before any non-NULL string. */
198int
199strcmp_opt(const char *s1, const char *s2)
200{
201 if (!s1) {
202 if (!s2)
203 return 0;
204 else
205 return -1;
206 } else if (!s2) {
207 return 1;
208 } else {
209 return strcmp(s1, s2);
210 }
211}
212
213/** Compares the first strlen(s2) characters of s1 with s2. Returns as for
214 * strcmp.
215 */
216int
217strcmpstart(const char *s1, const char *s2)
218{
219 size_t n = strlen(s2);
220 return strncmp(s1, s2, n);
221}
222
223/** Compares the first strlen(s2) characters of s1 with s2. Returns as for
224 * strcasecmp.
225 */
226int
227strcasecmpstart(const char *s1, const char *s2)
228{
229 size_t n = strlen(s2);
230 return strncasecmp(s1, s2, n);
231}
232
233/** Compare the value of the string <b>prefix</b> with the start of the
234 * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
235 *
236 * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
237 * less than strlen(prefix).]
238 */
239int
240fast_memcmpstart(const void *mem, size_t memlen,
241 const char *prefix)
242{
243 size_t plen = strlen(prefix);
244 if (memlen < plen)
245 return -1;
246 return fast_memcmp(mem, prefix, plen);
247}
248
249/** Compares the last strlen(s2) characters of s1 with s2. Returns as for
250 * strcmp.
251 */
252int
253strcmpend(const char *s1, const char *s2)
254{
255 size_t n1 = strlen(s1), n2 = strlen(s2);
256 if (n2>n1)
257 return strcmp(s1,s2);
258 else
259 return strncmp(s1+(n1-n2), s2, n2);
260}
261
262/** Compares the last strlen(s2) characters of s1 with s2. Returns as for
263 * strcasecmp.
264 */
265int
266strcasecmpend(const char *s1, const char *s2)
267{
268 size_t n1 = strlen(s1), n2 = strlen(s2);
269 if (n2>n1) /* then they can't be the same; figure out which is bigger */
270 return strcasecmp(s1,s2);
271 else
272 return strncasecmp(s1+(n1-n2), s2, n2);
273}
274
275/** Return a pointer to the first char of s that is not whitespace and
276 * not a comment, or to the terminating NUL if no such character exists.
277 */
278const char *
279eat_whitespace(const char *s)
280{
281 raw_assert(s);
282
283 while (1) {
284 switch (*s) {
285 case '\0':
286 default:
287 return s;
288 case ' ':
289 case '\t':
290 case '\n':
291 case '\r':
292 ++s;
293 break;
294 case '#':
295 ++s;
296 while (*s && *s != '\n')
297 ++s;
298 }
299 }
300}
301
302/** Return a pointer to the first char of s that is not whitespace and
303 * not a comment, or to the terminating NUL if no such character exists.
304 */
305const char *
306eat_whitespace_eos(const char *s, const char *eos)
307{
308 raw_assert(s);
309 raw_assert(eos && s <= eos);
310
311 while (s < eos) {
312 switch (*s) {
313 case '\0':
314 default:
315 return s;
316 case ' ':
317 case '\t':
318 case '\n':
319 case '\r':
320 ++s;
321 break;
322 case '#':
323 ++s;
324 while (s < eos && *s && *s != '\n')
325 ++s;
326 }
327 }
328 return s;
329}
330
331/** Return a pointer to the first char of s that is not a space or a tab
332 * or a \\r, or to the terminating NUL if no such character exists. */
333const char *
335{
336 while (*s == ' ' || *s == '\t' || *s == '\r')
337 ++s;
338 return s;
339}
340
341/** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
342 * found a non-whitespace character or not. */
343const char *
344eat_whitespace_eos_no_nl(const char *s, const char *eos)
345{
346 while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
347 ++s;
348 return s;
349}
350
351/** Return a pointer to the first char of s that is whitespace or <b>#</b>,
352 * or to the terminating NUL if no such character exists.
353 */
354const char *
355find_whitespace(const char *s)
356{
357 /* tor_assert(s); */
358 while (1) {
359 switch (*s)
360 {
361 case '\0':
362 case '#':
363 case ' ':
364 case '\r':
365 case '\n':
366 case '\t':
367 return s;
368 default:
369 ++s;
370 }
371 }
372}
373
374/** As find_whitespace, but stop at <b>eos</b> whether we have found a
375 * whitespace or not. */
376const char *
377find_whitespace_eos(const char *s, const char *eos)
378{
379 /* tor_assert(s); */
380 while (s < eos) {
381 switch (*s)
382 {
383 case '\0':
384 case '#':
385 case ' ':
386 case '\r':
387 case '\n':
388 case '\t':
389 return s;
390 default:
391 ++s;
392 }
393 }
394 return s;
395}
396
397/** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
398 * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
399 * or immediately after a newline). Return NULL if no such string is found.
400 */
401const char *
402find_str_at_start_of_line(const char *haystack, const char *needle)
403{
404 size_t needle_len = strlen(needle);
405
406 do {
407 if (!strncmp(haystack, needle, needle_len))
408 return haystack;
409
410 haystack = strchr(haystack, '\n');
411 if (!haystack)
412 return NULL;
413 else
414 ++haystack;
415 } while (*haystack);
416
417 return NULL;
418}
419
420/** Returns true if <b>string</b> could be a C identifier.
421 A C identifier must begin with a letter or an underscore and the
422 rest of its characters can be letters, numbers or underscores. No
423 length limit is imposed. */
424int
425string_is_C_identifier(const char *string)
426{
427 size_t iter;
428 size_t length = strlen(string);
429 if (!length)
430 return 0;
431
432 for (iter = 0; iter < length ; iter++) {
433 if (iter == 0) {
434 if (!(TOR_ISALPHA(string[iter]) ||
435 string[iter] == '_'))
436 return 0;
437 } else {
438 if (!(TOR_ISALPHA(string[iter]) ||
439 TOR_ISDIGIT(string[iter]) ||
440 string[iter] == '_'))
441 return 0;
442 }
443 }
444
445 return 1;
446}
447
448/** A byte with the top <b>x</b> bits set. */
449#define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
450/** A byte with the lowest <b>x</b> bits set. */
451#define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
452
453/** Given the leading byte <b>b</b>, return the total number of bytes in the
454 * UTF-8 character. Returns 0 if it's an invalid leading byte.
455 */
456static uint8_t
458{
459 if ((TOP_BITS(1) & b) == 0x00)
460 return 1; // a 1-byte UTF-8 char, aka ASCII
461 if ((TOP_BITS(3) & b) == TOP_BITS(2))
462 return 2; // a 2-byte UTF-8 char
463 if ((TOP_BITS(4) & b) == TOP_BITS(3))
464 return 3; // a 3-byte UTF-8 char
465 if ((TOP_BITS(5) & b) == TOP_BITS(4))
466 return 4; // a 4-byte UTF-8 char
467
468 // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
469 return 0;
470}
471
472/** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
473static bool
475{
476 uint8_t top2bits = b & TOP_BITS(2);
477 return top2bits == TOP_BITS(1);
478}
479
480/** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
481 * character.
482 */
483static bool
484validate_char(const uint8_t *c, uint8_t len)
485{
486 if (len == 1)
487 return true; // already validated this is an ASCII char earlier.
488
489 uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
490 uint32_t codepoint = c[0] & mask;
491
492 mask = LOW_BITS(6); // bitmask for continuation bytes.
493 for (uint8_t i = 1; i < len; i++) {
494 if (!is_continuation_byte(c[i]))
495 return false;
496 codepoint <<= 6;
497 codepoint |= (c[i] & mask);
498 }
499
500 if (len == 2 && codepoint <= 0x7f)
501 return false; // Invalid, overly long encoding, should have fit in 1 byte.
502
503 if (len == 3 && codepoint <= 0x7ff)
504 return false; // Invalid, overly long encoding, should have fit in 2 bytes.
505
506 if (len == 4 && codepoint <= 0xffff)
507 return false; // Invalid, overly long encoding, should have fit in 3 bytes.
508
509 if (codepoint >= 0xd800 && codepoint <= 0xdfff)
510 return false; // Invalid, reserved for UTF-16 surrogate pairs.
511
512 return codepoint <= 0x10ffff; // Check if within maximum.
513}
514
515/** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
516 valid UTF-8 string. */
517int
518string_is_utf8(const char *str, size_t len)
519{
520 // If str is NULL, don't try to read it
521 if (!str) {
522 // We could test for this case, but the low-level logs would produce
523 // confusing test output.
524 // LCOV_EXCL_START
525 if (len) {
526 // Use the low-level logging function, so that the log module can
527 // validate UTF-8 (if needed in future code)
529 "BUG: string_is_utf8() called with NULL str but non-zero len.");
530 // Since it's a bug, we should probably reject this string
531 return false;
532 }
533 // LCOV_EXCL_STOP
534 return true;
535 }
536
537 for (size_t i = 0; i < len;) {
538 uint8_t num_bytes = bytes_in_char(str[i]);
539 if (num_bytes == 0) // Invalid leading byte found.
540 return false;
541
542 size_t next_char = i + num_bytes;
543 if (next_char > len)
544 return false;
545
546 // Validate the continuation bytes in this multi-byte character,
547 // and advance to the next character in the string.
548 if (!validate_char((const uint8_t*)&str[i], num_bytes))
549 return false;
550 i = next_char;
551 }
552 return true;
553}
554
555/** As string_is_utf8(), but returns false if the string begins with a UTF-8
556 * byte order mark (BOM).
557 */
558int
559string_is_utf8_no_bom(const char *str, size_t len)
560{
561 if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
562 !strcmpstart(str, "\uFFFE"))) {
563 return false;
564 }
565 return string_is_utf8(str, len);
566}
Locale-independent character-type inspection (header)
int safe_mem_is_zero(const void *mem, size_t sz)
Definition: di_ops.c:224
Headers for di_ops.c.
#define fast_memeq(a, b, c)
Definition: di_ops.h:35
#define fast_memcmp(a, b, c)
Definition: di_ops.h:28
Definitions for common sizes of cryptographic digests.
#define DIGEST_LEN
Definition: digest_sizes.h:20
#define DIGEST256_LEN
Definition: digest_sizes.h:23
void tor_log_err_sigsafe(const char *m,...)
Definition: torerr.c:70
Headers for torerr.c.
int tor_strisspace(const char *s)
Definition: util_string.c:186
const char * find_whitespace_eos(const char *s, const char *eos)
Definition: util_string.c:377
int strcasecmpstart(const char *s1, const char *s2)
Definition: util_string.c:227
void tor_strlower(char *s)
Definition: util_string.c:129
int strcmpstart(const char *s1, const char *s2)
Definition: util_string.c:217
int strcmpend(const char *s1, const char *s2)
Definition: util_string.c:253
static uint8_t bytes_in_char(uint8_t b)
Definition: util_string.c:457
const char * find_whitespace(const char *s)
Definition: util_string.c:355
void tor_strupper(char *s)
Definition: util_string.c:140
const char * eat_whitespace_eos(const char *s, const char *eos)
Definition: util_string.c:306
int fast_memcmpstart(const void *mem, size_t memlen, const char *prefix)
Definition: util_string.c:240
int strcasecmpend(const char *s1, const char *s2)
Definition: util_string.c:266
int tor_strisprint(const char *s)
Definition: util_string.c:160
int strcmp_opt(const char *s1, const char *s2)
Definition: util_string.c:199
int string_is_utf8_no_bom(const char *str, size_t len)
Definition: util_string.c:559
int tor_digest256_is_zero(const char *digest)
Definition: util_string.c:105
int fast_mem_is_zero(const char *mem, size_t len)
Definition: util_string.c:76
static bool validate_char(const uint8_t *c, uint8_t len)
Definition: util_string.c:484
int string_is_C_identifier(const char *string)
Definition: util_string.c:425
void tor_strstrip(char *s, const char *strip)
Definition: util_string.c:113
const char * find_str_at_start_of_line(const char *haystack, const char *needle)
Definition: util_string.c:402
const char * eat_whitespace(const char *s)
Definition: util_string.c:279
const char * eat_whitespace_eos_no_nl(const char *s, const char *eos)
Definition: util_string.c:344
const char * eat_whitespace_no_nl(const char *s)
Definition: util_string.c:334
int tor_strisnonupper(const char *s)
Definition: util_string.c:173
const void * tor_memmem(const void *_haystack, size_t hlen, const void *_needle, size_t nlen)
Definition: util_string.c:29
int string_is_utf8(const char *str, size_t len)
Definition: util_string.c:518
int tor_digest_is_zero(const char *digest)
Definition: util_string.c:98
void tor_strreplacechar(char *s, char find, char replacement)
Definition: util_string.c:150
static bool is_continuation_byte(uint8_t b)
Definition: util_string.c:474
#define LOW_BITS(x)
Definition: util_string.c:451
#define TOP_BITS(x)
Definition: util_string.c:449
Header for util_string.c.