1
//! A crate for performing GeoIP lookups using the Tor GeoIP database.
2

            
3
// @@ begin lint list maintained by maint/add_warning @@
4
#![allow(renamed_and_removed_lints)] // @@REMOVE_WHEN(ci_arti_stable)
5
#![allow(unknown_lints)] // @@REMOVE_WHEN(ci_arti_nightly)
6
#![warn(missing_docs)]
7
#![warn(noop_method_call)]
8
#![warn(unreachable_pub)]
9
#![warn(clippy::all)]
10
#![deny(clippy::await_holding_lock)]
11
#![deny(clippy::cargo_common_metadata)]
12
#![deny(clippy::cast_lossless)]
13
#![deny(clippy::checked_conversions)]
14
#![warn(clippy::cognitive_complexity)]
15
#![deny(clippy::debug_assert_with_mut_call)]
16
#![deny(clippy::exhaustive_enums)]
17
#![deny(clippy::exhaustive_structs)]
18
#![deny(clippy::expl_impl_clone_on_copy)]
19
#![deny(clippy::fallible_impl_from)]
20
#![deny(clippy::implicit_clone)]
21
#![deny(clippy::large_stack_arrays)]
22
#![warn(clippy::manual_ok_or)]
23
#![deny(clippy::missing_docs_in_private_items)]
24
#![warn(clippy::needless_borrow)]
25
#![warn(clippy::needless_pass_by_value)]
26
#![warn(clippy::option_option)]
27
#![deny(clippy::print_stderr)]
28
#![deny(clippy::print_stdout)]
29
#![warn(clippy::rc_buffer)]
30
#![deny(clippy::ref_option_ref)]
31
#![warn(clippy::semicolon_if_nothing_returned)]
32
#![warn(clippy::trait_duplication_in_bounds)]
33
#![deny(clippy::unchecked_duration_subtraction)]
34
#![deny(clippy::unnecessary_wraps)]
35
#![warn(clippy::unseparated_literal_suffix)]
36
#![deny(clippy::unwrap_used)]
37
#![deny(clippy::mod_module_files)]
38
#![allow(clippy::let_unit_value)] // This can reasonably be done for explicitness
39
#![allow(clippy::uninlined_format_args)]
40
#![allow(clippy::significant_drop_in_scrutinee)] // arti/-/merge_requests/588/#note_2812945
41
#![allow(clippy::result_large_err)] // temporary workaround for arti#587
42
#![allow(clippy::needless_raw_string_hashes)] // complained-about code is fine, often best
43
#![allow(clippy::needless_lifetimes)] // See arti#1765
44
//! <!-- @@ end lint list maintained by maint/add_warning @@ -->
45

            
46
// TODO #1645 (either remove this, or decide to have it everywhere)
47
#![cfg_attr(not(all(feature = "full")), allow(unused))]
48

            
49
pub use crate::err::Error;
50
use rangemap::RangeInclusiveMap;
51
use std::fmt::{Debug, Display, Formatter};
52
use std::net::{IpAddr, Ipv6Addr};
53
use std::num::{NonZeroU32, NonZeroU8, TryFromIntError};
54
use std::str::FromStr;
55
use std::sync::{Arc, OnceLock};
56

            
57
mod err;
58

            
59
/// An embedded copy of the latest geoip v4 database at the time of compilation.
60
///
61
/// FIXME(eta): This does use a few megabytes of binary size, which is less than ideal.
62
///             It would be better to parse it at compile time or something.
63
#[cfg(feature = "embedded-db")]
64
static EMBEDDED_DB_V4: &str = include_str!("../data/geoip");
65

            
66
/// An embedded copy of the latest geoip v6 database at the time of compilation.
67
#[cfg(feature = "embedded-db")]
68
static EMBEDDED_DB_V6: &str = include_str!("../data/geoip6");
69

            
70
/// A parsed copy of the embedded database.
71
#[cfg(feature = "embedded-db")]
72
static EMBEDDED_DB_PARSED: OnceLock<Arc<GeoipDb>> = OnceLock::new();
73

            
74
/// A two-letter country code.
75
///
76
/// Specifically, this type represents a purported "ISO 3166-1 alpha-2" country
77
/// code, such as "IT" for Italy or "UY" for Uruguay.
78
///
79
/// It does not include the sentinel value `??` that we use to represent
80
/// "country unknown"; if you need that, use [`OptionCc`]. Other than that, we
81
/// do not check whether the country code represents a real country: we only
82
/// ensure that it is a pair of printing ASCII characters.
83
///
84
/// Note that the geoip databases included with Arti will only include real
85
/// countries; we do not include the pseudo-countries `A1` through `An` for
86
/// "anonymous proxies", since doing so would mean putting nearly all Tor relays
87
/// into one of those countries.
88
#[derive(Copy, Clone, Eq, PartialEq)]
89
pub struct CountryCode {
90
    /// The underlying value (two printable ASCII characters, stored uppercase).
91
    ///
92
    /// The special value `??` is excluded, since it is not a country; use
93
    /// `OptionCc` instead if you need to represent that.
94
    ///
95
    /// We store these as `NonZeroU8` so that an `Option<CountryCode>` only has to
96
    /// take 2 bytes. This helps with alignment and storage.
97
    inner: [NonZeroU8; 2],
98
}
99

            
100
impl CountryCode {
101
    /// Make a new `CountryCode`.
102
14070228
    fn new(cc_orig: &str) -> Result<Self, Error> {
103
        /// Try to convert an array of 2 bytes into an array of 2 nonzero bytes.
104
        #[inline]
105
14041266
        fn try_cvt_to_nz(inp: [u8; 2]) -> Result<[NonZeroU8; 2], TryFromIntError> {
106
14041266
            // I have confirmed that the asm here is reasonably efficient.
107
14041266
            Ok([inp[0].try_into()?, inp[1].try_into()?])
108
14041266
        }
109

            
110
14070228
        let cc = cc_orig.to_ascii_uppercase();
111

            
112
14070228
        let cc: [u8; 2] = cc
113
14070228
            .as_bytes()
114
14070228
            .try_into()
115
14070231
            .map_err(|_| Error::BadCountryCode(cc))?;
116

            
117
28726711
        if !cc.iter().all(|b| b.is_ascii() && !b.is_ascii_control()) {
118
6
            return Err(Error::BadCountryCode(cc_orig.to_owned()));
119
14070216
        }
120
14070216

            
121
14070216
        if &cc == b"??" {
122
28950
            return Err(Error::NowhereNotSupported);
123
14041266
        }
124
14041266

            
125
14041266
        Ok(Self {
126
14041266
            inner: try_cvt_to_nz(cc).map_err(|_| Error::BadCountryCode(cc_orig.to_owned()))?,
127
        })
128
14070228
    }
129

            
130
    /// Get the actual country code.
131
    ///
132
    /// This just calls `.as_ref()`.
133
    pub fn get(&self) -> &str {
134
        self.as_ref()
135
    }
136
}
137

            
138
impl Display for CountryCode {
139
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
140
        write!(f, "{}", self.as_ref())
141
    }
142
}
143

            
144
impl Debug for CountryCode {
145
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
146
        write!(f, "CountryCode(\"{}\")", self.as_ref())
147
    }
148
}
149

            
150
impl AsRef<str> for CountryCode {
151
100
    fn as_ref(&self) -> &str {
152
        /// Convert a reference to an array of 2 nonzero bytes to a reference to
153
        /// an array of 2 bytes.
154
        #[inline]
155
100
        fn cvt_ref(inp: &[NonZeroU8; 2]) -> &[u8; 2] {
156
100
            // SAFETY: Every NonZeroU8 has a layout and bit validity that is
157
100
            // also a valid u8.  The layout of arrays is also guaranteed.
158
100
            //
159
100
            // (We don't use try_into here because we need to return a str that
160
100
            // points to a reference to self.)
161
100
            let ptr = inp.as_ptr() as *const u8;
162
100
            let slice = unsafe { std::slice::from_raw_parts(ptr, inp.len()) };
163
100
            slice
164
100
                .try_into()
165
100
                .expect("the resulting slice should have the correct length!")
166
100
        }
167

            
168
        // This shouldn't ever panic, since we shouldn't feed non-utf8 country
169
        // codes in.
170
        //
171
        // In theory we could use from_utf8_unchecked, but that's probably not
172
        // needed.
173
100
        std::str::from_utf8(cvt_ref(&self.inner)).expect("invalid country code in CountryCode")
174
100
    }
175
}
176

            
177
impl FromStr for CountryCode {
178
    type Err = Error;
179

            
180
32
    fn from_str(s: &str) -> Result<Self, Self::Err> {
181
32
        CountryCode::new(s)
182
32
    }
183
}
184

            
185
/// Wrapper for an `Option<`[`CountryCode`]`>` that encodes `None` as `??`.
186
///
187
/// Used so that we can implement foreign traits.
188
#[derive(
189
    Copy, Clone, Debug, Eq, PartialEq, derive_more::Into, derive_more::From, derive_more::AsRef,
190
)]
191
#[allow(clippy::exhaustive_structs)]
192
pub struct OptionCc(pub Option<CountryCode>);
193

            
194
impl FromStr for OptionCc {
195
    type Err = Error;
196

            
197
14070196
    fn from_str(s: &str) -> Result<Self, Self::Err> {
198
14070196
        match CountryCode::new(s) {
199
28948
            Err(Error::NowhereNotSupported) => Ok(None.into()),
200
            Err(e) => Err(e),
201
14041248
            Ok(cc) => Ok(Some(cc).into()),
202
        }
203
14070196
    }
204
}
205

            
206
impl Display for OptionCc {
207
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
208
        match self.0 {
209
            Some(cc) => write!(f, "{}", cc),
210
            None => write!(f, "??"),
211
        }
212
    }
213
}
214

            
215
/// A country code / ASN definition.
216
///
217
/// Type lifted from `geoip-db-tool` in the C-tor source.
218
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
219
struct NetDefn {
220
    /// The country code.
221
    ///
222
    /// We translate the value "??" into None.
223
    cc: Option<CountryCode>,
224
    /// The ASN, if we have one. We translate the value "0" into None.
225
    asn: Option<NonZeroU32>,
226
}
227

            
228
impl NetDefn {
229
    /// Make a new `NetDefn`.
230
14070192
    fn new(cc: &str, asn: Option<u32>) -> Result<Self, Error> {
231
14070192
        let asn = NonZeroU32::new(asn.unwrap_or(0));
232
14070192
        let cc = cc.parse::<OptionCc>()?.into();
233
14070192

            
234
14070192
        Ok(Self { cc, asn })
235
14070192
    }
236

            
237
    /// Return the country code.
238
240
    fn country_code(&self) -> Option<&CountryCode> {
239
240
        self.cc.as_ref()
240
240
    }
241

            
242
    /// Return the ASN, if there is one.
243
    fn asn(&self) -> Option<u32> {
244
        self.asn.as_ref().map(|x| x.get())
245
    }
246
}
247

            
248
/// A database of IP addresses to country codes.
249
#[derive(Clone, Eq, PartialEq, Debug)]
250
pub struct GeoipDb {
251
    /// The IPv4 subset of the database, with v4 addresses stored as 32-bit integers.
252
    map_v4: RangeInclusiveMap<u32, NetDefn>,
253
    /// The IPv6 subset of the database, with v6 addresses stored as 128-bit integers.
254
    map_v6: RangeInclusiveMap<u128, NetDefn>,
255
}
256

            
257
impl GeoipDb {
258
    /// Make a new `GeoipDb` using a compiled-in copy of the GeoIP database.
259
    ///
260
    /// The returned instance of the database is shared with `Arc` across all invocations of this
261
    /// function in the same program.
262
    #[cfg(feature = "embedded-db")]
263
140
    pub fn new_embedded() -> Arc<Self> {
264
142
        Arc::clone(EMBEDDED_DB_PARSED.get_or_init(|| {
265
48
            Arc::new(
266
48
                // It's reasonable to assume the one we embedded is fine -- we'll test it in CI, etc.
267
48
                Self::new_from_legacy_format(EMBEDDED_DB_V4, EMBEDDED_DB_V6)
268
48
                    .expect("failed to parse embedded geoip database"),
269
48
            )
270
142
        }))
271
140
    }
272

            
273
    /// Make a new `GeoipDb` using provided copies of the v4 and v6 database, in Tor legacy format.
274
96
    pub fn new_from_legacy_format(db_v4: &str, db_v6: &str) -> Result<Self, Error> {
275
96
        let mut ret = GeoipDb {
276
96
            map_v4: Default::default(),
277
96
            map_v6: Default::default(),
278
96
        };
279

            
280
8079654
        for line in db_v4.lines() {
281
8079654
            if line.starts_with('#') {
282
816
                continue;
283
8078838
            }
284
8078838
            let line = line.trim();
285
8078838
            if line.is_empty() {
286
4
                continue;
287
8078834
            }
288
8078834
            let mut split = line.split(',');
289
8078834
            let from = split
290
8078834
                .next()
291
8078834
                .ok_or(Error::BadFormat("empty line somehow?"))?
292
8078834
                .parse::<u32>()?;
293
8078834
            let to = split
294
8078834
                .next()
295
8078834
                .ok_or(Error::BadFormat("line with insufficient commas"))?
296
8078834
                .parse::<u32>()?;
297
8078834
            let cc = split
298
8078834
                .next()
299
8078834
                .ok_or(Error::BadFormat("line with insufficient commas"))?;
300
8078834
            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
301

            
302
8078834
            let defn = NetDefn::new(cc, asn)?;
303

            
304
8078834
            ret.map_v4.insert(from..=to, defn);
305
        }
306

            
307
        // This is slightly copypasta, but probably less readable to merge into one thing.
308
5992270
        for line in db_v6.lines() {
309
5992270
            if line.starts_with('#') {
310
816
                continue;
311
5991454
            }
312
5991454
            let line = line.trim();
313
5991454
            if line.is_empty() {
314
96
                continue;
315
5991358
            }
316
5991358
            let mut split = line.split(',');
317
5991358
            let from = split
318
5991358
                .next()
319
5991358
                .ok_or(Error::BadFormat("empty line somehow?"))?
320
5991358
                .parse::<Ipv6Addr>()?;
321
5991358
            let to = split
322
5991358
                .next()
323
5991358
                .ok_or(Error::BadFormat("line with insufficient commas"))?
324
5991358
                .parse::<Ipv6Addr>()?;
325
5991358
            let cc = split
326
5991358
                .next()
327
5991358
                .ok_or(Error::BadFormat("line with insufficient commas"))?;
328
5991358
            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
329

            
330
5991358
            let defn = NetDefn::new(cc, asn)?;
331

            
332
5991358
            ret.map_v6.insert(from.into()..=to.into(), defn);
333
        }
334

            
335
96
        Ok(ret)
336
96
    }
337

            
338
    /// Get the `NetDefn` for an IP address.
339
3004
    fn lookup_defn(&self, ip: IpAddr) -> Option<&NetDefn> {
340
3004
        match ip {
341
2490
            IpAddr::V4(v4) => self.map_v4.get(&v4.into()),
342
514
            IpAddr::V6(v6) => self.map_v6.get(&v6.into()),
343
        }
344
3004
    }
345

            
346
    /// Get a 2-letter country code for the given IP address, if this data is available.
347
3004
    pub fn lookup_country_code(&self, ip: IpAddr) -> Option<&CountryCode> {
348
3014
        self.lookup_defn(ip).and_then(|x| x.country_code())
349
3004
    }
350

            
351
    /// Determine a 2-letter country code for a host with multiple IP addresses.
352
    ///
353
    /// This looks up all of the IP addresses with `lookup_country_code`. If the lookups
354
    /// return different countries, `None` is returned. IP addresses that fail to resolve
355
    /// into a country are ignored if some of the other addresses do resolve successfully.
356
710
    pub fn lookup_country_code_multi<I>(&self, ips: I) -> Option<&CountryCode>
357
710
    where
358
710
        I: IntoIterator<Item = IpAddr>,
359
710
    {
360
710
        let mut ret = None;
361

            
362
1698
        for ip in ips {
363
990
            if let Some(cc) = self.lookup_country_code(ip) {
364
                // If we already have a return value and it's different, then return None;
365
                // a server can't be in two different countries.
366
10
                if ret.is_some() && ret != Some(cc) {
367
2
                    return None;
368
8
                }
369
8

            
370
8
                ret = Some(cc);
371
980
            }
372
        }
373

            
374
708
        ret
375
710
    }
376

            
377
    /// Return the ASN the IP address is in, if this data is available.
378
    pub fn lookup_asn(&self, ip: IpAddr) -> Option<u32> {
379
        self.lookup_defn(ip)?.asn()
380
    }
381
}
382

            
383
/// A (representation of a) host on the network which may have a known country code.
384
pub trait HasCountryCode {
385
    /// Return the country code in which this server is most likely located.
386
    ///
387
    /// This is usually implemented by simple GeoIP lookup on the addresses provided by `HasAddrs`.
388
    /// It follows that the server might not actually be in the returned country, but this is a
389
    /// halfway decent estimate for what other servers might guess the server's location to be
390
    /// (and thus useful for e.g. getting around simple geo-blocks, or having webpages return
391
    /// the correct localised versions).
392
    ///
393
    /// Returning `None` signifies that no country code information is available. (Conflicting
394
    /// GeoIP lookup results might also cause `None` to be returned.)
395
    fn country_code(&self) -> Option<CountryCode>;
396
}
397

            
398
#[cfg(test)]
399
mod test {
400
    // @@ begin test lint list maintained by maint/add_warning @@
401
    #![allow(clippy::bool_assert_comparison)]
402
    #![allow(clippy::clone_on_copy)]
403
    #![allow(clippy::dbg_macro)]
404
    #![allow(clippy::mixed_attributes_style)]
405
    #![allow(clippy::print_stderr)]
406
    #![allow(clippy::print_stdout)]
407
    #![allow(clippy::single_char_pattern)]
408
    #![allow(clippy::unwrap_used)]
409
    #![allow(clippy::unchecked_duration_subtraction)]
410
    #![allow(clippy::useless_vec)]
411
    #![allow(clippy::needless_pass_by_value)]
412
    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
413

            
414
    use super::*;
415
    use std::net::Ipv4Addr;
416

            
417
    // NOTE(eta): this test takes a whole 1.6 seconds in *non-release* mode
418
    #[test]
419
    #[cfg(feature = "embedded-db")]
420
    fn embedded_db() {
421
        let db = GeoipDb::new_embedded();
422

            
423
        assert_eq!(
424
            db.lookup_country_code(Ipv4Addr::new(8, 8, 8, 8).into())
425
                .map(|x| x.as_ref()),
426
            Some("US")
427
        );
428

            
429
        assert_eq!(
430
            db.lookup_country_code("2001:4860:4860::8888".parse().unwrap())
431
                .map(|x| x.as_ref()),
432
            Some("US")
433
        );
434
    }
435

            
436
    #[test]
437
    fn basic_lookups() {
438
        let src_v4 = r#"
439
        16909056,16909311,GB
440
        "#;
441
        let src_v6 = r#"
442
        fe80::,fe81::,US
443
        dead:beef::,dead:ffff::,??
444
        "#;
445
        let db = GeoipDb::new_from_legacy_format(src_v4, src_v6).unwrap();
446

            
447
        assert_eq!(
448
            db.lookup_country_code(Ipv4Addr::new(1, 2, 3, 4).into())
449
                .map(|x| x.as_ref()),
450
            Some("GB")
451
        );
452

            
453
        assert_eq!(
454
            db.lookup_country_code(Ipv4Addr::new(1, 1, 1, 1).into()),
455
            None
456
        );
457

            
458
        assert_eq!(
459
            db.lookup_country_code("fe80::dead:beef".parse().unwrap())
460
                .map(|x| x.as_ref()),
461
            Some("US")
462
        );
463

            
464
        assert_eq!(
465
            db.lookup_country_code("fe81::dead:beef".parse().unwrap()),
466
            None
467
        );
468
        assert_eq!(
469
            db.lookup_country_code("dead:beef::1".parse().unwrap()),
470
            None
471
        );
472
    }
473

            
474
    #[test]
475
    fn cc_parse() -> Result<(), Error> {
476
        // real countries.
477
        assert_eq!(CountryCode::from_str("us")?, CountryCode::from_str("US")?);
478
        assert_eq!(CountryCode::from_str("UY")?, CountryCode::from_str("UY")?);
479

            
480
        // not real as of this writing, but still representable.
481
        assert_eq!(CountryCode::from_str("A7")?, CountryCode::from_str("a7")?);
482
        assert_eq!(CountryCode::from_str("xz")?, CountryCode::from_str("xz")?);
483

            
484
        // Can't convert to two bytes.
485
        assert!(matches!(
486
            CountryCode::from_str("z"),
487
            Err(Error::BadCountryCode(_))
488
        ));
489
        assert!(matches!(
490
            CountryCode::from_str("🐻‍❄️"),
491
            Err(Error::BadCountryCode(_))
492
        ));
493
        assert!(matches!(
494
            CountryCode::from_str("Sheboygan"),
495
            Err(Error::BadCountryCode(_))
496
        ));
497

            
498
        // Can convert to two bytes, but still not printable ascii
499
        assert!(matches!(
500
            CountryCode::from_str("\r\n"),
501
            Err(Error::BadCountryCode(_))
502
        ));
503
        assert!(matches!(
504
            CountryCode::from_str("\0\0"),
505
            Err(Error::BadCountryCode(_))
506
        ));
507
        assert!(matches!(
508
            CountryCode::from_str("¡"),
509
            Err(Error::BadCountryCode(_))
510
        ));
511

            
512
        // Not a country.
513
        assert!(matches!(
514
            CountryCode::from_str("??"),
515
            Err(Error::NowhereNotSupported)
516
        ));
517

            
518
        Ok(())
519
    }
520

            
521
    #[test]
522
    fn opt_cc_parse() -> Result<(), Error> {
523
        assert_eq!(
524
            CountryCode::from_str("br")?,
525
            OptionCc::from_str("BR")?.0.unwrap()
526
        );
527
        assert!(OptionCc::from_str("??")?.0.is_none());
528

            
529
        Ok(())
530
    }
531
}