tor_geoip/
lib.rs

1//! A crate for performing GeoIP lookups using the Tor GeoIP database.
2
3// @@ begin lint list maintained by maint/add_warning @@
4#![allow(renamed_and_removed_lints)] // @@REMOVE_WHEN(ci_arti_stable)
5#![allow(unknown_lints)] // @@REMOVE_WHEN(ci_arti_nightly)
6#![warn(missing_docs)]
7#![warn(noop_method_call)]
8#![warn(unreachable_pub)]
9#![warn(clippy::all)]
10#![deny(clippy::await_holding_lock)]
11#![deny(clippy::cargo_common_metadata)]
12#![deny(clippy::cast_lossless)]
13#![deny(clippy::checked_conversions)]
14#![warn(clippy::cognitive_complexity)]
15#![deny(clippy::debug_assert_with_mut_call)]
16#![deny(clippy::exhaustive_enums)]
17#![deny(clippy::exhaustive_structs)]
18#![deny(clippy::expl_impl_clone_on_copy)]
19#![deny(clippy::fallible_impl_from)]
20#![deny(clippy::implicit_clone)]
21#![deny(clippy::large_stack_arrays)]
22#![warn(clippy::manual_ok_or)]
23#![deny(clippy::missing_docs_in_private_items)]
24#![warn(clippy::needless_borrow)]
25#![warn(clippy::needless_pass_by_value)]
26#![warn(clippy::option_option)]
27#![deny(clippy::print_stderr)]
28#![deny(clippy::print_stdout)]
29#![warn(clippy::rc_buffer)]
30#![deny(clippy::ref_option_ref)]
31#![warn(clippy::semicolon_if_nothing_returned)]
32#![warn(clippy::trait_duplication_in_bounds)]
33#![deny(clippy::unchecked_duration_subtraction)]
34#![deny(clippy::unnecessary_wraps)]
35#![warn(clippy::unseparated_literal_suffix)]
36#![deny(clippy::unwrap_used)]
37#![deny(clippy::mod_module_files)]
38#![allow(clippy::let_unit_value)] // This can reasonably be done for explicitness
39#![allow(clippy::uninlined_format_args)]
40#![allow(clippy::significant_drop_in_scrutinee)] // arti/-/merge_requests/588/#note_2812945
41#![allow(clippy::result_large_err)] // temporary workaround for arti#587
42#![allow(clippy::needless_raw_string_hashes)] // complained-about code is fine, often best
43#![allow(clippy::needless_lifetimes)] // See arti#1765
44//! <!-- @@ end lint list maintained by maint/add_warning @@ -->
45
46// TODO #1645 (either remove this, or decide to have it everywhere)
47#![cfg_attr(not(all(feature = "full")), allow(unused))]
48
49pub use crate::err::Error;
50use rangemap::RangeInclusiveMap;
51use std::fmt::{Debug, Display, Formatter};
52use std::net::{IpAddr, Ipv6Addr};
53use std::num::{NonZeroU32, NonZeroU8, TryFromIntError};
54use std::str::FromStr;
55use std::sync::{Arc, OnceLock};
56
57mod err;
58
59/// An embedded copy of the latest geoip v4 database at the time of compilation.
60///
61/// FIXME(eta): This does use a few megabytes of binary size, which is less than ideal.
62///             It would be better to parse it at compile time or something.
63#[cfg(feature = "embedded-db")]
64static EMBEDDED_DB_V4: &str = include_str!("../data/geoip");
65
66/// An embedded copy of the latest geoip v6 database at the time of compilation.
67#[cfg(feature = "embedded-db")]
68static EMBEDDED_DB_V6: &str = include_str!("../data/geoip6");
69
70/// A parsed copy of the embedded database.
71#[cfg(feature = "embedded-db")]
72static EMBEDDED_DB_PARSED: OnceLock<Arc<GeoipDb>> = OnceLock::new();
73
74/// A two-letter country code.
75///
76/// Specifically, this type represents a purported "ISO 3166-1 alpha-2" country
77/// code, such as "IT" for Italy or "UY" for Uruguay.
78///
79/// It does not include the sentinel value `??` that we use to represent
80/// "country unknown"; if you need that, use [`OptionCc`]. Other than that, we
81/// do not check whether the country code represents a real country: we only
82/// ensure that it is a pair of printing ASCII characters.
83///
84/// Note that the geoip databases included with Arti will only include real
85/// countries; we do not include the pseudo-countries `A1` through `An` for
86/// "anonymous proxies", since doing so would mean putting nearly all Tor relays
87/// into one of those countries.
88#[derive(Copy, Clone, Eq, PartialEq)]
89pub struct CountryCode {
90    /// The underlying value (two printable ASCII characters, stored uppercase).
91    ///
92    /// The special value `??` is excluded, since it is not a country; use
93    /// `OptionCc` instead if you need to represent that.
94    ///
95    /// We store these as `NonZeroU8` so that an `Option<CountryCode>` only has to
96    /// take 2 bytes. This helps with alignment and storage.
97    inner: [NonZeroU8; 2],
98}
99
100impl CountryCode {
101    /// Make a new `CountryCode`.
102    fn new(cc_orig: &str) -> Result<Self, Error> {
103        /// Try to convert an array of 2 bytes into an array of 2 nonzero bytes.
104        #[inline]
105        fn try_cvt_to_nz(inp: [u8; 2]) -> Result<[NonZeroU8; 2], TryFromIntError> {
106            // I have confirmed that the asm here is reasonably efficient.
107            Ok([inp[0].try_into()?, inp[1].try_into()?])
108        }
109
110        let cc = cc_orig.to_ascii_uppercase();
111
112        let cc: [u8; 2] = cc
113            .as_bytes()
114            .try_into()
115            .map_err(|_| Error::BadCountryCode(cc))?;
116
117        if !cc.iter().all(|b| b.is_ascii() && !b.is_ascii_control()) {
118            return Err(Error::BadCountryCode(cc_orig.to_owned()));
119        }
120
121        if &cc == b"??" {
122            return Err(Error::NowhereNotSupported);
123        }
124
125        Ok(Self {
126            inner: try_cvt_to_nz(cc).map_err(|_| Error::BadCountryCode(cc_orig.to_owned()))?,
127        })
128    }
129
130    /// Get the actual country code.
131    ///
132    /// This just calls `.as_ref()`.
133    pub fn get(&self) -> &str {
134        self.as_ref()
135    }
136}
137
138impl Display for CountryCode {
139    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
140        write!(f, "{}", self.as_ref())
141    }
142}
143
144impl Debug for CountryCode {
145    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
146        write!(f, "CountryCode(\"{}\")", self.as_ref())
147    }
148}
149
150impl AsRef<str> for CountryCode {
151    fn as_ref(&self) -> &str {
152        /// Convert a reference to an array of 2 nonzero bytes to a reference to
153        /// an array of 2 bytes.
154        #[inline]
155        fn cvt_ref(inp: &[NonZeroU8; 2]) -> &[u8; 2] {
156            // SAFETY: Every NonZeroU8 has a layout and bit validity that is
157            // also a valid u8.  The layout of arrays is also guaranteed.
158            //
159            // (We don't use try_into here because we need to return a str that
160            // points to a reference to self.)
161            let ptr = inp.as_ptr() as *const u8;
162            let slice = unsafe { std::slice::from_raw_parts(ptr, inp.len()) };
163            slice
164                .try_into()
165                .expect("the resulting slice should have the correct length!")
166        }
167
168        // This shouldn't ever panic, since we shouldn't feed non-utf8 country
169        // codes in.
170        //
171        // In theory we could use from_utf8_unchecked, but that's probably not
172        // needed.
173        std::str::from_utf8(cvt_ref(&self.inner)).expect("invalid country code in CountryCode")
174    }
175}
176
177impl FromStr for CountryCode {
178    type Err = Error;
179
180    fn from_str(s: &str) -> Result<Self, Self::Err> {
181        CountryCode::new(s)
182    }
183}
184
185/// Wrapper for an `Option<`[`CountryCode`]`>` that encodes `None` as `??`.
186///
187/// Used so that we can implement foreign traits.
188#[derive(
189    Copy, Clone, Debug, Eq, PartialEq, derive_more::Into, derive_more::From, derive_more::AsRef,
190)]
191#[allow(clippy::exhaustive_structs)]
192pub struct OptionCc(pub Option<CountryCode>);
193
194impl FromStr for OptionCc {
195    type Err = Error;
196
197    fn from_str(s: &str) -> Result<Self, Self::Err> {
198        match CountryCode::new(s) {
199            Err(Error::NowhereNotSupported) => Ok(None.into()),
200            Err(e) => Err(e),
201            Ok(cc) => Ok(Some(cc).into()),
202        }
203    }
204}
205
206impl Display for OptionCc {
207    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
208        match self.0 {
209            Some(cc) => write!(f, "{}", cc),
210            None => write!(f, "??"),
211        }
212    }
213}
214
215/// A country code / ASN definition.
216///
217/// Type lifted from `geoip-db-tool` in the C-tor source.
218#[derive(Copy, Clone, Eq, PartialEq, Debug)]
219struct NetDefn {
220    /// The country code.
221    ///
222    /// We translate the value "??" into None.
223    cc: Option<CountryCode>,
224    /// The ASN, if we have one. We translate the value "0" into None.
225    asn: Option<NonZeroU32>,
226}
227
228impl NetDefn {
229    /// Make a new `NetDefn`.
230    fn new(cc: &str, asn: Option<u32>) -> Result<Self, Error> {
231        let asn = NonZeroU32::new(asn.unwrap_or(0));
232        let cc = cc.parse::<OptionCc>()?.into();
233
234        Ok(Self { cc, asn })
235    }
236
237    /// Return the country code.
238    fn country_code(&self) -> Option<&CountryCode> {
239        self.cc.as_ref()
240    }
241
242    /// Return the ASN, if there is one.
243    fn asn(&self) -> Option<u32> {
244        self.asn.as_ref().map(|x| x.get())
245    }
246}
247
248/// A database of IP addresses to country codes.
249#[derive(Clone, Eq, PartialEq, Debug)]
250pub struct GeoipDb {
251    /// The IPv4 subset of the database, with v4 addresses stored as 32-bit integers.
252    map_v4: RangeInclusiveMap<u32, NetDefn>,
253    /// The IPv6 subset of the database, with v6 addresses stored as 128-bit integers.
254    map_v6: RangeInclusiveMap<u128, NetDefn>,
255}
256
257impl GeoipDb {
258    /// Make a new `GeoipDb` using a compiled-in copy of the GeoIP database.
259    ///
260    /// The returned instance of the database is shared with `Arc` across all invocations of this
261    /// function in the same program.
262    #[cfg(feature = "embedded-db")]
263    pub fn new_embedded() -> Arc<Self> {
264        Arc::clone(EMBEDDED_DB_PARSED.get_or_init(|| {
265            Arc::new(
266                // It's reasonable to assume the one we embedded is fine -- we'll test it in CI, etc.
267                Self::new_from_legacy_format(EMBEDDED_DB_V4, EMBEDDED_DB_V6)
268                    .expect("failed to parse embedded geoip database"),
269            )
270        }))
271    }
272
273    /// Make a new `GeoipDb` using provided copies of the v4 and v6 database, in Tor legacy format.
274    pub fn new_from_legacy_format(db_v4: &str, db_v6: &str) -> Result<Self, Error> {
275        let mut ret = GeoipDb {
276            map_v4: Default::default(),
277            map_v6: Default::default(),
278        };
279
280        for line in db_v4.lines() {
281            if line.starts_with('#') {
282                continue;
283            }
284            let line = line.trim();
285            if line.is_empty() {
286                continue;
287            }
288            let mut split = line.split(',');
289            let from = split
290                .next()
291                .ok_or(Error::BadFormat("empty line somehow?"))?
292                .parse::<u32>()?;
293            let to = split
294                .next()
295                .ok_or(Error::BadFormat("line with insufficient commas"))?
296                .parse::<u32>()?;
297            let cc = split
298                .next()
299                .ok_or(Error::BadFormat("line with insufficient commas"))?;
300            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
301
302            let defn = NetDefn::new(cc, asn)?;
303
304            ret.map_v4.insert(from..=to, defn);
305        }
306
307        // This is slightly copypasta, but probably less readable to merge into one thing.
308        for line in db_v6.lines() {
309            if line.starts_with('#') {
310                continue;
311            }
312            let line = line.trim();
313            if line.is_empty() {
314                continue;
315            }
316            let mut split = line.split(',');
317            let from = split
318                .next()
319                .ok_or(Error::BadFormat("empty line somehow?"))?
320                .parse::<Ipv6Addr>()?;
321            let to = split
322                .next()
323                .ok_or(Error::BadFormat("line with insufficient commas"))?
324                .parse::<Ipv6Addr>()?;
325            let cc = split
326                .next()
327                .ok_or(Error::BadFormat("line with insufficient commas"))?;
328            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
329
330            let defn = NetDefn::new(cc, asn)?;
331
332            ret.map_v6.insert(from.into()..=to.into(), defn);
333        }
334
335        Ok(ret)
336    }
337
338    /// Get the `NetDefn` for an IP address.
339    fn lookup_defn(&self, ip: IpAddr) -> Option<&NetDefn> {
340        match ip {
341            IpAddr::V4(v4) => self.map_v4.get(&v4.into()),
342            IpAddr::V6(v6) => self.map_v6.get(&v6.into()),
343        }
344    }
345
346    /// Get a 2-letter country code for the given IP address, if this data is available.
347    pub fn lookup_country_code(&self, ip: IpAddr) -> Option<&CountryCode> {
348        self.lookup_defn(ip).and_then(|x| x.country_code())
349    }
350
351    /// Determine a 2-letter country code for a host with multiple IP addresses.
352    ///
353    /// This looks up all of the IP addresses with `lookup_country_code`. If the lookups
354    /// return different countries, `None` is returned. IP addresses that fail to resolve
355    /// into a country are ignored if some of the other addresses do resolve successfully.
356    pub fn lookup_country_code_multi<I>(&self, ips: I) -> Option<&CountryCode>
357    where
358        I: IntoIterator<Item = IpAddr>,
359    {
360        let mut ret = None;
361
362        for ip in ips {
363            if let Some(cc) = self.lookup_country_code(ip) {
364                // If we already have a return value and it's different, then return None;
365                // a server can't be in two different countries.
366                if ret.is_some() && ret != Some(cc) {
367                    return None;
368                }
369
370                ret = Some(cc);
371            }
372        }
373
374        ret
375    }
376
377    /// Return the ASN the IP address is in, if this data is available.
378    pub fn lookup_asn(&self, ip: IpAddr) -> Option<u32> {
379        self.lookup_defn(ip)?.asn()
380    }
381}
382
383/// A (representation of a) host on the network which may have a known country code.
384pub trait HasCountryCode {
385    /// Return the country code in which this server is most likely located.
386    ///
387    /// This is usually implemented by simple GeoIP lookup on the addresses provided by `HasAddrs`.
388    /// It follows that the server might not actually be in the returned country, but this is a
389    /// halfway decent estimate for what other servers might guess the server's location to be
390    /// (and thus useful for e.g. getting around simple geo-blocks, or having webpages return
391    /// the correct localised versions).
392    ///
393    /// Returning `None` signifies that no country code information is available. (Conflicting
394    /// GeoIP lookup results might also cause `None` to be returned.)
395    fn country_code(&self) -> Option<CountryCode>;
396}
397
398#[cfg(test)]
399mod test {
400    // @@ begin test lint list maintained by maint/add_warning @@
401    #![allow(clippy::bool_assert_comparison)]
402    #![allow(clippy::clone_on_copy)]
403    #![allow(clippy::dbg_macro)]
404    #![allow(clippy::mixed_attributes_style)]
405    #![allow(clippy::print_stderr)]
406    #![allow(clippy::print_stdout)]
407    #![allow(clippy::single_char_pattern)]
408    #![allow(clippy::unwrap_used)]
409    #![allow(clippy::unchecked_duration_subtraction)]
410    #![allow(clippy::useless_vec)]
411    #![allow(clippy::needless_pass_by_value)]
412    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
413
414    use super::*;
415    use std::net::Ipv4Addr;
416
417    // NOTE(eta): this test takes a whole 1.6 seconds in *non-release* mode
418    #[test]
419    #[cfg(feature = "embedded-db")]
420    fn embedded_db() {
421        let db = GeoipDb::new_embedded();
422
423        assert_eq!(
424            db.lookup_country_code(Ipv4Addr::new(8, 8, 8, 8).into())
425                .map(|x| x.as_ref()),
426            Some("US")
427        );
428
429        assert_eq!(
430            db.lookup_country_code("2001:4860:4860::8888".parse().unwrap())
431                .map(|x| x.as_ref()),
432            Some("US")
433        );
434    }
435
436    #[test]
437    fn basic_lookups() {
438        let src_v4 = r#"
439        16909056,16909311,GB
440        "#;
441        let src_v6 = r#"
442        fe80::,fe81::,US
443        dead:beef::,dead:ffff::,??
444        "#;
445        let db = GeoipDb::new_from_legacy_format(src_v4, src_v6).unwrap();
446
447        assert_eq!(
448            db.lookup_country_code(Ipv4Addr::new(1, 2, 3, 4).into())
449                .map(|x| x.as_ref()),
450            Some("GB")
451        );
452
453        assert_eq!(
454            db.lookup_country_code(Ipv4Addr::new(1, 1, 1, 1).into()),
455            None
456        );
457
458        assert_eq!(
459            db.lookup_country_code("fe80::dead:beef".parse().unwrap())
460                .map(|x| x.as_ref()),
461            Some("US")
462        );
463
464        assert_eq!(
465            db.lookup_country_code("fe81::dead:beef".parse().unwrap()),
466            None
467        );
468        assert_eq!(
469            db.lookup_country_code("dead:beef::1".parse().unwrap()),
470            None
471        );
472    }
473
474    #[test]
475    fn cc_parse() -> Result<(), Error> {
476        // real countries.
477        assert_eq!(CountryCode::from_str("us")?, CountryCode::from_str("US")?);
478        assert_eq!(CountryCode::from_str("UY")?, CountryCode::from_str("UY")?);
479
480        // not real as of this writing, but still representable.
481        assert_eq!(CountryCode::from_str("A7")?, CountryCode::from_str("a7")?);
482        assert_eq!(CountryCode::from_str("xz")?, CountryCode::from_str("xz")?);
483
484        // Can't convert to two bytes.
485        assert!(matches!(
486            CountryCode::from_str("z"),
487            Err(Error::BadCountryCode(_))
488        ));
489        assert!(matches!(
490            CountryCode::from_str("🐻‍❄️"),
491            Err(Error::BadCountryCode(_))
492        ));
493        assert!(matches!(
494            CountryCode::from_str("Sheboygan"),
495            Err(Error::BadCountryCode(_))
496        ));
497
498        // Can convert to two bytes, but still not printable ascii
499        assert!(matches!(
500            CountryCode::from_str("\r\n"),
501            Err(Error::BadCountryCode(_))
502        ));
503        assert!(matches!(
504            CountryCode::from_str("\0\0"),
505            Err(Error::BadCountryCode(_))
506        ));
507        assert!(matches!(
508            CountryCode::from_str("¡"),
509            Err(Error::BadCountryCode(_))
510        ));
511
512        // Not a country.
513        assert!(matches!(
514            CountryCode::from_str("??"),
515            Err(Error::NowhereNotSupported)
516        ));
517
518        Ok(())
519    }
520
521    #[test]
522    fn opt_cc_parse() -> Result<(), Error> {
523        assert_eq!(
524            CountryCode::from_str("br")?,
525            OptionCc::from_str("BR")?.0.unwrap()
526        );
527        assert!(OptionCc::from_str("??")?.0.is_none());
528
529        Ok(())
530    }
531}