use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::sync::Arc; use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match}; /// A builder for constructing a Teddy matcher. /// /// The builder primarily permits fine grained configuration of the Teddy /// matcher. Most options are made only available for testing/benchmarking /// purposes. In reality, options are automatically determined by the nature /// and number of patterns given to the builder. #[derive(Clone, Debug)] pub(crate) struct Builder { /// When none, this is automatically determined. Otherwise, `false` means /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't /// available and Fat Teddy was requested, no matcher will be built. only_fat: Option, /// When none, this is automatically determined. Otherwise, `false` means /// that 128-bit vectors will be used (up to SSSE3 instructions) where as /// `true` means that 256-bit vectors will be used. As with `fat`, if /// 256-bit vectors are requested and they aren't available, then a /// searcher will not be built. only_256bit: Option, /// When true (the default), the number of patterns will be used as a /// heuristic for refusing construction of a Teddy searcher. The point here /// is that too many patterns can overwhelm Teddy. But this can be disabled /// in cases where the caller knows better. heuristic_pattern_limits: bool, } impl Default for Builder { fn default() -> Builder { Builder::new() } } impl Builder { /// Create a new builder for configuring a Teddy matcher. pub(crate) fn new() -> Builder { Builder { only_fat: None, only_256bit: None, heuristic_pattern_limits: true, } } /// Build a matcher for the set of patterns given. If a matcher could not /// be built, then `None` is returned. /// /// Generally, a matcher isn't built if the necessary CPU features aren't /// available, an unsupported target or if the searcher is believed to be /// slower than standard techniques (i.e., if there are too many literals). pub(crate) fn build(&self, patterns: Arc) -> Option { self.build_imp(patterns) } /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful /// for a larger set of literals. /// /// `None` is the default, which results in an automatic selection based /// on the number of literals and available CPU features. pub(crate) fn only_fat(&mut self, yes: Option) -> &mut Builder { self.only_fat = yes; self } /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). /// Generally, a larger vector size is better since it either permits /// matching more patterns or matching more bytes in the haystack at once. /// /// `None` is the default, which results in an automatic selection based on /// the number of literals and available CPU features. pub(crate) fn only_256bit(&mut self, yes: Option) -> &mut Builder { self.only_256bit = yes; self } /// Request that heuristic limitations on the number of patterns be /// employed. This useful to disable for benchmarking where one wants to /// explore how Teddy performs on large number of patterns even if the /// heuristics would otherwise refuse construction. /// /// This is enabled by default. pub(crate) fn heuristic_pattern_limits( &mut self, yes: bool, ) -> &mut Builder { self.heuristic_pattern_limits = yes; self } fn build_imp(&self, patterns: Arc) -> Option { let patlimit = self.heuristic_pattern_limits; // There's no particular reason why we limit ourselves to little endian // here, but it seems likely that some parts of Teddy as they are // currently written (e.g., the uses of `trailing_zeros`) are likely // wrong on non-little-endian targets. Such things are likely easy to // fix, but at the time of writing (2023/09/18), I actually do not know // how to test this code on a big-endian target. So for now, we're // conservative and just bail out. if !cfg!(target_endian = "little") { debug!("skipping Teddy because target isn't little endian"); return None; } // Too many patterns will overwhelm Teddy and likely lead to slow // downs, typically in the verification step. if patlimit && patterns.len() > 64 { debug!("skipping Teddy because of too many patterns"); return None; } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] { use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3}; let mask_len = core::cmp::min(4, patterns.minimum_len()); let beefy = patterns.len() > 32; let has_avx2 = self::x86_64::is_available_avx2(); let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3(); let use_avx2 = if self.only_256bit == Some(true) { if !has_avx2 { debug!( "skipping Teddy because avx2 was demanded but unavailable" ); return None; } true } else if self.only_256bit == Some(false) { if !has_ssse3 { debug!( "skipping Teddy because ssse3 was demanded but unavailable" ); return None; } false } else if !has_ssse3 && !has_avx2 { debug!( "skipping Teddy because ssse3 and avx2 are unavailable" ); return None; } else { has_avx2 }; let fat = match self.only_fat { None => use_avx2 && beefy, Some(false) => false, Some(true) if !use_avx2 => { debug!( "skipping Teddy because fat was demanded, but fat \ Teddy requires avx2 which is unavailable" ); return None; } Some(true) => true, }; // Just like for aarch64, it's possible that too many patterns will // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which // helps things scale a bit more by spreading patterns over more // buckets. // // These thresholds were determined by looking at the measurements // for the rust/aho-corasick/packed/leftmost-first and // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` // benchmarks. if patlimit && mask_len == 1 && patterns.len() > 16 { debug!( "skipping Teddy (mask len: 1) because there are \ too many patterns", ); return None; } match (mask_len, use_avx2, fat) { (1, false, _) => { debug!("Teddy choice: 128-bit slim, 1 byte"); SlimSSSE3::<1>::new(&patterns) } (1, true, false) => { debug!("Teddy choice: 256-bit slim, 1 byte"); SlimAVX2::<1>::new(&patterns) } (1, true, true) => { debug!("Teddy choice: 256-bit fat, 1 byte"); FatAVX2::<1>::new(&patterns) } (2, false, _) => { debug!("Teddy choice: 128-bit slim, 2 bytes"); SlimSSSE3::<2>::new(&patterns) } (2, true, false) => { debug!("Teddy choice: 256-bit slim, 2 bytes"); SlimAVX2::<2>::new(&patterns) } (2, true, true) => { debug!("Teddy choice: 256-bit fat, 2 bytes"); FatAVX2::<2>::new(&patterns) } (3, false, _) => { debug!("Teddy choice: 128-bit slim, 3 bytes"); SlimSSSE3::<3>::new(&patterns) } (3, true, false) => { debug!("Teddy choice: 256-bit slim, 3 bytes"); SlimAVX2::<3>::new(&patterns) } (3, true, true) => { debug!("Teddy choice: 256-bit fat, 3 bytes"); FatAVX2::<3>::new(&patterns) } (4, false, _) => { debug!("Teddy choice: 128-bit slim, 4 bytes"); SlimSSSE3::<4>::new(&patterns) } (4, true, false) => { debug!("Teddy choice: 256-bit slim, 4 bytes"); SlimAVX2::<4>::new(&patterns) } (4, true, true) => { debug!("Teddy choice: 256-bit fat, 4 bytes"); FatAVX2::<4>::new(&patterns) } _ => { debug!("no supported Teddy configuration found"); None } } } #[cfg(all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ))] { use self::aarch64::SlimNeon; let mask_len = core::cmp::min(4, patterns.minimum_len()); if self.only_256bit == Some(true) { debug!( "skipping Teddy because 256-bits were demanded \ but unavailable" ); return None; } if self.only_fat == Some(true) { debug!( "skipping Teddy because fat was demanded but unavailable" ); } // Since we don't have Fat teddy in aarch64 (I think we'd want at // least 256-bit vectors for that), we need to be careful not to // allow too many patterns as it might overwhelm Teddy. Generally // speaking, as the mask length goes up, the more patterns we can // handle because the mask length results in fewer candidates // generated. // // These thresholds were determined by looking at the measurements // for the rust/aho-corasick/packed/leftmost-first and // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` // benchmarks. match mask_len { 1 => { if patlimit && patterns.len() > 16 { debug!( "skipping Teddy (mask len: 1) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 1 byte"); SlimNeon::<1>::new(&patterns) } 2 => { if patlimit && patterns.len() > 32 { debug!( "skipping Teddy (mask len: 2) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 2 bytes"); SlimNeon::<2>::new(&patterns) } 3 => { if patlimit && patterns.len() > 48 { debug!( "skipping Teddy (mask len: 3) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 3 bytes"); SlimNeon::<3>::new(&patterns) } 4 => { debug!("Teddy choice: 128-bit slim, 4 bytes"); SlimNeon::<4>::new(&patterns) } _ => { debug!("no supported Teddy configuration found"); None } } } #[cfg(not(any( all(target_arch = "x86_64", target_feature = "sse2"), all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ) )))] { None } } } /// A searcher that dispatches to one of several possible Teddy variants. #[derive(Clone, Debug)] pub(crate) struct Searcher { /// The Teddy variant we use. We use dynamic dispatch under the theory that /// it results in better codegen then a enum, although this is a specious /// claim. /// /// This `Searcher` is essentially a wrapper for a `SearcherT` trait /// object. We just make `memory_usage` and `minimum_len` available without /// going through dynamic dispatch. imp: Arc, /// Total heap memory used by the Teddy variant. memory_usage: usize, /// The minimum haystack length this searcher can handle. It is intended /// for callers to use some other search routine (such as Rabin-Karp) in /// cases where the haystack (or remainer of the haystack) is too short. minimum_len: usize, } impl Searcher { /// Look for the leftmost occurrence of any pattern in this search in the /// given haystack starting at the given position. /// /// # Panics /// /// This panics when `haystack[at..].len()` is less than the minimum length /// for this haystack. #[inline(always)] pub(crate) fn find( &self, haystack: &[u8], at: usize, ) -> Option { // SAFETY: The Teddy implementations all require a minimum haystack // length, and this is required for safety. Therefore, we assert it // here in order to make this method sound. assert!(haystack[at..].len() >= self.minimum_len); let hayptr = haystack.as_ptr(); // SAFETY: Construction of the searcher guarantees that we are able // to run it in the current environment (i.e., we won't get an AVX2 // searcher on a x86-64 CPU without AVX2 support). Also, the pointers // are valid as they are derived directly from a borrowed slice. let teddym = unsafe { self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))? }; let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize()); let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize()); let span = crate::Span { start, end }; // OK because we won't permit the construction of a searcher that // could report a pattern ID bigger than what can fit in the crate-wide // PatternID type. let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize()); let m = crate::Match::new(pid, span); Some(m) } /// Returns the approximate total amount of heap used by this type, in /// units of bytes. #[inline(always)] pub(crate) fn memory_usage(&self) -> usize { self.memory_usage } /// Returns the minimum length, in bytes, that a haystack must be in order /// to use it with this searcher. #[inline(always)] pub(crate) fn minimum_len(&self) -> usize { self.minimum_len } } /// A trait that provides dynamic dispatch over the different possible Teddy /// variants on the same algorithm. /// /// On `x86_64` for example, it isn't known until runtime which of 12 possible /// variants will be used. One might use one of the four slim 128-bit vector /// variants, or one of the four 256-bit vector variants or even one of the /// four fat 256-bit vector variants. /// /// Since this choice is generally made when the Teddy searcher is constructed /// and this choice is based on the patterns given and what the current CPU /// supports, it follows that there must be some kind of indirection at search /// time that "selects" the variant chosen at build time. /// /// There are a few different ways to go about this. One approach is to use an /// enum. It works fine, but in my experiments, this generally results in worse /// codegen. Another approach, which is what we use here, is dynamic dispatch /// via a trait object. We basically implement this trait for each possible /// variant, select the variant we want at build time and convert it to a /// trait object for use at search time. /// /// Another approach is to use function pointers and stick each of the possible /// variants into a union. This is essentially isomorphic to the dynamic /// dispatch approach, but doesn't require any allocations. Since this crate /// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The /// `memchr` crate does this.) trait SearcherT: Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static { /// Execute a search on the given haystack (identified by `start` and `end` /// raw pointers). /// /// # Safety /// /// Essentially, the `start` and `end` pointers must be valid and point /// to a haystack one can read. As long as you derive them from, for /// example, a `&[u8]`, they should automatically satisfy all of the safety /// obligations: /// /// * Both `start` and `end` must be valid for reads. /// * Both `start` and `end` must point to an initialized value. /// * Both `start` and `end` must point to the same allocated object and /// must either be in bounds or at most one byte past the end of the /// allocated object. /// * Both `start` and `end` must be _derived from_ a pointer to the same /// object. /// * The distance between `start` and `end` must not overflow `isize`. /// * The distance being in bounds must not rely on "wrapping around" the /// address space. /// * It must be the case that `start <= end`. /// * `end - start` must be greater than the minimum length for this /// searcher. /// /// Also, it is expected that implementations of this trait will tag this /// method with a `target_feature` attribute. Callers must ensure that /// they are executing this method in an environment where that attribute /// is valid. unsafe fn find(&self, start: *const u8, end: *const u8) -> Option; } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] mod x86_64 { use core::arch::x86_64::{__m128i, __m256i}; use alloc::sync::Arc; use crate::packed::{ ext::Pointer, pattern::Patterns, teddy::generic::{self, Match}, }; use super::{Searcher, SearcherT}; #[derive(Clone, Debug)] pub(super) struct SlimSSSE3 { slim128: generic::Slim<__m128i, BYTES>, } // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_ssse3 { ($len:expr) => { impl SlimSSSE3<$len> { /// Creates a new searcher using "slim" Teddy with 128-bit /// vectors. If SSSE3 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_ssse3() { return None; } Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether SSSE3 is available or not. /// /// # Safety /// /// Callers must ensure that SSSE3 is available in the current /// environment. #[target_feature(enable = "ssse3")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::<__m128i, $len>::new( Arc::clone(patterns), ); let memory_usage = slim128.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimSSSE3 { slim128 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimSSSE3<$len> { #[target_feature(enable = "ssse3")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.slim128.find(start, end) } } }; } slim_ssse3!(1); slim_ssse3!(2); slim_ssse3!(3); slim_ssse3!(4); #[derive(Clone, Debug)] pub(super) struct SlimAVX2 { slim128: generic::Slim<__m128i, BYTES>, slim256: generic::Slim<__m256i, BYTES>, } // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_avx2 { ($len:expr) => { impl SlimAVX2<$len> { /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors. If AVX2 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_avx2() { return None; } Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether AVX2 is available or not. /// /// # Safety /// /// Callers must ensure that AVX2 is available in the current /// environment. #[target_feature(enable = "avx2")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::<__m128i, $len>::new( Arc::clone(&patterns), ); let slim256 = generic::Slim::<__m256i, $len>::new( Arc::clone(&patterns), ); let memory_usage = slim128.memory_usage() + slim256.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimAVX2 { slim128, slim256 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimAVX2<$len> { #[target_feature(enable = "avx2")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. let len = end.distance(start); if len < self.slim256.minimum_len() { self.slim128.find(start, end) } else { self.slim256.find(start, end) } } } }; } slim_avx2!(1); slim_avx2!(2); slim_avx2!(3); slim_avx2!(4); #[derive(Clone, Debug)] pub(super) struct FatAVX2 { fat256: generic::Fat<__m256i, BYTES>, } // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! fat_avx2 { ($len:expr) => { impl FatAVX2<$len> { /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors. If AVX2 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_avx2() { return None; } Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether AVX2 is available or not. /// /// # Safety /// /// Callers must ensure that AVX2 is available in the current /// environment. #[target_feature(enable = "avx2")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let fat256 = generic::Fat::<__m256i, $len>::new( Arc::clone(&patterns), ); let memory_usage = fat256.memory_usage(); let minimum_len = fat256.minimum_len(); let imp = Arc::new(FatAVX2 { fat256 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for FatAVX2<$len> { #[target_feature(enable = "avx2")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.fat256.find(start, end) } } }; } fat_avx2!(1); fat_avx2!(2); fat_avx2!(3); fat_avx2!(4); #[inline] pub(super) fn is_available_ssse3() -> bool { #[cfg(not(target_feature = "sse2"))] { false } #[cfg(target_feature = "sse2")] { #[cfg(target_feature = "ssse3")] { true } #[cfg(not(target_feature = "ssse3"))] { #[cfg(feature = "std")] { std::is_x86_feature_detected!("ssse3") } #[cfg(not(feature = "std"))] { false } } } } #[inline] pub(super) fn is_available_avx2() -> bool { #[cfg(not(target_feature = "sse2"))] { false } #[cfg(target_feature = "sse2")] { #[cfg(target_feature = "avx2")] { true } #[cfg(not(target_feature = "avx2"))] { #[cfg(feature = "std")] { std::is_x86_feature_detected!("avx2") } #[cfg(not(feature = "std"))] { false } } } } } #[cfg(all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ))] mod aarch64 { use core::arch::aarch64::uint8x16_t; use alloc::sync::Arc; use crate::packed::{ pattern::Patterns, teddy::generic::{self, Match}, }; use super::{Searcher, SearcherT}; #[derive(Clone, Debug)] pub(super) struct SlimNeon { slim128: generic::Slim, } // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_neon { ($len:expr) => { impl SlimNeon<$len> { /// Creates a new searcher using "slim" Teddy with 128-bit /// vectors. If SSSE3 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether SSSE3 is available or not. /// /// # Safety /// /// Callers must ensure that SSSE3 is available in the current /// environment. #[target_feature(enable = "neon")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::::new( Arc::clone(patterns), ); let memory_usage = slim128.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimNeon { slim128 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimNeon<$len> { #[target_feature(enable = "neon")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.slim128.find(start, end) } } }; } slim_neon!(1); slim_neon!(2); slim_neon!(3); slim_neon!(4); }