memchr/arch/all/packedpair/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
/*!
Provides an architecture independent implementation of the "packed pair"
algorithm.
The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
difference is that it (by default) uses a background distribution of byte
frequencies to heuristically select the pair of bytes to search for. Note that
this module provides an architecture independent version that doesn't do as
good of a job keeping the search for candidates inside a SIMD hot path. It
however can be good enough in many circumstances.
[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
*/
use crate::memchr;
mod default_rank;
/// An architecture independent "packed pair" finder.
///
/// This finder picks two bytes that it believes have high predictive power for
/// indicating an overall match of a needle. At search time, it reports offsets
/// where the needle could match based on whether the pair of bytes it chose
/// match.
///
/// This is architecture independent because it utilizes `memchr` to find the
/// occurrence of one of the bytes in the pair, and then checks whether the
/// second byte matches. If it does, in the case of [`Finder::find_prefilter`],
/// the location at which the needle could match is returned.
///
/// It is generally preferred to use architecture specific routines for a
/// "packed pair" prefilter, but this can be a useful fallback when the
/// architecture independent routines are unavailable.
#[derive(Clone, Copy, Debug)]
pub struct Finder {
pair: Pair,
byte1: u8,
byte2: u8,
}
impl Finder {
/// Create a new prefilter that reports possible locations where the given
/// needle matches.
#[inline]
pub fn new(needle: &[u8]) -> Option<Finder> {
Finder::with_pair(needle, Pair::new(needle)?)
}
/// Create a new prefilter using the pair given.
///
/// If the prefilter could not be constructed, then `None` is returned.
///
/// This constructor permits callers to control precisely which pair of
/// bytes is used as a predicate.
#[inline]
pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
let byte1 = needle[usize::from(pair.index1())];
let byte2 = needle[usize::from(pair.index2())];
// Currently this can never fail so we could just return a Finder,
// but it's conceivable this could change.
Some(Finder { pair, byte1, byte2 })
}
/// Run this finder on the given haystack as a prefilter.
///
/// If a candidate match is found, then an offset where the needle *could*
/// begin in the haystack is returned.
#[inline]
pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
let mut i = 0;
let index1 = usize::from(self.pair.index1());
let index2 = usize::from(self.pair.index2());
loop {
// Use a fast vectorized implementation to skip to the next
// occurrence of the rarest byte (heuristically chosen) in the
// needle.
i += memchr(self.byte1, &haystack[i..])?;
let found = i;
i += 1;
// If we can't align our first byte match with the haystack, then a
// match is impossible.
let aligned1 = match found.checked_sub(index1) {
None => continue,
Some(aligned1) => aligned1,
};
// Now align the second byte match with the haystack. A mismatch
// means that a match is impossible.
let aligned2 = match aligned1.checked_add(index2) {
None => continue,
Some(aligned_index2) => aligned_index2,
};
if haystack.get(aligned2).map_or(true, |&b| b != self.byte2) {
continue;
}
// We've done what we can. There might be a match here.
return Some(aligned1);
}
}
/// Returns the pair of offsets (into the needle) used to check as a
/// predicate before confirming whether a needle exists at a particular
/// position.
#[inline]
pub fn pair(&self) -> &Pair {
&self.pair
}
}
/// A pair of byte offsets into a needle to use as a predicate.
///
/// This pair is used as a predicate to quickly filter out positions in a
/// haystack in which a needle cannot match. In some cases, this pair can even
/// be used in vector algorithms such that the vector algorithm only switches
/// over to scalar code once this pair has been found.
///
/// A pair of offsets can be used in both substring search implementations and
/// in prefilters. The former will report matches of a needle in a haystack
/// where as the latter will only report possible matches of a needle.
///
/// The offsets are limited each to a maximum of 255 to keep memory usage low.
/// Moreover, it's rarely advantageous to create a predicate using offsets
/// greater than 255 anyway.
///
/// The only guarantee enforced on the pair of offsets is that they are not
/// equivalent. It is not necessarily the case that `index1 < index2` for
/// example. By convention, `index1` corresponds to the byte in the needle
/// that is believed to be most the predictive. Note also that because of the
/// requirement that the indices be both valid for the needle used to build
/// the pair and not equal, it follows that a pair can only be constructed for
/// needles with length at least 2.
#[derive(Clone, Copy, Debug)]
pub struct Pair {
index1: u8,
index2: u8,
}
impl Pair {
/// Create a new pair of offsets from the given needle.
///
/// If a pair could not be created (for example, if the needle is too
/// short), then `None` is returned.
///
/// This chooses the pair in the needle that is believed to be as
/// predictive of an overall match of the needle as possible.
#[inline]
pub fn new(needle: &[u8]) -> Option<Pair> {
Pair::with_ranker(needle, DefaultFrequencyRank)
}
/// Create a new pair of offsets from the given needle and ranker.
///
/// This permits the caller to choose a background frequency distribution
/// with which bytes are selected. The idea is to select a pair of bytes
/// that is believed to strongly predict a match in the haystack. This
/// usually means selecting bytes that occur rarely in a haystack.
///
/// If a pair could not be created (for example, if the needle is too
/// short), then `None` is returned.
#[inline]
pub fn with_ranker<R: HeuristicFrequencyRank>(
needle: &[u8],
ranker: R,
) -> Option<Pair> {
if needle.len() <= 1 {
return None;
}
// Find the rarest two bytes. We make them distinct indices by
// construction. (The actual byte value may be the same in degenerate
// cases, but that's OK.)
let (mut rare1, mut index1) = (needle[0], 0);
let (mut rare2, mut index2) = (needle[1], 1);
if ranker.rank(rare2) < ranker.rank(rare1) {
core::mem::swap(&mut rare1, &mut rare2);
core::mem::swap(&mut index1, &mut index2);
}
let max = usize::from(core::u8::MAX);
for (i, &b) in needle.iter().enumerate().take(max).skip(2) {
if ranker.rank(b) < ranker.rank(rare1) {
rare2 = rare1;
index2 = index1;
rare1 = b;
index1 = u8::try_from(i).unwrap();
} else if b != rare1 && ranker.rank(b) < ranker.rank(rare2) {
rare2 = b;
index2 = u8::try_from(i).unwrap();
}
}
// While not strictly required for how a Pair is normally used, we
// really don't want these to be equivalent. If they were, it would
// reduce the effectiveness of candidate searching using these rare
// bytes by increasing the rate of false positives.
assert_ne!(index1, index2);
Some(Pair { index1, index2 })
}
/// Create a new pair using the offsets given for the needle given.
///
/// This bypasses any sort of heuristic process for choosing the offsets
/// and permits the caller to choose the offsets themselves.
///
/// Indices are limited to valid `u8` values so that a `Pair` uses less
/// memory. It is not possible to create a `Pair` with offsets bigger than
/// `u8::MAX`. It's likely that such a thing is not needed, but if it is,
/// it's suggested to build your own bespoke algorithm because you're
/// likely working on a very niche case. (File an issue if this suggestion
/// does not make sense to you.)
///
/// If a pair could not be created (for example, if the needle is too
/// short), then `None` is returned.
#[inline]
pub fn with_indices(
needle: &[u8],
index1: u8,
index2: u8,
) -> Option<Pair> {
// While not strictly required for how a Pair is normally used, we
// really don't want these to be equivalent. If they were, it would
// reduce the effectiveness of candidate searching using these rare
// bytes by increasing the rate of false positives.
if index1 == index2 {
return None;
}
// Similarly, invalid indices means the Pair is invalid too.
if usize::from(index1) >= needle.len() {
return None;
}
if usize::from(index2) >= needle.len() {
return None;
}
Some(Pair { index1, index2 })
}
/// Returns the first offset of the pair.
#[inline]
pub fn index1(&self) -> u8 {
self.index1
}
/// Returns the second offset of the pair.
#[inline]
pub fn index2(&self) -> u8 {
self.index2
}
}
/// This trait allows the user to customize the heuristic used to determine the
/// relative frequency of a given byte in the dataset being searched.
///
/// The use of this trait can have a dramatic impact on performance depending
/// on the type of data being searched. The details of why are explained in the
/// docs of [`crate::memmem::Prefilter`]. To summarize, the core algorithm uses
/// a prefilter to quickly identify candidate matches that are later verified
/// more slowly. This prefilter is implemented in terms of trying to find
/// `rare` bytes at specific offsets that will occur less frequently in the
/// dataset. While the concept of a `rare` byte is similar for most datasets,
/// there are some specific datasets (like binary executables) that have
/// dramatically different byte distributions. For these datasets customizing
/// the byte frequency heuristic can have a massive impact on performance, and
/// might even need to be done at runtime.
///
/// The default implementation of `HeuristicFrequencyRank` reads from the
/// static frequency table defined in `src/memmem/byte_frequencies.rs`. This
/// is optimal for most inputs, so if you are unsure of the impact of using a
/// custom `HeuristicFrequencyRank` you should probably just use the default.
///
/// # Example
///
/// ```
/// use memchr::{
/// arch::all::packedpair::HeuristicFrequencyRank,
/// memmem::FinderBuilder,
/// };
///
/// /// A byte-frequency table that is good for scanning binary executables.
/// struct Binary;
///
/// impl HeuristicFrequencyRank for Binary {
/// fn rank(&self, byte: u8) -> u8 {
/// const TABLE: [u8; 256] = [
/// 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17,
/// 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16,
/// 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11,
/// 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8,
/// 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27,
/// 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19,
/// 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24,
/// 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
/// 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15,
/// 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0,
/// 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0,
/// 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5,
/// 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
/// 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2,
/// 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5,
/// 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175,
/// ];
/// TABLE[byte as usize]
/// }
/// }
/// // Create a new finder with the custom heuristic.
/// let finder = FinderBuilder::new()
/// .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd");
/// // Find needle with custom heuristic.
/// assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some());
/// ```
pub trait HeuristicFrequencyRank {
/// Return the heuristic frequency rank of the given byte. A lower rank
/// means the byte is believed to occur less frequently in the haystack.
///
/// Some uses of this heuristic may treat arbitrary absolute rank values as
/// significant. For example, an implementation detail in this crate may
/// determine that heuristic prefilters are inappropriate if every byte in
/// the needle has a "high" rank.
fn rank(&self, byte: u8) -> u8;
}
/// The default byte frequency heuristic that is good for most haystacks.
pub(crate) struct DefaultFrequencyRank;
impl HeuristicFrequencyRank for DefaultFrequencyRank {
fn rank(&self, byte: u8) -> u8 {
self::default_rank::RANK[usize::from(byte)]
}
}
/// This permits passing any implementation of `HeuristicFrequencyRank` as a
/// borrowed version of itself.
impl<'a, R> HeuristicFrequencyRank for &'a R
where
R: HeuristicFrequencyRank,
{
fn rank(&self, byte: u8) -> u8 {
(**self).rank(byte)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn forward_packedpair() {
fn find(
haystack: &[u8],
needle: &[u8],
_index1: u8,
_index2: u8,
) -> Option<Option<usize>> {
// We ignore the index positions requested since it winds up making
// this test too slow overall.
let f = Finder::new(needle)?;
Some(f.find_prefilter(haystack))
}
crate::tests::packedpair::Runner::new().fwd(find).run()
}
}