memchr/arch/all/
twoway.rs

Help
1/*!
2An implementation of the [Two-Way substring search algorithm][two-way].
3
4[`Finder`] can be built for forward searches, while [`FinderRev`] can be built
5for reverse searches.
6
7Two-Way makes for a nice general purpose substring search algorithm because of
8its time and space complexity properties. It also performs well in practice.
9Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)`
10time to create a finder, `O(1)` space and `O(n)` search time. In other words,
11the preprocessing step is quick, doesn't require any heap memory and the worst
12case search time is guaranteed to be linear in the haystack regardless of the
13size of the needle.
14
15While vector algorithms will usually beat Two-Way handedly, vector algorithms
16also usually have pathological or edge cases that are better handled by Two-Way.
17Moreover, not all targets support vector algorithms or implementations for them
18simply may not exist yet.
19
20Two-Way can be found in the `memmem` implementations in at least [GNU libc] and
21[musl].
22
23[two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm
24[GNU libc]: https://www.gnu.org/software/libc/
25[musl]: https://www.musl-libc.org/
26*/
27
28use core::cmp;
29
30use crate::{
31    arch::all::{is_prefix, is_suffix},
32    memmem::Pre,
33};
34
35/// A forward substring searcher that uses the Two-Way algorithm.
36#[derive(Clone, Copy, Debug)]
37pub struct Finder(TwoWay);
38
39/// A reverse substring searcher that uses the Two-Way algorithm.
40#[derive(Clone, Copy, Debug)]
41pub struct FinderRev(TwoWay);
42
43/// An implementation of the TwoWay substring search algorithm.
44///
45/// This searcher supports forward and reverse search, although not
46/// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where
47/// `n ~ len(needle)` and `m ~ len(haystack)`.
48///
49/// The implementation here roughly matches that which was developed by
50/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
51/// changes in this implementation are 1) the use of zero-based indices, 2) a
52/// heuristic skip table based on the last byte (borrowed from Rust's standard
53/// library) and 3) the addition of heuristics for a fast skip loop. For (3),
54/// callers can pass any kind of prefilter they want, but usually it's one
55/// based on a heuristic that uses an approximate background frequency of bytes
56/// to choose rare bytes to quickly look for candidate match positions. Note
57/// though that currently, this prefilter functionality is not exposed directly
58/// in the public API. (File an issue if you want it and provide a use case
59/// please.)
60///
61/// The heuristic for fast skipping is automatically shut off if it's
62/// detected to be ineffective at search time. Generally, this only occurs in
63/// pathological cases. But this is generally necessary in order to preserve
64/// a `O(n + m)` time bound.
65///
66/// The code below is fairly complex and not obviously correct at all. It's
67/// likely necessary to read the Two-Way paper cited above in order to fully
68/// grok this code. The essence of it is:
69///
70/// 1. Do something to detect a "critical" position in the needle.
71/// 2. For the current position in the haystack, look if `needle[critical..]`
72/// matches at that position.
73/// 3. If so, look if `needle[..critical]` matches.
74/// 4. If a mismatch occurs, shift the search by some amount based on the
75/// critical position and a pre-computed shift.
76///
77/// This type is wrapped in the forward and reverse finders that expose
78/// consistent forward or reverse APIs.
79#[derive(Clone, Copy, Debug)]
80struct TwoWay {
81    /// A small bitset used as a quick prefilter (in addition to any prefilter
82    /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i`
83    /// for any `b == needle[i]`.
84    ///
85    /// When used as a prefilter, if the last byte at the current candidate
86    /// position is NOT in this set, then we can skip that entire candidate
87    /// position (the length of the needle). This is essentially the shift
88    /// trick found in Boyer-Moore, but only applied to bytes that don't appear
89    /// in the needle.
90    ///
91    /// N.B. This trick was inspired by something similar in std's
92    /// implementation of Two-Way.
93    byteset: ApproximateByteSet,
94    /// A critical position in needle. Specifically, this position corresponds
95    /// to beginning of either the minimal or maximal suffix in needle. (N.B.
96    /// See SuffixType below for why "minimal" isn't quite the correct word
97    /// here.)
98    ///
99    /// This is the position at which every search begins. Namely, search
100    /// starts by scanning text to the right of this position, and only if
101    /// there's a match does the text to the left of this position get scanned.
102    critical_pos: usize,
103    /// The amount we shift by in the Two-Way search algorithm. This
104    /// corresponds to the "small period" and "large period" cases.
105    shift: Shift,
106}
107
108impl Finder {
109    /// Create a searcher that finds occurrences of the given `needle`.
110    ///
111    /// An empty `needle` results in a match at every position in a haystack,
112    /// including at `haystack.len()`.
113    #[inline]
114    pub fn new(needle: &[u8]) -> Finder {
115        let byteset = ApproximateByteSet::new(needle);
116        let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
117        let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
118        let (period_lower_bound, critical_pos) =
119            if min_suffix.pos > max_suffix.pos {
120                (min_suffix.period, min_suffix.pos)
121            } else {
122                (max_suffix.period, max_suffix.pos)
123            };
124        let shift = Shift::forward(needle, period_lower_bound, critical_pos);
125        Finder(TwoWay { byteset, critical_pos, shift })
126    }
127
128    /// Returns the first occurrence of `needle` in the given `haystack`, or
129    /// `None` if no such occurrence could be found.
130    ///
131    /// The `needle` given must be the same as the `needle` provided to
132    /// [`Finder::new`].
133    ///
134    /// An empty `needle` results in a match at every position in a haystack,
135    /// including at `haystack.len()`.
136    #[inline]
137    pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
138        self.find_with_prefilter(None, haystack, needle)
139    }
140
141    /// This is like [`Finder::find`], but it accepts a prefilter for
142    /// accelerating searches.
143    ///
144    /// Currently this is not exposed in the public API because, at the time
145    /// of writing, I didn't want to spend time thinking about how to expose
146    /// the prefilter infrastructure (if at all). If you have a compelling use
147    /// case for exposing this routine, please create an issue. Do *not* open
148    /// a PR that just exposes `Pre` and friends. Exporting this routine will
149    /// require API design.
150    #[inline(always)]
151    pub(crate) fn find_with_prefilter(
152        &self,
153        pre: Option<Pre<'_>>,
154        haystack: &[u8],
155        needle: &[u8],
156    ) -> Option<usize> {
157        match self.0.shift {
158            Shift::Small { period } => {
159                self.find_small_imp(pre, haystack, needle, period)
160            }
161            Shift::Large { shift } => {
162                self.find_large_imp(pre, haystack, needle, shift)
163            }
164        }
165    }
166
167    // Each of the two search implementations below can be accelerated by a
168    // prefilter, but it is not always enabled. To avoid its overhead when
169    // its disabled, we explicitly inline each search implementation based on
170    // whether a prefilter will be used or not. The decision on which to use
171    // is made in the parent meta searcher.
172
173    #[inline(always)]
174    fn find_small_imp(
175        &self,
176        mut pre: Option<Pre<'_>>,
177        haystack: &[u8],
178        needle: &[u8],
179        period: usize,
180    ) -> Option<usize> {
181        let mut pos = 0;
182        let mut shift = 0;
183        let last_byte_pos = match needle.len().checked_sub(1) {
184            None => return Some(pos),
185            Some(last_byte) => last_byte,
186        };
187        while pos + needle.len() <= haystack.len() {
188            let mut i = cmp::max(self.0.critical_pos, shift);
189            if let Some(pre) = pre.as_mut() {
190                if pre.is_effective() {
191                    pos += pre.find(&haystack[pos..])?;
192                    shift = 0;
193                    i = self.0.critical_pos;
194                    if pos + needle.len() > haystack.len() {
195                        return None;
196                    }
197                }
198            }
199            if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
200                pos += needle.len();
201                shift = 0;
202                continue;
203            }
204            while i < needle.len() && needle[i] == haystack[pos + i] {
205                i += 1;
206            }
207            if i < needle.len() {
208                pos += i - self.0.critical_pos + 1;
209                shift = 0;
210            } else {
211                let mut j = self.0.critical_pos;
212                while j > shift && needle[j] == haystack[pos + j] {
213                    j -= 1;
214                }
215                if j <= shift && needle[shift] == haystack[pos + shift] {
216                    return Some(pos);
217                }
218                pos += period;
219                shift = needle.len() - period;
220            }
221        }
222        None
223    }
224
225    #[inline(always)]
226    fn find_large_imp(
227        &self,
228        mut pre: Option<Pre<'_>>,
229        haystack: &[u8],
230        needle: &[u8],
231        shift: usize,
232    ) -> Option<usize> {
233        let mut pos = 0;
234        let last_byte_pos = match needle.len().checked_sub(1) {
235            None => return Some(pos),
236            Some(last_byte) => last_byte,
237        };
238        'outer: while pos + needle.len() <= haystack.len() {
239            if let Some(pre) = pre.as_mut() {
240                if pre.is_effective() {
241                    pos += pre.find(&haystack[pos..])?;
242                    if pos + needle.len() > haystack.len() {
243                        return None;
244                    }
245                }
246            }
247
248            if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
249                pos += needle.len();
250                continue;
251            }
252            let mut i = self.0.critical_pos;
253            while i < needle.len() && needle[i] == haystack[pos + i] {
254                i += 1;
255            }
256            if i < needle.len() {
257                pos += i - self.0.critical_pos + 1;
258            } else {
259                for j in (0..self.0.critical_pos).rev() {
260                    if needle[j] != haystack[pos + j] {
261                        pos += shift;
262                        continue 'outer;
263                    }
264                }
265                return Some(pos);
266            }
267        }
268        None
269    }
270}
271
272impl FinderRev {
273    /// Create a searcher that finds occurrences of the given `needle`.
274    ///
275    /// An empty `needle` results in a match at every position in a haystack,
276    /// including at `haystack.len()`.
277    #[inline]
278    pub fn new(needle: &[u8]) -> FinderRev {
279        let byteset = ApproximateByteSet::new(needle);
280        let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
281        let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
282        let (period_lower_bound, critical_pos) =
283            if min_suffix.pos < max_suffix.pos {
284                (min_suffix.period, min_suffix.pos)
285            } else {
286                (max_suffix.period, max_suffix.pos)
287            };
288        let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
289        FinderRev(TwoWay { byteset, critical_pos, shift })
290    }
291
292    /// Returns the last occurrence of `needle` in the given `haystack`, or
293    /// `None` if no such occurrence could be found.
294    ///
295    /// The `needle` given must be the same as the `needle` provided to
296    /// [`FinderRev::new`].
297    ///
298    /// An empty `needle` results in a match at every position in a haystack,
299    /// including at `haystack.len()`.
300    #[inline]
301    pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
302        // For the reverse case, we don't use a prefilter. It's plausible that
303        // perhaps we should, but it's a lot of additional code to do it, and
304        // it's not clear that it's actually worth it. If you have a really
305        // compelling use case for this, please file an issue.
306        match self.0.shift {
307            Shift::Small { period } => {
308                self.rfind_small_imp(haystack, needle, period)
309            }
310            Shift::Large { shift } => {
311                self.rfind_large_imp(haystack, needle, shift)
312            }
313        }
314    }
315
316    #[inline(always)]
317    fn rfind_small_imp(
318        &self,
319        haystack: &[u8],
320        needle: &[u8],
321        period: usize,
322    ) -> Option<usize> {
323        let nlen = needle.len();
324        let mut pos = haystack.len();
325        let mut shift = nlen;
326        let first_byte = match needle.get(0) {
327            None => return Some(pos),
328            Some(&first_byte) => first_byte,
329        };
330        while pos >= nlen {
331            if !self.0.byteset.contains(haystack[pos - nlen]) {
332                pos -= nlen;
333                shift = nlen;
334                continue;
335            }
336            let mut i = cmp::min(self.0.critical_pos, shift);
337            while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
338                i -= 1;
339            }
340            if i > 0 || first_byte != haystack[pos - nlen] {
341                pos -= self.0.critical_pos - i + 1;
342                shift = nlen;
343            } else {
344                let mut j = self.0.critical_pos;
345                while j < shift && needle[j] == haystack[pos - nlen + j] {
346                    j += 1;
347                }
348                if j >= shift {
349                    return Some(pos - nlen);
350                }
351                pos -= period;
352                shift = period;
353            }
354        }
355        None
356    }
357
358    #[inline(always)]
359    fn rfind_large_imp(
360        &self,
361        haystack: &[u8],
362        needle: &[u8],
363        shift: usize,
364    ) -> Option<usize> {
365        let nlen = needle.len();
366        let mut pos = haystack.len();
367        let first_byte = match needle.get(0) {
368            None => return Some(pos),
369            Some(&first_byte) => first_byte,
370        };
371        while pos >= nlen {
372            if !self.0.byteset.contains(haystack[pos - nlen]) {
373                pos -= nlen;
374                continue;
375            }
376            let mut i = self.0.critical_pos;
377            while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
378                i -= 1;
379            }
380            if i > 0 || first_byte != haystack[pos - nlen] {
381                pos -= self.0.critical_pos - i + 1;
382            } else {
383                let mut j = self.0.critical_pos;
384                while j < nlen && needle[j] == haystack[pos - nlen + j] {
385                    j += 1;
386                }
387                if j == nlen {
388                    return Some(pos - nlen);
389                }
390                pos -= shift;
391            }
392        }
393        None
394    }
395}
396
397/// A representation of the amount we're allowed to shift by during Two-Way
398/// search.
399///
400/// When computing a critical factorization of the needle, we find the position
401/// of the critical factorization by finding the needle's maximal (or minimal)
402/// suffix, along with the period of that suffix. It turns out that the period
403/// of that suffix is a lower bound on the period of the needle itself.
404///
405/// This lower bound is equivalent to the actual period of the needle in
406/// some cases. To describe that case, we denote the needle as `x` where
407/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
408/// bound given here is always the period of `v`, which is `<= period(x)`. The
409/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
410/// where `u` is a suffix of `v[0..period(v)]`.
411///
412/// This case is important because the search algorithm for when the
413/// periods are equivalent is slightly different than the search algorithm
414/// for when the periods are not equivalent. In particular, when they aren't
415/// equivalent, we know that the period of the needle is no less than half its
416/// length. In this case, we shift by an amount less than or equal to the
417/// period of the needle (determined by the maximum length of the components
418/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
419///
420/// The above two cases are represented by the variants below. Each entails
421/// a different instantiation of the Two-Way search algorithm.
422///
423/// N.B. If we could find a way to compute the exact period in all cases,
424/// then we could collapse this case analysis and simplify the algorithm. The
425/// Two-Way paper suggests this is possible, but more reading is required to
426/// grok why the authors didn't pursue that path.
427#[derive(Clone, Copy, Debug)]
428enum Shift {
429    Small { period: usize },
430    Large { shift: usize },
431}
432
433impl Shift {
434    /// Compute the shift for a given needle in the forward direction.
435    ///
436    /// This requires a lower bound on the period and a critical position.
437    /// These can be computed by extracting both the minimal and maximal
438    /// lexicographic suffixes, and choosing the right-most starting position.
439    /// The lower bound on the period is then the period of the chosen suffix.
440    fn forward(
441        needle: &[u8],
442        period_lower_bound: usize,
443        critical_pos: usize,
444    ) -> Shift {
445        let large = cmp::max(critical_pos, needle.len() - critical_pos);
446        if critical_pos * 2 >= needle.len() {
447            return Shift::Large { shift: large };
448        }
449
450        let (u, v) = needle.split_at(critical_pos);
451        if !is_suffix(&v[..period_lower_bound], u) {
452            return Shift::Large { shift: large };
453        }
454        Shift::Small { period: period_lower_bound }
455    }
456
457    /// Compute the shift for a given needle in the reverse direction.
458    ///
459    /// This requires a lower bound on the period and a critical position.
460    /// These can be computed by extracting both the minimal and maximal
461    /// lexicographic suffixes, and choosing the left-most starting position.
462    /// The lower bound on the period is then the period of the chosen suffix.
463    fn reverse(
464        needle: &[u8],
465        period_lower_bound: usize,
466        critical_pos: usize,
467    ) -> Shift {
468        let large = cmp::max(critical_pos, needle.len() - critical_pos);
469        if (needle.len() - critical_pos) * 2 >= needle.len() {
470            return Shift::Large { shift: large };
471        }
472
473        let (v, u) = needle.split_at(critical_pos);
474        if !is_prefix(&v[v.len() - period_lower_bound..], u) {
475            return Shift::Large { shift: large };
476        }
477        Shift::Small { period: period_lower_bound }
478    }
479}
480
481/// A suffix extracted from a needle along with its period.
482#[derive(Debug)]
483struct Suffix {
484    /// The starting position of this suffix.
485    ///
486    /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
487    /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
488    /// forward suffixes, this is an inclusive starting position, where as for
489    /// reverse suffixes, this is an exclusive ending position.
490    pos: usize,
491    /// The period of this suffix.
492    ///
493    /// Note that this is NOT necessarily the period of the string from which
494    /// this suffix comes from. (It is always less than or equal to the period
495    /// of the original string.)
496    period: usize,
497}
498
499impl Suffix {
500    fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
501        // suffix represents our maximal (or minimal) suffix, along with
502        // its period.
503        let mut suffix = Suffix { pos: 0, period: 1 };
504        // The start of a suffix in `needle` that we are considering as a
505        // more maximal (or minimal) suffix than what's in `suffix`.
506        let mut candidate_start = 1;
507        // The current offset of our suffixes that we're comparing.
508        //
509        // When the characters at this offset are the same, then we mush on
510        // to the next position since no decision is possible. When the
511        // candidate's character is greater (or lesser) than the corresponding
512        // character than our current maximal (or minimal) suffix, then the
513        // current suffix is changed over to the candidate and we restart our
514        // search. Otherwise, the candidate suffix is no good and we restart
515        // our search on the next candidate.
516        //
517        // The three cases above correspond to the three cases in the loop
518        // below.
519        let mut offset = 0;
520
521        while candidate_start + offset < needle.len() {
522            let current = needle[suffix.pos + offset];
523            let candidate = needle[candidate_start + offset];
524            match kind.cmp(current, candidate) {
525                SuffixOrdering::Accept => {
526                    suffix = Suffix { pos: candidate_start, period: 1 };
527                    candidate_start += 1;
528                    offset = 0;
529                }
530                SuffixOrdering::Skip => {
531                    candidate_start += offset + 1;
532                    offset = 0;
533                    suffix.period = candidate_start - suffix.pos;
534                }
535                SuffixOrdering::Push => {
536                    if offset + 1 == suffix.period {
537                        candidate_start += suffix.period;
538                        offset = 0;
539                    } else {
540                        offset += 1;
541                    }
542                }
543            }
544        }
545        suffix
546    }
547
548    fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
549        // See the comments in `forward` for how this works.
550        let mut suffix = Suffix { pos: needle.len(), period: 1 };
551        if needle.len() == 1 {
552            return suffix;
553        }
554        let mut candidate_start = match needle.len().checked_sub(1) {
555            None => return suffix,
556            Some(candidate_start) => candidate_start,
557        };
558        let mut offset = 0;
559
560        while offset < candidate_start {
561            let current = needle[suffix.pos - offset - 1];
562            let candidate = needle[candidate_start - offset - 1];
563            match kind.cmp(current, candidate) {
564                SuffixOrdering::Accept => {
565                    suffix = Suffix { pos: candidate_start, period: 1 };
566                    candidate_start -= 1;
567                    offset = 0;
568                }
569                SuffixOrdering::Skip => {
570                    candidate_start -= offset + 1;
571                    offset = 0;
572                    suffix.period = suffix.pos - candidate_start;
573                }
574                SuffixOrdering::Push => {
575                    if offset + 1 == suffix.period {
576                        candidate_start -= suffix.period;
577                        offset = 0;
578                    } else {
579                        offset += 1;
580                    }
581                }
582            }
583        }
584        suffix
585    }
586}
587
588/// The kind of suffix to extract.
589#[derive(Clone, Copy, Debug)]
590enum SuffixKind {
591    /// Extract the smallest lexicographic suffix from a string.
592    ///
593    /// Technically, this doesn't actually pick the smallest lexicographic
594    /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
595    /// the latter over the former, even though `a < aa`. The reasoning for
596    /// this isn't clear from the paper, but it still smells like a minimal
597    /// suffix.
598    Minimal,
599    /// Extract the largest lexicographic suffix from a string.
600    ///
601    /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
602    /// the choice between `z` and `zz`, this will choose the latter over the
603    /// former.
604    Maximal,
605}
606
607/// The result of comparing corresponding bytes between two suffixes.
608#[derive(Clone, Copy, Debug)]
609enum SuffixOrdering {
610    /// This occurs when the given candidate byte indicates that the candidate
611    /// suffix is better than the current maximal (or minimal) suffix. That is,
612    /// the current candidate suffix should supplant the current maximal (or
613    /// minimal) suffix.
614    Accept,
615    /// This occurs when the given candidate byte excludes the candidate suffix
616    /// from being better than the current maximal (or minimal) suffix. That
617    /// is, the current candidate suffix should be dropped and the next one
618    /// should be considered.
619    Skip,
620    /// This occurs when no decision to accept or skip the candidate suffix
621    /// can be made, e.g., when corresponding bytes are equivalent. In this
622    /// case, the next corresponding bytes should be compared.
623    Push,
624}
625
626impl SuffixKind {
627    /// Returns true if and only if the given candidate byte indicates that
628    /// it should replace the current suffix as the maximal (or minimal)
629    /// suffix.
630    fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
631        use self::SuffixOrdering::*;
632
633        match self {
634            SuffixKind::Minimal if candidate < current => Accept,
635            SuffixKind::Minimal if candidate > current => Skip,
636            SuffixKind::Minimal => Push,
637            SuffixKind::Maximal if candidate > current => Accept,
638            SuffixKind::Maximal if candidate < current => Skip,
639            SuffixKind::Maximal => Push,
640        }
641    }
642}
643
644/// A bitset used to track whether a particular byte exists in a needle or not.
645///
646/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
647/// needle. If a particular byte in the haystack is NOT in this set, then one
648/// can conclude that it is also not in the needle, and thus, one can advance
649/// in the haystack by needle.len() bytes.
650#[derive(Clone, Copy, Debug)]
651struct ApproximateByteSet(u64);
652
653impl ApproximateByteSet {
654    /// Create a new set from the given needle.
655    fn new(needle: &[u8]) -> ApproximateByteSet {
656        let mut bits = 0;
657        for &b in needle {
658            bits |= 1 << (b % 64);
659        }
660        ApproximateByteSet(bits)
661    }
662
663    /// Return true if and only if the given byte might be in this set. This
664    /// may return a false positive, but will never return a false negative.
665    #[inline(always)]
666    fn contains(&self, byte: u8) -> bool {
667        self.0 & (1 << (byte % 64)) != 0
668    }
669}
670
671#[cfg(test)]
672mod tests {
673    use alloc::vec::Vec;
674
675    use super::*;
676
677    /// Convenience wrapper for computing the suffix as a byte string.
678    fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
679        let s = Suffix::forward(needle, kind);
680        (&needle[s.pos..], s.period)
681    }
682
683    /// Convenience wrapper for computing the reverse suffix as a byte string.
684    fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
685        let s = Suffix::reverse(needle, kind);
686        (&needle[..s.pos], s.period)
687    }
688
689    /// Return all of the non-empty suffixes in the given byte string.
690    fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
691        (0..bytes.len()).map(|i| &bytes[i..]).collect()
692    }
693
694    /// Return the lexicographically maximal suffix of the given byte string.
695    fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
696        let mut sufs = suffixes(needle);
697        sufs.sort();
698        sufs.pop().unwrap()
699    }
700
701    /// Return the lexicographically maximal suffix of the reverse of the given
702    /// byte string.
703    fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
704        let mut reversed = needle.to_vec();
705        reversed.reverse();
706        let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
707        got.reverse();
708        got
709    }
710
711    define_substring_forward_quickcheck!(|h, n| Some(
712        Finder::new(n).find(h, n)
713    ));
714    define_substring_reverse_quickcheck!(|h, n| Some(
715        FinderRev::new(n).rfind(h, n)
716    ));
717
718    #[test]
719    fn forward() {
720        crate::tests::substring::Runner::new()
721            .fwd(|h, n| Some(Finder::new(n).find(h, n)))
722            .run();
723    }
724
725    #[test]
726    fn reverse() {
727        crate::tests::substring::Runner::new()
728            .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
729            .run();
730    }
731
732    #[test]
733    fn suffix_forward() {
734        macro_rules! assert_suffix_min {
735            ($given:expr, $expected:expr, $period:expr) => {
736                let (got_suffix, got_period) =
737                    get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
738                let got_suffix = core::str::from_utf8(got_suffix).unwrap();
739                assert_eq!(($expected, $period), (got_suffix, got_period));
740            };
741        }
742
743        macro_rules! assert_suffix_max {
744            ($given:expr, $expected:expr, $period:expr) => {
745                let (got_suffix, got_period) =
746                    get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
747                let got_suffix = core::str::from_utf8(got_suffix).unwrap();
748                assert_eq!(($expected, $period), (got_suffix, got_period));
749            };
750        }
751
752        assert_suffix_min!("a", "a", 1);
753        assert_suffix_max!("a", "a", 1);
754
755        assert_suffix_min!("ab", "ab", 2);
756        assert_suffix_max!("ab", "b", 1);
757
758        assert_suffix_min!("ba", "a", 1);
759        assert_suffix_max!("ba", "ba", 2);
760
761        assert_suffix_min!("abc", "abc", 3);
762        assert_suffix_max!("abc", "c", 1);
763
764        assert_suffix_min!("acb", "acb", 3);
765        assert_suffix_max!("acb", "cb", 2);
766
767        assert_suffix_min!("cba", "a", 1);
768        assert_suffix_max!("cba", "cba", 3);
769
770        assert_suffix_min!("abcabc", "abcabc", 3);
771        assert_suffix_max!("abcabc", "cabc", 3);
772
773        assert_suffix_min!("abcabcabc", "abcabcabc", 3);
774        assert_suffix_max!("abcabcabc", "cabcabc", 3);
775
776        assert_suffix_min!("abczz", "abczz", 5);
777        assert_suffix_max!("abczz", "zz", 1);
778
779        assert_suffix_min!("zzabc", "abc", 3);
780        assert_suffix_max!("zzabc", "zzabc", 5);
781
782        assert_suffix_min!("aaa", "aaa", 1);
783        assert_suffix_max!("aaa", "aaa", 1);
784
785        assert_suffix_min!("foobar", "ar", 2);
786        assert_suffix_max!("foobar", "r", 1);
787    }
788
789    #[test]
790    fn suffix_reverse() {
791        macro_rules! assert_suffix_min {
792            ($given:expr, $expected:expr, $period:expr) => {
793                let (got_suffix, got_period) =
794                    get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
795                let got_suffix = core::str::from_utf8(got_suffix).unwrap();
796                assert_eq!(($expected, $period), (got_suffix, got_period));
797            };
798        }
799
800        macro_rules! assert_suffix_max {
801            ($given:expr, $expected:expr, $period:expr) => {
802                let (got_suffix, got_period) =
803                    get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
804                let got_suffix = core::str::from_utf8(got_suffix).unwrap();
805                assert_eq!(($expected, $period), (got_suffix, got_period));
806            };
807        }
808
809        assert_suffix_min!("a", "a", 1);
810        assert_suffix_max!("a", "a", 1);
811
812        assert_suffix_min!("ab", "a", 1);
813        assert_suffix_max!("ab", "ab", 2);
814
815        assert_suffix_min!("ba", "ba", 2);
816        assert_suffix_max!("ba", "b", 1);
817
818        assert_suffix_min!("abc", "a", 1);
819        assert_suffix_max!("abc", "abc", 3);
820
821        assert_suffix_min!("acb", "a", 1);
822        assert_suffix_max!("acb", "ac", 2);
823
824        assert_suffix_min!("cba", "cba", 3);
825        assert_suffix_max!("cba", "c", 1);
826
827        assert_suffix_min!("abcabc", "abca", 3);
828        assert_suffix_max!("abcabc", "abcabc", 3);
829
830        assert_suffix_min!("abcabcabc", "abcabca", 3);
831        assert_suffix_max!("abcabcabc", "abcabcabc", 3);
832
833        assert_suffix_min!("abczz", "a", 1);
834        assert_suffix_max!("abczz", "abczz", 5);
835
836        assert_suffix_min!("zzabc", "zza", 3);
837        assert_suffix_max!("zzabc", "zz", 1);
838
839        assert_suffix_min!("aaa", "aaa", 1);
840        assert_suffix_max!("aaa", "aaa", 1);
841    }
842
843    #[cfg(not(miri))]
844    quickcheck::quickcheck! {
845        fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
846            if bytes.is_empty() {
847                return true;
848            }
849
850            let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
851            let expected = naive_maximal_suffix_forward(&bytes);
852            got == expected
853        }
854
855        fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
856            if bytes.is_empty() {
857                return true;
858            }
859
860            let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
861            let expected = naive_maximal_suffix_reverse(&bytes);
862            expected == got
863        }
864    }
865
866    // This is a regression test caught by quickcheck that exercised a bug in
867    // the reverse small period handling. The bug was that we were using 'if j
868    // == shift' to determine if a match occurred, but the correct guard is 'if
869    // j >= shift', which matches the corresponding guard in the forward impl.
870    #[test]
871    fn regression_rev_small_period() {
872        let rfind = |h, n| FinderRev::new(n).rfind(h, n);
873        let haystack = "ababaz";
874        let needle = "abab";
875        assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
876    }
877}
memchr/arch/all/twoway.rs

memchr/arch/all/
twoway.rs