aho_corasick/
dfa.rs

1/*!
2Provides direct access to a DFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a DFA directly. Using an `DFA` directly is typically only necessary
7when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13    automaton::Automaton,
14    nfa::noncontiguous,
15    util::{
16        alphabet::ByteClasses,
17        error::{BuildError, MatchError},
18        int::{Usize, U32},
19        prefilter::Prefilter,
20        primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21        search::{Anchored, MatchKind, StartKind},
22        special::Special,
23    },
24};
25
26/// A DFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using a `DFA` directly is typically only necessary when
30/// one needs access to the [`Automaton`] trait implementation.
31///
32/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// A DFA provides the best possible search performance (in this crate) via two
37/// mechanisms:
38///
39/// * All states use a dense representation for their transitions.
40/// * All failure transitions are pre-computed such that they are never
41/// explicitly handled at search time.
42///
43/// These two facts combined mean that every state transition is performed
44/// using a constant number of instructions. However, this comes at
45/// great cost. The memory usage of a DFA can be quite exorbitant.
46/// It is potentially multiple orders of magnitude greater than a
47/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49/// not by orders of magnitude.
50///
51/// Unless you have a small number of patterns or memory usage is not a concern
52/// and search performance is critical, a DFA is usually not the best choice.
53///
54/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55/// support for anchored and unanchored search configurations. Namely,
56/// since failure transitions are pre-computed, supporting both anchored
57/// and unanchored searches requires a duplication of the transition table,
58/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59/// unconditionally support both anchored and unanchored searches because there
60/// is essentially no added cost for doing so.) It is for this reason that
61/// a DFA's support for anchored and unanchored searches can be configured
62/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63/// searches.
64///
65/// # Example
66///
67/// This example shows how to build an `DFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72///     automaton::Automaton,
73///     dfa::DFA,
74///     Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = DFA::new(patterns).unwrap();
81/// assert_eq!(
82///     Some(Match::must(0, 1..2)),
83///     nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct DFA {
92    /// The DFA transition table. IDs in this table are pre-multiplied. So
93    /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
94    /// 2*stride, 3*stride, ...
95    trans: Vec<StateID>,
96    /// The matches for every match state in this DFA. This is first indexed by
97    /// state index (so that's `sid >> stride2`) and then by order in which the
98    /// matches are meant to occur.
99    matches: Vec<Vec<PatternID>>,
100    /// The amount of heap memory used, in bytes, by the inner Vecs of
101    /// 'matches'.
102    matches_memory_usage: usize,
103    /// The length of each pattern. This is used to compute the start offset
104    /// of a match.
105    pattern_lens: Vec<SmallIndex>,
106    /// A prefilter for accelerating searches, if one exists.
107    prefilter: Option<Prefilter>,
108    /// The match semantics built into this DFA.
109    match_kind: MatchKind,
110    /// The total number of states in this DFA.
111    state_len: usize,
112    /// The alphabet size, or total number of equivalence classes, for this
113    /// DFA. Note that the actual number of transitions in each state is
114    /// stride=2^stride2, where stride is the smallest power of 2 greater than
115    /// or equal to alphabet_len. We do things this way so that we can use
116    /// bitshifting to go from a state ID to an index into 'matches'.
117    alphabet_len: usize,
118    /// The exponent with a base 2, such that stride=2^stride2. Given a state
119    /// index 'i', its state identifier is 'i << stride2'. Given a state
120    /// identifier 'sid', its state index is 'sid >> stride2'.
121    stride2: usize,
122    /// The equivalence classes for this DFA. All transitions are defined on
123    /// equivalence classes and not on the 256 distinct byte values.
124    byte_classes: ByteClasses,
125    /// The length of the shortest pattern in this automaton.
126    min_pattern_len: usize,
127    /// The length of the longest pattern in this automaton.
128    max_pattern_len: usize,
129    /// The information required to deduce which states are "special" in this
130    /// DFA.
131    special: Special,
132}
133
134impl DFA {
135    /// Create a new Aho-Corasick DFA using the default configuration.
136    ///
137    /// Use a [`Builder`] if you want to change the configuration.
138    pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
139    where
140        I: IntoIterator<Item = P>,
141        P: AsRef<[u8]>,
142    {
143        DFA::builder().build(patterns)
144    }
145
146    /// A convenience method for returning a new Aho-Corasick DFA builder.
147    ///
148    /// This usually permits one to just import the `DFA` type.
149    pub fn builder() -> Builder {
150        Builder::new()
151    }
152}
153
154impl DFA {
155    /// A sentinel state ID indicating that a search should stop once it has
156    /// entered this state. When a search stops, it returns a match if one has
157    /// been found, otherwise no match. A DFA always has an actual dead state
158    /// at this ID.
159    ///
160    /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
161    /// Namely, the whole point of a DFA is that the FAIL state is completely
162    /// compiled away. That is, DFA construction involves pre-computing the
163    /// failure transitions everywhere, such that failure transitions are no
164    /// longer used at search time. This, combined with its uniformly dense
165    /// representation, are the two most important factors in why it's faster
166    /// than the NFAs in this crate.
167    const DEAD: StateID = StateID::new_unchecked(0);
168
169    /// Adds the given pattern IDs as matches to the given state and also
170    /// records the added memory usage.
171    fn set_matches(
172        &mut self,
173        sid: StateID,
174        pids: impl Iterator<Item = PatternID>,
175    ) {
176        let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
177        let mut at_least_one = false;
178        for pid in pids {
179            self.matches[index].push(pid);
180            self.matches_memory_usage += PatternID::SIZE;
181            at_least_one = true;
182        }
183        assert!(at_least_one, "match state must have non-empty pids");
184    }
185}
186
187// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188// returns a valid state ID given a valid state ID. We otherwise claim that
189// all other methods are correct as well.
190unsafe impl Automaton for DFA {
191    #[inline(always)]
192    fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193        // Either of the start state IDs can be DEAD, in which case, support
194        // for that type of search is not provided by this DFA. Which start
195        // state IDs are inactive depends on the 'StartKind' configuration at
196        // DFA construction time.
197        match anchored {
198            Anchored::No => {
199                let start = self.special.start_unanchored_id;
200                if start == DFA::DEAD {
201                    Err(MatchError::invalid_input_unanchored())
202                } else {
203                    Ok(start)
204                }
205            }
206            Anchored::Yes => {
207                let start = self.special.start_anchored_id;
208                if start == DFA::DEAD {
209                    Err(MatchError::invalid_input_anchored())
210                } else {
211                    Ok(start)
212                }
213            }
214        }
215    }
216
217    #[inline(always)]
218    fn next_state(
219        &self,
220        _anchored: Anchored,
221        sid: StateID,
222        byte: u8,
223    ) -> StateID {
224        let class = self.byte_classes.get(byte);
225        self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226    }
227
228    #[inline(always)]
229    fn is_special(&self, sid: StateID) -> bool {
230        sid <= self.special.max_special_id
231    }
232
233    #[inline(always)]
234    fn is_dead(&self, sid: StateID) -> bool {
235        sid == DFA::DEAD
236    }
237
238    #[inline(always)]
239    fn is_match(&self, sid: StateID) -> bool {
240        !self.is_dead(sid) && sid <= self.special.max_match_id
241    }
242
243    #[inline(always)]
244    fn is_start(&self, sid: StateID) -> bool {
245        sid == self.special.start_unanchored_id
246            || sid == self.special.start_anchored_id
247    }
248
249    #[inline(always)]
250    fn match_kind(&self) -> MatchKind {
251        self.match_kind
252    }
253
254    #[inline(always)]
255    fn patterns_len(&self) -> usize {
256        self.pattern_lens.len()
257    }
258
259    #[inline(always)]
260    fn pattern_len(&self, pid: PatternID) -> usize {
261        self.pattern_lens[pid].as_usize()
262    }
263
264    #[inline(always)]
265    fn min_pattern_len(&self) -> usize {
266        self.min_pattern_len
267    }
268
269    #[inline(always)]
270    fn max_pattern_len(&self) -> usize {
271        self.max_pattern_len
272    }
273
274    #[inline(always)]
275    fn match_len(&self, sid: StateID) -> usize {
276        debug_assert!(self.is_match(sid));
277        let offset = (sid.as_usize() >> self.stride2) - 2;
278        self.matches[offset].len()
279    }
280
281    #[inline(always)]
282    fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283        debug_assert!(self.is_match(sid));
284        let offset = (sid.as_usize() >> self.stride2) - 2;
285        self.matches[offset][index]
286    }
287
288    #[inline(always)]
289    fn memory_usage(&self) -> usize {
290        use core::mem::size_of;
291
292        (self.trans.len() * size_of::<u32>())
293            + (self.matches.len() * size_of::<Vec<PatternID>>())
294            + self.matches_memory_usage
295            + (self.pattern_lens.len() * size_of::<SmallIndex>())
296            + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
297    }
298
299    #[inline(always)]
300    fn prefilter(&self) -> Option<&Prefilter> {
301        self.prefilter.as_ref()
302    }
303}
304
305impl core::fmt::Debug for DFA {
306    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307        use crate::{
308            automaton::{fmt_state_indicator, sparse_transitions},
309            util::debug::DebugByte,
310        };
311
312        writeln!(f, "dfa::DFA(")?;
313        for index in 0..self.state_len {
314            let sid = StateID::new_unchecked(index << self.stride2);
315            // While we do currently include the FAIL state in the transition
316            // table (to simplify construction), it is never actually used. It
317            // poses problems with the code below because it gets treated as
318            // a match state incidentally when it is, of course, not. So we
319            // special case it. The fail state is always the first state after
320            // the dead state.
321            //
322            // If the construction is changed to remove the fail state (it
323            // probably should be), then this special case should be updated.
324            if index == 1 {
325                writeln!(f, "F {:06}:", sid.as_usize())?;
326                continue;
327            }
328            fmt_state_indicator(f, self, sid)?;
329            write!(f, "{:06}: ", sid.as_usize())?;
330
331            let it = (0..self.byte_classes.alphabet_len()).map(|class| {
332                (class.as_u8(), self.trans[sid.as_usize() + class])
333            });
334            for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335                if i > 0 {
336                    write!(f, ", ")?;
337                }
338                if start == end {
339                    write!(
340                        f,
341                        "{:?} => {:?}",
342                        DebugByte(start),
343                        next.as_usize()
344                    )?;
345                } else {
346                    write!(
347                        f,
348                        "{:?}-{:?} => {:?}",
349                        DebugByte(start),
350                        DebugByte(end),
351                        next.as_usize()
352                    )?;
353                }
354            }
355            write!(f, "\n")?;
356            if self.is_match(sid) {
357                write!(f, " matches: ")?;
358                for i in 0..self.match_len(sid) {
359                    if i > 0 {
360                        write!(f, ", ")?;
361                    }
362                    let pid = self.match_pattern(sid, i);
363                    write!(f, "{}", pid.as_usize())?;
364                }
365                write!(f, "\n")?;
366            }
367        }
368        writeln!(f, "match kind: {:?}", self.match_kind)?;
369        writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370        writeln!(f, "state length: {:?}", self.state_len)?;
371        writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372        writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373        writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374        writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375        writeln!(f, "stride: {:?}", 1 << self.stride2)?;
376        writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377        writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378        writeln!(f, ")")?;
379        Ok(())
380    }
381}
382
383/// A builder for configuring an Aho-Corasick DFA.
384///
385/// This builder has a subset of the options available to a
386/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387/// their behavior is identical.
388#[derive(Clone, Debug)]
389pub struct Builder {
390    noncontiguous: noncontiguous::Builder,
391    start_kind: StartKind,
392    byte_classes: bool,
393}
394
395impl Default for Builder {
396    fn default() -> Builder {
397        Builder {
398            noncontiguous: noncontiguous::Builder::new(),
399            start_kind: StartKind::Unanchored,
400            byte_classes: true,
401        }
402    }
403}
404
405impl Builder {
406    /// Create a new builder for configuring an Aho-Corasick DFA.
407    pub fn new() -> Builder {
408        Builder::default()
409    }
410
411    /// Build an Aho-Corasick DFA from the given iterator of patterns.
412    ///
413    /// A builder may be reused to create more DFAs.
414    pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415    where
416        I: IntoIterator<Item = P>,
417        P: AsRef<[u8]>,
418    {
419        let nnfa = self.noncontiguous.build(patterns)?;
420        self.build_from_noncontiguous(&nnfa)
421    }
422
423    /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424    ///
425    /// Note that when this method is used, only the `start_kind` and
426    /// `byte_classes` settings on this builder are respected. The other
427    /// settings only apply to the initial construction of the Aho-Corasick
428    /// automaton. Since using this method requires that initial construction
429    /// has already completed, all settings impacting only initial construction
430    /// are no longer relevant.
431    pub fn build_from_noncontiguous(
432        &self,
433        nnfa: &noncontiguous::NFA,
434    ) -> Result<DFA, BuildError> {
435        debug!("building DFA");
436        let byte_classes = if self.byte_classes {
437            nnfa.byte_classes().clone()
438        } else {
439            ByteClasses::singletons()
440        };
441        let state_len = match self.start_kind {
442            StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
443            StartKind::Both => {
444                // These unwraps are OK because we know that the number of
445                // NFA states is < StateID::LIMIT which is in turn less than
446                // i32::MAX. Thus, there is always room to multiply by 2.
447                // Finally, the number of states is always at least 4 in the
448                // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449                // subtraction of 4 is okay.
450                //
451                // Note that we subtract 4 because the "anchored" part of
452                // the DFA duplicates the unanchored part (without failure
453                // transitions), but reuses the DEAD, FAIL and START states.
454                nnfa.states()
455                    .len()
456                    .checked_mul(2)
457                    .unwrap()
458                    .checked_sub(4)
459                    .unwrap()
460            }
461        };
462        let trans_len =
463            match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464                Some(trans_len) => trans_len,
465                None => {
466                    return Err(BuildError::state_id_overflow(
467                        StateID::MAX.as_u64(),
468                        usize::MAX.as_u64(),
469                    ))
470                }
471            };
472        StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473            .map_err(|e| {
474                BuildError::state_id_overflow(
475                    StateID::MAX.as_u64(),
476                    e.attempted(),
477                )
478            })?;
479        let num_match_states = match self.start_kind {
480            StartKind::Unanchored | StartKind::Anchored => {
481                nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
482            }
483            StartKind::Both => nnfa
484                .special()
485                .max_match_id
486                .as_usize()
487                .checked_sub(1)
488                .unwrap()
489                .checked_mul(2)
490                .unwrap(),
491        };
492        let mut dfa = DFA {
493            trans: vec![DFA::DEAD; trans_len],
494            matches: vec![vec![]; num_match_states],
495            matches_memory_usage: 0,
496            pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497            prefilter: nnfa.prefilter().map(|p| p.clone()),
498            match_kind: nnfa.match_kind(),
499            state_len,
500            alphabet_len: byte_classes.alphabet_len(),
501            stride2: byte_classes.stride2(),
502            byte_classes,
503            min_pattern_len: nnfa.min_pattern_len(),
504            max_pattern_len: nnfa.max_pattern_len(),
505            // The special state IDs are set later.
506            special: Special::zero(),
507        };
508        match self.start_kind {
509            StartKind::Both => {
510                self.finish_build_both_starts(nnfa, &mut dfa);
511            }
512            StartKind::Unanchored => {
513                self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514            }
515            StartKind::Anchored => {
516                self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517            }
518        }
519        debug!(
520            "DFA built, <states: {:?}, size: {:?}, \
521             alphabet len: {:?}, stride: {:?}>",
522            dfa.state_len,
523            dfa.memory_usage(),
524            dfa.byte_classes.alphabet_len(),
525            dfa.byte_classes.stride(),
526        );
527        // The vectors can grow ~twice as big during construction because a
528        // Vec amortizes growth. But here, let's shrink things back down to
529        // what we actually need since we're never going to add more to it.
530        dfa.trans.shrink_to_fit();
531        dfa.pattern_lens.shrink_to_fit();
532        dfa.matches.shrink_to_fit();
533        // TODO: We might also want to shrink each Vec inside of `dfa.matches`,
534        // or even better, convert it to one contiguous allocation. But I think
535        // I went with nested allocs for good reason (can't remember), so this
536        // may be tricky to do. I decided not to shrink them here because it
537        // might require a fair bit of work to do. It's unclear whether it's
538        // worth it.
539        Ok(dfa)
540    }
541
542    /// Finishes building a DFA for either unanchored or anchored searches,
543    /// but NOT both.
544    fn finish_build_one_start(
545        &self,
546        anchored: Anchored,
547        nnfa: &noncontiguous::NFA,
548        dfa: &mut DFA,
549    ) {
550        // This function always succeeds because we check above that all of the
551        // states in the NFA can be mapped to DFA state IDs.
552        let stride2 = dfa.stride2;
553        let old2new = |oldsid: StateID| {
554            StateID::new_unchecked(oldsid.as_usize() << stride2)
555        };
556        for (oldsid, state) in nnfa.states().iter().with_state_ids() {
557            let newsid = old2new(oldsid);
558            if state.is_match() {
559                dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
560            }
561            sparse_iter(
562                nnfa,
563                oldsid,
564                &dfa.byte_classes,
565                |byte, class, mut oldnextsid| {
566                    if oldnextsid == noncontiguous::NFA::FAIL {
567                        if anchored.is_anchored() {
568                            oldnextsid = noncontiguous::NFA::DEAD;
569                        } else if state.fail() == noncontiguous::NFA::DEAD {
570                            // This is a special case that avoids following
571                            // DEAD transitions in a non-contiguous NFA.
572                            // Following these transitions is pretty slow
573                            // because the non-contiguous NFA will always use
574                            // a sparse representation for it (because the
575                            // DEAD state is usually treated as a sentinel).
576                            // The *vast* majority of failure states are DEAD
577                            // states, so this winds up being pretty slow if
578                            // we go through the non-contiguous NFA state
579                            // transition logic. Instead, just do it ourselves.
580                            oldnextsid = noncontiguous::NFA::DEAD;
581                        } else {
582                            oldnextsid = nnfa.next_state(
583                                Anchored::No,
584                                state.fail(),
585                                byte,
586                            );
587                        }
588                    }
589                    dfa.trans[newsid.as_usize() + usize::from(class)] =
590                        old2new(oldnextsid);
591                },
592            );
593        }
594        // Now that we've remapped all the IDs in our states, all that's left
595        // is remapping the special state IDs.
596        let old = nnfa.special();
597        let new = &mut dfa.special;
598        new.max_special_id = old2new(old.max_special_id);
599        new.max_match_id = old2new(old.max_match_id);
600        if anchored.is_anchored() {
601            new.start_unanchored_id = DFA::DEAD;
602            new.start_anchored_id = old2new(old.start_anchored_id);
603        } else {
604            new.start_unanchored_id = old2new(old.start_unanchored_id);
605            new.start_anchored_id = DFA::DEAD;
606        }
607    }
608
609    /// Finishes building a DFA that supports BOTH unanchored and anchored
610    /// searches. It works by inter-leaving unanchored states with anchored
611    /// states in the same transition table. This way, we avoid needing to
612    /// re-shuffle states afterward to ensure that our states still look like
613    /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
614    ///
615    /// Honestly this is pretty inscrutable... Simplifications are most
616    /// welcome.
617    fn finish_build_both_starts(
618        &self,
619        nnfa: &noncontiguous::NFA,
620        dfa: &mut DFA,
621    ) {
622        let stride2 = dfa.stride2;
623        let stride = 1 << stride2;
624        let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
625        let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
626        let mut is_anchored = vec![false; dfa.state_len];
627        let mut newsid = DFA::DEAD;
628        let next_dfa_id =
629            |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
630        for (oldsid, state) in nnfa.states().iter().with_state_ids() {
631            if oldsid == noncontiguous::NFA::DEAD
632                || oldsid == noncontiguous::NFA::FAIL
633            {
634                remap_unanchored[oldsid] = newsid;
635                remap_anchored[oldsid] = newsid;
636                newsid = next_dfa_id(newsid);
637            } else if oldsid == nnfa.special().start_unanchored_id
638                || oldsid == nnfa.special().start_anchored_id
639            {
640                if oldsid == nnfa.special().start_unanchored_id {
641                    remap_unanchored[oldsid] = newsid;
642                    remap_anchored[oldsid] = DFA::DEAD;
643                } else {
644                    remap_unanchored[oldsid] = DFA::DEAD;
645                    remap_anchored[oldsid] = newsid;
646                    is_anchored[newsid.as_usize() >> stride2] = true;
647                }
648                if state.is_match() {
649                    dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
650                }
651                sparse_iter(
652                    nnfa,
653                    oldsid,
654                    &dfa.byte_classes,
655                    |_, class, oldnextsid| {
656                        let class = usize::from(class);
657                        if oldnextsid == noncontiguous::NFA::FAIL {
658                            dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
659                        } else {
660                            dfa.trans[newsid.as_usize() + class] = oldnextsid;
661                        }
662                    },
663                );
664                newsid = next_dfa_id(newsid);
665            } else {
666                let unewsid = newsid;
667                newsid = next_dfa_id(newsid);
668                let anewsid = newsid;
669                newsid = next_dfa_id(newsid);
670
671                remap_unanchored[oldsid] = unewsid;
672                remap_anchored[oldsid] = anewsid;
673                is_anchored[anewsid.as_usize() >> stride2] = true;
674                if state.is_match() {
675                    dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
676                    dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
677                }
678                sparse_iter(
679                    nnfa,
680                    oldsid,
681                    &dfa.byte_classes,
682                    |byte, class, oldnextsid| {
683                        let class = usize::from(class);
684                        if oldnextsid == noncontiguous::NFA::FAIL {
685                            let oldnextsid =
686                                if state.fail() == noncontiguous::NFA::DEAD {
687                                    noncontiguous::NFA::DEAD
688                                } else {
689                                    nnfa.next_state(
690                                        Anchored::No,
691                                        state.fail(),
692                                        byte,
693                                    )
694                                };
695                            dfa.trans[unewsid.as_usize() + class] = oldnextsid;
696                        } else {
697                            dfa.trans[unewsid.as_usize() + class] = oldnextsid;
698                            dfa.trans[anewsid.as_usize() + class] = oldnextsid;
699                        }
700                    },
701                );
702            }
703        }
704        for i in 0..dfa.state_len {
705            let sid = i << stride2;
706            if is_anchored[i] {
707                for next in dfa.trans[sid..][..stride].iter_mut() {
708                    *next = remap_anchored[*next];
709                }
710            } else {
711                for next in dfa.trans[sid..][..stride].iter_mut() {
712                    *next = remap_unanchored[*next];
713                }
714            }
715        }
716        // Now that we've remapped all the IDs in our states, all that's left
717        // is remapping the special state IDs.
718        let old = nnfa.special();
719        let new = &mut dfa.special;
720        new.max_special_id = remap_anchored[old.max_special_id];
721        new.max_match_id = remap_anchored[old.max_match_id];
722        new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
723        new.start_anchored_id = remap_anchored[old.start_anchored_id];
724    }
725
726    /// Set the desired match semantics.
727    ///
728    /// This only applies when using [`Builder::build`] and not
729    /// [`Builder::build_from_noncontiguous`].
730    ///
731    /// See
732    /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
733    /// for more documentation and examples.
734    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
735        self.noncontiguous.match_kind(kind);
736        self
737    }
738
739    /// Enable ASCII-aware case insensitive matching.
740    ///
741    /// This only applies when using [`Builder::build`] and not
742    /// [`Builder::build_from_noncontiguous`].
743    ///
744    /// See
745    /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
746    /// for more documentation and examples.
747    pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
748        self.noncontiguous.ascii_case_insensitive(yes);
749        self
750    }
751
752    /// Enable heuristic prefilter optimizations.
753    ///
754    /// This only applies when using [`Builder::build`] and not
755    /// [`Builder::build_from_noncontiguous`].
756    ///
757    /// See
758    /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
759    /// for more documentation and examples.
760    pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
761        self.noncontiguous.prefilter(yes);
762        self
763    }
764
765    /// Sets the starting state configuration for the automaton.
766    ///
767    /// See
768    /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
769    /// for more documentation and examples.
770    pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
771        self.start_kind = kind;
772        self
773    }
774
775    /// A debug setting for whether to attempt to shrink the size of the
776    /// automaton's alphabet or not.
777    ///
778    /// This should never be enabled unless you're debugging an automaton.
779    /// Namely, disabling byte classes makes transitions easier to reason
780    /// about, since they use the actual bytes instead of equivalence classes.
781    /// Disabling this confers no performance benefit at search time.
782    ///
783    /// See
784    /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
785    /// for more documentation and examples.
786    pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
787        self.byte_classes = yes;
788        self
789    }
790}
791
792/// Iterate over all possible equivalence class transitions in this state.
793/// The closure is called for all transitions with a distinct equivalence
794/// class, even those not explicitly represented in this sparse state. For
795/// any implicitly defined transitions, the given closure is called with
796/// the fail state ID.
797///
798/// The closure is guaranteed to be called precisely
799/// `byte_classes.alphabet_len()` times, once for every possible class in
800/// ascending order.
801fn sparse_iter<F: FnMut(u8, u8, StateID)>(
802    nnfa: &noncontiguous::NFA,
803    oldsid: StateID,
804    classes: &ByteClasses,
805    mut f: F,
806) {
807    let mut prev_class = None;
808    let mut byte = 0usize;
809    for t in nnfa.iter_trans(oldsid) {
810        while byte < usize::from(t.byte()) {
811            let rep = byte.as_u8();
812            let class = classes.get(rep);
813            byte += 1;
814            if prev_class != Some(class) {
815                f(rep, class, noncontiguous::NFA::FAIL);
816                prev_class = Some(class);
817            }
818        }
819        let rep = t.byte();
820        let class = classes.get(rep);
821        byte += 1;
822        if prev_class != Some(class) {
823            f(rep, class, t.next());
824            prev_class = Some(class);
825        }
826    }
827    for b in byte..=255 {
828        let rep = b.as_u8();
829        let class = classes.get(rep);
830        if prev_class != Some(class) {
831            f(rep, class, noncontiguous::NFA::FAIL);
832            prev_class = Some(class);
833        }
834    }
835}
aho_corasick/dfa.rs

aho_corasick/
dfa.rs