271 lines
8.2 KiB
Rust
271 lines
8.2 KiB
Rust
|
|
/*!
|
||
|
|
Provides implementations of `fst::Automaton` for Aho-Corasick automata.
|
||
|
|
|
||
|
|
This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
|
||
|
|
The former executes an anchored search on an FST while the latter executes
|
||
|
|
an unanchored search. Building these wrappers is fallible and will fail if
|
||
|
|
the underlying Aho-Corasick automaton does not support the type of search it
|
||
|
|
represents.
|
||
|
|
*/
|
||
|
|
|
||
|
|
use crate::{
|
||
|
|
automaton::{Automaton, StateID},
|
||
|
|
Anchored as AcAnchored, Input, MatchError,
|
||
|
|
};
|
||
|
|
|
||
|
|
/// Represents an unanchored Aho-Corasick search of a finite state transducer.
|
||
|
|
///
|
||
|
|
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
|
||
|
|
/// underlying automaton does not support unanchored searches.
|
||
|
|
///
|
||
|
|
/// # Example
|
||
|
|
///
|
||
|
|
/// This shows how to build an FST of keys and then run an unanchored search on
|
||
|
|
/// those keys using an Aho-Corasick automaton.
|
||
|
|
///
|
||
|
|
/// ```
|
||
|
|
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
|
||
|
|
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||
|
|
///
|
||
|
|
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||
|
|
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
|
||
|
|
/// // NFAs always support both unanchored and anchored searches.
|
||
|
|
/// let searcher = Unanchored::new(&nfa).unwrap();
|
||
|
|
///
|
||
|
|
/// let mut stream = set.search(searcher).into_stream();
|
||
|
|
/// let mut results = vec![];
|
||
|
|
/// while let Some(key) = stream.next() {
|
||
|
|
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||
|
|
/// }
|
||
|
|
/// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
|
||
|
|
/// ```
|
||
|
|
#[derive(Clone, Debug)]
|
||
|
|
pub struct Unanchored<A>(A);
|
||
|
|
|
||
|
|
impl<A: Automaton> Unanchored<A> {
|
||
|
|
/// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
|
||
|
|
///
|
||
|
|
/// If the given Aho-Corasick automaton does not support unanchored
|
||
|
|
/// searches, then this returns an error.
|
||
|
|
pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
|
||
|
|
let input = Input::new("").anchored(AcAnchored::No);
|
||
|
|
let _ = aut.start_state(&input)?;
|
||
|
|
Ok(Unanchored(aut))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Returns a borrow to the underlying automaton.
|
||
|
|
pub fn as_ref(&self) -> &A {
|
||
|
|
&self.0
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Unwrap this value and return the inner automaton.
|
||
|
|
pub fn into_inner(self) -> A {
|
||
|
|
self.0
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<A: Automaton> fst::Automaton for Unanchored<A> {
|
||
|
|
type State = StateID;
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn start(&self) -> StateID {
|
||
|
|
let input = Input::new("").anchored(AcAnchored::No);
|
||
|
|
self.0.start_state(&input).expect("support for unanchored searches")
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn is_match(&self, state: &StateID) -> bool {
|
||
|
|
self.0.is_match(*state)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn accept(&self, state: &StateID, byte: u8) -> StateID {
|
||
|
|
if fst::Automaton::is_match(self, state) {
|
||
|
|
return *state;
|
||
|
|
}
|
||
|
|
self.0.next_state(AcAnchored::No, *state, byte)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn can_match(&self, state: &StateID) -> bool {
|
||
|
|
!self.0.is_dead(*state)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Represents an anchored Aho-Corasick search of a finite state transducer.
|
||
|
|
///
|
||
|
|
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
|
||
|
|
/// underlying automaton does not support unanchored searches.
|
||
|
|
///
|
||
|
|
/// # Example
|
||
|
|
///
|
||
|
|
/// This shows how to build an FST of keys and then run an anchored search on
|
||
|
|
/// those keys using an Aho-Corasick automaton.
|
||
|
|
///
|
||
|
|
/// ```
|
||
|
|
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
|
||
|
|
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||
|
|
///
|
||
|
|
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||
|
|
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
|
||
|
|
/// // NFAs always support both unanchored and anchored searches.
|
||
|
|
/// let searcher = Anchored::new(&nfa).unwrap();
|
||
|
|
///
|
||
|
|
/// let mut stream = set.search(searcher).into_stream();
|
||
|
|
/// let mut results = vec![];
|
||
|
|
/// while let Some(key) = stream.next() {
|
||
|
|
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||
|
|
/// }
|
||
|
|
/// assert_eq!(vec!["bcd", "xyz"], results);
|
||
|
|
/// ```
|
||
|
|
///
|
||
|
|
/// This is like the example above, except we use an Aho-Corasick DFA, which
|
||
|
|
/// requires explicitly configuring it to support anchored searches. (NFAs
|
||
|
|
/// unconditionally support both unanchored and anchored searches.)
|
||
|
|
///
|
||
|
|
/// ```
|
||
|
|
/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
|
||
|
|
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||
|
|
///
|
||
|
|
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||
|
|
/// let dfa = DFA::builder()
|
||
|
|
/// .start_kind(StartKind::Anchored)
|
||
|
|
/// .build(&["bcd", "x"])
|
||
|
|
/// .unwrap();
|
||
|
|
/// // We've explicitly configured our DFA to support anchored searches.
|
||
|
|
/// let searcher = Anchored::new(&dfa).unwrap();
|
||
|
|
///
|
||
|
|
/// let mut stream = set.search(searcher).into_stream();
|
||
|
|
/// let mut results = vec![];
|
||
|
|
/// while let Some(key) = stream.next() {
|
||
|
|
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||
|
|
/// }
|
||
|
|
/// assert_eq!(vec!["bcd", "xyz"], results);
|
||
|
|
/// ```
|
||
|
|
#[derive(Clone, Debug)]
|
||
|
|
pub struct Anchored<A>(A);
|
||
|
|
|
||
|
|
impl<A: Automaton> Anchored<A> {
|
||
|
|
/// Create a new `Anchored` implementation of the `fst::Automaton` trait.
|
||
|
|
///
|
||
|
|
/// If the given Aho-Corasick automaton does not support anchored searches,
|
||
|
|
/// then this returns an error.
|
||
|
|
pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
|
||
|
|
let input = Input::new("").anchored(AcAnchored::Yes);
|
||
|
|
let _ = aut.start_state(&input)?;
|
||
|
|
Ok(Anchored(aut))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Returns a borrow to the underlying automaton.
|
||
|
|
pub fn as_ref(&self) -> &A {
|
||
|
|
&self.0
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Unwrap this value and return the inner automaton.
|
||
|
|
pub fn into_inner(self) -> A {
|
||
|
|
self.0
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<A: Automaton> fst::Automaton for Anchored<A> {
|
||
|
|
type State = StateID;
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn start(&self) -> StateID {
|
||
|
|
let input = Input::new("").anchored(AcAnchored::Yes);
|
||
|
|
self.0.start_state(&input).expect("support for unanchored searches")
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn is_match(&self, state: &StateID) -> bool {
|
||
|
|
self.0.is_match(*state)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn accept(&self, state: &StateID, byte: u8) -> StateID {
|
||
|
|
if fst::Automaton::is_match(self, state) {
|
||
|
|
return *state;
|
||
|
|
}
|
||
|
|
self.0.next_state(AcAnchored::Yes, *state, byte)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline]
|
||
|
|
fn can_match(&self, state: &StateID) -> bool {
|
||
|
|
!self.0.is_dead(*state)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests {
|
||
|
|
use alloc::{string::String, vec, vec::Vec};
|
||
|
|
|
||
|
|
use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||
|
|
|
||
|
|
use crate::{
|
||
|
|
dfa::DFA,
|
||
|
|
nfa::{contiguous, noncontiguous},
|
||
|
|
StartKind,
|
||
|
|
};
|
||
|
|
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
fn search<A: Automaton, D: AsRef<[u8]>>(
|
||
|
|
set: &Set<D>,
|
||
|
|
aut: A,
|
||
|
|
) -> Vec<String> {
|
||
|
|
let mut stream = set.search(aut).into_stream();
|
||
|
|
let mut results = vec![];
|
||
|
|
while let Some(key) = stream.next() {
|
||
|
|
results.push(String::from(core::str::from_utf8(key).unwrap()));
|
||
|
|
}
|
||
|
|
results
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn unanchored() {
|
||
|
|
let set =
|
||
|
|
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||
|
|
.unwrap();
|
||
|
|
let patterns = vec!["baz", "bax"];
|
||
|
|
let expected = vec!["baz", "xbax"];
|
||
|
|
|
||
|
|
let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
|
||
|
|
let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
|
||
|
|
let aut = Unanchored(DFA::new(&patterns).unwrap());
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn anchored() {
|
||
|
|
let set =
|
||
|
|
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||
|
|
.unwrap();
|
||
|
|
let patterns = vec!["baz", "bax"];
|
||
|
|
let expected = vec!["baz"];
|
||
|
|
|
||
|
|
let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
|
||
|
|
let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
|
||
|
|
let aut = Anchored(
|
||
|
|
DFA::builder()
|
||
|
|
.start_kind(StartKind::Anchored)
|
||
|
|
.build(&patterns)
|
||
|
|
.unwrap(),
|
||
|
|
);
|
||
|
|
let got = search(&set, &aut);
|
||
|
|
assert_eq!(got, expected);
|
||
|
|
}
|
||
|
|
}
|