use std::borrow::Borrow;
use std::collections::HashMap;
use std::error;
use std::fmt;
use std::io;
use std::result;
use super::{TrieSetSlice, CHUNK_SIZE};
const CHUNKS: usize = 0x110000 / CHUNK_SIZE;
pub type Result<T> = result::Result<T, Error>;
#[derive(Clone, Debug)]
pub enum Error {
InvalidCodepoint(u32),
GaveUp,
}
impl error::Error for Error {}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Error::InvalidCodepoint(cp) => write!(
f,
"could not construct trie set containing an \
invalid Unicode codepoint: 0x{:X}",
cp
),
Error::GaveUp => {
write!(f, "could not compress codepoint set into a trie")
}
}
}
}
impl From<Error> for io::Error {
fn from(err: Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, err)
}
}
#[derive(Clone)]
pub struct TrieSetOwned {
tree1_level1: Vec<u64>,
tree2_level1: Vec<u8>,
tree2_level2: Vec<u64>,
tree3_level1: Vec<u8>,
tree3_level2: Vec<u8>,
tree3_level3: Vec<u64>,
}
impl fmt::Debug for TrieSetOwned {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TrieSetOwned(...)")
}
}
impl TrieSetOwned {
fn new(all: &[bool]) -> Result<TrieSetOwned> {
let mut bitvectors = Vec::with_capacity(CHUNKS);
for i in 0..CHUNKS {
let mut bitvector = 0u64;
for j in 0..CHUNK_SIZE {
if all[i * CHUNK_SIZE + j] {
bitvector |= 1 << j;
}
}
bitvectors.push(bitvector);
}
let tree1_level1 =
bitvectors.iter().cloned().take(0x800 / CHUNK_SIZE).collect();
let (mut tree2_level1, mut tree2_level2) = compress_postfix_leaves(
&bitvectors[0x800 / CHUNK_SIZE..0x10000 / CHUNK_SIZE],
)?;
if tree2_level2.len() == 1 && tree2_level2[0] == 0 {
tree2_level1.clear();
tree2_level2.clear();
}
let (mid, mut tree3_level3) = compress_postfix_leaves(
&bitvectors[0x10000 / CHUNK_SIZE..0x110000 / CHUNK_SIZE],
)?;
let (mut tree3_level1, mut tree3_level2) =
compress_postfix_mid(&mid, 64)?;
if tree3_level3.len() == 1 && tree3_level3[0] == 0 {
tree3_level1.clear();
tree3_level2.clear();
tree3_level3.clear();
}
Ok(TrieSetOwned {
tree1_level1,
tree2_level1,
tree2_level2,
tree3_level1,
tree3_level2,
tree3_level3,
})
}
pub fn from_scalars<I, C>(scalars: I) -> Result<TrieSetOwned>
where
I: IntoIterator<Item = C>,
C: Borrow<char>,
{
let mut all = vec![false; 0x110000];
for s in scalars {
all[*s.borrow() as usize] = true;
}
TrieSetOwned::new(&all)
}
pub fn from_codepoints<I, C>(codepoints: I) -> Result<TrieSetOwned>
where
I: IntoIterator<Item = C>,
C: Borrow<u32>,
{
let mut all = vec![false; 0x110000];
for cp in codepoints {
let cp = *cp.borrow();
if cp > 0x10FFFF {
return Err(Error::InvalidCodepoint(cp));
}
all[cp as usize] = true;
}
TrieSetOwned::new(&all)
}
#[inline(always)]
pub fn as_slice(&self) -> TrieSetSlice<'_> {
TrieSetSlice {
tree1_level1: &self.tree1_level1,
tree2_level1: &self.tree2_level1,
tree2_level2: &self.tree2_level2,
tree3_level1: &self.tree3_level1,
tree3_level2: &self.tree3_level2,
tree3_level3: &self.tree3_level3,
}
}
pub fn contains_char(&self, c: char) -> bool {
self.as_slice().contains_char(c)
}
pub fn contains_u32(&self, cp: u32) -> bool {
self.as_slice().contains_u32(cp)
}
}
fn compress_postfix_leaves(chunks: &[u64]) -> Result<(Vec<u8>, Vec<u64>)> {
let mut root = vec![];
let mut children = vec![];
let mut bychild = HashMap::new();
for &chunk in chunks {
if !bychild.contains_key(&chunk) {
let start = bychild.len();
if start > ::std::u8::MAX as usize {
return Err(Error::GaveUp);
}
bychild.insert(chunk, start as u8);
children.push(chunk);
}
root.push(bychild[&chunk]);
}
Ok((root, children))
}
fn compress_postfix_mid(
chunks: &[u8],
chunk_size: usize,
) -> Result<(Vec<u8>, Vec<u8>)> {
let mut root = vec![];
let mut children = vec![];
let mut bychild = HashMap::new();
for i in 0..(chunks.len() / chunk_size) {
let chunk = &chunks[i * chunk_size..(i + 1) * chunk_size];
if !bychild.contains_key(chunk) {
let start = bychild.len();
if start > ::std::u8::MAX as usize {
return Err(Error::GaveUp);
}
bychild.insert(chunk, start as u8);
children.extend(chunk);
}
root.push(bychild[chunk]);
}
Ok((root, children))
}
#[cfg(test)]
mod tests {
use super::TrieSetOwned;
use crate::general_category;
use std::collections::HashSet;
fn mk(scalars: &[char]) -> TrieSetOwned {
TrieSetOwned::from_scalars(scalars).unwrap()
}
fn ranges_to_set(ranges: &[(u32, u32)]) -> Vec<u32> {
let mut set = vec![];
for &(start, end) in ranges {
for cp in start..end + 1 {
set.push(cp);
}
}
set
}
#[test]
fn set1() {
let set = mk(&['a']);
assert!(set.contains_char('a'));
assert!(!set.contains_char('b'));
assert!(!set.contains_char('β'));
assert!(!set.contains_char('☃'));
assert!(!set.contains_char('😼'));
}
#[test]
fn set_combined() {
let set = mk(&['a', 'b', 'β', '☃', '😼']);
assert!(set.contains_char('a'));
assert!(set.contains_char('b'));
assert!(set.contains_char('β'));
assert!(set.contains_char('☃'));
assert!(set.contains_char('😼'));
assert!(!set.contains_char('c'));
assert!(!set.contains_char('θ'));
assert!(!set.contains_char('⛇'));
assert!(!set.contains_char('🐲'));
}
macro_rules! category_test {
($name:ident, $ranges:ident) => {
#[test]
fn $name() {
let set = ranges_to_set(general_category::$ranges);
let hashset: HashSet<u32> = set.iter().cloned().collect();
let trie = TrieSetOwned::from_codepoints(&set).unwrap();
for cp in 0..0x110000 {
assert!(trie.contains_u32(cp) == hashset.contains(&cp));
}
assert!(!trie.contains_u32(0x110000));
assert!(!hashset.contains(&0x110000));
}
};
}
category_test!(gencat_cased_letter, CASED_LETTER);
category_test!(gencat_close_punctuation, CLOSE_PUNCTUATION);
category_test!(gencat_connector_punctuation, CONNECTOR_PUNCTUATION);
category_test!(gencat_control, CONTROL);
category_test!(gencat_currency_symbol, CURRENCY_SYMBOL);
category_test!(gencat_dash_punctuation, DASH_PUNCTUATION);
category_test!(gencat_decimal_number, DECIMAL_NUMBER);
category_test!(gencat_enclosing_mark, ENCLOSING_MARK);
category_test!(gencat_final_punctuation, FINAL_PUNCTUATION);
category_test!(gencat_format, FORMAT);
category_test!(gencat_initial_punctuation, INITIAL_PUNCTUATION);
category_test!(gencat_letter, LETTER);
category_test!(gencat_letter_number, LETTER_NUMBER);
category_test!(gencat_line_separator, LINE_SEPARATOR);
category_test!(gencat_lowercase_letter, LOWERCASE_LETTER);
category_test!(gencat_math_symbol, MATH_SYMBOL);
category_test!(gencat_mark, MARK);
category_test!(gencat_modifier_letter, MODIFIER_LETTER);
category_test!(gencat_modifier_symbol, MODIFIER_SYMBOL);
category_test!(gencat_nonspacing_mark, NONSPACING_MARK);
category_test!(gencat_number, NUMBER);
category_test!(gencat_open_punctuation, OPEN_PUNCTUATION);
category_test!(gencat_other, OTHER);
category_test!(gencat_other_letter, OTHER_LETTER);
category_test!(gencat_other_number, OTHER_NUMBER);
category_test!(gencat_other_punctuation, OTHER_PUNCTUATION);
category_test!(gencat_other_symbol, OTHER_SYMBOL);
category_test!(gencat_paragraph_separator, PARAGRAPH_SEPARATOR);
category_test!(gencat_private_use, PRIVATE_USE);
category_test!(gencat_punctuation, PUNCTUATION);
category_test!(gencat_separator, SEPARATOR);
category_test!(gencat_space_separator, SPACE_SEPARATOR);
category_test!(gencat_spacing_mark, SPACING_MARK);
category_test!(gencat_surrogate, SURROGATE);
category_test!(gencat_symbol, SYMBOL);
category_test!(gencat_titlecase_letter, TITLECASE_LETTER);
category_test!(gencat_unassigned, UNASSIGNED);
category_test!(gencat_uppercase_letter, UPPERCASE_LETTER);
}