litrs/
escape.rs

1use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}};
2
3
4/// Must start with `\`. Returns the unscaped value as `E` and the number of
5/// input bytes the escape is long.
6///
7/// `unicode` and `byte_escapes` specify which types of escapes are
8/// supported. [Quote escapes] are always unescaped, [Unicode escapes] only if
9/// `unicode` is true. If `byte_escapes` is false, [ASCII escapes] are
10/// used, if it's true, [Byte escapes] are (the only difference being that the
11/// latter supports \xHH escapes > 0x7f).
12///
13/// [Quote escapes]: https://doc.rust-lang.org/reference/tokens.html#quote-escapes
14/// [Unicode escapes]: https://doc.rust-lang.org/reference/tokens.html#unicode-escapes
15/// [Ascii escapes]: https://doc.rust-lang.org/reference/tokens.html#ascii-escapes
16/// [Byte escapes]: https://doc.rust-lang.org/reference/tokens.html#byte-escapes
17pub(crate) fn unescape<E: Escapee>(
18    input: &str,
19    offset: usize,
20    unicode: bool,
21    byte_escapes: bool,
22) -> Result<(E, usize), ParseError> {
23    let first = input.as_bytes().get(1)
24        .ok_or(perr(offset, UnterminatedEscape))?;
25    let out = match first {
26        // Quote escapes
27        b'\'' => (E::from_byte(b'\''), 2),
28        b'"' => (E::from_byte(b'"'), 2),
29
30        // Ascii escapes
31        b'n' => (E::from_byte(b'\n'), 2),
32        b'r' => (E::from_byte(b'\r'), 2),
33        b't' => (E::from_byte(b'\t'), 2),
34        b'\\' => (E::from_byte(b'\\'), 2),
35        b'0' => (E::from_byte(b'\0'), 2),
36        b'x' => {
37            let hex_string = input.get(2..4)
38                .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))?
39                .as_bytes();
40            let first = hex_digit_value(hex_string[0])
41                .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
42            let second = hex_digit_value(hex_string[1])
43                .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
44            let value = second + 16 * first;
45
46            if !byte_escapes && value > 0x7F {
47                return Err(perr(offset..offset + 4, NonAsciiXEscape));
48            }
49
50            (E::from_byte(value), 4)
51        },
52
53        // Unicode escape
54        b'u' => {
55            if !unicode {
56                return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral));
57            }
58
59            if input.as_bytes().get(2) != Some(&b'{') {
60                return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace));
61            }
62
63            let closing_pos = input.bytes().position(|b| b == b'}')
64                .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?;
65
66            let inner = &input[3..closing_pos];
67            if inner.as_bytes().first() == Some(&b'_') {
68                return Err(perr(4, InvalidStartOfUnicodeEscape));
69            }
70
71            let mut v: u32 = 0;
72            let mut digit_count = 0;
73            for (i, b) in inner.bytes().enumerate() {
74                if b == b'_'{
75                    continue;
76                }
77
78                let digit = hex_digit_value(b)
79                    .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?;
80
81                if digit_count == 6 {
82                    return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape));
83                }
84                digit_count += 1;
85                v = 16 * v + digit as u32;
86            }
87
88            let c = std::char::from_u32(v)
89                .ok_or(perr(offset..offset + closing_pos + 1, InvalidUnicodeEscapeChar))?;
90
91            (E::from_char(c), closing_pos + 1)
92        }
93
94        _ => return Err(perr(offset..offset + 2, UnknownEscape)),
95    };
96
97    Ok(out)
98}
99
100pub(crate) trait Escapee: Sized {
101    type Container: EscapeeContainer<Self>;
102    fn from_byte(b: u8) -> Self;
103    fn from_char(c: char) -> Self;
104}
105
106impl Escapee for u8 {
107    type Container = Vec<u8>;
108    fn from_byte(b: u8) -> Self {
109        b
110    }
111    fn from_char(_: char) -> Self {
112        panic!("bug: `<u8 as Escapee>::from_char` was called");
113    }
114}
115
116impl Escapee for char {
117    type Container = String;
118    fn from_byte(b: u8) -> Self {
119        b.into()
120    }
121    fn from_char(c: char) -> Self {
122        c
123    }
124}
125
126pub(crate) trait EscapeeContainer<E: Escapee> {
127    fn new() -> Self;
128    fn is_empty(&self) -> bool;
129    fn push(&mut self, v: E);
130    fn push_str(&mut self, s: &str);
131}
132
133impl EscapeeContainer<u8> for Vec<u8> {
134    fn new() -> Self { Self::new() }
135    fn is_empty(&self) -> bool { self.is_empty() }
136    fn push(&mut self, v: u8) { self.push(v); }
137    fn push_str(&mut self, s: &str) { self.extend_from_slice(s.as_bytes()); }
138}
139
140impl EscapeeContainer<char> for String {
141    fn new() -> Self { Self::new() }
142    fn is_empty(&self) -> bool { self.is_empty() }
143    fn push(&mut self, v: char) { self.push(v); }
144    fn push_str(&mut self, s: &str) { self.push_str(s); }
145}
146
147
148/// Checks whether the character is skipped after a string continue start
149/// (unescaped backlash followed by `\n`).
150fn is_string_continue_skipable_whitespace(b: u8) -> bool {
151    b == b' ' || b == b'\t' || b == b'\n'
152}
153
154/// Unescapes a whole string or byte string.
155#[inline(never)]
156pub(crate) fn unescape_string<E: Escapee>(
157    input: &str,
158    offset: usize,
159    unicode: bool,
160    byte_escapes: bool,
161) -> Result<(Option<E::Container>, usize), ParseError> {
162    let mut closing_quote_pos = None;
163    let mut i = offset;
164    let mut end_last_escape = offset;
165    let mut value = <E::Container>::new();
166    while i < input.len() {
167        match input.as_bytes()[i] {
168            // Handle "string continue".
169            b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => {
170                value.push_str(&input[end_last_escape..i]);
171
172                // Find the first non-whitespace character.
173                let end_escape = input[i + 2..].bytes()
174                    .position(|b| !is_string_continue_skipable_whitespace(b))
175                    .ok_or(perr(None, UnterminatedString))?;
176
177                i += 2 + end_escape;
178                end_last_escape = i;
179            }
180            b'\\' => {
181                let rest = &input[i..input.len() - 1];
182                let (c, len) = unescape::<E>(rest, i, unicode, byte_escapes)?;
183                value.push_str(&input[end_last_escape..i]);
184                value.push(c);
185                i += len;
186                end_last_escape = i;
187            }
188            b'\r' => return Err(perr(i, CarriageReturn)),
189            b'"' => {
190                closing_quote_pos = Some(i);
191                break;
192            },
193            b if !unicode && !b.is_ascii() => return Err(perr(i, NonAsciiInByteLiteral)),
194            _ => i += 1,
195        }
196    }
197
198    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?;
199
200    let start_suffix = closing_quote_pos + 1;
201    let suffix = &input[start_suffix..];
202    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
203
204    // `value` is only empty if there was no escape in the input string
205    // (with the special case of the input being empty). This means the
206    // string value basically equals the input, so we store `None`.
207    let value = if value.is_empty() {
208        None
209    } else {
210        // There was an escape in the string, so we need to push the
211        // remaining unescaped part of the string still.
212        value.push_str(&input[end_last_escape..closing_quote_pos]);
213        Some(value)
214    };
215
216    Ok((value, start_suffix))
217}
218
219/// Reads and checks a raw (byte) string literal. Returns the number of hashes
220/// and the index when the suffix starts.
221#[inline(never)]
222pub(crate) fn scan_raw_string<E: Escapee>(
223    input: &str,
224    offset: usize,
225    unicode: bool,
226) -> Result<(u32, usize), ParseError> {
227    // Raw string literal
228    let num_hashes = input[offset..].bytes().position(|b| b != b'#')
229        .ok_or(perr(None, InvalidLiteral))?;
230
231    if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
232        return Err(perr(None, InvalidLiteral));
233    }
234    let start_inner = offset + num_hashes + 1;
235    let hashes = &input[offset..num_hashes + offset];
236
237    let mut closing_quote_pos = None;
238    let mut i = start_inner;
239    while i < input.len() {
240        let b = input.as_bytes()[i];
241        if b == b'"' && input[i + 1..].starts_with(hashes) {
242            closing_quote_pos = Some(i);
243            break;
244        }
245
246        // CR are just always disallowed in all (raw) strings. Rust performs
247        // a normalization of CR LF to just LF in a pass prior to lexing. But
248        // in lexing, it's disallowed.
249        if b == b'\r' {
250            return Err(perr(i, CarriageReturn));
251        }
252
253        if !unicode {
254            if !b.is_ascii() {
255                return Err(perr(i, NonAsciiInByteLiteral));
256            }
257        }
258
259        i += 1;
260    }
261
262    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?;
263
264    let start_suffix = closing_quote_pos + num_hashes + 1;
265    let suffix = &input[start_suffix..];
266    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
267
268    Ok((num_hashes as u32, start_suffix))
269}