xml/reader/parser/
inside_doctype.rs

1use std::fmt::Write;
2
3use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
4use crate::reader::error::SyntaxError;
5use crate::reader::lexer::Token;
6
7use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State};
8
9impl PullParser {
10    pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> {
11        if let Some(ref mut doctype) = self.data.doctype {
12            write!(doctype, "{t}").ok()?;
13            if doctype.len() > self.config.max_data_length {
14                return Some(self.error(SyntaxError::ExceededConfiguredLimit));
15            }
16        }
17
18        match substate {
19            DoctypeSubstate::Outside => match t {
20                Token::TagEnd => self.into_state_continue(State::OutsideTag),
21                Token::MarkupDeclarationStart => {
22                    self.buf.clear();
23                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName))
24                },
25                Token::Character('%') => {
26                    self.data.ref_data.clear();
27                    self.data.ref_data.push('%');
28                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd))
29                },
30                Token::CommentStart => {
31                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment))
32                },
33                Token::SingleQuote | Token::DoubleQuote => {
34                    // just discard string literals
35                    self.data.quote = super::QuoteToken::from_token(t);
36                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String))
37                },
38                Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))),
39                // TODO: parse SYSTEM, and [
40                _ => None,
41            },
42            DoctypeSubstate::String => match t {
43                Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None,
44                Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None,
45                Token::SingleQuote | Token::DoubleQuote => {
46                    self.data.quote = None;
47                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
48                },
49                _ => None,
50            },
51            DoctypeSubstate::Comment => match t {
52                Token::CommentEnd => {
53                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
54                },
55                _ => None,
56            },
57            DoctypeSubstate::InsideName => match t {
58                Token::Character(c @ 'A'..='Z') => {
59                    self.buf.push(c);
60                    None
61                },
62                Token::Character(c) if is_whitespace_char(c) => {
63                    let buf = self.take_buf();
64                    match buf.as_str() {
65                        "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
66                        "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
67                        _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))),
68                    }
69                },
70                _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
71            },
72            DoctypeSubstate::BeforeEntityName => {
73                self.data.name.clear();
74                match t {
75                    Token::Character(c) if is_whitespace_char(c) => None,
76                    Token::Character('%') => { // % is for PEDecl
77                        self.data.name.push('%');
78                        self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
79                    },
80                    Token::Character(c) if is_name_start_char(c) => {
81                        if self.data.name.len() > self.config.max_name_length {
82                            return Some(self.error(SyntaxError::ExceededConfiguredLimit));
83                        }
84                        self.data.name.push(c);
85                        self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
86                    },
87                    _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
88                }
89            },
90            DoctypeSubstate::EntityName => match t {
91                Token::Character(c) if is_whitespace_char(c) => {
92                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
93                },
94                Token::Character(c) if is_name_char(c) => {
95                    if self.data.name.len() > self.config.max_name_length {
96                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
97                    }
98                    self.data.name.push(c);
99                    None
100                },
101                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
102            },
103            DoctypeSubstate::BeforeEntityValue => {
104                self.buf.clear();
105                match t {
106                    Token::Character(c) if is_whitespace_char(c) => None,
107                    // SYSTEM/PUBLIC not supported
108                    Token::Character('S' | 'P') => {
109                        let name = self.data.take_name();
110                        self.entities.entry(name).or_default(); // Dummy value, but at least the name is recognized
111
112                        self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration))
113                    },
114                    Token::SingleQuote | Token::DoubleQuote => {
115                        self.data.quote = super::QuoteToken::from_token(t);
116                        self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
117                    },
118                    _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
119                }
120            },
121            DoctypeSubstate::EntityValue => match t {
122                Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None },
123                Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None },
124                Token::SingleQuote | Token::DoubleQuote => {
125                    self.data.quote = None;
126                    let name = self.data.take_name();
127                    let val = self.take_buf();
128                    self.entities.entry(name).or_insert(val); // First wins
129                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME
130                },
131                Token::ReferenceStart | Token::Character('&') => {
132                    self.data.ref_data.clear();
133                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart))
134                },
135                Token::Character('%') => {
136                    self.data.ref_data.clear();
137                    self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities
138                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue))
139                },
140                Token::Character(c) if !self.is_valid_xml_char(c) => {
141                    Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
142                },
143                Token::Character(c) => {
144                    self.buf.push(c);
145                    None
146                },
147                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
148            },
149            DoctypeSubstate::PEReferenceDefinitionStart => match t {
150                Token::Character(c) if is_whitespace_char(c) => None,
151                Token::Character(c) if is_name_start_char(c) => {
152                    debug_assert_eq!(self.data.name, "%");
153                    self.data.name.push(c);
154                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition))
155                },
156                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
157            },
158            DoctypeSubstate::PEReferenceDefinition => match t {
159                Token::Character(c) if is_name_char(c) => {
160                    if self.data.name.len() > self.config.max_name_length {
161                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
162                    }
163                    self.data.name.push(c);
164                    None
165                },
166                Token::Character(c) if is_whitespace_char(c) => {
167                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
168                },
169                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
170            },
171            DoctypeSubstate::PEReferenceInDtd => match t {
172                Token::Character(c) if is_name_char(c) => {
173                    self.data.ref_data.push(c);
174                    None
175                },
176                Token::ReferenceEnd | Token::Character(';') => {
177                    let name = self.data.take_ref_data();
178                    match self.entities.get(&name) {
179                        Some(ent) => {
180                            if let Err(e) = self.lexer.reparse(ent) {
181                                return Some(Err(e));
182                            }
183                            self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
184                        },
185                        None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
186                    }
187                },
188                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
189            },
190            DoctypeSubstate::PEReferenceInValue => match t {
191                Token::Character(c) if is_name_char(c) => {
192                    self.data.ref_data.push(c);
193                    None
194                },
195                Token::ReferenceEnd | Token::Character(';') => {
196                    let name = self.data.take_ref_data();
197                    match self.entities.get(&name) {
198                        Some(ent) => {
199                            self.buf.push_str(ent);
200                            self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
201                        },
202                        None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
203                    }
204                },
205                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
206            },
207            DoctypeSubstate::NumericReferenceStart => match t {
208                Token::Character('#') => {
209                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference))
210                },
211                Token::Character(c) if !self.is_valid_xml_char(c) => {
212                    Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
213                },
214                Token::Character(c) => {
215                    self.buf.push('&');
216                    self.buf.push(c);
217                    // named entities are not expanded inside doctype
218                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
219                },
220                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
221            },
222            DoctypeSubstate::NumericReference => match t {
223                Token::ReferenceEnd | Token::Character(';') => {
224                    let r = self.data.take_ref_data();
225                    // https://www.w3.org/TR/xml/#sec-entexpand
226                    match self.numeric_reference_from_str(&r) {
227                        Ok(c) => {
228                            self.buf.push(c);
229                            self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
230                        },
231                        Err(e) => Some(self.error(e)),
232                    }
233                },
234                Token::Character(c) if !self.is_valid_xml_char(c) => {
235                    Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
236                },
237                Token::Character(c) => {
238                    self.data.ref_data.push(c);
239                    None
240                },
241                _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
242            },
243            DoctypeSubstate::SkipDeclaration => match t {
244                Token::TagEnd => {
245                    self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
246                },
247                _ => None,
248            },
249        }
250    }
251}