xml/reader/parser/
outside_tag.rs

1use crate::common::is_whitespace_char;
2use crate::reader::error::SyntaxError;
3use crate::reader::events::XmlEvent;
4use crate::reader::lexer::Token;
5
6use super::{
7    ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate,
8    ProcessingInstructionSubstate, PullParser, Result, State,
9};
10
11impl PullParser {
12    pub fn outside_tag(&mut self, t: Token) -> Option<Result> {
13        match t {
14            Token::Character(c) => {
15                if is_whitespace_char(c) {
16                    // skip whitespace outside of the root element
17                    if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
18                        (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
19                            return None;
20                    }
21                } else {
22                    self.inside_whitespace = false;
23                    if self.depth() == 0 {
24                        return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
25                    }
26                }
27
28                if !self.is_valid_xml_char_not_restricted(c) {
29                    return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
30                }
31
32                if self.buf.is_empty() {
33                    self.push_pos();
34                } else if self.buf.len() > self.config.max_data_length {
35                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
36                }
37                self.buf.push(c);
38                None
39            },
40
41            Token::CommentEnd | Token::TagEnd | Token::EqualsSign |
42            Token::DoubleQuote | Token::SingleQuote |
43            Token::ProcessingInstructionEnd | Token::EmptyTagEnd => {
44                if self.depth() == 0 {
45                    return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
46                }
47                self.inside_whitespace = false;
48
49                if let Some(s) = t.as_static_str() {
50                    if self.buf.is_empty() {
51                        self.push_pos();
52                    } else if self.buf.len() > self.config.max_data_length {
53                        return Some(self.error(SyntaxError::ExceededConfiguredLimit));
54                    }
55
56                    self.buf.push_str(s);
57                }
58                None
59            },
60
61            Token::ReferenceStart if self.depth() > 0 => {
62                self.state_after_reference = State::OutsideTag;
63                self.into_state_continue(State::InsideReference)
64            },
65
66            Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
67                self.inside_whitespace = false;
68                if self.buf.len() > self.config.max_data_length {
69                    return Some(self.error(SyntaxError::ExceededConfiguredLimit));
70                }
71                Token::ReferenceEnd.push_to_string(&mut self.buf);
72                None
73            },
74
75            Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => {
76                let next_event = self.set_encountered(Encountered::Comment);
77                // We need to switch the lexer into a comment mode inside comments
78                self.into_state(State::InsideComment, next_event)
79            }
80
81            Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => {
82                if self.buf.is_empty() {
83                    self.push_pos(); // CDataEnd will pop pos if the buffer remains empty
84                }
85                // if coalescing chars, continue without event
86                self.into_state_continue(State::InsideCData)
87            },
88
89            _ => {
90                // Encountered some markup event, flush the buffer as characters
91                // or a whitespace
92                let mut next_event = if self.buf_has_data() {
93                    let buf = self.take_buf();
94                    if self.inside_whitespace && self.config.c.trim_whitespace {
95                        // there will be no event emitted for this, but start of buffering has pushed a pos
96                        self.next_pos();
97                        None
98                    } else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
99                        debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
100                        Some(Ok(XmlEvent::Whitespace(buf)))
101                    } else if self.config.c.trim_whitespace {
102                        Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
103                    } else {
104                        Some(Ok(XmlEvent::Characters(buf)))
105                    }
106                } else { None };
107                self.inside_whitespace = true;  // Reset inside_whitespace flag
108
109                // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it
110                // and ignored comments don't pop
111                if t != Token::CommentStart || !self.config.c.ignore_comments {
112                    self.push_pos();
113                }
114                match t {
115                    Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => {
116                        if let Some(e) = self.set_encountered(Encountered::Element) {
117                            next_event = Some(e);
118                        }
119                        self.nst.push_empty();
120                        self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
121                    },
122
123                    Token::ClosingTagStart if self.depth() > 0 =>
124                        self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
125
126                    Token::CommentStart => {
127                        if let Some(e) = self.set_encountered(Encountered::Comment) {
128                            next_event = Some(e);
129                        }
130                        // We need to switch the lexer into a comment mode inside comments
131                        self.into_state(State::InsideComment, next_event)
132                    },
133
134                    Token::DoctypeStart if self.encountered < Encountered::Doctype => {
135                        if let Some(e) = self.set_encountered(Encountered::Doctype) {
136                            next_event = Some(e);
137                        }
138                        self.data.doctype = Some(Token::DoctypeStart.to_string());
139
140                        // We don't have a doctype event so skip this position
141                        // FIXME: update when we have a doctype event
142                        self.next_pos();
143                        self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
144                    },
145
146                    Token::ProcessingInstructionStart =>
147                        self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
148
149                    Token::CDataStart if self.depth() > 0 => {
150                        self.into_state(State::InsideCData, next_event)
151                    },
152
153                    _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
154                }
155            },
156        }
157    }
158
159    pub fn document_start(&mut self, t: Token) -> Option<Result> {
160        debug_assert!(self.encountered < Encountered::Declaration);
161
162        match t {
163            Token::Character(c) => {
164                let next_event = self.set_encountered(Encountered::AnyChars);
165
166                if !is_whitespace_char(c) {
167                    return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
168                }
169                self.inside_whitespace = true;
170
171                // skip whitespace outside of the root element
172                if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
173                    (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
174                        return self.into_state(State::OutsideTag, next_event);
175                }
176
177                self.push_pos();
178                self.buf.push(c);
179                self.into_state(State::OutsideTag, next_event)
180            },
181
182            Token::CommentStart => {
183                let next_event = self.set_encountered(Encountered::Comment);
184                self.into_state(State::InsideComment, next_event)
185            },
186
187            Token::OpeningTagStart => {
188                let next_event = self.set_encountered(Encountered::Element);
189                self.nst.push_empty();
190                self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
191            },
192
193            Token::DoctypeStart => {
194                let next_event = self.set_encountered(Encountered::Doctype);
195                self.data.doctype = Some(Token::DoctypeStart.to_string());
196
197                // We don't have a doctype event so skip this position
198                // FIXME: update when we have a doctype event
199                self.next_pos();
200                self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
201            },
202
203            Token::ProcessingInstructionStart => {
204                self.push_pos();
205                self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
206            },
207
208            _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
209        }
210    }
211}