1use crate::common::{is_name_char, is_whitespace_char, is_xml10_char, is_xml11_char, Position, TextPosition};
6use crate::reader::error::SyntaxError;
7use crate::reader::{Error, ErrorKind};
8use crate::util::{CharReader, Encoding};
9use std::collections::VecDeque;
10use std::io::Read;
11use std::{fmt, result};
12
13use super::ParserConfig2;
14
15#[derive(Copy, Clone, PartialEq, Eq, Debug)]
18pub(crate) enum Token {
19 ProcessingInstructionStart,
21 ProcessingInstructionEnd,
23 DoctypeStart,
25 OpeningTagStart,
27 ClosingTagStart,
29 TagEnd,
31 EmptyTagEnd,
33 CommentStart,
35 CommentEnd,
37 Character(char),
39 EqualsSign,
41 SingleQuote,
43 DoubleQuote,
45 CDataStart,
47 CDataEnd,
49 ReferenceStart,
51 ReferenceEnd,
53 MarkupDeclarationStart,
55 Eof,
57}
58
59impl fmt::Display for Token {
60 #[cold]
61 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62 match *self {
63 Token::Character(c) => c.fmt(f),
64 other => match other {
65 Token::OpeningTagStart => "<",
66 Token::ProcessingInstructionStart => "<?",
67 Token::DoctypeStart => "<!DOCTYPE",
68 Token::ClosingTagStart => "</",
69 Token::CommentStart => "<!--",
70 Token::CDataStart => "<![CDATA[",
71 Token::TagEnd => ">",
72 Token::EmptyTagEnd => "/>",
73 Token::ProcessingInstructionEnd => "?>",
74 Token::CommentEnd => "-->",
75 Token::CDataEnd => "]]>",
76 Token::ReferenceStart => "&",
77 Token::ReferenceEnd => ";",
78 Token::EqualsSign => "=",
79 Token::SingleQuote => "'",
80 Token::DoubleQuote => "\"",
81 Token::MarkupDeclarationStart => "<!",
82 Token::Eof | Token::Character(_) => {
83 debug_assert!(false);
84 ""
85 },
86 }.fmt(f),
87 }
88 }
89}
90
91impl Token {
92 pub const fn as_static_str(self) -> Option<&'static str> {
93 match self {
94 Self::OpeningTagStart => Some("<"),
95 Self::ProcessingInstructionStart => Some("<?"),
96 Self::DoctypeStart => Some("<!DOCTYPE"),
97 Self::ClosingTagStart => Some("</"),
98 Self::CommentStart => Some("<!--"),
99 Self::CDataStart => Some("<![CDATA["),
100 Self::TagEnd => Some(">"),
101 Self::EmptyTagEnd => Some("/>"),
102 Self::ProcessingInstructionEnd => Some("?>"),
103 Self::CommentEnd => Some("-->"),
104 Self::CDataEnd => Some("]]>"),
105 Self::ReferenceStart => Some("&"),
106 Self::ReferenceEnd => Some(";"),
107 Self::EqualsSign => Some("="),
108 Self::SingleQuote => Some("'"),
109 Self::DoubleQuote => Some("\""),
110 _ => None
111 }
112 }
113
114 pub fn push_to_string(self, target: &mut String) {
116 match self {
117 Self::Character(c) => {
118 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
119 target.push(c);
120 },
121 _ => if let Some(s) = self.as_static_str() {
122 target.push_str(s);
123 }
124 }
125 }
126}
127
128#[derive(Copy, Clone)]
129enum State {
130 Normal,
132 TagStarted,
134 CommentOrCDataOrDoctypeStarted,
136 CommentStarted,
138 DoctypeStarted(DoctypeStartedSubstate),
140 InsideMarkupDeclaration,
142 InsideDoctype,
144 CDataStarted(CDataStartedSubstate),
146 ProcessingInstructionClosing,
148 EmptyTagClosing,
150 CommentClosing(ClosingSubstate),
152 CDataClosing(ClosingSubstate),
154 InvalidCDataClosing(ClosingSubstate),
156 InsideComment,
158 InsideCdata,
160 InsideProcessingInstruction,
162 InsideMarkupDeclarationQuotedString(QuoteStyle),
164}
165
166#[derive(Copy, Clone, Eq, PartialEq)]
167enum QuoteStyle {
168 Single, Double
169}
170
171#[derive(Copy, Clone)]
172enum ClosingSubstate {
173 First, Second
174}
175
176#[derive(Copy, Clone)]
177#[allow(clippy::upper_case_acronyms)]
178enum DoctypeStartedSubstate {
179 D, DO, DOC, DOCT, DOCTY, DOCTYP
180}
181
182#[derive(Copy, Clone)]
183#[allow(clippy::upper_case_acronyms)]
184enum CDataStartedSubstate {
185 E, C, CD, CDA, CDAT, CDATA
186}
187
188pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
190
191macro_rules! dispatch_on_enum_state(
194 ($_self:ident, $s:expr, $c:expr, $is:expr,
195 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
196 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
197 match $s {
198 $(
199 $st => match $c {
200 $stc => Ok($_self.move_to($is($next_st))),
201 _ => $_self.handle_error($chunk, $c)
202 },
203 )+
204 $end_st => match $c {
205 $end_c => $e,
206 _ => $_self.handle_error($end_chunk, $c)
207 }
208 }
209 )
210);
211
212pub(crate) struct Lexer {
222 st: State,
223 reader: CharReader,
224 pos: TextPosition,
225 head_pos: TextPosition,
226 char_queue: VecDeque<char>,
227 normal_state: State,
229 inside_token: bool,
230 eof_handled: bool,
231 reparse_depth: u8,
232 #[cfg(test)]
233 skip_errors: bool,
234
235 max_entity_expansion_depth: u8,
236 max_entity_expansion_length: usize,
237}
238
239impl Position for Lexer {
240 #[inline]
241 fn position(&self) -> TextPosition { self.pos }
243}
244
245impl Lexer {
246 pub(crate) fn new(config: &ParserConfig2) -> Self {
248 Self {
249 reader: CharReader::new(),
250 pos: TextPosition::new(),
251 head_pos: TextPosition::new(),
252 char_queue: VecDeque::with_capacity(4), st: State::Normal,
254 normal_state: State::Normal,
255 inside_token: false,
256 eof_handled: false,
257 reparse_depth: 0,
258 #[cfg(test)]
259 skip_errors: false,
260
261 max_entity_expansion_depth: config.max_entity_expansion_depth,
262 max_entity_expansion_length: config.max_entity_expansion_length,
263 }
264 }
265
266 pub(crate) fn encoding(&self) -> Encoding {
267 self.reader.encoding
268 }
269
270 pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
271 self.reader.encoding = encoding;
272 }
273
274 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
277
278 #[inline]
280 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
281
282 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result<Token> {
292 if self.eof_handled {
294 return Ok(Token::Eof);
295 }
296
297 if !self.inside_token {
298 self.pos = self.head_pos;
299 self.inside_token = true;
300 }
301
302 while let Some(c) = self.char_queue.pop_front() {
304 if let Some(t) = self.dispatch_char(c)? {
305 self.inside_token = false;
306 return Ok(t);
307 }
308 }
309 self.reparse_depth = 0;
311 while let Some(c) = self.reader.next_char_from(b)? {
312 if c == '\n' {
313 self.head_pos.new_line();
314 } else {
315 self.head_pos.advance(1);
316 }
317
318 if let Some(t) = self.dispatch_char(c)? {
319 self.inside_token = false;
320 return Ok(t);
321 }
322 }
323
324 self.end_of_stream()
325 }
326
327 #[inline(never)]
328 fn end_of_stream(&mut self) -> Result<Token> {
329 self.eof_handled = true;
331 self.pos = self.head_pos;
332 match self.st {
333 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
334 State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
335 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
336 State::CommentClosing(ClosingSubstate::Second) |
337 State::InsideComment | State::InsideMarkupDeclaration |
338 State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
339 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
340 Err(self.error(SyntaxError::UnexpectedEof)),
341 State::EmptyTagClosing =>
342 Ok(Token::Character('/')),
343 State::CommentClosing(ClosingSubstate::First) =>
344 Ok(Token::Character('-')),
345 State::InvalidCDataClosing(ClosingSubstate::First) =>
346 Ok(Token::Character(']')),
347 State::InvalidCDataClosing(ClosingSubstate::Second) => {
348 self.eof_handled = false;
349 Ok(self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')))
350 },
351 State::Normal => Ok(Token::Eof),
352 }
353 }
354
355 #[cold]
356 #[allow(clippy::needless_pass_by_value)]
357 fn error(&self, e: SyntaxError) -> Error {
358 Error {
359 pos: self.position(),
360 kind: ErrorKind::Syntax(e.to_cow()),
361 }
362 }
363
364 #[inline(never)]
365 fn dispatch_char(&mut self, c: char) -> Result {
366 match self.st {
367 State::Normal => Ok(self.normal(c)),
368 State::TagStarted => self.tag_opened(c),
369 State::EmptyTagClosing => Ok(Some(self.empty_element_closing(c))),
370 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
371 State::InsideCdata => Ok(self.inside_cdata(c)),
372 State::CDataStarted(s) => self.cdata_started(c, s),
373 State::InsideComment => Ok(self.inside_comment_state(c)),
374 State::CommentStarted => self.comment_started(c),
375 State::InsideProcessingInstruction => Ok(self.inside_processing_instruction(c)),
376 State::ProcessingInstructionClosing => Ok(Some(self.processing_instruction_closing(c))),
377 State::CommentClosing(s) => self.comment_closing(c, s),
378 State::CDataClosing(s) => Ok(self.cdata_closing(c, s)),
379 State::InsideDoctype => Ok(self.inside_doctype(c)),
380 State::DoctypeStarted(s) => self.doctype_started(c, s),
381 State::InvalidCDataClosing(s) => Ok(self.invalid_cdata_closing(c, s)),
382 State::InsideMarkupDeclaration => self.markup_declaration(c),
383 State::InsideMarkupDeclarationQuotedString(q) => Ok(Some(self.markup_declaration_string(c, q))),
384 }
385 }
386
387 #[inline]
388 fn move_to(&mut self, st: State) -> Option<Token> {
389 self.st = st;
390 None
391 }
392
393 #[inline]
394 fn move_to_with(&mut self, st: State, token: Token) -> Token {
395 self.st = st;
396 token
397 }
398
399 #[inline]
400 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Token {
401 self.normal_state = st;
402 self.st = st;
403 token
404 }
405
406 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Token {
407 for c in cs.iter().rev().copied() {
408 self.char_queue.push_front(c);
409 }
410 self.move_to_with(st, token)
411 }
412
413 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
414 if markup.is_empty() {
415 return Ok(());
416 }
417
418 self.reparse_depth += 1;
419 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
420 return Err(self.error(SyntaxError::EntityTooBig));
421 }
422
423 self.eof_handled = false;
424 self.char_queue.reserve(markup.len());
425 for c in markup.chars().rev() {
426 self.char_queue.push_front(c);
427 }
428
429 Ok(())
430 }
431
432 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
433 debug_assert!(!chunk.is_empty());
434
435 #[cfg(test)]
436 if self.skip_errors {
437 let mut chars = chunk.chars();
438 let first = chars.next().unwrap_or('\0');
439 self.char_queue.extend(chars);
440 self.char_queue.push_back(c);
441 return Ok(Some(self.move_to_with(State::Normal, Token::Character(first))));
442 }
443 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
444 }
445
446 fn normal(&mut self, c: char) -> Option<Token> {
448 match c {
449 '<' => self.move_to(State::TagStarted),
450 '>' => Some(Token::TagEnd),
451 '/' => self.move_to(State::EmptyTagClosing),
452 '=' => Some(Token::EqualsSign),
453 '"' => Some(Token::DoubleQuote),
454 '\'' => Some(Token::SingleQuote),
455 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
456 '&' => Some(Token::ReferenceStart),
457 ';' => Some(Token::ReferenceEnd),
458 _ => Some(Token::Character(c))
459 }
460 }
461
462 fn inside_cdata(&mut self, c: char) -> Option<Token> {
463 match c {
464 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
465 _ => Some(Token::Character(c)),
466 }
467 }
468
469 fn inside_processing_instruction(&mut self, c: char) -> Option<Token> {
470 match c {
472 '?' => self.move_to(State::ProcessingInstructionClosing),
473 '<' => Some(Token::OpeningTagStart),
474 '>' => Some(Token::TagEnd),
475 '/' => Some(Token::ClosingTagStart),
476 '=' => Some(Token::EqualsSign),
477 '"' => Some(Token::DoubleQuote),
478 '\'' => Some(Token::SingleQuote),
479 '&' => Some(Token::ReferenceStart),
480 ';' => Some(Token::ReferenceEnd),
481 _ => Some(Token::Character(c))
482 }
483 }
484
485 fn inside_comment_state(&mut self, c: char) -> Option<Token> {
486 match c {
487 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
488 _ => Some(Token::Character(c)),
489 }
490 }
491
492 fn tag_opened(&mut self, c: char) -> Result {
494 match c {
495 '?' => Ok(Some(self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart))),
496 '/' => Ok(Some(self.move_to_with(self.normal_state, Token::ClosingTagStart))),
497 '!' => Ok(self.move_to(State::CommentOrCDataOrDoctypeStarted)),
498 _ if is_whitespace_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
499 _ if is_name_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
500 _ => self.handle_error("<", c)
501 }
502 }
503
504 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
506 match c {
507 '-' => Ok(self.move_to(State::CommentStarted)),
508 '[' => Ok(self.move_to(State::CDataStarted(CDataStartedSubstate::E))),
509 'D' => Ok(self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D))),
510 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
511 Ok(Some(self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)))
512 },
513 _ => self.handle_error("<!", c),
514 }
515 }
516
517 fn comment_started(&mut self, c: char) -> Result {
519 match c {
520 '-' => Ok(Some(self.move_to_with(State::InsideComment, Token::CommentStart))),
521 _ => self.handle_error("<!-", c),
522 }
523 }
524
525 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
527 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
528 dispatch_on_enum_state!(self, s, c, State::CDataStarted,
529 E ; 'C' ; C ; "<![",
530 C ; 'D' ; CD ; "<![C",
531 CD ; 'A' ; CDA ; "<![CD",
532 CDA ; 'T' ; CDAT ; "<![CDA",
533 CDAT ; 'A' ; CDATA ; "<![CDAT";
534 CDATA ; '[' ; "<![CDATA" ; Ok(Some(self.move_to_with(State::InsideCdata, Token::CDataStart)))
535 )
536 }
537
538 fn markup_declaration(&mut self, c: char) -> Result {
540 match c {
541 '<' => self.handle_error("<!", c),
542 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::TagEnd))),
543 '&' => Ok(Some(Token::ReferenceStart)),
544 ';' => Ok(Some(Token::ReferenceEnd)),
545 '"' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote))),
546 '\'' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote))),
547 _ => Ok(Some(Token::Character(c))),
548 }
549 }
550
551 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Token {
552 match c {
553 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
554 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
555 _ => Token::Character(c),
556 }
557 }
558
559 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
561 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
562 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
563 D ; 'O' ; DO ; "<!D",
564 DO ; 'C' ; DOC ; "<!DO",
565 DOC ; 'T' ; DOCT ; "<!DOC",
566 DOCT ; 'Y' ; DOCTY ; "<!DOCT",
567 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
568 DOCTYP ; 'E' ; "<!DOCTYP" ; Ok(Some(self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)))
569 )
570 }
571
572 fn inside_doctype(&mut self, c: char) -> Option<Token> {
574 match c {
575 '>' => Some(self.move_to_and_reset_normal(State::Normal, Token::TagEnd)),
576 '<' => self.move_to(State::TagStarted),
577 '&' => Some(Token::ReferenceStart),
578 ';' => Some(Token::ReferenceEnd),
579 '"' => Some(Token::DoubleQuote),
580 '\'' => Some(Token::SingleQuote),
581 _ => Some(Token::Character(c)),
582 }
583 }
584
585 fn processing_instruction_closing(&mut self, c: char) -> Token {
587 match c {
588 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
589 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
590 }
591 }
592
593 fn empty_element_closing(&mut self, c: char) -> Token {
595 match c {
596 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
597 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
598 }
599 }
600
601 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
603 match s {
604 ClosingSubstate::First => match c {
605 '-' => Ok(self.move_to(State::CommentClosing(ClosingSubstate::Second))),
606 _ => Ok(Some(self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')))),
607 },
608 ClosingSubstate::Second => match c {
609 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::CommentEnd))),
610 _ => self.handle_error("--", c),
612 },
613 }
614 }
615
616 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
618 match s {
619 ClosingSubstate::First => match c {
620 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
621 _ => Some(self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']'))),
622 },
623 ClosingSubstate::Second => match c {
624 '>' => Some(self.move_to_with(State::Normal, Token::CDataEnd)),
625 _ => Some(self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']'))),
626 },
627 }
628 }
629
630 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
632 match s {
633 ClosingSubstate::First => match c {
634 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
635 _ => Some(self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))),
636 },
637 ClosingSubstate::Second => match c {
638 '>' => Some(self.move_to_with(self.normal_state, Token::CDataEnd)),
639 _ => Some(self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))),
640 },
641 }
642 }
643}
644
645#[cfg(test)]
646mod tests {
647 use crate::{common::Position, reader::ParserConfig2};
648 use std::io::{BufReader, Cursor};
649
650 use super::{Lexer, Token};
651
652 macro_rules! assert_oks(
653 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
654 $(
655 assert_eq!(Ok($e), $lex.next_token(&mut $buf));
656 )+
657 })
658 );
659
660 macro_rules! assert_err(
661 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
662 let err = $lex.next_token(&mut $buf);
663 assert!(err.is_err());
664 let err = err.unwrap_err();
665 assert_eq!($r as u64, err.position().row);
666 assert_eq!($c as u64, err.position().column);
667 })
668 );
669
670 macro_rules! assert_none(
671 (for $lex:ident and $buf:ident) => (
672 assert_eq!(Ok(Token::Eof), $lex.next_token(&mut $buf))
673 )
674 );
675
676 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
677 (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
678 }
679
680 #[test]
681 fn tricky_pi() {
682 let (mut lex, mut buf) = make_lex_and_buf(r"<?x<!-- &??><x>");
683
684 assert_oks!(for lex and buf ;
685 Token::ProcessingInstructionStart
686 Token::Character('x')
687 Token::OpeningTagStart Token::Character('!')
689 Token::Character('-')
690 Token::Character('-')
691 Token::Character(' ')
692 Token::ReferenceStart
693 Token::Character('?')
694 Token::ProcessingInstructionEnd
695 Token::OpeningTagStart
696 Token::Character('x')
697 Token::TagEnd
698 );
699 assert_none!(for lex and buf);
700 }
701
702 #[test]
703 fn reparser() {
704 let (mut lex, mut buf) = make_lex_and_buf(r"&a;");
705
706 assert_oks!(for lex and buf ;
707 Token::ReferenceStart
708 Token::Character('a')
709 Token::ReferenceEnd
710 );
711 lex.reparse("<hi/>").unwrap();
712 assert_oks!(for lex and buf ;
713 Token::OpeningTagStart
714 Token::Character('h')
715 Token::Character('i')
716 Token::EmptyTagEnd
717 );
718 assert_none!(for lex and buf);
719 }
720
721 #[test]
722 fn simple_lexer_test() {
723 let (mut lex, mut buf) = make_lex_and_buf(
724 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "#
725 );
726
727 assert_oks!(for lex and buf ;
728 Token::OpeningTagStart
729 Token::Character('a')
730 Token::Character(' ')
731 Token::Character('p')
732 Token::EqualsSign
733 Token::SingleQuote
734 Token::Character('q')
735 Token::SingleQuote
736 Token::TagEnd
737 Token::Character(' ')
738 Token::Character('x')
739 Token::OpeningTagStart
740 Token::Character('b')
741 Token::Character(' ')
742 Token::Character('z')
743 Token::EqualsSign
744 Token::DoubleQuote
745 Token::Character('y')
746 Token::DoubleQuote
747 Token::TagEnd
748 Token::Character('d')
749 Token::Character('\t')
750 Token::ClosingTagStart
751 Token::Character('b')
752 Token::TagEnd
753 Token::ClosingTagStart
754 Token::Character('a')
755 Token::TagEnd
756 Token::OpeningTagStart
757 Token::Character('p')
758 Token::EmptyTagEnd
759 Token::Character(' ')
760 Token::ProcessingInstructionStart
761 Token::Character('n')
762 Token::Character('m')
763 Token::Character(' ')
764 Token::ProcessingInstructionEnd
765 Token::Character(' ')
766 Token::CommentStart
767 Token::Character(' ')
768 Token::Character('a')
769 Token::Character(' ')
770 Token::Character('c')
771 Token::Character(' ')
772 Token::CommentEnd
773 Token::Character(' ')
774 Token::ReferenceStart
775 Token::Character('n')
776 Token::Character('b')
777 Token::Character('s')
778 Token::Character('p')
779 Token::ReferenceEnd
780 );
781 assert_none!(for lex and buf);
782 }
783
784 #[test]
785 fn special_chars_test() {
786 let (mut lex, mut buf) = make_lex_and_buf(
787 r"?x!+ // -| ]z]]"
788 );
789
790 assert_oks!(for lex and buf ;
791 Token::Character('?')
792 Token::Character('x')
793 Token::Character('!')
794 Token::Character('+')
795 Token::Character(' ')
796 Token::Character('/')
797 Token::Character('/')
798 Token::Character(' ')
799 Token::Character('-')
800 Token::Character('|')
801 Token::Character(' ')
802 Token::Character(']')
803 Token::Character('z')
804 Token::Character(']')
805 Token::Character(']')
806 );
807 assert_none!(for lex and buf);
808 }
809
810 #[test]
811 fn cdata_test() {
812 let (mut lex, mut buf) = make_lex_and_buf(
813 r"<a><![CDATA[x y ?]]> </a>"
814 );
815
816 assert_oks!(for lex and buf ;
817 Token::OpeningTagStart
818 Token::Character('a')
819 Token::TagEnd
820 Token::CDataStart
821 Token::Character('x')
822 Token::Character(' ')
823 Token::Character('y')
824 Token::Character(' ')
825 Token::Character('?')
826 Token::CDataEnd
827 Token::Character(' ')
828 Token::ClosingTagStart
829 Token::Character('a')
830 Token::TagEnd
831 );
832 assert_none!(for lex and buf);
833 }
834
835 #[test]
836 fn cdata_closers_test() {
837 let (mut lex, mut buf) = make_lex_and_buf(
838 r"<![CDATA[] > ]> ]]><!---->]]<a>"
839 );
840
841 assert_oks!(for lex and buf ;
842 Token::CDataStart
843 Token::Character(']')
844 Token::Character(' ')
845 Token::Character('>')
846 Token::Character(' ')
847 Token::Character(']')
848 Token::Character('>')
849 Token::Character(' ')
850 Token::CDataEnd
851 Token::CommentStart
852 Token::CommentEnd
853 Token::Character(']')
854 Token::Character(']')
855 Token::OpeningTagStart
856 Token::Character('a')
857 Token::TagEnd
858 );
859 assert_none!(for lex and buf);
860 }
861
862 #[test]
863 fn doctype_test() {
864 let (mut lex, mut buf) = make_lex_and_buf(
865 r"<a><!DOCTYPE ab xx z> "
866 );
867 assert_oks!(for lex and buf ;
868 Token::OpeningTagStart
869 Token::Character('a')
870 Token::TagEnd
871 Token::DoctypeStart
872 Token::Character(' ')
873 Token::Character('a')
874 Token::Character('b')
875 Token::Character(' ')
876 Token::Character('x')
877 Token::Character('x')
878 Token::Character(' ')
879 Token::Character('z')
880 Token::TagEnd
881 Token::Character(' ')
882 );
883 assert_none!(for lex and buf);
884 }
885
886 #[test]
887 fn tricky_comments() {
888 let (mut lex, mut buf) = make_lex_and_buf(
889 r"<a><!-- C ->--></a>"
890 );
891 assert_oks!(for lex and buf ;
892 Token::OpeningTagStart
893 Token::Character('a')
894 Token::TagEnd
895 Token::CommentStart
896 Token::Character(' ')
897 Token::Character('C')
898 Token::Character(' ')
899 Token::Character('-')
900 Token::Character('>')
901 Token::CommentEnd
902 Token::ClosingTagStart
903 Token::Character('a')
904 Token::TagEnd
905 );
906 assert_none!(for lex and buf);
907 }
908
909 #[test]
910 fn doctype_with_internal_subset_test() {
911 let (mut lex, mut buf) = make_lex_and_buf(
912 r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
913 );
914 assert_oks!(for lex and buf ;
915 Token::OpeningTagStart
916 Token::Character('a')
917 Token::TagEnd
918 Token::DoctypeStart
919 Token::Character(' ')
920 Token::Character('a')
921 Token::Character('b')
922 Token::Character('[')
923 Token::MarkupDeclarationStart
924 Token::Character('E')
925 Token::Character('L')
926 Token::Character('E')
927 Token::Character('M')
928 Token::Character('E')
929 Token::Character('N')
930 Token::Character('T')
931 Token::Character(' ')
932 Token::Character('b')
933 Token::Character('a')
934 Token::Character(' ')
935 Token::DoubleQuote
936 Token::Character('>')
937 Token::Character('>')
938 Token::Character('>')
939 Token::DoubleQuote
940 Token::TagEnd
941 Token::Character(' ')
942 Token::Character(']')
943 Token::TagEnd
944 Token::Character(' ')
945 );
946 assert_none!(for lex and buf);
947 }
948
949 #[test]
950 fn doctype_internal_pi_comment() {
951 let (mut lex, mut buf) = make_lex_and_buf(
952 "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
953 );
954 assert_oks!(for lex and buf ;
955 Token::DoctypeStart
956 Token::Character(' ')
957 Token::Character('a')
958 Token::Character(' ')
959 Token::Character('[')
960 Token::Character('\n')
961 Token::MarkupDeclarationStart
962 Token::Character('E')
963 Token::Character('L')
964 Token::Character('E')
965 Token::Character('M')
966 Token::Character('E')
967 Token::Character('N')
968 Token::Character('T')
969 Token::Character(' ')
970 Token::Character('l')
971 Token::Character(' ')
972 Token::Character('A')
973 Token::Character('N')
974 Token::Character('Y')
975 Token::TagEnd
976 Token::Character(' ')
977 Token::CommentStart
978 Token::Character(' ')
979 Token::Character('<')
980 Token::Character('?')
981 Token::Character('n')
982 Token::Character('o')
983 Token::Character('n')
984 Token::Character('?')
985 Token::Character('>')
986 Token::CommentEnd
987 Token::Character(' ')
988 Token::ProcessingInstructionStart
989 Token::Character('p')
990 Token::Character('i')
991 Token::Character(' ')
992 Token::TagEnd Token::Character(' ')
994 Token::ProcessingInstructionEnd
995 Token::Character(' ')
996 Token::Character('\n')
997 Token::Character(']')
998 Token::TagEnd );
1000 assert_none!(for lex and buf);
1001 }
1002
1003 #[test]
1004 fn end_of_stream_handling_ok() {
1005 macro_rules! eof_check(
1006 ($data:expr ; $token:expr) => ({
1007 let (mut lex, mut buf) = make_lex_and_buf($data);
1008 assert_oks!(for lex and buf ; $token);
1009 assert_none!(for lex and buf);
1010 })
1011 );
1012 eof_check!("?" ; Token::Character('?'));
1013 eof_check!("/" ; Token::Character('/'));
1014 eof_check!("-" ; Token::Character('-'));
1015 eof_check!("]" ; Token::Character(']'));
1016 eof_check!("]" ; Token::Character(']'));
1017 eof_check!("]" ; Token::Character(']'));
1018 }
1019
1020 #[test]
1021 fn end_of_stream_handling_error() {
1022 macro_rules! eof_check(
1023 ($data:expr; $r:expr, $c:expr) => ({
1024 let (mut lex, mut buf) = make_lex_and_buf($data);
1025 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1026 assert_none!(for lex and buf);
1027 })
1028 );
1029 eof_check!("<" ; 0, 1);
1030 eof_check!("<!" ; 0, 2);
1031 eof_check!("<!-" ; 0, 3);
1032 eof_check!("<![" ; 0, 3);
1033 eof_check!("<![C" ; 0, 4);
1034 eof_check!("<![CD" ; 0, 5);
1035 eof_check!("<![CDA" ; 0, 6);
1036 eof_check!("<![CDAT" ; 0, 7);
1037 eof_check!("<![CDATA" ; 0, 8);
1038 }
1039
1040 #[test]
1041 fn error_in_comment_or_cdata_prefix() {
1042 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1043 assert_err!(for lex and buf expect row 0 ; 0,
1044 "Unexpected token '<!' before 'x'"
1045 );
1046
1047 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1048 lex.disable_errors();
1049 assert_oks!(for lex and buf ;
1050 Token::Character('<')
1051 Token::Character('!')
1052 Token::Character('x')
1053 );
1054 assert_none!(for lex and buf);
1055 }
1056
1057 #[test]
1058 fn error_in_comment_started() {
1059 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1060 assert_err!(for lex and buf expect row 0 ; 0,
1061 "Unexpected token '<!-' before '\t'"
1062 );
1063
1064 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1065 lex.disable_errors();
1066 assert_oks!(for lex and buf ;
1067 Token::Character('<')
1068 Token::Character('!')
1069 Token::Character('-')
1070 Token::Character('\t')
1071 );
1072 assert_none!(for lex and buf);
1073 }
1074
1075 #[test]
1076 fn error_in_comment_two_dashes_not_at_end() {
1077 let (mut lex, mut buf) = make_lex_and_buf("--x");
1078 lex.st = super::State::InsideComment;
1079 assert_err!(for lex and buf expect row 0; 0,
1080 "Unexpected token '--' before 'x'"
1081 );
1082
1083 let (mut lex, mut buf) = make_lex_and_buf("--x");
1084 assert_oks!(for lex and buf ;
1085 Token::Character('-')
1086 Token::Character('-')
1087 Token::Character('x')
1088 );
1089 }
1090
1091 macro_rules! check_case(
1092 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1093 let (mut lex, mut buf) = make_lex_and_buf($data);
1094 assert_err!(for lex and buf expect row $r ; $c, $s);
1095
1096 let (mut lex, mut buf) = make_lex_and_buf($data);
1097 lex.disable_errors();
1098 for c in $chunk.chars() {
1099 assert_eq!(Ok(Token::Character(c)), lex.next_token(&mut buf));
1100 }
1101 assert_oks!(for lex and buf ;
1102 Token::Character($app)
1103 );
1104 assert_none!(for lex and buf);
1105 })
1106 );
1107
1108 #[test]
1109 fn token_size() {
1110 assert_eq!(4, std::mem::size_of::<Token>());
1111 assert_eq!(2, std::mem::size_of::<super::State>());
1112 }
1113
1114 #[test]
1115 fn error_in_cdata_started() {
1116 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
1117 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
1118 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['");
1119 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['");
1120 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['");
1121 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1122 }
1123
1124 #[test]
1125 fn error_in_doctype_started() {
1126 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'");
1127 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'");
1128 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1129 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1130 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1131 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1132 }
1133
1134 #[test]
1135 fn issue_98_cdata_ending_with_right_bracket() {
1136 let (mut lex, mut buf) = make_lex_and_buf(
1137 r"<![CDATA[Foo [Bar]]]>"
1138 );
1139
1140 assert_oks!(for lex and buf ;
1141 Token::CDataStart
1142 Token::Character('F')
1143 Token::Character('o')
1144 Token::Character('o')
1145 Token::Character(' ')
1146 Token::Character('[')
1147 Token::Character('B')
1148 Token::Character('a')
1149 Token::Character('r')
1150 Token::Character(']')
1151 Token::CDataEnd
1152 );
1153 assert_none!(for lex and buf);
1154 }
1155}