source: code/trunk/vendor/golang.org/x/net/html/parse.go@ 145

Last change on this file since 145 was 145, checked in by Izuru Yakumo, 22 months ago

Updated the Makefile and vendored depedencies

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 58.7 KB
Line 
1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8 "errors"
9 "fmt"
10 "io"
11 "strings"
12
13 a "golang.org/x/net/html/atom"
14)
15
16// A parser implements the HTML5 parsing algorithm:
17// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18type parser struct {
19 // tokenizer provides the tokens for the parser.
20 tokenizer *Tokenizer
21 // tok is the most recently read token.
22 tok Token
23 // Self-closing tags like <hr/> are treated as start tags, except that
24 // hasSelfClosingToken is set while they are being processed.
25 hasSelfClosingToken bool
26 // doc is the document root element.
27 doc *Node
28 // The stack of open elements (section 12.2.4.2) and active formatting
29 // elements (section 12.2.4.3).
30 oe, afe nodeStack
31 // Element pointers (section 12.2.4.4).
32 head, form *Node
33 // Other parsing state flags (section 12.2.4.5).
34 scripting, framesetOK bool
35 // The stack of template insertion modes
36 templateStack insertionModeStack
37 // im is the current insertion mode.
38 im insertionMode
39 // originalIM is the insertion mode to go back to after completing a text
40 // or inTableText insertion mode.
41 originalIM insertionMode
42 // fosterParenting is whether new elements should be inserted according to
43 // the foster parenting rules (section 12.2.6.1).
44 fosterParenting bool
45 // quirks is whether the parser is operating in "quirks mode."
46 quirks bool
47 // fragment is whether the parser is parsing an HTML fragment.
48 fragment bool
49 // context is the context element when parsing an HTML fragment
50 // (section 12.4).
51 context *Node
52}
53
54func (p *parser) top() *Node {
55 if n := p.oe.top(); n != nil {
56 return n
57 }
58 return p.doc
59}
60
61// Stop tags for use in popUntil. These come from section 12.2.4.2.
62var (
63 defaultScopeStopTags = map[string][]a.Atom{
64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66 "svg": {a.Desc, a.ForeignObject, a.Title},
67 }
68)
69
70type scope int
71
72const (
73 defaultScope scope = iota
74 listItemScope
75 buttonScope
76 tableScope
77 tableRowScope
78 tableBodyScope
79 selectScope
80)
81
82// popUntil pops the stack of open elements at the highest element whose tag
83// is in matchTags, provided there is no higher element in the scope's stop
84// tags (as defined in section 12.2.4.2). It returns whether or not there was
85// such an element. If there was not, popUntil leaves the stack unchanged.
86//
87// For example, the set of stop tags for table scope is: "html", "table". If
88// the stack was:
89// ["html", "body", "font", "table", "b", "i", "u"]
90// then popUntil(tableScope, "font") would return false, but
91// popUntil(tableScope, "i") would return true and the stack would become:
92// ["html", "body", "font", "table", "b"]
93//
94// If an element's tag is in both the stop tags and matchTags, then the stack
95// will be popped and the function returns true (provided, of course, there was
96// no higher element in the stack that was also in the stop tags). For example,
97// popUntil(tableScope, "table") returns true and leaves:
98// ["html", "body", "font"]
99func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
101 p.oe = p.oe[:i]
102 return true
103 }
104 return false
105}
106
107// indexOfElementInScope returns the index in p.oe of the highest element whose
108// tag is in matchTags that is in scope. If no matching element is in scope, it
109// returns -1.
110func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111 for i := len(p.oe) - 1; i >= 0; i-- {
112 tagAtom := p.oe[i].DataAtom
113 if p.oe[i].Namespace == "" {
114 for _, t := range matchTags {
115 if t == tagAtom {
116 return i
117 }
118 }
119 switch s {
120 case defaultScope:
121 // No-op.
122 case listItemScope:
123 if tagAtom == a.Ol || tagAtom == a.Ul {
124 return -1
125 }
126 case buttonScope:
127 if tagAtom == a.Button {
128 return -1
129 }
130 case tableScope:
131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
132 return -1
133 }
134 case selectScope:
135 if tagAtom != a.Optgroup && tagAtom != a.Option {
136 return -1
137 }
138 default:
139 panic("unreachable")
140 }
141 }
142 switch s {
143 case defaultScope, listItemScope, buttonScope:
144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
145 if t == tagAtom {
146 return -1
147 }
148 }
149 }
150 }
151 return -1
152}
153
154// elementInScope is like popUntil, except that it doesn't modify the stack of
155// open elements.
156func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157 return p.indexOfElementInScope(s, matchTags...) != -1
158}
159
160// clearStackToContext pops elements off the stack of open elements until a
161// scope-defined element is found.
162func (p *parser) clearStackToContext(s scope) {
163 for i := len(p.oe) - 1; i >= 0; i-- {
164 tagAtom := p.oe[i].DataAtom
165 switch s {
166 case tableScope:
167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
168 p.oe = p.oe[:i+1]
169 return
170 }
171 case tableRowScope:
172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
173 p.oe = p.oe[:i+1]
174 return
175 }
176 case tableBodyScope:
177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
178 p.oe = p.oe[:i+1]
179 return
180 }
181 default:
182 panic("unreachable")
183 }
184 }
185}
186
187// parseGenericRawTextElement implements the generic raw text element parsing
188// algorithm defined in 12.2.6.2.
189// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
190// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
191// officially, need to make tokenizer consider both states.
192func (p *parser) parseGenericRawTextElement() {
193 p.addElement()
194 p.originalIM = p.im
195 p.im = textIM
196}
197
198// generateImpliedEndTags pops nodes off the stack of open elements as long as
199// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
200// If exceptions are specified, nodes with that name will not be popped off.
201func (p *parser) generateImpliedEndTags(exceptions ...string) {
202 var i int
203loop:
204 for i = len(p.oe) - 1; i >= 0; i-- {
205 n := p.oe[i]
206 if n.Type != ElementNode {
207 break
208 }
209 switch n.DataAtom {
210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
211 for _, except := range exceptions {
212 if n.Data == except {
213 break loop
214 }
215 }
216 continue
217 }
218 break
219 }
220
221 p.oe = p.oe[:i+1]
222}
223
224// addChild adds a child node n to the top element, and pushes n onto the stack
225// of open elements if it is an element node.
226func (p *parser) addChild(n *Node) {
227 if p.shouldFosterParent() {
228 p.fosterParent(n)
229 } else {
230 p.top().AppendChild(n)
231 }
232
233 if n.Type == ElementNode {
234 p.oe = append(p.oe, n)
235 }
236}
237
238// shouldFosterParent returns whether the next node to be added should be
239// foster parented.
240func (p *parser) shouldFosterParent() bool {
241 if p.fosterParenting {
242 switch p.top().DataAtom {
243 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
244 return true
245 }
246 }
247 return false
248}
249
250// fosterParent adds a child node according to the foster parenting rules.
251// Section 12.2.6.1, "foster parenting".
252func (p *parser) fosterParent(n *Node) {
253 var table, parent, prev, template *Node
254 var i int
255 for i = len(p.oe) - 1; i >= 0; i-- {
256 if p.oe[i].DataAtom == a.Table {
257 table = p.oe[i]
258 break
259 }
260 }
261
262 var j int
263 for j = len(p.oe) - 1; j >= 0; j-- {
264 if p.oe[j].DataAtom == a.Template {
265 template = p.oe[j]
266 break
267 }
268 }
269
270 if template != nil && (table == nil || j > i) {
271 template.AppendChild(n)
272 return
273 }
274
275 if table == nil {
276 // The foster parent is the html element.
277 parent = p.oe[0]
278 } else {
279 parent = table.Parent
280 }
281 if parent == nil {
282 parent = p.oe[i-1]
283 }
284
285 if table != nil {
286 prev = table.PrevSibling
287 } else {
288 prev = parent.LastChild
289 }
290 if prev != nil && prev.Type == TextNode && n.Type == TextNode {
291 prev.Data += n.Data
292 return
293 }
294
295 parent.InsertBefore(n, table)
296}
297
298// addText adds text to the preceding node if it is a text node, or else it
299// calls addChild with a new text node.
300func (p *parser) addText(text string) {
301 if text == "" {
302 return
303 }
304
305 if p.shouldFosterParent() {
306 p.fosterParent(&Node{
307 Type: TextNode,
308 Data: text,
309 })
310 return
311 }
312
313 t := p.top()
314 if n := t.LastChild; n != nil && n.Type == TextNode {
315 n.Data += text
316 return
317 }
318 p.addChild(&Node{
319 Type: TextNode,
320 Data: text,
321 })
322}
323
324// addElement adds a child element based on the current token.
325func (p *parser) addElement() {
326 p.addChild(&Node{
327 Type: ElementNode,
328 DataAtom: p.tok.DataAtom,
329 Data: p.tok.Data,
330 Attr: p.tok.Attr,
331 })
332}
333
334// Section 12.2.4.3.
335func (p *parser) addFormattingElement() {
336 tagAtom, attr := p.tok.DataAtom, p.tok.Attr
337 p.addElement()
338
339 // Implement the Noah's Ark clause, but with three per family instead of two.
340 identicalElements := 0
341findIdenticalElements:
342 for i := len(p.afe) - 1; i >= 0; i-- {
343 n := p.afe[i]
344 if n.Type == scopeMarkerNode {
345 break
346 }
347 if n.Type != ElementNode {
348 continue
349 }
350 if n.Namespace != "" {
351 continue
352 }
353 if n.DataAtom != tagAtom {
354 continue
355 }
356 if len(n.Attr) != len(attr) {
357 continue
358 }
359 compareAttributes:
360 for _, t0 := range n.Attr {
361 for _, t1 := range attr {
362 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
363 // Found a match for this attribute, continue with the next attribute.
364 continue compareAttributes
365 }
366 }
367 // If we get here, there is no attribute that matches a.
368 // Therefore the element is not identical to the new one.
369 continue findIdenticalElements
370 }
371
372 identicalElements++
373 if identicalElements >= 3 {
374 p.afe.remove(n)
375 }
376 }
377
378 p.afe = append(p.afe, p.top())
379}
380
381// Section 12.2.4.3.
382func (p *parser) clearActiveFormattingElements() {
383 for {
384 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
385 return
386 }
387 }
388}
389
390// Section 12.2.4.3.
391func (p *parser) reconstructActiveFormattingElements() {
392 n := p.afe.top()
393 if n == nil {
394 return
395 }
396 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
397 return
398 }
399 i := len(p.afe) - 1
400 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
401 if i == 0 {
402 i = -1
403 break
404 }
405 i--
406 n = p.afe[i]
407 }
408 for {
409 i++
410 clone := p.afe[i].clone()
411 p.addChild(clone)
412 p.afe[i] = clone
413 if i == len(p.afe)-1 {
414 break
415 }
416 }
417}
418
419// Section 12.2.5.
420func (p *parser) acknowledgeSelfClosingTag() {
421 p.hasSelfClosingToken = false
422}
423
424// An insertion mode (section 12.2.4.1) is the state transition function from
425// a particular state in the HTML5 parser's state machine. It updates the
426// parser's fields depending on parser.tok (where ErrorToken means EOF).
427// It returns whether the token was consumed.
428type insertionMode func(*parser) bool
429
430// setOriginalIM sets the insertion mode to return to after completing a text or
431// inTableText insertion mode.
432// Section 12.2.4.1, "using the rules for".
433func (p *parser) setOriginalIM() {
434 if p.originalIM != nil {
435 panic("html: bad parser state: originalIM was set twice")
436 }
437 p.originalIM = p.im
438}
439
440// Section 12.2.4.1, "reset the insertion mode".
441func (p *parser) resetInsertionMode() {
442 for i := len(p.oe) - 1; i >= 0; i-- {
443 n := p.oe[i]
444 last := i == 0
445 if last && p.context != nil {
446 n = p.context
447 }
448
449 switch n.DataAtom {
450 case a.Select:
451 if !last {
452 for ancestor, first := n, p.oe[0]; ancestor != first; {
453 ancestor = p.oe[p.oe.index(ancestor)-1]
454 switch ancestor.DataAtom {
455 case a.Template:
456 p.im = inSelectIM
457 return
458 case a.Table:
459 p.im = inSelectInTableIM
460 return
461 }
462 }
463 }
464 p.im = inSelectIM
465 case a.Td, a.Th:
466 // TODO: remove this divergence from the HTML5 spec.
467 //
468 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
469 p.im = inCellIM
470 case a.Tr:
471 p.im = inRowIM
472 case a.Tbody, a.Thead, a.Tfoot:
473 p.im = inTableBodyIM
474 case a.Caption:
475 p.im = inCaptionIM
476 case a.Colgroup:
477 p.im = inColumnGroupIM
478 case a.Table:
479 p.im = inTableIM
480 case a.Template:
481 // TODO: remove this divergence from the HTML5 spec.
482 if n.Namespace != "" {
483 continue
484 }
485 p.im = p.templateStack.top()
486 case a.Head:
487 // TODO: remove this divergence from the HTML5 spec.
488 //
489 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
490 p.im = inHeadIM
491 case a.Body:
492 p.im = inBodyIM
493 case a.Frameset:
494 p.im = inFramesetIM
495 case a.Html:
496 if p.head == nil {
497 p.im = beforeHeadIM
498 } else {
499 p.im = afterHeadIM
500 }
501 default:
502 if last {
503 p.im = inBodyIM
504 return
505 }
506 continue
507 }
508 return
509 }
510}
511
512const whitespace = " \t\r\n\f"
513
514// Section 12.2.6.4.1.
515func initialIM(p *parser) bool {
516 switch p.tok.Type {
517 case TextToken:
518 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
519 if len(p.tok.Data) == 0 {
520 // It was all whitespace, so ignore it.
521 return true
522 }
523 case CommentToken:
524 p.doc.AppendChild(&Node{
525 Type: CommentNode,
526 Data: p.tok.Data,
527 })
528 return true
529 case DoctypeToken:
530 n, quirks := parseDoctype(p.tok.Data)
531 p.doc.AppendChild(n)
532 p.quirks = quirks
533 p.im = beforeHTMLIM
534 return true
535 }
536 p.quirks = true
537 p.im = beforeHTMLIM
538 return false
539}
540
541// Section 12.2.6.4.2.
542func beforeHTMLIM(p *parser) bool {
543 switch p.tok.Type {
544 case DoctypeToken:
545 // Ignore the token.
546 return true
547 case TextToken:
548 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
549 if len(p.tok.Data) == 0 {
550 // It was all whitespace, so ignore it.
551 return true
552 }
553 case StartTagToken:
554 if p.tok.DataAtom == a.Html {
555 p.addElement()
556 p.im = beforeHeadIM
557 return true
558 }
559 case EndTagToken:
560 switch p.tok.DataAtom {
561 case a.Head, a.Body, a.Html, a.Br:
562 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
563 return false
564 default:
565 // Ignore the token.
566 return true
567 }
568 case CommentToken:
569 p.doc.AppendChild(&Node{
570 Type: CommentNode,
571 Data: p.tok.Data,
572 })
573 return true
574 }
575 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
576 return false
577}
578
579// Section 12.2.6.4.3.
580func beforeHeadIM(p *parser) bool {
581 switch p.tok.Type {
582 case TextToken:
583 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
584 if len(p.tok.Data) == 0 {
585 // It was all whitespace, so ignore it.
586 return true
587 }
588 case StartTagToken:
589 switch p.tok.DataAtom {
590 case a.Head:
591 p.addElement()
592 p.head = p.top()
593 p.im = inHeadIM
594 return true
595 case a.Html:
596 return inBodyIM(p)
597 }
598 case EndTagToken:
599 switch p.tok.DataAtom {
600 case a.Head, a.Body, a.Html, a.Br:
601 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
602 return false
603 default:
604 // Ignore the token.
605 return true
606 }
607 case CommentToken:
608 p.addChild(&Node{
609 Type: CommentNode,
610 Data: p.tok.Data,
611 })
612 return true
613 case DoctypeToken:
614 // Ignore the token.
615 return true
616 }
617
618 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
619 return false
620}
621
622// Section 12.2.6.4.4.
623func inHeadIM(p *parser) bool {
624 switch p.tok.Type {
625 case TextToken:
626 s := strings.TrimLeft(p.tok.Data, whitespace)
627 if len(s) < len(p.tok.Data) {
628 // Add the initial whitespace to the current node.
629 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
630 if s == "" {
631 return true
632 }
633 p.tok.Data = s
634 }
635 case StartTagToken:
636 switch p.tok.DataAtom {
637 case a.Html:
638 return inBodyIM(p)
639 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
640 p.addElement()
641 p.oe.pop()
642 p.acknowledgeSelfClosingTag()
643 return true
644 case a.Noscript:
645 if p.scripting {
646 p.parseGenericRawTextElement()
647 return true
648 }
649 p.addElement()
650 p.im = inHeadNoscriptIM
651 // Don't let the tokenizer go into raw text mode when scripting is disabled.
652 p.tokenizer.NextIsNotRawText()
653 return true
654 case a.Script, a.Title:
655 p.addElement()
656 p.setOriginalIM()
657 p.im = textIM
658 return true
659 case a.Noframes, a.Style:
660 p.parseGenericRawTextElement()
661 return true
662 case a.Head:
663 // Ignore the token.
664 return true
665 case a.Template:
666 // TODO: remove this divergence from the HTML5 spec.
667 //
668 // We don't handle all of the corner cases when mixing foreign
669 // content (i.e. <math> or <svg>) with <template>. Without this
670 // early return, we can get into an infinite loop, possibly because
671 // of the "TODO... further divergence" a little below.
672 //
673 // As a workaround, if we are mixing foreign content and templates,
674 // just ignore the rest of the HTML. Foreign content is rare and a
675 // relatively old HTML feature. Templates are also rare and a
676 // relatively new HTML feature. Their combination is very rare.
677 for _, e := range p.oe {
678 if e.Namespace != "" {
679 p.im = ignoreTheRemainingTokens
680 return true
681 }
682 }
683
684 p.addElement()
685 p.afe = append(p.afe, &scopeMarker)
686 p.framesetOK = false
687 p.im = inTemplateIM
688 p.templateStack = append(p.templateStack, inTemplateIM)
689 return true
690 }
691 case EndTagToken:
692 switch p.tok.DataAtom {
693 case a.Head:
694 p.oe.pop()
695 p.im = afterHeadIM
696 return true
697 case a.Body, a.Html, a.Br:
698 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
699 return false
700 case a.Template:
701 if !p.oe.contains(a.Template) {
702 return true
703 }
704 // TODO: remove this further divergence from the HTML5 spec.
705 //
706 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
707 p.generateImpliedEndTags()
708 for i := len(p.oe) - 1; i >= 0; i-- {
709 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
710 p.oe = p.oe[:i]
711 break
712 }
713 }
714 p.clearActiveFormattingElements()
715 p.templateStack.pop()
716 p.resetInsertionMode()
717 return true
718 default:
719 // Ignore the token.
720 return true
721 }
722 case CommentToken:
723 p.addChild(&Node{
724 Type: CommentNode,
725 Data: p.tok.Data,
726 })
727 return true
728 case DoctypeToken:
729 // Ignore the token.
730 return true
731 }
732
733 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
734 return false
735}
736
737// Section 12.2.6.4.5.
738func inHeadNoscriptIM(p *parser) bool {
739 switch p.tok.Type {
740 case DoctypeToken:
741 // Ignore the token.
742 return true
743 case StartTagToken:
744 switch p.tok.DataAtom {
745 case a.Html:
746 return inBodyIM(p)
747 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
748 return inHeadIM(p)
749 case a.Head:
750 // Ignore the token.
751 return true
752 case a.Noscript:
753 // Don't let the tokenizer go into raw text mode even when a <noscript>
754 // tag is in "in head noscript" insertion mode.
755 p.tokenizer.NextIsNotRawText()
756 // Ignore the token.
757 return true
758 }
759 case EndTagToken:
760 switch p.tok.DataAtom {
761 case a.Noscript, a.Br:
762 default:
763 // Ignore the token.
764 return true
765 }
766 case TextToken:
767 s := strings.TrimLeft(p.tok.Data, whitespace)
768 if len(s) == 0 {
769 // It was all whitespace.
770 return inHeadIM(p)
771 }
772 case CommentToken:
773 return inHeadIM(p)
774 }
775 p.oe.pop()
776 if p.top().DataAtom != a.Head {
777 panic("html: the new current node will be a head element.")
778 }
779 p.im = inHeadIM
780 if p.tok.DataAtom == a.Noscript {
781 return true
782 }
783 return false
784}
785
786// Section 12.2.6.4.6.
787func afterHeadIM(p *parser) bool {
788 switch p.tok.Type {
789 case TextToken:
790 s := strings.TrimLeft(p.tok.Data, whitespace)
791 if len(s) < len(p.tok.Data) {
792 // Add the initial whitespace to the current node.
793 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
794 if s == "" {
795 return true
796 }
797 p.tok.Data = s
798 }
799 case StartTagToken:
800 switch p.tok.DataAtom {
801 case a.Html:
802 return inBodyIM(p)
803 case a.Body:
804 p.addElement()
805 p.framesetOK = false
806 p.im = inBodyIM
807 return true
808 case a.Frameset:
809 p.addElement()
810 p.im = inFramesetIM
811 return true
812 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
813 p.oe = append(p.oe, p.head)
814 defer p.oe.remove(p.head)
815 return inHeadIM(p)
816 case a.Head:
817 // Ignore the token.
818 return true
819 }
820 case EndTagToken:
821 switch p.tok.DataAtom {
822 case a.Body, a.Html, a.Br:
823 // Drop down to creating an implied <body> tag.
824 case a.Template:
825 return inHeadIM(p)
826 default:
827 // Ignore the token.
828 return true
829 }
830 case CommentToken:
831 p.addChild(&Node{
832 Type: CommentNode,
833 Data: p.tok.Data,
834 })
835 return true
836 case DoctypeToken:
837 // Ignore the token.
838 return true
839 }
840
841 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
842 p.framesetOK = true
843 return false
844}
845
846// copyAttributes copies attributes of src not found on dst to dst.
847func copyAttributes(dst *Node, src Token) {
848 if len(src.Attr) == 0 {
849 return
850 }
851 attr := map[string]string{}
852 for _, t := range dst.Attr {
853 attr[t.Key] = t.Val
854 }
855 for _, t := range src.Attr {
856 if _, ok := attr[t.Key]; !ok {
857 dst.Attr = append(dst.Attr, t)
858 attr[t.Key] = t.Val
859 }
860 }
861}
862
863// Section 12.2.6.4.7.
864func inBodyIM(p *parser) bool {
865 switch p.tok.Type {
866 case TextToken:
867 d := p.tok.Data
868 switch n := p.oe.top(); n.DataAtom {
869 case a.Pre, a.Listing:
870 if n.FirstChild == nil {
871 // Ignore a newline at the start of a <pre> block.
872 if d != "" && d[0] == '\r' {
873 d = d[1:]
874 }
875 if d != "" && d[0] == '\n' {
876 d = d[1:]
877 }
878 }
879 }
880 d = strings.Replace(d, "\x00", "", -1)
881 if d == "" {
882 return true
883 }
884 p.reconstructActiveFormattingElements()
885 p.addText(d)
886 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
887 // There were non-whitespace characters inserted.
888 p.framesetOK = false
889 }
890 case StartTagToken:
891 switch p.tok.DataAtom {
892 case a.Html:
893 if p.oe.contains(a.Template) {
894 return true
895 }
896 copyAttributes(p.oe[0], p.tok)
897 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
898 return inHeadIM(p)
899 case a.Body:
900 if p.oe.contains(a.Template) {
901 return true
902 }
903 if len(p.oe) >= 2 {
904 body := p.oe[1]
905 if body.Type == ElementNode && body.DataAtom == a.Body {
906 p.framesetOK = false
907 copyAttributes(body, p.tok)
908 }
909 }
910 case a.Frameset:
911 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
912 // Ignore the token.
913 return true
914 }
915 body := p.oe[1]
916 if body.Parent != nil {
917 body.Parent.RemoveChild(body)
918 }
919 p.oe = p.oe[:1]
920 p.addElement()
921 p.im = inFramesetIM
922 return true
923 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
924 p.popUntil(buttonScope, a.P)
925 p.addElement()
926 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
927 p.popUntil(buttonScope, a.P)
928 switch n := p.top(); n.DataAtom {
929 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
930 p.oe.pop()
931 }
932 p.addElement()
933 case a.Pre, a.Listing:
934 p.popUntil(buttonScope, a.P)
935 p.addElement()
936 // The newline, if any, will be dealt with by the TextToken case.
937 p.framesetOK = false
938 case a.Form:
939 if p.form != nil && !p.oe.contains(a.Template) {
940 // Ignore the token
941 return true
942 }
943 p.popUntil(buttonScope, a.P)
944 p.addElement()
945 if !p.oe.contains(a.Template) {
946 p.form = p.top()
947 }
948 case a.Li:
949 p.framesetOK = false
950 for i := len(p.oe) - 1; i >= 0; i-- {
951 node := p.oe[i]
952 switch node.DataAtom {
953 case a.Li:
954 p.oe = p.oe[:i]
955 case a.Address, a.Div, a.P:
956 continue
957 default:
958 if !isSpecialElement(node) {
959 continue
960 }
961 }
962 break
963 }
964 p.popUntil(buttonScope, a.P)
965 p.addElement()
966 case a.Dd, a.Dt:
967 p.framesetOK = false
968 for i := len(p.oe) - 1; i >= 0; i-- {
969 node := p.oe[i]
970 switch node.DataAtom {
971 case a.Dd, a.Dt:
972 p.oe = p.oe[:i]
973 case a.Address, a.Div, a.P:
974 continue
975 default:
976 if !isSpecialElement(node) {
977 continue
978 }
979 }
980 break
981 }
982 p.popUntil(buttonScope, a.P)
983 p.addElement()
984 case a.Plaintext:
985 p.popUntil(buttonScope, a.P)
986 p.addElement()
987 case a.Button:
988 p.popUntil(defaultScope, a.Button)
989 p.reconstructActiveFormattingElements()
990 p.addElement()
991 p.framesetOK = false
992 case a.A:
993 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
994 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
995 p.inBodyEndTagFormatting(a.A, "a")
996 p.oe.remove(n)
997 p.afe.remove(n)
998 break
999 }
1000 }
1001 p.reconstructActiveFormattingElements()
1002 p.addFormattingElement()
1003 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1004 p.reconstructActiveFormattingElements()
1005 p.addFormattingElement()
1006 case a.Nobr:
1007 p.reconstructActiveFormattingElements()
1008 if p.elementInScope(defaultScope, a.Nobr) {
1009 p.inBodyEndTagFormatting(a.Nobr, "nobr")
1010 p.reconstructActiveFormattingElements()
1011 }
1012 p.addFormattingElement()
1013 case a.Applet, a.Marquee, a.Object:
1014 p.reconstructActiveFormattingElements()
1015 p.addElement()
1016 p.afe = append(p.afe, &scopeMarker)
1017 p.framesetOK = false
1018 case a.Table:
1019 if !p.quirks {
1020 p.popUntil(buttonScope, a.P)
1021 }
1022 p.addElement()
1023 p.framesetOK = false
1024 p.im = inTableIM
1025 return true
1026 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
1027 p.reconstructActiveFormattingElements()
1028 p.addElement()
1029 p.oe.pop()
1030 p.acknowledgeSelfClosingTag()
1031 if p.tok.DataAtom == a.Input {
1032 for _, t := range p.tok.Attr {
1033 if t.Key == "type" {
1034 if strings.ToLower(t.Val) == "hidden" {
1035 // Skip setting framesetOK = false
1036 return true
1037 }
1038 }
1039 }
1040 }
1041 p.framesetOK = false
1042 case a.Param, a.Source, a.Track:
1043 p.addElement()
1044 p.oe.pop()
1045 p.acknowledgeSelfClosingTag()
1046 case a.Hr:
1047 p.popUntil(buttonScope, a.P)
1048 p.addElement()
1049 p.oe.pop()
1050 p.acknowledgeSelfClosingTag()
1051 p.framesetOK = false
1052 case a.Image:
1053 p.tok.DataAtom = a.Img
1054 p.tok.Data = a.Img.String()
1055 return false
1056 case a.Textarea:
1057 p.addElement()
1058 p.setOriginalIM()
1059 p.framesetOK = false
1060 p.im = textIM
1061 case a.Xmp:
1062 p.popUntil(buttonScope, a.P)
1063 p.reconstructActiveFormattingElements()
1064 p.framesetOK = false
1065 p.parseGenericRawTextElement()
1066 case a.Iframe:
1067 p.framesetOK = false
1068 p.parseGenericRawTextElement()
1069 case a.Noembed:
1070 p.parseGenericRawTextElement()
1071 case a.Noscript:
1072 if p.scripting {
1073 p.parseGenericRawTextElement()
1074 return true
1075 }
1076 p.reconstructActiveFormattingElements()
1077 p.addElement()
1078 // Don't let the tokenizer go into raw text mode when scripting is disabled.
1079 p.tokenizer.NextIsNotRawText()
1080 case a.Select:
1081 p.reconstructActiveFormattingElements()
1082 p.addElement()
1083 p.framesetOK = false
1084 p.im = inSelectIM
1085 return true
1086 case a.Optgroup, a.Option:
1087 if p.top().DataAtom == a.Option {
1088 p.oe.pop()
1089 }
1090 p.reconstructActiveFormattingElements()
1091 p.addElement()
1092 case a.Rb, a.Rtc:
1093 if p.elementInScope(defaultScope, a.Ruby) {
1094 p.generateImpliedEndTags()
1095 }
1096 p.addElement()
1097 case a.Rp, a.Rt:
1098 if p.elementInScope(defaultScope, a.Ruby) {
1099 p.generateImpliedEndTags("rtc")
1100 }
1101 p.addElement()
1102 case a.Math, a.Svg:
1103 p.reconstructActiveFormattingElements()
1104 if p.tok.DataAtom == a.Math {
1105 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1106 } else {
1107 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1108 }
1109 adjustForeignAttributes(p.tok.Attr)
1110 p.addElement()
1111 p.top().Namespace = p.tok.Data
1112 if p.hasSelfClosingToken {
1113 p.oe.pop()
1114 p.acknowledgeSelfClosingTag()
1115 }
1116 return true
1117 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1118 // Ignore the token.
1119 default:
1120 p.reconstructActiveFormattingElements()
1121 p.addElement()
1122 }
1123 case EndTagToken:
1124 switch p.tok.DataAtom {
1125 case a.Body:
1126 if p.elementInScope(defaultScope, a.Body) {
1127 p.im = afterBodyIM
1128 }
1129 case a.Html:
1130 if p.elementInScope(defaultScope, a.Body) {
1131 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1132 return false
1133 }
1134 return true
1135 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
1136 p.popUntil(defaultScope, p.tok.DataAtom)
1137 case a.Form:
1138 if p.oe.contains(a.Template) {
1139 i := p.indexOfElementInScope(defaultScope, a.Form)
1140 if i == -1 {
1141 // Ignore the token.
1142 return true
1143 }
1144 p.generateImpliedEndTags()
1145 if p.oe[i].DataAtom != a.Form {
1146 // Ignore the token.
1147 return true
1148 }
1149 p.popUntil(defaultScope, a.Form)
1150 } else {
1151 node := p.form
1152 p.form = nil
1153 i := p.indexOfElementInScope(defaultScope, a.Form)
1154 if node == nil || i == -1 || p.oe[i] != node {
1155 // Ignore the token.
1156 return true
1157 }
1158 p.generateImpliedEndTags()
1159 p.oe.remove(node)
1160 }
1161 case a.P:
1162 if !p.elementInScope(buttonScope, a.P) {
1163 p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1164 }
1165 p.popUntil(buttonScope, a.P)
1166 case a.Li:
1167 p.popUntil(listItemScope, a.Li)
1168 case a.Dd, a.Dt:
1169 p.popUntil(defaultScope, p.tok.DataAtom)
1170 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1171 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1172 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1173 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
1174 case a.Applet, a.Marquee, a.Object:
1175 if p.popUntil(defaultScope, p.tok.DataAtom) {
1176 p.clearActiveFormattingElements()
1177 }
1178 case a.Br:
1179 p.tok.Type = StartTagToken
1180 return false
1181 case a.Template:
1182 return inHeadIM(p)
1183 default:
1184 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
1185 }
1186 case CommentToken:
1187 p.addChild(&Node{
1188 Type: CommentNode,
1189 Data: p.tok.Data,
1190 })
1191 case ErrorToken:
1192 // TODO: remove this divergence from the HTML5 spec.
1193 if len(p.templateStack) > 0 {
1194 p.im = inTemplateIM
1195 return false
1196 }
1197 for _, e := range p.oe {
1198 switch e.DataAtom {
1199 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1200 a.Thead, a.Tr, a.Body, a.Html:
1201 default:
1202 return true
1203 }
1204 }
1205 }
1206
1207 return true
1208}
1209
1210func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
1211 // This is the "adoption agency" algorithm, described at
1212 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1213
1214 // TODO: this is a fairly literal line-by-line translation of that algorithm.
1215 // Once the code successfully parses the comprehensive test suite, we should
1216 // refactor this code to be more idiomatic.
1217
1218 // Steps 1-2
1219 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
1220 p.oe.pop()
1221 return
1222 }
1223
1224 // Steps 3-5. The outer loop.
1225 for i := 0; i < 8; i++ {
1226 // Step 6. Find the formatting element.
1227 var formattingElement *Node
1228 for j := len(p.afe) - 1; j >= 0; j-- {
1229 if p.afe[j].Type == scopeMarkerNode {
1230 break
1231 }
1232 if p.afe[j].DataAtom == tagAtom {
1233 formattingElement = p.afe[j]
1234 break
1235 }
1236 }
1237 if formattingElement == nil {
1238 p.inBodyEndTagOther(tagAtom, tagName)
1239 return
1240 }
1241
1242 // Step 7. Ignore the tag if formatting element is not in the stack of open elements.
1243 feIndex := p.oe.index(formattingElement)
1244 if feIndex == -1 {
1245 p.afe.remove(formattingElement)
1246 return
1247 }
1248 // Step 8. Ignore the tag if formatting element is not in the scope.
1249 if !p.elementInScope(defaultScope, tagAtom) {
1250 // Ignore the tag.
1251 return
1252 }
1253
1254 // Step 9. This step is omitted because it's just a parse error but no need to return.
1255
1256 // Steps 10-11. Find the furthest block.
1257 var furthestBlock *Node
1258 for _, e := range p.oe[feIndex:] {
1259 if isSpecialElement(e) {
1260 furthestBlock = e
1261 break
1262 }
1263 }
1264 if furthestBlock == nil {
1265 e := p.oe.pop()
1266 for e != formattingElement {
1267 e = p.oe.pop()
1268 }
1269 p.afe.remove(e)
1270 return
1271 }
1272
1273 // Steps 12-13. Find the common ancestor and bookmark node.
1274 commonAncestor := p.oe[feIndex-1]
1275 bookmark := p.afe.index(formattingElement)
1276
1277 // Step 14. The inner loop. Find the lastNode to reparent.
1278 lastNode := furthestBlock
1279 node := furthestBlock
1280 x := p.oe.index(node)
1281 // Step 14.1.
1282 j := 0
1283 for {
1284 // Step 14.2.
1285 j++
1286 // Step. 14.3.
1287 x--
1288 node = p.oe[x]
1289 // Step 14.4. Go to the next step if node is formatting element.
1290 if node == formattingElement {
1291 break
1292 }
1293 // Step 14.5. Remove node from the list of active formatting elements if
1294 // inner loop counter is greater than three and node is in the list of
1295 // active formatting elements.
1296 if ni := p.afe.index(node); j > 3 && ni > -1 {
1297 p.afe.remove(node)
1298 // If any element of the list of active formatting elements is removed,
1299 // we need to take care whether bookmark should be decremented or not.
1300 // This is because the value of bookmark may exceed the size of the
1301 // list by removing elements from the list.
1302 if ni <= bookmark {
1303 bookmark--
1304 }
1305 continue
1306 }
1307 // Step 14.6. Continue the next inner loop if node is not in the list of
1308 // active formatting elements.
1309 if p.afe.index(node) == -1 {
1310 p.oe.remove(node)
1311 continue
1312 }
1313 // Step 14.7.
1314 clone := node.clone()
1315 p.afe[p.afe.index(node)] = clone
1316 p.oe[p.oe.index(node)] = clone
1317 node = clone
1318 // Step 14.8.
1319 if lastNode == furthestBlock {
1320 bookmark = p.afe.index(node) + 1
1321 }
1322 // Step 14.9.
1323 if lastNode.Parent != nil {
1324 lastNode.Parent.RemoveChild(lastNode)
1325 }
1326 node.AppendChild(lastNode)
1327 // Step 14.10.
1328 lastNode = node
1329 }
1330
1331 // Step 15. Reparent lastNode to the common ancestor,
1332 // or for misnested table nodes, to the foster parent.
1333 if lastNode.Parent != nil {
1334 lastNode.Parent.RemoveChild(lastNode)
1335 }
1336 switch commonAncestor.DataAtom {
1337 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1338 p.fosterParent(lastNode)
1339 default:
1340 commonAncestor.AppendChild(lastNode)
1341 }
1342
1343 // Steps 16-18. Reparent nodes from the furthest block's children
1344 // to a clone of the formatting element.
1345 clone := formattingElement.clone()
1346 reparentChildren(clone, furthestBlock)
1347 furthestBlock.AppendChild(clone)
1348
1349 // Step 19. Fix up the list of active formatting elements.
1350 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1351 // Move the bookmark with the rest of the list.
1352 bookmark--
1353 }
1354 p.afe.remove(formattingElement)
1355 p.afe.insert(bookmark, clone)
1356
1357 // Step 20. Fix up the stack of open elements.
1358 p.oe.remove(formattingElement)
1359 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1360 }
1361}
1362
1363// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1364// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1365// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1366func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
1367 for i := len(p.oe) - 1; i >= 0; i-- {
1368 // Two element nodes have the same tag if they have the same Data (a
1369 // string-typed field). As an optimization, for common HTML tags, each
1370 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed
1371 // field), since integer comparison is faster than string comparison.
1372 // Uncommon (custom) tags get a zero DataAtom.
1373 //
1374 // The if condition here is equivalent to (p.oe[i].Data == tagName).
1375 if (p.oe[i].DataAtom == tagAtom) &&
1376 ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
1377 p.oe = p.oe[:i]
1378 break
1379 }
1380 if isSpecialElement(p.oe[i]) {
1381 break
1382 }
1383 }
1384}
1385
1386// Section 12.2.6.4.8.
1387func textIM(p *parser) bool {
1388 switch p.tok.Type {
1389 case ErrorToken:
1390 p.oe.pop()
1391 case TextToken:
1392 d := p.tok.Data
1393 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1394 // Ignore a newline at the start of a <textarea> block.
1395 if d != "" && d[0] == '\r' {
1396 d = d[1:]
1397 }
1398 if d != "" && d[0] == '\n' {
1399 d = d[1:]
1400 }
1401 }
1402 if d == "" {
1403 return true
1404 }
1405 p.addText(d)
1406 return true
1407 case EndTagToken:
1408 p.oe.pop()
1409 }
1410 p.im = p.originalIM
1411 p.originalIM = nil
1412 return p.tok.Type == EndTagToken
1413}
1414
1415// Section 12.2.6.4.9.
1416func inTableIM(p *parser) bool {
1417 switch p.tok.Type {
1418 case TextToken:
1419 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1420 switch p.oe.top().DataAtom {
1421 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1422 if strings.Trim(p.tok.Data, whitespace) == "" {
1423 p.addText(p.tok.Data)
1424 return true
1425 }
1426 }
1427 case StartTagToken:
1428 switch p.tok.DataAtom {
1429 case a.Caption:
1430 p.clearStackToContext(tableScope)
1431 p.afe = append(p.afe, &scopeMarker)
1432 p.addElement()
1433 p.im = inCaptionIM
1434 return true
1435 case a.Colgroup:
1436 p.clearStackToContext(tableScope)
1437 p.addElement()
1438 p.im = inColumnGroupIM
1439 return true
1440 case a.Col:
1441 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1442 return false
1443 case a.Tbody, a.Tfoot, a.Thead:
1444 p.clearStackToContext(tableScope)
1445 p.addElement()
1446 p.im = inTableBodyIM
1447 return true
1448 case a.Td, a.Th, a.Tr:
1449 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1450 return false
1451 case a.Table:
1452 if p.popUntil(tableScope, a.Table) {
1453 p.resetInsertionMode()
1454 return false
1455 }
1456 // Ignore the token.
1457 return true
1458 case a.Style, a.Script, a.Template:
1459 return inHeadIM(p)
1460 case a.Input:
1461 for _, t := range p.tok.Attr {
1462 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
1463 p.addElement()
1464 p.oe.pop()
1465 return true
1466 }
1467 }
1468 // Otherwise drop down to the default action.
1469 case a.Form:
1470 if p.oe.contains(a.Template) || p.form != nil {
1471 // Ignore the token.
1472 return true
1473 }
1474 p.addElement()
1475 p.form = p.oe.pop()
1476 case a.Select:
1477 p.reconstructActiveFormattingElements()
1478 switch p.top().DataAtom {
1479 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1480 p.fosterParenting = true
1481 }
1482 p.addElement()
1483 p.fosterParenting = false
1484 p.framesetOK = false
1485 p.im = inSelectInTableIM
1486 return true
1487 }
1488 case EndTagToken:
1489 switch p.tok.DataAtom {
1490 case a.Table:
1491 if p.popUntil(tableScope, a.Table) {
1492 p.resetInsertionMode()
1493 return true
1494 }
1495 // Ignore the token.
1496 return true
1497 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1498 // Ignore the token.
1499 return true
1500 case a.Template:
1501 return inHeadIM(p)
1502 }
1503 case CommentToken:
1504 p.addChild(&Node{
1505 Type: CommentNode,
1506 Data: p.tok.Data,
1507 })
1508 return true
1509 case DoctypeToken:
1510 // Ignore the token.
1511 return true
1512 case ErrorToken:
1513 return inBodyIM(p)
1514 }
1515
1516 p.fosterParenting = true
1517 defer func() { p.fosterParenting = false }()
1518
1519 return inBodyIM(p)
1520}
1521
1522// Section 12.2.6.4.11.
1523func inCaptionIM(p *parser) bool {
1524 switch p.tok.Type {
1525 case StartTagToken:
1526 switch p.tok.DataAtom {
1527 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1528 if !p.popUntil(tableScope, a.Caption) {
1529 // Ignore the token.
1530 return true
1531 }
1532 p.clearActiveFormattingElements()
1533 p.im = inTableIM
1534 return false
1535 case a.Select:
1536 p.reconstructActiveFormattingElements()
1537 p.addElement()
1538 p.framesetOK = false
1539 p.im = inSelectInTableIM
1540 return true
1541 }
1542 case EndTagToken:
1543 switch p.tok.DataAtom {
1544 case a.Caption:
1545 if p.popUntil(tableScope, a.Caption) {
1546 p.clearActiveFormattingElements()
1547 p.im = inTableIM
1548 }
1549 return true
1550 case a.Table:
1551 if !p.popUntil(tableScope, a.Caption) {
1552 // Ignore the token.
1553 return true
1554 }
1555 p.clearActiveFormattingElements()
1556 p.im = inTableIM
1557 return false
1558 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1559 // Ignore the token.
1560 return true
1561 }
1562 }
1563 return inBodyIM(p)
1564}
1565
1566// Section 12.2.6.4.12.
1567func inColumnGroupIM(p *parser) bool {
1568 switch p.tok.Type {
1569 case TextToken:
1570 s := strings.TrimLeft(p.tok.Data, whitespace)
1571 if len(s) < len(p.tok.Data) {
1572 // Add the initial whitespace to the current node.
1573 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1574 if s == "" {
1575 return true
1576 }
1577 p.tok.Data = s
1578 }
1579 case CommentToken:
1580 p.addChild(&Node{
1581 Type: CommentNode,
1582 Data: p.tok.Data,
1583 })
1584 return true
1585 case DoctypeToken:
1586 // Ignore the token.
1587 return true
1588 case StartTagToken:
1589 switch p.tok.DataAtom {
1590 case a.Html:
1591 return inBodyIM(p)
1592 case a.Col:
1593 p.addElement()
1594 p.oe.pop()
1595 p.acknowledgeSelfClosingTag()
1596 return true
1597 case a.Template:
1598 return inHeadIM(p)
1599 }
1600 case EndTagToken:
1601 switch p.tok.DataAtom {
1602 case a.Colgroup:
1603 if p.oe.top().DataAtom == a.Colgroup {
1604 p.oe.pop()
1605 p.im = inTableIM
1606 }
1607 return true
1608 case a.Col:
1609 // Ignore the token.
1610 return true
1611 case a.Template:
1612 return inHeadIM(p)
1613 }
1614 case ErrorToken:
1615 return inBodyIM(p)
1616 }
1617 if p.oe.top().DataAtom != a.Colgroup {
1618 return true
1619 }
1620 p.oe.pop()
1621 p.im = inTableIM
1622 return false
1623}
1624
1625// Section 12.2.6.4.13.
1626func inTableBodyIM(p *parser) bool {
1627 switch p.tok.Type {
1628 case StartTagToken:
1629 switch p.tok.DataAtom {
1630 case a.Tr:
1631 p.clearStackToContext(tableBodyScope)
1632 p.addElement()
1633 p.im = inRowIM
1634 return true
1635 case a.Td, a.Th:
1636 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1637 return false
1638 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1639 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1640 p.im = inTableIM
1641 return false
1642 }
1643 // Ignore the token.
1644 return true
1645 }
1646 case EndTagToken:
1647 switch p.tok.DataAtom {
1648 case a.Tbody, a.Tfoot, a.Thead:
1649 if p.elementInScope(tableScope, p.tok.DataAtom) {
1650 p.clearStackToContext(tableBodyScope)
1651 p.oe.pop()
1652 p.im = inTableIM
1653 }
1654 return true
1655 case a.Table:
1656 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1657 p.im = inTableIM
1658 return false
1659 }
1660 // Ignore the token.
1661 return true
1662 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1663 // Ignore the token.
1664 return true
1665 }
1666 case CommentToken:
1667 p.addChild(&Node{
1668 Type: CommentNode,
1669 Data: p.tok.Data,
1670 })
1671 return true
1672 }
1673
1674 return inTableIM(p)
1675}
1676
1677// Section 12.2.6.4.14.
1678func inRowIM(p *parser) bool {
1679 switch p.tok.Type {
1680 case StartTagToken:
1681 switch p.tok.DataAtom {
1682 case a.Td, a.Th:
1683 p.clearStackToContext(tableRowScope)
1684 p.addElement()
1685 p.afe = append(p.afe, &scopeMarker)
1686 p.im = inCellIM
1687 return true
1688 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1689 if p.popUntil(tableScope, a.Tr) {
1690 p.im = inTableBodyIM
1691 return false
1692 }
1693 // Ignore the token.
1694 return true
1695 }
1696 case EndTagToken:
1697 switch p.tok.DataAtom {
1698 case a.Tr:
1699 if p.popUntil(tableScope, a.Tr) {
1700 p.im = inTableBodyIM
1701 return true
1702 }
1703 // Ignore the token.
1704 return true
1705 case a.Table:
1706 if p.popUntil(tableScope, a.Tr) {
1707 p.im = inTableBodyIM
1708 return false
1709 }
1710 // Ignore the token.
1711 return true
1712 case a.Tbody, a.Tfoot, a.Thead:
1713 if p.elementInScope(tableScope, p.tok.DataAtom) {
1714 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
1715 return false
1716 }
1717 // Ignore the token.
1718 return true
1719 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1720 // Ignore the token.
1721 return true
1722 }
1723 }
1724
1725 return inTableIM(p)
1726}
1727
1728// Section 12.2.6.4.15.
1729func inCellIM(p *parser) bool {
1730 switch p.tok.Type {
1731 case StartTagToken:
1732 switch p.tok.DataAtom {
1733 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1734 if p.popUntil(tableScope, a.Td, a.Th) {
1735 // Close the cell and reprocess.
1736 p.clearActiveFormattingElements()
1737 p.im = inRowIM
1738 return false
1739 }
1740 // Ignore the token.
1741 return true
1742 case a.Select:
1743 p.reconstructActiveFormattingElements()
1744 p.addElement()
1745 p.framesetOK = false
1746 p.im = inSelectInTableIM
1747 return true
1748 }
1749 case EndTagToken:
1750 switch p.tok.DataAtom {
1751 case a.Td, a.Th:
1752 if !p.popUntil(tableScope, p.tok.DataAtom) {
1753 // Ignore the token.
1754 return true
1755 }
1756 p.clearActiveFormattingElements()
1757 p.im = inRowIM
1758 return true
1759 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1760 // Ignore the token.
1761 return true
1762 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1763 if !p.elementInScope(tableScope, p.tok.DataAtom) {
1764 // Ignore the token.
1765 return true
1766 }
1767 // Close the cell and reprocess.
1768 if p.popUntil(tableScope, a.Td, a.Th) {
1769 p.clearActiveFormattingElements()
1770 }
1771 p.im = inRowIM
1772 return false
1773 }
1774 }
1775 return inBodyIM(p)
1776}
1777
1778// Section 12.2.6.4.16.
1779func inSelectIM(p *parser) bool {
1780 switch p.tok.Type {
1781 case TextToken:
1782 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1783 case StartTagToken:
1784 switch p.tok.DataAtom {
1785 case a.Html:
1786 return inBodyIM(p)
1787 case a.Option:
1788 if p.top().DataAtom == a.Option {
1789 p.oe.pop()
1790 }
1791 p.addElement()
1792 case a.Optgroup:
1793 if p.top().DataAtom == a.Option {
1794 p.oe.pop()
1795 }
1796 if p.top().DataAtom == a.Optgroup {
1797 p.oe.pop()
1798 }
1799 p.addElement()
1800 case a.Select:
1801 if !p.popUntil(selectScope, a.Select) {
1802 // Ignore the token.
1803 return true
1804 }
1805 p.resetInsertionMode()
1806 case a.Input, a.Keygen, a.Textarea:
1807 if p.elementInScope(selectScope, a.Select) {
1808 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1809 return false
1810 }
1811 // In order to properly ignore <textarea>, we need to change the tokenizer mode.
1812 p.tokenizer.NextIsNotRawText()
1813 // Ignore the token.
1814 return true
1815 case a.Script, a.Template:
1816 return inHeadIM(p)
1817 case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
1818 // Don't let the tokenizer go into raw text mode when there are raw tags
1819 // to be ignored. These tags should be ignored from the tokenizer
1820 // properly.
1821 p.tokenizer.NextIsNotRawText()
1822 // Ignore the token.
1823 return true
1824 }
1825 case EndTagToken:
1826 switch p.tok.DataAtom {
1827 case a.Option:
1828 if p.top().DataAtom == a.Option {
1829 p.oe.pop()
1830 }
1831 case a.Optgroup:
1832 i := len(p.oe) - 1
1833 if p.oe[i].DataAtom == a.Option {
1834 i--
1835 }
1836 if p.oe[i].DataAtom == a.Optgroup {
1837 p.oe = p.oe[:i]
1838 }
1839 case a.Select:
1840 if !p.popUntil(selectScope, a.Select) {
1841 // Ignore the token.
1842 return true
1843 }
1844 p.resetInsertionMode()
1845 case a.Template:
1846 return inHeadIM(p)
1847 }
1848 case CommentToken:
1849 p.addChild(&Node{
1850 Type: CommentNode,
1851 Data: p.tok.Data,
1852 })
1853 case DoctypeToken:
1854 // Ignore the token.
1855 return true
1856 case ErrorToken:
1857 return inBodyIM(p)
1858 }
1859
1860 return true
1861}
1862
1863// Section 12.2.6.4.17.
1864func inSelectInTableIM(p *parser) bool {
1865 switch p.tok.Type {
1866 case StartTagToken, EndTagToken:
1867 switch p.tok.DataAtom {
1868 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1869 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
1870 // Ignore the token.
1871 return true
1872 }
1873 // This is like p.popUntil(selectScope, a.Select), but it also
1874 // matches <math select>, not just <select>. Matching the MathML
1875 // tag is arguably incorrect (conceptually), but it mimics what
1876 // Chromium does.
1877 for i := len(p.oe) - 1; i >= 0; i-- {
1878 if n := p.oe[i]; n.DataAtom == a.Select {
1879 p.oe = p.oe[:i]
1880 break
1881 }
1882 }
1883 p.resetInsertionMode()
1884 return false
1885 }
1886 }
1887 return inSelectIM(p)
1888}
1889
1890// Section 12.2.6.4.18.
1891func inTemplateIM(p *parser) bool {
1892 switch p.tok.Type {
1893 case TextToken, CommentToken, DoctypeToken:
1894 return inBodyIM(p)
1895 case StartTagToken:
1896 switch p.tok.DataAtom {
1897 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1898 return inHeadIM(p)
1899 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1900 p.templateStack.pop()
1901 p.templateStack = append(p.templateStack, inTableIM)
1902 p.im = inTableIM
1903 return false
1904 case a.Col:
1905 p.templateStack.pop()
1906 p.templateStack = append(p.templateStack, inColumnGroupIM)
1907 p.im = inColumnGroupIM
1908 return false
1909 case a.Tr:
1910 p.templateStack.pop()
1911 p.templateStack = append(p.templateStack, inTableBodyIM)
1912 p.im = inTableBodyIM
1913 return false
1914 case a.Td, a.Th:
1915 p.templateStack.pop()
1916 p.templateStack = append(p.templateStack, inRowIM)
1917 p.im = inRowIM
1918 return false
1919 default:
1920 p.templateStack.pop()
1921 p.templateStack = append(p.templateStack, inBodyIM)
1922 p.im = inBodyIM
1923 return false
1924 }
1925 case EndTagToken:
1926 switch p.tok.DataAtom {
1927 case a.Template:
1928 return inHeadIM(p)
1929 default:
1930 // Ignore the token.
1931 return true
1932 }
1933 case ErrorToken:
1934 if !p.oe.contains(a.Template) {
1935 // Ignore the token.
1936 return true
1937 }
1938 // TODO: remove this divergence from the HTML5 spec.
1939 //
1940 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1941 p.generateImpliedEndTags()
1942 for i := len(p.oe) - 1; i >= 0; i-- {
1943 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1944 p.oe = p.oe[:i]
1945 break
1946 }
1947 }
1948 p.clearActiveFormattingElements()
1949 p.templateStack.pop()
1950 p.resetInsertionMode()
1951 return false
1952 }
1953 return false
1954}
1955
1956// Section 12.2.6.4.19.
1957func afterBodyIM(p *parser) bool {
1958 switch p.tok.Type {
1959 case ErrorToken:
1960 // Stop parsing.
1961 return true
1962 case TextToken:
1963 s := strings.TrimLeft(p.tok.Data, whitespace)
1964 if len(s) == 0 {
1965 // It was all whitespace.
1966 return inBodyIM(p)
1967 }
1968 case StartTagToken:
1969 if p.tok.DataAtom == a.Html {
1970 return inBodyIM(p)
1971 }
1972 case EndTagToken:
1973 if p.tok.DataAtom == a.Html {
1974 if !p.fragment {
1975 p.im = afterAfterBodyIM
1976 }
1977 return true
1978 }
1979 case CommentToken:
1980 // The comment is attached to the <html> element.
1981 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
1982 panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1983 }
1984 p.oe[0].AppendChild(&Node{
1985 Type: CommentNode,
1986 Data: p.tok.Data,
1987 })
1988 return true
1989 }
1990 p.im = inBodyIM
1991 return false
1992}
1993
1994// Section 12.2.6.4.20.
1995func inFramesetIM(p *parser) bool {
1996 switch p.tok.Type {
1997 case CommentToken:
1998 p.addChild(&Node{
1999 Type: CommentNode,
2000 Data: p.tok.Data,
2001 })
2002 case TextToken:
2003 // Ignore all text but whitespace.
2004 s := strings.Map(func(c rune) rune {
2005 switch c {
2006 case ' ', '\t', '\n', '\f', '\r':
2007 return c
2008 }
2009 return -1
2010 }, p.tok.Data)
2011 if s != "" {
2012 p.addText(s)
2013 }
2014 case StartTagToken:
2015 switch p.tok.DataAtom {
2016 case a.Html:
2017 return inBodyIM(p)
2018 case a.Frameset:
2019 p.addElement()
2020 case a.Frame:
2021 p.addElement()
2022 p.oe.pop()
2023 p.acknowledgeSelfClosingTag()
2024 case a.Noframes:
2025 return inHeadIM(p)
2026 }
2027 case EndTagToken:
2028 switch p.tok.DataAtom {
2029 case a.Frameset:
2030 if p.oe.top().DataAtom != a.Html {
2031 p.oe.pop()
2032 if p.oe.top().DataAtom != a.Frameset {
2033 p.im = afterFramesetIM
2034 return true
2035 }
2036 }
2037 }
2038 default:
2039 // Ignore the token.
2040 }
2041 return true
2042}
2043
2044// Section 12.2.6.4.21.
2045func afterFramesetIM(p *parser) bool {
2046 switch p.tok.Type {
2047 case CommentToken:
2048 p.addChild(&Node{
2049 Type: CommentNode,
2050 Data: p.tok.Data,
2051 })
2052 case TextToken:
2053 // Ignore all text but whitespace.
2054 s := strings.Map(func(c rune) rune {
2055 switch c {
2056 case ' ', '\t', '\n', '\f', '\r':
2057 return c
2058 }
2059 return -1
2060 }, p.tok.Data)
2061 if s != "" {
2062 p.addText(s)
2063 }
2064 case StartTagToken:
2065 switch p.tok.DataAtom {
2066 case a.Html:
2067 return inBodyIM(p)
2068 case a.Noframes:
2069 return inHeadIM(p)
2070 }
2071 case EndTagToken:
2072 switch p.tok.DataAtom {
2073 case a.Html:
2074 p.im = afterAfterFramesetIM
2075 return true
2076 }
2077 default:
2078 // Ignore the token.
2079 }
2080 return true
2081}
2082
2083// Section 12.2.6.4.22.
2084func afterAfterBodyIM(p *parser) bool {
2085 switch p.tok.Type {
2086 case ErrorToken:
2087 // Stop parsing.
2088 return true
2089 case TextToken:
2090 s := strings.TrimLeft(p.tok.Data, whitespace)
2091 if len(s) == 0 {
2092 // It was all whitespace.
2093 return inBodyIM(p)
2094 }
2095 case StartTagToken:
2096 if p.tok.DataAtom == a.Html {
2097 return inBodyIM(p)
2098 }
2099 case CommentToken:
2100 p.doc.AppendChild(&Node{
2101 Type: CommentNode,
2102 Data: p.tok.Data,
2103 })
2104 return true
2105 case DoctypeToken:
2106 return inBodyIM(p)
2107 }
2108 p.im = inBodyIM
2109 return false
2110}
2111
2112// Section 12.2.6.4.23.
2113func afterAfterFramesetIM(p *parser) bool {
2114 switch p.tok.Type {
2115 case CommentToken:
2116 p.doc.AppendChild(&Node{
2117 Type: CommentNode,
2118 Data: p.tok.Data,
2119 })
2120 case TextToken:
2121 // Ignore all text but whitespace.
2122 s := strings.Map(func(c rune) rune {
2123 switch c {
2124 case ' ', '\t', '\n', '\f', '\r':
2125 return c
2126 }
2127 return -1
2128 }, p.tok.Data)
2129 if s != "" {
2130 p.tok.Data = s
2131 return inBodyIM(p)
2132 }
2133 case StartTagToken:
2134 switch p.tok.DataAtom {
2135 case a.Html:
2136 return inBodyIM(p)
2137 case a.Noframes:
2138 return inHeadIM(p)
2139 }
2140 case DoctypeToken:
2141 return inBodyIM(p)
2142 default:
2143 // Ignore the token.
2144 }
2145 return true
2146}
2147
2148func ignoreTheRemainingTokens(p *parser) bool {
2149 return true
2150}
2151
2152const whitespaceOrNUL = whitespace + "\x00"
2153
2154// Section 12.2.6.5
2155func parseForeignContent(p *parser) bool {
2156 switch p.tok.Type {
2157 case TextToken:
2158 if p.framesetOK {
2159 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2160 }
2161 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2162 p.addText(p.tok.Data)
2163 case CommentToken:
2164 p.addChild(&Node{
2165 Type: CommentNode,
2166 Data: p.tok.Data,
2167 })
2168 case StartTagToken:
2169 if !p.fragment {
2170 b := breakout[p.tok.Data]
2171 if p.tok.DataAtom == a.Font {
2172 loop:
2173 for _, attr := range p.tok.Attr {
2174 switch attr.Key {
2175 case "color", "face", "size":
2176 b = true
2177 break loop
2178 }
2179 }
2180 }
2181 if b {
2182 for i := len(p.oe) - 1; i >= 0; i-- {
2183 n := p.oe[i]
2184 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2185 p.oe = p.oe[:i+1]
2186 break
2187 }
2188 }
2189 return false
2190 }
2191 }
2192 current := p.adjustedCurrentNode()
2193 switch current.Namespace {
2194 case "math":
2195 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2196 case "svg":
2197 // Adjust SVG tag names. The tokenizer lower-cases tag names, but
2198 // SVG wants e.g. "foreignObject" with a capital second "O".
2199 if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2200 p.tok.DataAtom = a.Lookup([]byte(x))
2201 p.tok.Data = x
2202 }
2203 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2204 default:
2205 panic("html: bad parser state: unexpected namespace")
2206 }
2207 adjustForeignAttributes(p.tok.Attr)
2208 namespace := current.Namespace
2209 p.addElement()
2210 p.top().Namespace = namespace
2211 if namespace != "" {
2212 // Don't let the tokenizer go into raw text mode in foreign content
2213 // (e.g. in an SVG <title> tag).
2214 p.tokenizer.NextIsNotRawText()
2215 }
2216 if p.hasSelfClosingToken {
2217 p.oe.pop()
2218 p.acknowledgeSelfClosingTag()
2219 }
2220 case EndTagToken:
2221 for i := len(p.oe) - 1; i >= 0; i-- {
2222 if p.oe[i].Namespace == "" {
2223 return p.im(p)
2224 }
2225 if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2226 p.oe = p.oe[:i]
2227 break
2228 }
2229 }
2230 return true
2231 default:
2232 // Ignore the token.
2233 }
2234 return true
2235}
2236
2237// Section 12.2.4.2.
2238func (p *parser) adjustedCurrentNode() *Node {
2239 if len(p.oe) == 1 && p.fragment && p.context != nil {
2240 return p.context
2241 }
2242 return p.oe.top()
2243}
2244
2245// Section 12.2.6.
2246func (p *parser) inForeignContent() bool {
2247 if len(p.oe) == 0 {
2248 return false
2249 }
2250 n := p.adjustedCurrentNode()
2251 if n.Namespace == "" {
2252 return false
2253 }
2254 if mathMLTextIntegrationPoint(n) {
2255 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2256 return false
2257 }
2258 if p.tok.Type == TextToken {
2259 return false
2260 }
2261 }
2262 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2263 return false
2264 }
2265 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2266 return false
2267 }
2268 if p.tok.Type == ErrorToken {
2269 return false
2270 }
2271 return true
2272}
2273
2274// parseImpliedToken parses a token as though it had appeared in the parser's
2275// input.
2276func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2277 realToken, selfClosing := p.tok, p.hasSelfClosingToken
2278 p.tok = Token{
2279 Type: t,
2280 DataAtom: dataAtom,
2281 Data: data,
2282 }
2283 p.hasSelfClosingToken = false
2284 p.parseCurrentToken()
2285 p.tok, p.hasSelfClosingToken = realToken, selfClosing
2286}
2287
2288// parseCurrentToken runs the current token through the parsing routines
2289// until it is consumed.
2290func (p *parser) parseCurrentToken() {
2291 if p.tok.Type == SelfClosingTagToken {
2292 p.hasSelfClosingToken = true
2293 p.tok.Type = StartTagToken
2294 }
2295
2296 consumed := false
2297 for !consumed {
2298 if p.inForeignContent() {
2299 consumed = parseForeignContent(p)
2300 } else {
2301 consumed = p.im(p)
2302 }
2303 }
2304
2305 if p.hasSelfClosingToken {
2306 // This is a parse error, but ignore it.
2307 p.hasSelfClosingToken = false
2308 }
2309}
2310
2311func (p *parser) parse() error {
2312 // Iterate until EOF. Any other error will cause an early return.
2313 var err error
2314 for err != io.EOF {
2315 // CDATA sections are allowed only in foreign content.
2316 n := p.oe.top()
2317 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2318 // Read and parse the next token.
2319 p.tokenizer.Next()
2320 p.tok = p.tokenizer.Token()
2321 if p.tok.Type == ErrorToken {
2322 err = p.tokenizer.Err()
2323 if err != nil && err != io.EOF {
2324 return err
2325 }
2326 }
2327 p.parseCurrentToken()
2328 }
2329 return nil
2330}
2331
2332// Parse returns the parse tree for the HTML from the given Reader.
2333//
2334// It implements the HTML5 parsing algorithm
2335// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2336// which is very complicated. The resultant tree can contain implicitly created
2337// nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2338// differ from the nesting implied by a naive processing of start and end
2339// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2340// with no corresponding node in the resulting tree.
2341//
2342// The input is assumed to be UTF-8 encoded.
2343func Parse(r io.Reader) (*Node, error) {
2344 return ParseWithOptions(r)
2345}
2346
2347// ParseFragment parses a fragment of HTML and returns the nodes that were
2348// found. If the fragment is the InnerHTML for an existing element, pass that
2349// element in context.
2350//
2351// It has the same intricacies as Parse.
2352func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2353 return ParseFragmentWithOptions(r, context)
2354}
2355
2356// ParseOption configures a parser.
2357type ParseOption func(p *parser)
2358
2359// ParseOptionEnableScripting configures the scripting flag.
2360// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
2361//
2362// By default, scripting is enabled.
2363func ParseOptionEnableScripting(enable bool) ParseOption {
2364 return func(p *parser) {
2365 p.scripting = enable
2366 }
2367}
2368
2369// ParseWithOptions is like Parse, with options.
2370func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
2371 p := &parser{
2372 tokenizer: NewTokenizer(r),
2373 doc: &Node{
2374 Type: DocumentNode,
2375 },
2376 scripting: true,
2377 framesetOK: true,
2378 im: initialIM,
2379 }
2380
2381 for _, f := range opts {
2382 f(p)
2383 }
2384
2385 if err := p.parse(); err != nil {
2386 return nil, err
2387 }
2388 return p.doc, nil
2389}
2390
2391// ParseFragmentWithOptions is like ParseFragment, with options.
2392func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
2393 contextTag := ""
2394 if context != nil {
2395 if context.Type != ElementNode {
2396 return nil, errors.New("html: ParseFragment of non-element Node")
2397 }
2398 // The next check isn't just context.DataAtom.String() == context.Data because
2399 // it is valid to pass an element whose tag isn't a known atom. For example,
2400 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2401 if context.DataAtom != a.Lookup([]byte(context.Data)) {
2402 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2403 }
2404 contextTag = context.DataAtom.String()
2405 }
2406 p := &parser{
2407 doc: &Node{
2408 Type: DocumentNode,
2409 },
2410 scripting: true,
2411 fragment: true,
2412 context: context,
2413 }
2414 if context != nil && context.Namespace != "" {
2415 p.tokenizer = NewTokenizer(r)
2416 } else {
2417 p.tokenizer = NewTokenizerFragment(r, contextTag)
2418 }
2419
2420 for _, f := range opts {
2421 f(p)
2422 }
2423
2424 root := &Node{
2425 Type: ElementNode,
2426 DataAtom: a.Html,
2427 Data: a.Html.String(),
2428 }
2429 p.doc.AppendChild(root)
2430 p.oe = nodeStack{root}
2431 if context != nil && context.DataAtom == a.Template {
2432 p.templateStack = append(p.templateStack, inTemplateIM)
2433 }
2434 p.resetInsertionMode()
2435
2436 for n := context; n != nil; n = n.Parent {
2437 if n.Type == ElementNode && n.DataAtom == a.Form {
2438 p.form = n
2439 break
2440 }
2441 }
2442
2443 if err := p.parse(); err != nil {
2444 return nil, err
2445 }
2446
2447 parent := p.doc
2448 if context != nil {
2449 parent = root
2450 }
2451
2452 var result []*Node
2453 for c := parent.FirstChild; c != nil; {
2454 next := c.NextSibling
2455 parent.RemoveChild(c)
2456 result = append(result, c)
2457 c = next
2458 }
2459 return result, nil
2460}
Note: See TracBrowser for help on using the repository browser.