Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: code/trunk/vendor/golang.org/x/net/html/token.go@ 145

Last change on this file since 145 was 145, checked in by Izuru Yakumo, 22 months ago

Updated the Makefile and vendored depedencies

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 30.7 KB

Line
1	// Copyright 2010 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	package html
6
7	import (
8	"bytes"
9	"errors"
10	"io"
11	"strconv"
12	"strings"
13
14	"golang.org/x/net/html/atom"
15	)
16
17	// A TokenType is the type of a Token.
18	type TokenType uint32
19
20	const (
21	// ErrorToken means that an error occurred during tokenization.
22	ErrorToken TokenType = iota
23	// TextToken means a text node.
24	TextToken
25	// A StartTagToken looks like <a>.
26	StartTagToken
27	// An EndTagToken looks like </a>.
28	EndTagToken
29	// A SelfClosingTagToken tag looks like <br/>.
30	SelfClosingTagToken
31	// A CommentToken looks like <!--x-->.
32	CommentToken
33	// A DoctypeToken looks like <!DOCTYPE x>
34	DoctypeToken
35	)
36
37	// ErrBufferExceeded means that the buffering limit was exceeded.
38	var ErrBufferExceeded = errors.New("max buffer exceeded")
39
40	// String returns a string representation of the TokenType.
41	func (t TokenType) String() string {
42	switch t {
43	case ErrorToken:
44	return "Error"
45	case TextToken:
46	return "Text"
47	case StartTagToken:
48	return "StartTag"
49	case EndTagToken:
50	return "EndTag"
51	case SelfClosingTagToken:
52	return "SelfClosingTag"
53	case CommentToken:
54	return "Comment"
55	case DoctypeToken:
56	return "Doctype"
57	}
58	return "Invalid(" + strconv.Itoa(int(t)) + ")"
59	}
60
61	// An Attribute is an attribute namespace-key-value triple. Namespace is
62	// non-empty for foreign attributes like xlink, Key is alphabetic (and hence
63	// does not contain escapable characters like '&', '<' or '>'), and Val is
64	// unescaped (it looks like "a<b" rather than "a<b").
65	//
66	// Namespace is only used by the parser, not the tokenizer.
67	type Attribute struct {
68	Namespace, Key, Val string
69	}
70
71	// A Token consists of a TokenType and some Data (tag name for start and end
72	// tags, content for text, comments and doctypes). A tag Token may also contain
73	// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
74	// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or
75	// zero if Data is not a known tag name.
76	type Token struct {
77	Type TokenType
78	DataAtom atom.Atom
79	Data string
80	Attr []Attribute
81	}
82
83	// tagString returns a string representation of a tag Token's Data and Attr.
84	func (t Token) tagString() string {
85	if len(t.Attr) == 0 {
86	return t.Data
87	}
88	buf := bytes.NewBufferString(t.Data)
89	for _, a := range t.Attr {
90	buf.WriteByte(' ')
91	buf.WriteString(a.Key)
92	buf.WriteString(`="`)
93	escape(buf, a.Val)
94	buf.WriteByte('"')
95	}
96	return buf.String()
97	}
98
99	// String returns a string representation of the Token.
100	func (t Token) String() string {
101	switch t.Type {
102	case ErrorToken:
103	return ""
104	case TextToken:
105	return EscapeString(t.Data)
106	case StartTagToken:
107	return "<" + t.tagString() + ">"
108	case EndTagToken:
109	return "</" + t.tagString() + ">"
110	case SelfClosingTagToken:
111	return "<" + t.tagString() + "/>"
112	case CommentToken:
113	return "<!--" + EscapeString(t.Data) + "-->"
114	case DoctypeToken:
115	return "<!DOCTYPE " + EscapeString(t.Data) + ">"
116	}
117	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
118	}
119
120	// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
121	// the end is exclusive.
122	type span struct {
123	start, end int
124	}
125
126	// A Tokenizer returns a stream of HTML Tokens.
127	type Tokenizer struct {
128	// r is the source of the HTML text.
129	r io.Reader
130	// tt is the TokenType of the current token.
131	tt TokenType
132	// err is the first error encountered during tokenization. It is possible
133	// for tt != Error && err != nil to hold: this means that Next returned a
134	// valid token but the subsequent Next call will return an error token.
135	// For example, if the HTML text input was just "plain", then the first
136	// Next call would set z.err to io.EOF but return a TextToken, and all
137	// subsequent Next calls would return an ErrorToken.
138	// err is never reset. Once it becomes non-nil, it stays non-nil.
139	err error
140	// readErr is the error returned by the io.Reader r. It is separate from
141	// err because it is valid for an io.Reader to return (n int, err1 error)
142	// such that n > 0 && err1 != nil, and callers should always process the
143	// n > 0 bytes before considering the error err1.
144	readErr error
145	// buf[raw.start:raw.end] holds the raw bytes of the current token.
146	// buf[raw.end:] is buffered input that will yield future tokens.
147	raw span
148	buf []byte
149	// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
150	maxBuf int
151	// buf[data.start:data.end] holds the raw bytes of the current token's data:
152	// a text token's text, a tag token's tag name, etc.
153	data span
154	// pendingAttr is the attribute key and value currently being tokenized.
155	// When complete, pendingAttr is pushed onto attr. nAttrReturned is
156	// incremented on each call to TagAttr.
157	pendingAttr [2]span
158	attr [][2]span
159	nAttrReturned int
160	// rawTag is the "script" in "</script>" that closes the next token. If
161	// non-empty, the subsequent call to Next will return a raw or RCDATA text
162	// token: one that treats "<p>" as text instead of an element.
163	// rawTag's contents are lower-cased.
164	rawTag string
165	// textIsRaw is whether the current text token's data is not escaped.
166	textIsRaw bool
167	// convertNUL is whether NUL bytes in the current token's data should
168	// be converted into \ufffd replacement characters.
169	convertNUL bool
170	// allowCDATA is whether CDATA sections are allowed in the current context.
171	allowCDATA bool
172	}
173
174	// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as
175	// the text "foo". The default value is false, which means to recognize it as
176	// a bogus comment "<!-- [CDATA[foo]] -->" instead.
177	//
178	// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
179	// only if tokenizing foreign content, such as MathML and SVG. However,
180	// tracking foreign-contentness is difficult to do purely in the tokenizer,
181	// as opposed to the parser, due to HTML integration points: an <svg> element
182	// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
183	// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the
184	// responsibility of the user of a tokenizer to call AllowCDATA as appropriate.
185	// In practice, if using the tokenizer without caring whether MathML or SVG
186	// CDATA is text or comments, such as tokenizing HTML to find all the anchor
187	// text, it is acceptable to ignore this responsibility.
188	func (z *Tokenizer) AllowCDATA(allowCDATA bool) {
189	z.allowCDATA = allowCDATA
190	}
191
192	// NextIsNotRawText instructs the tokenizer that the next token should not be
193	// considered as 'raw text'. Some elements, such as script and title elements,
194	// normally require the next token after the opening tag to be 'raw text' that
195	// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
196	// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
197	// an end tag token for "</title>". There are no distinct start tag or end tag
198	// tokens for the "<b>" and "</b>".
199	//
200	// This tokenizer implementation will generally look for raw text at the right
201	// times. Strictly speaking, an HTML5 compliant tokenizer should not look for
202	// raw text if in foreign content: <title> generally needs raw text, but a
203	// <title> inside an <svg> does not. Another example is that a <textarea>
204	// generally needs raw text, but a <textarea> is not allowed as an immediate
205	// child of a <select>; in normal parsing, a <textarea> implies </select>, but
206	// one cannot close the implicit element when parsing a <select>'s InnerHTML.
207	// Similarly to AllowCDATA, tracking the correct moment to override raw-text-
208	// ness is difficult to do purely in the tokenizer, as opposed to the parser.
209	// For strict compliance with the HTML5 tokenization algorithm, it is the
210	// responsibility of the user of a tokenizer to call NextIsNotRawText as
211	// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
212	// responsibility for basic usage.
213	//
214	// Note that this 'raw text' concept is different from the one offered by the
215	// Tokenizer.Raw method.
216	func (z *Tokenizer) NextIsNotRawText() {
217	z.rawTag = ""
218	}
219
220	// Err returns the error associated with the most recent ErrorToken token.
221	// This is typically io.EOF, meaning the end of tokenization.
222	func (z *Tokenizer) Err() error {
223	if z.tt != ErrorToken {
224	return nil
225	}
226	return z.err
227	}
228
229	// readByte returns the next byte from the input stream, doing a buffered read
230	// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
231	// slice that holds all the bytes read so far for the current token.
232	// It sets z.err if the underlying reader returns an error.
233	// Pre-condition: z.err == nil.
234	func (z *Tokenizer) readByte() byte {
235	if z.raw.end >= len(z.buf) {
236	// Our buffer is exhausted and we have to read from z.r. Check if the
237	// previous read resulted in an error.
238	if z.readErr != nil {
239	z.err = z.readErr
240	return 0
241	}
242	// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
243	// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
244	// allocate a new buffer before the copy.
245	c := cap(z.buf)
246	d := z.raw.end - z.raw.start
247	var buf1 []byte
248	if 2*d > c {
249	buf1 = make([]byte, d, 2*c)
250	} else {
251	buf1 = z.buf[:d]
252	}
253	copy(buf1, z.buf[z.raw.start:z.raw.end])
254	if x := z.raw.start; x != 0 {
255	// Adjust the data/attr spans to refer to the same contents after the copy.
256	z.data.start -= x
257	z.data.end -= x
258	z.pendingAttr[0].start -= x
259	z.pendingAttr[0].end -= x
260	z.pendingAttr[1].start -= x
261	z.pendingAttr[1].end -= x
262	for i := range z.attr {
263	z.attr[i][0].start -= x
264	z.attr[i][0].end -= x
265	z.attr[i][1].start -= x
266	z.attr[i][1].end -= x
267	}
268	}
269	z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
270	// Now that we have copied the live bytes to the start of the buffer,
271	// we read from z.r into the remainder.
272	var n int
273	n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)])
274	if n == 0 {
275	z.err = z.readErr
276	return 0
277	}
278	z.buf = buf1[:d+n]
279	}
280	x := z.buf[z.raw.end]
281	z.raw.end++
282	if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
283	z.err = ErrBufferExceeded
284	return 0
285	}
286	return x
287	}
288
289	// Buffered returns a slice containing data buffered but not yet tokenized.
290	func (z *Tokenizer) Buffered() []byte {
291	return z.buf[z.raw.end:]
292	}
293
294	// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil).
295	// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil)
296	// too many times in succession.
297	func readAtLeastOneByte(r io.Reader, b []byte) (int, error) {
298	for i := 0; i < 100; i++ {
299	if n, err := r.Read(b); n != 0 \|\| err != nil {
300	return n, err
301	}
302	}
303	return 0, io.ErrNoProgress
304	}
305
306	// skipWhiteSpace skips past any white space.
307	func (z *Tokenizer) skipWhiteSpace() {
308	if z.err != nil {
309	return
310	}
311	for {
312	c := z.readByte()
313	if z.err != nil {
314	return
315	}
316	switch c {
317	case ' ', '\n', '\r', '\t', '\f':
318	// No-op.
319	default:
320	z.raw.end--
321	return
322	}
323	}
324	}
325
326	// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
327	// is typically something like "script" or "textarea".
328	func (z *Tokenizer) readRawOrRCDATA() {
329	if z.rawTag == "script" {
330	z.readScript()
331	z.textIsRaw = true
332	z.rawTag = ""
333	return
334	}
335	loop:
336	for {
337	c := z.readByte()
338	if z.err != nil {
339	break loop
340	}
341	if c != '<' {
342	continue loop
343	}
344	c = z.readByte()
345	if z.err != nil {
346	break loop
347	}
348	if c != '/' {
349	z.raw.end--
350	continue loop
351	}
352	if z.readRawEndTag() \|\| z.err != nil {
353	break loop
354	}
355	}
356	z.data.end = z.raw.end
357	// A textarea's or title's RCDATA can contain escaped entities.
358	z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
359	z.rawTag = ""
360	}
361
362	// readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag.
363	// If it succeeds, it backs up the input position to reconsume the tag and
364	// returns true. Otherwise it returns false. The opening "</" has already been
365	// consumed.
366	func (z *Tokenizer) readRawEndTag() bool {
367	for i := 0; i < len(z.rawTag); i++ {
368	c := z.readByte()
369	if z.err != nil {
370	return false
371	}
372	if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
373	z.raw.end--
374	return false
375	}
376	}
377	c := z.readByte()
378	if z.err != nil {
379	return false
380	}
381	switch c {
382	case ' ', '\n', '\r', '\t', '\f', '/', '>':
383	// The 3 is 2 for the leading "</" plus 1 for the trailing character c.
384	z.raw.end -= 3 + len(z.rawTag)
385	return true
386	}
387	z.raw.end--
388	return false
389	}
390
391	// readScript reads until the next </script> tag, following the byzantine
392	// rules for escaping/hiding the closing tag.
393	func (z *Tokenizer) readScript() {
394	defer func() {
395	z.data.end = z.raw.end
396	}()
397	var c byte
398
399	scriptData:
400	c = z.readByte()
401	if z.err != nil {
402	return
403	}
404	if c == '<' {
405	goto scriptDataLessThanSign
406	}
407	goto scriptData
408
409	scriptDataLessThanSign:
410	c = z.readByte()
411	if z.err != nil {
412	return
413	}
414	switch c {
415	case '/':
416	goto scriptDataEndTagOpen
417	case '!':
418	goto scriptDataEscapeStart
419	}
420	z.raw.end--
421	goto scriptData
422
423	scriptDataEndTagOpen:
424	if z.readRawEndTag() \|\| z.err != nil {
425	return
426	}
427	goto scriptData
428
429	scriptDataEscapeStart:
430	c = z.readByte()
431	if z.err != nil {
432	return
433	}
434	if c == '-' {
435	goto scriptDataEscapeStartDash
436	}
437	z.raw.end--
438	goto scriptData
439
440	scriptDataEscapeStartDash:
441	c = z.readByte()
442	if z.err != nil {
443	return
444	}
445	if c == '-' {
446	goto scriptDataEscapedDashDash
447	}
448	z.raw.end--
449	goto scriptData
450
451	scriptDataEscaped:
452	c = z.readByte()
453	if z.err != nil {
454	return
455	}
456	switch c {
457	case '-':
458	goto scriptDataEscapedDash
459	case '<':
460	goto scriptDataEscapedLessThanSign
461	}
462	goto scriptDataEscaped
463
464	scriptDataEscapedDash:
465	c = z.readByte()
466	if z.err != nil {
467	return
468	}
469	switch c {
470	case '-':
471	goto scriptDataEscapedDashDash
472	case '<':
473	goto scriptDataEscapedLessThanSign
474	}
475	goto scriptDataEscaped
476
477	scriptDataEscapedDashDash:
478	c = z.readByte()
479	if z.err != nil {
480	return
481	}
482	switch c {
483	case '-':
484	goto scriptDataEscapedDashDash
485	case '<':
486	goto scriptDataEscapedLessThanSign
487	case '>':
488	goto scriptData
489	}
490	goto scriptDataEscaped
491
492	scriptDataEscapedLessThanSign:
493	c = z.readByte()
494	if z.err != nil {
495	return
496	}
497	if c == '/' {
498	goto scriptDataEscapedEndTagOpen
499	}
500	if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' {
501	goto scriptDataDoubleEscapeStart
502	}
503	z.raw.end--
504	goto scriptData
505
506	scriptDataEscapedEndTagOpen:
507	if z.readRawEndTag() \|\| z.err != nil {
508	return
509	}
510	goto scriptDataEscaped
511
512	scriptDataDoubleEscapeStart:
513	z.raw.end--
514	for i := 0; i < len("script"); i++ {
515	c = z.readByte()
516	if z.err != nil {
517	return
518	}
519	if c != "script"[i] && c != "SCRIPT"[i] {
520	z.raw.end--
521	goto scriptDataEscaped
522	}
523	}
524	c = z.readByte()
525	if z.err != nil {
526	return
527	}
528	switch c {
529	case ' ', '\n', '\r', '\t', '\f', '/', '>':
530	goto scriptDataDoubleEscaped
531	}
532	z.raw.end--
533	goto scriptDataEscaped
534
535	scriptDataDoubleEscaped:
536	c = z.readByte()
537	if z.err != nil {
538	return
539	}
540	switch c {
541	case '-':
542	goto scriptDataDoubleEscapedDash
543	case '<':
544	goto scriptDataDoubleEscapedLessThanSign
545	}
546	goto scriptDataDoubleEscaped
547
548	scriptDataDoubleEscapedDash:
549	c = z.readByte()
550	if z.err != nil {
551	return
552	}
553	switch c {
554	case '-':
555	goto scriptDataDoubleEscapedDashDash
556	case '<':
557	goto scriptDataDoubleEscapedLessThanSign
558	}
559	goto scriptDataDoubleEscaped
560
561	scriptDataDoubleEscapedDashDash:
562	c = z.readByte()
563	if z.err != nil {
564	return
565	}
566	switch c {
567	case '-':
568	goto scriptDataDoubleEscapedDashDash
569	case '<':
570	goto scriptDataDoubleEscapedLessThanSign
571	case '>':
572	goto scriptData
573	}
574	goto scriptDataDoubleEscaped
575
576	scriptDataDoubleEscapedLessThanSign:
577	c = z.readByte()
578	if z.err != nil {
579	return
580	}
581	if c == '/' {
582	goto scriptDataDoubleEscapeEnd
583	}
584	z.raw.end--
585	goto scriptDataDoubleEscaped
586
587	scriptDataDoubleEscapeEnd:
588	if z.readRawEndTag() {
589	z.raw.end += len("</script>")
590	goto scriptDataEscaped
591	}
592	if z.err != nil {
593	return
594	}
595	goto scriptDataDoubleEscaped
596	}
597
598	// readComment reads the next comment token starting with "<!--". The opening
599	// "<!--" has already been consumed.
600	func (z *Tokenizer) readComment() {
601	// When modifying this function, consider manually increasing the suffixLen
602	// constant in func TestComments, from 6 to e.g. 9 or more. That increase
603	// should only be temporary, not committed, as it exponentially affects the
604	// test running time.
605
606	z.data.start = z.raw.end
607	defer func() {
608	if z.data.end < z.data.start {
609	// It's a comment with no data, like <!-->.
610	z.data.end = z.data.start
611	}
612	}()
613
614	var dashCount int
615	beginning := true
616	for {
617	c := z.readByte()
618	if z.err != nil {
619	z.data.end = z.calculateAbruptCommentDataEnd()
620	return
621	}
622	switch c {
623	case '-':
624	dashCount++
625	continue
626	case '>':
627	if dashCount >= 2 \|\| beginning {
628	z.data.end = z.raw.end - len("-->")
629	return
630	}
631	case '!':
632	if dashCount >= 2 {
633	c = z.readByte()
634	if z.err != nil {
635	z.data.end = z.calculateAbruptCommentDataEnd()
636	return
637	} else if c == '>' {
638	z.data.end = z.raw.end - len("--!>")
639	return
640	} else if c == '-' {
641	dashCount = 1
642	beginning = false
643	continue
644	}
645	}
646	}
647	dashCount = 0
648	beginning = false
649	}
650	}
651
652	func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
653	raw := z.Raw()
654	const prefixLen = len("<!--")
655	if len(raw) >= prefixLen {
656	raw = raw[prefixLen:]
657	if hasSuffix(raw, "--!") {
658	return z.raw.end - 3
659	} else if hasSuffix(raw, "--") {
660	return z.raw.end - 2
661	} else if hasSuffix(raw, "-") {
662	return z.raw.end - 1
663	}
664	}
665	return z.raw.end
666	}
667
668	func hasSuffix(b []byte, suffix string) bool {
669	if len(b) < len(suffix) {
670	return false
671	}
672	b = b[len(b)-len(suffix):]
673	for i := range b {
674	if b[i] != suffix[i] {
675	return false
676	}
677	}
678	return true
679	}
680
681	// readUntilCloseAngle reads until the next ">".
682	func (z *Tokenizer) readUntilCloseAngle() {
683	z.data.start = z.raw.end
684	for {
685	c := z.readByte()
686	if z.err != nil {
687	z.data.end = z.raw.end
688	return
689	}
690	if c == '>' {
691	z.data.end = z.raw.end - len(">")
692	return
693	}
694	}
695	}
696
697	// readMarkupDeclaration reads the next token starting with "<!". It might be
698	// a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
699	// "<!a bogus comment". The opening "<!" has already been consumed.
700	func (z *Tokenizer) readMarkupDeclaration() TokenType {
701	z.data.start = z.raw.end
702	var c [2]byte
703	for i := 0; i < 2; i++ {
704	c[i] = z.readByte()
705	if z.err != nil {
706	z.data.end = z.raw.end
707	return CommentToken
708	}
709	}
710	if c[0] == '-' && c[1] == '-' {
711	z.readComment()
712	return CommentToken
713	}
714	z.raw.end -= 2
715	if z.readDoctype() {
716	return DoctypeToken
717	}
718	if z.allowCDATA && z.readCDATA() {
719	z.convertNUL = true
720	return TextToken
721	}
722	// It's a bogus comment.
723	z.readUntilCloseAngle()
724	return CommentToken
725	}
726
727	// readDoctype attempts to read a doctype declaration and returns true if
728	// successful. The opening "<!" has already been consumed.
729	func (z *Tokenizer) readDoctype() bool {
730	const s = "DOCTYPE"
731	for i := 0; i < len(s); i++ {
732	c := z.readByte()
733	if z.err != nil {
734	z.data.end = z.raw.end
735	return false
736	}
737	if c != s[i] && c != s[i]+('a'-'A') {
738	// Back up to read the fragment of "DOCTYPE" again.
739	z.raw.end = z.data.start
740	return false
741	}
742	}
743	if z.skipWhiteSpace(); z.err != nil {
744	z.data.start = z.raw.end
745	z.data.end = z.raw.end
746	return true
747	}
748	z.readUntilCloseAngle()
749	return true
750	}
751
752	// readCDATA attempts to read a CDATA section and returns true if
753	// successful. The opening "<!" has already been consumed.
754	func (z *Tokenizer) readCDATA() bool {
755	const s = "[CDATA["
756	for i := 0; i < len(s); i++ {
757	c := z.readByte()
758	if z.err != nil {
759	z.data.end = z.raw.end
760	return false
761	}
762	if c != s[i] {
763	// Back up to read the fragment of "[CDATA[" again.
764	z.raw.end = z.data.start
765	return false
766	}
767	}
768	z.data.start = z.raw.end
769	brackets := 0
770	for {
771	c := z.readByte()
772	if z.err != nil {
773	z.data.end = z.raw.end
774	return true
775	}
776	switch c {
777	case ']':
778	brackets++
779	case '>':
780	if brackets >= 2 {
781	z.data.end = z.raw.end - len("]]>")
782	return true
783	}
784	brackets = 0
785	default:
786	brackets = 0
787	}
788	}
789	}
790
791	// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
792	// case-insensitively matches any element of ss.
793	func (z *Tokenizer) startTagIn(ss ...string) bool {
794	loop:
795	for _, s := range ss {
796	if z.data.end-z.data.start != len(s) {
797	continue loop
798	}
799	for i := 0; i < len(s); i++ {
800	c := z.buf[z.data.start+i]
801	if 'A' <= c && c <= 'Z' {
802	c += 'a' - 'A'
803	}
804	if c != s[i] {
805	continue loop
806	}
807	}
808	return true
809	}
810	return false
811	}
812
813	// readStartTag reads the next start tag token. The opening "<a" has already
814	// been consumed, where 'a' means anything in [A-Za-z].
815	func (z *Tokenizer) readStartTag() TokenType {
816	z.readTag(true)
817	if z.err != nil {
818	return ErrorToken
819	}
820	// Several tags flag the tokenizer's next token as raw.
821	c, raw := z.buf[z.data.start], false
822	if 'A' <= c && c <= 'Z' {
823	c += 'a' - 'A'
824	}
825	switch c {
826	case 'i':
827	raw = z.startTagIn("iframe")
828	case 'n':
829	raw = z.startTagIn("noembed", "noframes", "noscript")
830	case 'p':
831	raw = z.startTagIn("plaintext")
832	case 's':
833	raw = z.startTagIn("script", "style")
834	case 't':
835	raw = z.startTagIn("textarea", "title")
836	case 'x':
837	raw = z.startTagIn("xmp")
838	}
839	if raw {
840	z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))
841	}
842	// Look for a self-closing token like "<br/>".
843	if z.err == nil && z.buf[z.raw.end-2] == '/' {
844	return SelfClosingTagToken
845	}
846	return StartTagToken
847	}
848
849	// readTag reads the next tag token and its attributes. If saveAttr, those
850	// attributes are saved in z.attr, otherwise z.attr is set to an empty slice.
851	// The opening "<a" or "</a" has already been consumed, where 'a' means anything
852	// in [A-Za-z].
853	func (z *Tokenizer) readTag(saveAttr bool) {
854	z.attr = z.attr[:0]
855	z.nAttrReturned = 0
856	// Read the tag name and attribute key/value pairs.
857	z.readTagName()
858	if z.skipWhiteSpace(); z.err != nil {
859	return
860	}
861	for {
862	c := z.readByte()
863	if z.err != nil \|\| c == '>' {
864	break
865	}
866	z.raw.end--
867	z.readTagAttrKey()
868	z.readTagAttrVal()
869	// Save pendingAttr if saveAttr and that attribute has a non-empty key.
870	if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {
871	z.attr = append(z.attr, z.pendingAttr)
872	}
873	if z.skipWhiteSpace(); z.err != nil {
874	break
875	}
876	}
877	}
878
879	// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
880	// is positioned such that the first byte of the tag name (the "d" in "<div")
881	// has already been consumed.
882	func (z *Tokenizer) readTagName() {
883	z.data.start = z.raw.end - 1
884	for {
885	c := z.readByte()
886	if z.err != nil {
887	z.data.end = z.raw.end
888	return
889	}
890	switch c {
891	case ' ', '\n', '\r', '\t', '\f':
892	z.data.end = z.raw.end - 1
893	return
894	case '/', '>':
895	z.raw.end--
896	z.data.end = z.raw.end
897	return
898	}
899	}
900	}
901
902	// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
903	// Precondition: z.err == nil.
904	func (z *Tokenizer) readTagAttrKey() {
905	z.pendingAttr[0].start = z.raw.end
906	for {
907	c := z.readByte()
908	if z.err != nil {
909	z.pendingAttr[0].end = z.raw.end
910	return
911	}
912	switch c {
913	case ' ', '\n', '\r', '\t', '\f', '/':
914	z.pendingAttr[0].end = z.raw.end - 1
915	return
916	case '=', '>':
917	z.raw.end--
918	z.pendingAttr[0].end = z.raw.end
919	return
920	}
921	}
922	}
923
924	// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
925	func (z *Tokenizer) readTagAttrVal() {
926	z.pendingAttr[1].start = z.raw.end
927	z.pendingAttr[1].end = z.raw.end
928	if z.skipWhiteSpace(); z.err != nil {
929	return
930	}
931	c := z.readByte()
932	if z.err != nil {
933	return
934	}
935	if c != '=' {
936	z.raw.end--
937	return
938	}
939	if z.skipWhiteSpace(); z.err != nil {
940	return
941	}
942	quote := z.readByte()
943	if z.err != nil {
944	return
945	}
946	switch quote {
947	case '>':
948	z.raw.end--
949	return
950
951	case '\'', '"':
952	z.pendingAttr[1].start = z.raw.end
953	for {
954	c := z.readByte()
955	if z.err != nil {
956	z.pendingAttr[1].end = z.raw.end
957	return
958	}
959	if c == quote {
960	z.pendingAttr[1].end = z.raw.end - 1
961	return
962	}
963	}
964
965	default:
966	z.pendingAttr[1].start = z.raw.end - 1
967	for {
968	c := z.readByte()
969	if z.err != nil {
970	z.pendingAttr[1].end = z.raw.end
971	return
972	}
973	switch c {
974	case ' ', '\n', '\r', '\t', '\f':
975	z.pendingAttr[1].end = z.raw.end - 1
976	return
977	case '>':
978	z.raw.end--
979	z.pendingAttr[1].end = z.raw.end
980	return
981	}
982	}
983	}
984	}
985
986	// Next scans the next token and returns its type.
987	func (z *Tokenizer) Next() TokenType {
988	z.raw.start = z.raw.end
989	z.data.start = z.raw.end
990	z.data.end = z.raw.end
991	if z.err != nil {
992	z.tt = ErrorToken
993	return z.tt
994	}
995	if z.rawTag != "" {
996	if z.rawTag == "plaintext" {
997	// Read everything up to EOF.
998	for z.err == nil {
999	z.readByte()
1000	}
1001	z.data.end = z.raw.end
1002	z.textIsRaw = true
1003	} else {
1004	z.readRawOrRCDATA()
1005	}
1006	if z.data.end > z.data.start {
1007	z.tt = TextToken
1008	z.convertNUL = true
1009	return z.tt
1010	}
1011	}
1012	z.textIsRaw = false
1013	z.convertNUL = false
1014
1015	loop:
1016	for {
1017	c := z.readByte()
1018	if z.err != nil {
1019	break loop
1020	}
1021	if c != '<' {
1022	continue loop
1023	}
1024
1025	// Check if the '<' we have just read is part of a tag, comment
1026	// or doctype. If not, it's part of the accumulated text token.
1027	c = z.readByte()
1028	if z.err != nil {
1029	break loop
1030	}
1031	var tokenType TokenType
1032	switch {
1033	case 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z':
1034	tokenType = StartTagToken
1035	case c == '/':
1036	tokenType = EndTagToken
1037	case c == '!' \|\| c == '?':
1038	// We use CommentToken to mean any of "<!--actual comments-->",
1039	// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
1040	tokenType = CommentToken
1041	default:
1042	// Reconsume the current character.
1043	z.raw.end--
1044	continue
1045	}
1046
1047	// We have a non-text token, but we might have accumulated some text
1048	// before that. If so, we return the text first, and return the non-
1049	// text token on the subsequent call to Next.
1050	if x := z.raw.end - len("<a"); z.raw.start < x {
1051	z.raw.end = x
1052	z.data.end = x
1053	z.tt = TextToken
1054	return z.tt
1055	}
1056	switch tokenType {
1057	case StartTagToken:
1058	z.tt = z.readStartTag()
1059	return z.tt
1060	case EndTagToken:
1061	c = z.readByte()
1062	if z.err != nil {
1063	break loop
1064	}
1065	if c == '>' {
1066	// "</>" does not generate a token at all. Generate an empty comment
1067	// to allow passthrough clients to pick up the data using Raw.
1068	// Reset the tokenizer state and start again.
1069	z.tt = CommentToken
1070	return z.tt
1071	}
1072	if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' {
1073	z.readTag(false)
1074	if z.err != nil {
1075	z.tt = ErrorToken
1076	} else {
1077	z.tt = EndTagToken
1078	}
1079	return z.tt
1080	}
1081	z.raw.end--
1082	z.readUntilCloseAngle()
1083	z.tt = CommentToken
1084	return z.tt
1085	case CommentToken:
1086	if c == '!' {
1087	z.tt = z.readMarkupDeclaration()
1088	return z.tt
1089	}
1090	z.raw.end--
1091	z.readUntilCloseAngle()
1092	z.tt = CommentToken
1093	return z.tt
1094	}
1095	}
1096	if z.raw.start < z.raw.end {
1097	z.data.end = z.raw.end
1098	z.tt = TextToken
1099	return z.tt
1100	}
1101	z.tt = ErrorToken
1102	return z.tt
1103	}
1104
1105	// Raw returns the unmodified text of the current token. Calling Next, Token,
1106	// Text, TagName or TagAttr may change the contents of the returned slice.
1107	//
1108	// The token stream's raw bytes partition the byte stream (up until an
1109	// ErrorToken). There are no overlaps or gaps between two consecutive token's
1110	// raw bytes. One implication is that the byte offset of the current token is
1111	// the sum of the lengths of all previous tokens' raw bytes.
1112	func (z *Tokenizer) Raw() []byte {
1113	return z.buf[z.raw.start:z.raw.end]
1114	}
1115
1116	// convertNewlines converts "\r" and "\r\n" in s to "\n".
1117	// The conversion happens in place, but the resulting slice may be shorter.
1118	func convertNewlines(s []byte) []byte {
1119	for i, c := range s {
1120	if c != '\r' {
1121	continue
1122	}
1123
1124	src := i + 1
1125	if src >= len(s) \|\| s[src] != '\n' {
1126	s[i] = '\n'
1127	continue
1128	}
1129
1130	dst := i
1131	for src < len(s) {
1132	if s[src] == '\r' {
1133	if src+1 < len(s) && s[src+1] == '\n' {
1134	src++
1135	}
1136	s[dst] = '\n'
1137	} else {
1138	s[dst] = s[src]
1139	}
1140	src++
1141	dst++
1142	}
1143	return s[:dst]
1144	}
1145	return s
1146	}
1147
1148	var (
1149	nul = []byte("\x00")
1150	replacement = []byte("\ufffd")
1151	)
1152
1153	// Text returns the unescaped text of a text, comment or doctype token. The
1154	// contents of the returned slice may change on the next call to Next.
1155	func (z *Tokenizer) Text() []byte {
1156	switch z.tt {
1157	case TextToken, CommentToken, DoctypeToken:
1158	s := z.buf[z.data.start:z.data.end]
1159	z.data.start = z.raw.end
1160	z.data.end = z.raw.end
1161	s = convertNewlines(s)
1162	if (z.convertNUL \|\| z.tt == CommentToken) && bytes.Contains(s, nul) {
1163	s = bytes.Replace(s, nul, replacement, -1)
1164	}
1165	if !z.textIsRaw {
1166	s = unescape(s, false)
1167	}
1168	return s
1169	}
1170	return nil
1171	}
1172
1173	// TagName returns the lower-cased name of a tag token (the `img` out of
1174	// `<IMG SRC="foo">`) and whether the tag has attributes.
1175	// The contents of the returned slice may change on the next call to Next.
1176	func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
1177	if z.data.start < z.data.end {
1178	switch z.tt {
1179	case StartTagToken, EndTagToken, SelfClosingTagToken:
1180	s := z.buf[z.data.start:z.data.end]
1181	z.data.start = z.raw.end
1182	z.data.end = z.raw.end
1183	return lower(s), z.nAttrReturned < len(z.attr)
1184	}
1185	}
1186	return nil, false
1187	}
1188
1189	// TagAttr returns the lower-cased key and unescaped value of the next unparsed
1190	// attribute for the current tag token and whether there are more attributes.
1191	// The contents of the returned slices may change on the next call to Next.
1192	func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
1193	if z.nAttrReturned < len(z.attr) {
1194	switch z.tt {
1195	case StartTagToken, SelfClosingTagToken:
1196	x := z.attr[z.nAttrReturned]
1197	z.nAttrReturned++
1198	key = z.buf[x[0].start:x[0].end]
1199	val = z.buf[x[1].start:x[1].end]
1200	return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)
1201	}
1202	}
1203	return nil, nil, false
1204	}
1205
1206	// Token returns the current Token. The result's Data and Attr values remain
1207	// valid after subsequent Next calls.
1208	func (z *Tokenizer) Token() Token {
1209	t := Token{Type: z.tt}
1210	switch z.tt {
1211	case TextToken, CommentToken, DoctypeToken:
1212	t.Data = string(z.Text())
1213	case StartTagToken, SelfClosingTagToken, EndTagToken:
1214	name, moreAttr := z.TagName()
1215	for moreAttr {
1216	var key, val []byte
1217	key, val, moreAttr = z.TagAttr()
1218	t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
1219	}
1220	if a := atom.Lookup(name); a != 0 {
1221	t.DataAtom, t.Data = a, a.String()
1222	} else {
1223	t.DataAtom, t.Data = 0, string(name)
1224	}
1225	}
1226	return t
1227	}
1228
1229	// SetMaxBuf sets a limit on the amount of data buffered during tokenization.
1230	// A value of 0 means unlimited.
1231	func (z *Tokenizer) SetMaxBuf(n int) {
1232	z.maxBuf = n
1233	}
1234
1235	// NewTokenizer returns a new HTML Tokenizer for the given Reader.
1236	// The input is assumed to be UTF-8 encoded.
1237	func NewTokenizer(r io.Reader) *Tokenizer {
1238	return NewTokenizerFragment(r, "")
1239	}
1240
1241	// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for
1242	// tokenizing an existing element's InnerHTML fragment. contextTag is that
1243	// element's tag, such as "div" or "iframe".
1244	//
1245	// For example, how the InnerHTML "a<b" is tokenized depends on whether it is
1246	// for a <p> tag or a <script> tag.
1247	//
1248	// The input is assumed to be UTF-8 encoded.
1249	func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {
1250	z := &Tokenizer{
1251	r: r,
1252	buf: make([]byte, 0, 4096),
1253	}
1254	if contextTag != "" {
1255	switch s := strings.ToLower(contextTag); s {
1256	case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
1257	z.rawTag = s
1258	}
1259	}
1260	return z
1261	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: