1 | // Copyright 2010 The Go Authors. All rights reserved.
|
---|
2 | // Use of this source code is governed by a BSD-style
|
---|
3 | // license that can be found in the LICENSE file.
|
---|
4 |
|
---|
5 | package html
|
---|
6 |
|
---|
7 | import (
|
---|
8 | "bytes"
|
---|
9 | "strings"
|
---|
10 | "unicode/utf8"
|
---|
11 | )
|
---|
12 |
|
---|
13 | // These replacements permit compatibility with old numeric entities that
|
---|
14 | // assumed Windows-1252 encoding.
|
---|
15 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
|
---|
16 | var replacementTable = [...]rune{
|
---|
17 | '\u20AC', // First entry is what 0x80 should be replaced with.
|
---|
18 | '\u0081',
|
---|
19 | '\u201A',
|
---|
20 | '\u0192',
|
---|
21 | '\u201E',
|
---|
22 | '\u2026',
|
---|
23 | '\u2020',
|
---|
24 | '\u2021',
|
---|
25 | '\u02C6',
|
---|
26 | '\u2030',
|
---|
27 | '\u0160',
|
---|
28 | '\u2039',
|
---|
29 | '\u0152',
|
---|
30 | '\u008D',
|
---|
31 | '\u017D',
|
---|
32 | '\u008F',
|
---|
33 | '\u0090',
|
---|
34 | '\u2018',
|
---|
35 | '\u2019',
|
---|
36 | '\u201C',
|
---|
37 | '\u201D',
|
---|
38 | '\u2022',
|
---|
39 | '\u2013',
|
---|
40 | '\u2014',
|
---|
41 | '\u02DC',
|
---|
42 | '\u2122',
|
---|
43 | '\u0161',
|
---|
44 | '\u203A',
|
---|
45 | '\u0153',
|
---|
46 | '\u009D',
|
---|
47 | '\u017E',
|
---|
48 | '\u0178', // Last entry is 0x9F.
|
---|
49 | // 0x00->'\uFFFD' is handled programmatically.
|
---|
50 | // 0x0D->'\u000D' is a no-op.
|
---|
51 | }
|
---|
52 |
|
---|
53 | // unescapeEntity reads an entity like "<" from b[src:] and writes the
|
---|
54 | // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
---|
55 | // Precondition: b[src] == '&' && dst <= src.
|
---|
56 | // attribute should be true if parsing an attribute value.
|
---|
57 | func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
|
---|
58 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
|
---|
59 |
|
---|
60 | // i starts at 1 because we already know that s[0] == '&'.
|
---|
61 | i, s := 1, b[src:]
|
---|
62 |
|
---|
63 | if len(s) <= 1 {
|
---|
64 | b[dst] = b[src]
|
---|
65 | return dst + 1, src + 1
|
---|
66 | }
|
---|
67 |
|
---|
68 | if s[i] == '#' {
|
---|
69 | if len(s) <= 3 { // We need to have at least "&#.".
|
---|
70 | b[dst] = b[src]
|
---|
71 | return dst + 1, src + 1
|
---|
72 | }
|
---|
73 | i++
|
---|
74 | c := s[i]
|
---|
75 | hex := false
|
---|
76 | if c == 'x' || c == 'X' {
|
---|
77 | hex = true
|
---|
78 | i++
|
---|
79 | }
|
---|
80 |
|
---|
81 | x := '\x00'
|
---|
82 | for i < len(s) {
|
---|
83 | c = s[i]
|
---|
84 | i++
|
---|
85 | if hex {
|
---|
86 | if '0' <= c && c <= '9' {
|
---|
87 | x = 16*x + rune(c) - '0'
|
---|
88 | continue
|
---|
89 | } else if 'a' <= c && c <= 'f' {
|
---|
90 | x = 16*x + rune(c) - 'a' + 10
|
---|
91 | continue
|
---|
92 | } else if 'A' <= c && c <= 'F' {
|
---|
93 | x = 16*x + rune(c) - 'A' + 10
|
---|
94 | continue
|
---|
95 | }
|
---|
96 | } else if '0' <= c && c <= '9' {
|
---|
97 | x = 10*x + rune(c) - '0'
|
---|
98 | continue
|
---|
99 | }
|
---|
100 | if c != ';' {
|
---|
101 | i--
|
---|
102 | }
|
---|
103 | break
|
---|
104 | }
|
---|
105 |
|
---|
106 | if i <= 3 { // No characters matched.
|
---|
107 | b[dst] = b[src]
|
---|
108 | return dst + 1, src + 1
|
---|
109 | }
|
---|
110 |
|
---|
111 | if 0x80 <= x && x <= 0x9F {
|
---|
112 | // Replace characters from Windows-1252 with UTF-8 equivalents.
|
---|
113 | x = replacementTable[x-0x80]
|
---|
114 | } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
---|
115 | // Replace invalid characters with the replacement character.
|
---|
116 | x = '\uFFFD'
|
---|
117 | }
|
---|
118 |
|
---|
119 | return dst + utf8.EncodeRune(b[dst:], x), src + i
|
---|
120 | }
|
---|
121 |
|
---|
122 | // Consume the maximum number of characters possible, with the
|
---|
123 | // consumed characters matching one of the named references.
|
---|
124 |
|
---|
125 | for i < len(s) {
|
---|
126 | c := s[i]
|
---|
127 | i++
|
---|
128 | // Lower-cased characters are more common in entities, so we check for them first.
|
---|
129 | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
---|
130 | continue
|
---|
131 | }
|
---|
132 | if c != ';' {
|
---|
133 | i--
|
---|
134 | }
|
---|
135 | break
|
---|
136 | }
|
---|
137 |
|
---|
138 | entityName := string(s[1:i])
|
---|
139 | if entityName == "" {
|
---|
140 | // No-op.
|
---|
141 | } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
---|
142 | // No-op.
|
---|
143 | } else if x := entity[entityName]; x != 0 {
|
---|
144 | return dst + utf8.EncodeRune(b[dst:], x), src + i
|
---|
145 | } else if x := entity2[entityName]; x[0] != 0 {
|
---|
146 | dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
---|
147 | return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
---|
148 | } else if !attribute {
|
---|
149 | maxLen := len(entityName) - 1
|
---|
150 | if maxLen > longestEntityWithoutSemicolon {
|
---|
151 | maxLen = longestEntityWithoutSemicolon
|
---|
152 | }
|
---|
153 | for j := maxLen; j > 1; j-- {
|
---|
154 | if x := entity[entityName[:j]]; x != 0 {
|
---|
155 | return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
---|
156 | }
|
---|
157 | }
|
---|
158 | }
|
---|
159 |
|
---|
160 | dst1, src1 = dst+i, src+i
|
---|
161 | copy(b[dst:dst1], b[src:src1])
|
---|
162 | return dst1, src1
|
---|
163 | }
|
---|
164 |
|
---|
165 | // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
|
---|
166 | // attribute should be true if parsing an attribute value.
|
---|
167 | func unescape(b []byte, attribute bool) []byte {
|
---|
168 | for i, c := range b {
|
---|
169 | if c == '&' {
|
---|
170 | dst, src := unescapeEntity(b, i, i, attribute)
|
---|
171 | for src < len(b) {
|
---|
172 | c := b[src]
|
---|
173 | if c == '&' {
|
---|
174 | dst, src = unescapeEntity(b, dst, src, attribute)
|
---|
175 | } else {
|
---|
176 | b[dst] = c
|
---|
177 | dst, src = dst+1, src+1
|
---|
178 | }
|
---|
179 | }
|
---|
180 | return b[0:dst]
|
---|
181 | }
|
---|
182 | }
|
---|
183 | return b
|
---|
184 | }
|
---|
185 |
|
---|
186 | // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
|
---|
187 | func lower(b []byte) []byte {
|
---|
188 | for i, c := range b {
|
---|
189 | if 'A' <= c && c <= 'Z' {
|
---|
190 | b[i] = c + 'a' - 'A'
|
---|
191 | }
|
---|
192 | }
|
---|
193 | return b
|
---|
194 | }
|
---|
195 |
|
---|
196 | const escapedChars = "&'<>\"\r"
|
---|
197 |
|
---|
198 | func escape(w writer, s string) error {
|
---|
199 | i := strings.IndexAny(s, escapedChars)
|
---|
200 | for i != -1 {
|
---|
201 | if _, err := w.WriteString(s[:i]); err != nil {
|
---|
202 | return err
|
---|
203 | }
|
---|
204 | var esc string
|
---|
205 | switch s[i] {
|
---|
206 | case '&':
|
---|
207 | esc = "&"
|
---|
208 | case '\'':
|
---|
209 | // "'" is shorter than "'" and apos was not in HTML until HTML5.
|
---|
210 | esc = "'"
|
---|
211 | case '<':
|
---|
212 | esc = "<"
|
---|
213 | case '>':
|
---|
214 | esc = ">"
|
---|
215 | case '"':
|
---|
216 | // """ is shorter than """.
|
---|
217 | esc = """
|
---|
218 | case '\r':
|
---|
219 | esc = " "
|
---|
220 | default:
|
---|
221 | panic("unrecognized escape character")
|
---|
222 | }
|
---|
223 | s = s[i+1:]
|
---|
224 | if _, err := w.WriteString(esc); err != nil {
|
---|
225 | return err
|
---|
226 | }
|
---|
227 | i = strings.IndexAny(s, escapedChars)
|
---|
228 | }
|
---|
229 | _, err := w.WriteString(s)
|
---|
230 | return err
|
---|
231 | }
|
---|
232 |
|
---|
233 | // EscapeString escapes special characters like "<" to become "<". It
|
---|
234 | // escapes only five such characters: <, >, &, ' and ".
|
---|
235 | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
---|
236 | // always true.
|
---|
237 | func EscapeString(s string) string {
|
---|
238 | if strings.IndexAny(s, escapedChars) == -1 {
|
---|
239 | return s
|
---|
240 | }
|
---|
241 | var buf bytes.Buffer
|
---|
242 | escape(&buf, s)
|
---|
243 | return buf.String()
|
---|
244 | }
|
---|
245 |
|
---|
246 | // UnescapeString unescapes entities like "<" to become "<". It unescapes a
|
---|
247 | // larger range of entities than EscapeString escapes. For example, "á"
|
---|
248 | // unescapes to "á", as does "á" and "&xE1;".
|
---|
249 | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
---|
250 | // always true.
|
---|
251 | func UnescapeString(s string) string {
|
---|
252 | for _, c := range s {
|
---|
253 | if c == '&' {
|
---|
254 | return string(unescape([]byte(s), false))
|
---|
255 | }
|
---|
256 | }
|
---|
257 | return s
|
---|
258 | }
|
---|