1 | // Copyright 2014 The Go Authors. All rights reserved.
|
---|
2 | // Use of this source code is governed by a BSD-style
|
---|
3 | // license that can be found in the LICENSE file.
|
---|
4 |
|
---|
5 | // Package runes provide transforms for UTF-8 encoded text.
|
---|
6 | package runes // import "golang.org/x/text/runes"
|
---|
7 |
|
---|
8 | import (
|
---|
9 | "unicode"
|
---|
10 | "unicode/utf8"
|
---|
11 |
|
---|
12 | "golang.org/x/text/transform"
|
---|
13 | )
|
---|
14 |
|
---|
15 | // A Set is a collection of runes.
|
---|
16 | type Set interface {
|
---|
17 | // Contains returns true if r is contained in the set.
|
---|
18 | Contains(r rune) bool
|
---|
19 | }
|
---|
20 |
|
---|
21 | type setFunc func(rune) bool
|
---|
22 |
|
---|
23 | func (s setFunc) Contains(r rune) bool {
|
---|
24 | return s(r)
|
---|
25 | }
|
---|
26 |
|
---|
27 | // Note: using funcs here instead of wrapping types result in cleaner
|
---|
28 | // documentation and a smaller API.
|
---|
29 |
|
---|
30 | // In creates a Set with a Contains method that returns true for all runes in
|
---|
31 | // the given RangeTable.
|
---|
32 | func In(rt *unicode.RangeTable) Set {
|
---|
33 | return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
|
---|
34 | }
|
---|
35 |
|
---|
36 | // NotIn creates a Set with a Contains method that returns true for all runes not
|
---|
37 | // in the given RangeTable.
|
---|
38 | func NotIn(rt *unicode.RangeTable) Set {
|
---|
39 | return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
|
---|
40 | }
|
---|
41 |
|
---|
42 | // Predicate creates a Set with a Contains method that returns f(r).
|
---|
43 | func Predicate(f func(rune) bool) Set {
|
---|
44 | return setFunc(f)
|
---|
45 | }
|
---|
46 |
|
---|
47 | // Transformer implements the transform.Transformer interface.
|
---|
48 | type Transformer struct {
|
---|
49 | t transform.SpanningTransformer
|
---|
50 | }
|
---|
51 |
|
---|
52 | func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
---|
53 | return t.t.Transform(dst, src, atEOF)
|
---|
54 | }
|
---|
55 |
|
---|
56 | func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
|
---|
57 | return t.t.Span(b, atEOF)
|
---|
58 | }
|
---|
59 |
|
---|
60 | func (t Transformer) Reset() { t.t.Reset() }
|
---|
61 |
|
---|
62 | // Bytes returns a new byte slice with the result of converting b using t. It
|
---|
63 | // calls Reset on t. It returns nil if any error was found. This can only happen
|
---|
64 | // if an error-producing Transformer is passed to If.
|
---|
65 | func (t Transformer) Bytes(b []byte) []byte {
|
---|
66 | b, _, err := transform.Bytes(t, b)
|
---|
67 | if err != nil {
|
---|
68 | return nil
|
---|
69 | }
|
---|
70 | return b
|
---|
71 | }
|
---|
72 |
|
---|
73 | // String returns a string with the result of converting s using t. It calls
|
---|
74 | // Reset on t. It returns the empty string if any error was found. This can only
|
---|
75 | // happen if an error-producing Transformer is passed to If.
|
---|
76 | func (t Transformer) String(s string) string {
|
---|
77 | s, _, err := transform.String(t, s)
|
---|
78 | if err != nil {
|
---|
79 | return ""
|
---|
80 | }
|
---|
81 | return s
|
---|
82 | }
|
---|
83 |
|
---|
84 | // TODO:
|
---|
85 | // - Copy: copying strings and bytes in whole-rune units.
|
---|
86 | // - Validation (maybe)
|
---|
87 | // - Well-formed-ness (maybe)
|
---|
88 |
|
---|
89 | const runeErrorString = string(utf8.RuneError)
|
---|
90 |
|
---|
91 | // Remove returns a Transformer that removes runes r for which s.Contains(r).
|
---|
92 | // Illegal input bytes are replaced by RuneError before being passed to f.
|
---|
93 | func Remove(s Set) Transformer {
|
---|
94 | if f, ok := s.(setFunc); ok {
|
---|
95 | // This little trick cuts the running time of BenchmarkRemove for sets
|
---|
96 | // created by Predicate roughly in half.
|
---|
97 | // TODO: special-case RangeTables as well.
|
---|
98 | return Transformer{remove(f)}
|
---|
99 | }
|
---|
100 | return Transformer{remove(s.Contains)}
|
---|
101 | }
|
---|
102 |
|
---|
103 | // TODO: remove transform.RemoveFunc.
|
---|
104 |
|
---|
105 | type remove func(r rune) bool
|
---|
106 |
|
---|
107 | func (remove) Reset() {}
|
---|
108 |
|
---|
109 | // Span implements transform.Spanner.
|
---|
110 | func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
|
---|
111 | for r, size := rune(0), 0; n < len(src); {
|
---|
112 | if r = rune(src[n]); r < utf8.RuneSelf {
|
---|
113 | size = 1
|
---|
114 | } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
---|
115 | // Invalid rune.
|
---|
116 | if !atEOF && !utf8.FullRune(src[n:]) {
|
---|
117 | err = transform.ErrShortSrc
|
---|
118 | } else {
|
---|
119 | err = transform.ErrEndOfSpan
|
---|
120 | }
|
---|
121 | break
|
---|
122 | }
|
---|
123 | if t(r) {
|
---|
124 | err = transform.ErrEndOfSpan
|
---|
125 | break
|
---|
126 | }
|
---|
127 | n += size
|
---|
128 | }
|
---|
129 | return
|
---|
130 | }
|
---|
131 |
|
---|
132 | // Transform implements transform.Transformer.
|
---|
133 | func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
---|
134 | for r, size := rune(0), 0; nSrc < len(src); {
|
---|
135 | if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
---|
136 | size = 1
|
---|
137 | } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
---|
138 | // Invalid rune.
|
---|
139 | if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
---|
140 | err = transform.ErrShortSrc
|
---|
141 | break
|
---|
142 | }
|
---|
143 | // We replace illegal bytes with RuneError. Not doing so might
|
---|
144 | // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
---|
145 | // The resulting byte sequence may subsequently contain runes
|
---|
146 | // for which t(r) is true that were passed unnoticed.
|
---|
147 | if !t(utf8.RuneError) {
|
---|
148 | if nDst+3 > len(dst) {
|
---|
149 | err = transform.ErrShortDst
|
---|
150 | break
|
---|
151 | }
|
---|
152 | dst[nDst+0] = runeErrorString[0]
|
---|
153 | dst[nDst+1] = runeErrorString[1]
|
---|
154 | dst[nDst+2] = runeErrorString[2]
|
---|
155 | nDst += 3
|
---|
156 | }
|
---|
157 | nSrc++
|
---|
158 | continue
|
---|
159 | }
|
---|
160 | if t(r) {
|
---|
161 | nSrc += size
|
---|
162 | continue
|
---|
163 | }
|
---|
164 | if nDst+size > len(dst) {
|
---|
165 | err = transform.ErrShortDst
|
---|
166 | break
|
---|
167 | }
|
---|
168 | for i := 0; i < size; i++ {
|
---|
169 | dst[nDst] = src[nSrc]
|
---|
170 | nDst++
|
---|
171 | nSrc++
|
---|
172 | }
|
---|
173 | }
|
---|
174 | return
|
---|
175 | }
|
---|
176 |
|
---|
177 | // Map returns a Transformer that maps the runes in the input using the given
|
---|
178 | // mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
---|
179 | // being passed to the mapping func.
|
---|
180 | func Map(mapping func(rune) rune) Transformer {
|
---|
181 | return Transformer{mapper(mapping)}
|
---|
182 | }
|
---|
183 |
|
---|
184 | type mapper func(rune) rune
|
---|
185 |
|
---|
186 | func (mapper) Reset() {}
|
---|
187 |
|
---|
188 | // Span implements transform.Spanner.
|
---|
189 | func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
|
---|
190 | for r, size := rune(0), 0; n < len(src); n += size {
|
---|
191 | if r = rune(src[n]); r < utf8.RuneSelf {
|
---|
192 | size = 1
|
---|
193 | } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
---|
194 | // Invalid rune.
|
---|
195 | if !atEOF && !utf8.FullRune(src[n:]) {
|
---|
196 | err = transform.ErrShortSrc
|
---|
197 | } else {
|
---|
198 | err = transform.ErrEndOfSpan
|
---|
199 | }
|
---|
200 | break
|
---|
201 | }
|
---|
202 | if t(r) != r {
|
---|
203 | err = transform.ErrEndOfSpan
|
---|
204 | break
|
---|
205 | }
|
---|
206 | }
|
---|
207 | return n, err
|
---|
208 | }
|
---|
209 |
|
---|
210 | // Transform implements transform.Transformer.
|
---|
211 | func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
---|
212 | var replacement rune
|
---|
213 | var b [utf8.UTFMax]byte
|
---|
214 |
|
---|
215 | for r, size := rune(0), 0; nSrc < len(src); {
|
---|
216 | if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
---|
217 | if replacement = t(r); replacement < utf8.RuneSelf {
|
---|
218 | if nDst == len(dst) {
|
---|
219 | err = transform.ErrShortDst
|
---|
220 | break
|
---|
221 | }
|
---|
222 | dst[nDst] = byte(replacement)
|
---|
223 | nDst++
|
---|
224 | nSrc++
|
---|
225 | continue
|
---|
226 | }
|
---|
227 | size = 1
|
---|
228 | } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
---|
229 | // Invalid rune.
|
---|
230 | if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
---|
231 | err = transform.ErrShortSrc
|
---|
232 | break
|
---|
233 | }
|
---|
234 |
|
---|
235 | if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
|
---|
236 | if nDst+3 > len(dst) {
|
---|
237 | err = transform.ErrShortDst
|
---|
238 | break
|
---|
239 | }
|
---|
240 | dst[nDst+0] = runeErrorString[0]
|
---|
241 | dst[nDst+1] = runeErrorString[1]
|
---|
242 | dst[nDst+2] = runeErrorString[2]
|
---|
243 | nDst += 3
|
---|
244 | nSrc++
|
---|
245 | continue
|
---|
246 | }
|
---|
247 | } else if replacement = t(r); replacement == r {
|
---|
248 | if nDst+size > len(dst) {
|
---|
249 | err = transform.ErrShortDst
|
---|
250 | break
|
---|
251 | }
|
---|
252 | for i := 0; i < size; i++ {
|
---|
253 | dst[nDst] = src[nSrc]
|
---|
254 | nDst++
|
---|
255 | nSrc++
|
---|
256 | }
|
---|
257 | continue
|
---|
258 | }
|
---|
259 |
|
---|
260 | n := utf8.EncodeRune(b[:], replacement)
|
---|
261 |
|
---|
262 | if nDst+n > len(dst) {
|
---|
263 | err = transform.ErrShortDst
|
---|
264 | break
|
---|
265 | }
|
---|
266 | for i := 0; i < n; i++ {
|
---|
267 | dst[nDst] = b[i]
|
---|
268 | nDst++
|
---|
269 | }
|
---|
270 | nSrc += size
|
---|
271 | }
|
---|
272 | return
|
---|
273 | }
|
---|
274 |
|
---|
275 | // ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
---|
276 | // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
---|
277 | func ReplaceIllFormed() Transformer {
|
---|
278 | return Transformer{&replaceIllFormed{}}
|
---|
279 | }
|
---|
280 |
|
---|
281 | type replaceIllFormed struct{ transform.NopResetter }
|
---|
282 |
|
---|
283 | func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
|
---|
284 | for n < len(src) {
|
---|
285 | // ASCII fast path.
|
---|
286 | if src[n] < utf8.RuneSelf {
|
---|
287 | n++
|
---|
288 | continue
|
---|
289 | }
|
---|
290 |
|
---|
291 | r, size := utf8.DecodeRune(src[n:])
|
---|
292 |
|
---|
293 | // Look for a valid non-ASCII rune.
|
---|
294 | if r != utf8.RuneError || size != 1 {
|
---|
295 | n += size
|
---|
296 | continue
|
---|
297 | }
|
---|
298 |
|
---|
299 | // Look for short source data.
|
---|
300 | if !atEOF && !utf8.FullRune(src[n:]) {
|
---|
301 | err = transform.ErrShortSrc
|
---|
302 | break
|
---|
303 | }
|
---|
304 |
|
---|
305 | // We have an invalid rune.
|
---|
306 | err = transform.ErrEndOfSpan
|
---|
307 | break
|
---|
308 | }
|
---|
309 | return n, err
|
---|
310 | }
|
---|
311 |
|
---|
312 | func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
---|
313 | for nSrc < len(src) {
|
---|
314 | // ASCII fast path.
|
---|
315 | if r := src[nSrc]; r < utf8.RuneSelf {
|
---|
316 | if nDst == len(dst) {
|
---|
317 | err = transform.ErrShortDst
|
---|
318 | break
|
---|
319 | }
|
---|
320 | dst[nDst] = r
|
---|
321 | nDst++
|
---|
322 | nSrc++
|
---|
323 | continue
|
---|
324 | }
|
---|
325 |
|
---|
326 | // Look for a valid non-ASCII rune.
|
---|
327 | if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
|
---|
328 | if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
---|
329 | err = transform.ErrShortDst
|
---|
330 | break
|
---|
331 | }
|
---|
332 | nDst += size
|
---|
333 | nSrc += size
|
---|
334 | continue
|
---|
335 | }
|
---|
336 |
|
---|
337 | // Look for short source data.
|
---|
338 | if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
---|
339 | err = transform.ErrShortSrc
|
---|
340 | break
|
---|
341 | }
|
---|
342 |
|
---|
343 | // We have an invalid rune.
|
---|
344 | if nDst+3 > len(dst) {
|
---|
345 | err = transform.ErrShortDst
|
---|
346 | break
|
---|
347 | }
|
---|
348 | dst[nDst+0] = runeErrorString[0]
|
---|
349 | dst[nDst+1] = runeErrorString[1]
|
---|
350 | dst[nDst+2] = runeErrorString[2]
|
---|
351 | nDst += 3
|
---|
352 | nSrc++
|
---|
353 | }
|
---|
354 | return nDst, nSrc, err
|
---|
355 | }
|
---|