source: code/trunk/vendor/github.com/andybalholm/brotli/utf8_util.go@ 145

Last change on this file since 145 was 145, checked in by Izuru Yakumo, 22 months ago

Updated the Makefile and vendored depedencies

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 1.9 KB
Line 
1package brotli
2
3/* Copyright 2013 Google Inc. All Rights Reserved.
4
5 Distributed under MIT license.
6 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
7*/
8
9/* Heuristics for deciding about the UTF8-ness of strings. */
10
11const kMinUTF8Ratio float64 = 0.75
12
13/* Returns 1 if at least min_fraction of the bytes between pos and
14 pos + length in the (data, mask) ring-buffer is UTF8-encoded, otherwise
15 returns 0. */
16func parseAsUTF8(symbol *int, input []byte, size uint) uint {
17 /* ASCII */
18 if input[0]&0x80 == 0 {
19 *symbol = int(input[0])
20 if *symbol > 0 {
21 return 1
22 }
23 }
24
25 /* 2-byte UTF8 */
26 if size > 1 && input[0]&0xE0 == 0xC0 && input[1]&0xC0 == 0x80 {
27 *symbol = (int(input[0])&0x1F)<<6 | int(input[1])&0x3F
28 if *symbol > 0x7F {
29 return 2
30 }
31 }
32
33 /* 3-byte UFT8 */
34 if size > 2 && input[0]&0xF0 == 0xE0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 {
35 *symbol = (int(input[0])&0x0F)<<12 | (int(input[1])&0x3F)<<6 | int(input[2])&0x3F
36 if *symbol > 0x7FF {
37 return 3
38 }
39 }
40
41 /* 4-byte UFT8 */
42 if size > 3 && input[0]&0xF8 == 0xF0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 && input[3]&0xC0 == 0x80 {
43 *symbol = (int(input[0])&0x07)<<18 | (int(input[1])&0x3F)<<12 | (int(input[2])&0x3F)<<6 | int(input[3])&0x3F
44 if *symbol > 0xFFFF && *symbol <= 0x10FFFF {
45 return 4
46 }
47 }
48
49 /* Not UTF8, emit a special symbol above the UTF8-code space */
50 *symbol = 0x110000 | int(input[0])
51
52 return 1
53}
54
55/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
56func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction float64) bool {
57 var size_utf8 uint = 0
58 var i uint = 0
59 for i < length {
60 var symbol int
61 current_data := data[(pos+i)&mask:]
62 var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
63 i += bytes_read
64 if symbol < 0x110000 {
65 size_utf8 += bytes_read
66 }
67 }
68
69 return float64(size_utf8) > min_fraction*float64(length)
70}
Note: See TracBrowser for help on using the repository browser.