1 | package brotli
|
---|
2 |
|
---|
3 | /* Copyright 2013 Google Inc. All Rights Reserved.
|
---|
4 |
|
---|
5 | Distributed under MIT license.
|
---|
6 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
---|
7 | */
|
---|
8 |
|
---|
9 | /* Heuristics for deciding about the UTF8-ness of strings. */
|
---|
10 |
|
---|
11 | const kMinUTF8Ratio float64 = 0.75
|
---|
12 |
|
---|
13 | /* Returns 1 if at least min_fraction of the bytes between pos and
|
---|
14 | pos + length in the (data, mask) ring-buffer is UTF8-encoded, otherwise
|
---|
15 | returns 0. */
|
---|
16 | func parseAsUTF8(symbol *int, input []byte, size uint) uint {
|
---|
17 | /* ASCII */
|
---|
18 | if input[0]&0x80 == 0 {
|
---|
19 | *symbol = int(input[0])
|
---|
20 | if *symbol > 0 {
|
---|
21 | return 1
|
---|
22 | }
|
---|
23 | }
|
---|
24 |
|
---|
25 | /* 2-byte UTF8 */
|
---|
26 | if size > 1 && input[0]&0xE0 == 0xC0 && input[1]&0xC0 == 0x80 {
|
---|
27 | *symbol = (int(input[0])&0x1F)<<6 | int(input[1])&0x3F
|
---|
28 | if *symbol > 0x7F {
|
---|
29 | return 2
|
---|
30 | }
|
---|
31 | }
|
---|
32 |
|
---|
33 | /* 3-byte UFT8 */
|
---|
34 | if size > 2 && input[0]&0xF0 == 0xE0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 {
|
---|
35 | *symbol = (int(input[0])&0x0F)<<12 | (int(input[1])&0x3F)<<6 | int(input[2])&0x3F
|
---|
36 | if *symbol > 0x7FF {
|
---|
37 | return 3
|
---|
38 | }
|
---|
39 | }
|
---|
40 |
|
---|
41 | /* 4-byte UFT8 */
|
---|
42 | if size > 3 && input[0]&0xF8 == 0xF0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 && input[3]&0xC0 == 0x80 {
|
---|
43 | *symbol = (int(input[0])&0x07)<<18 | (int(input[1])&0x3F)<<12 | (int(input[2])&0x3F)<<6 | int(input[3])&0x3F
|
---|
44 | if *symbol > 0xFFFF && *symbol <= 0x10FFFF {
|
---|
45 | return 4
|
---|
46 | }
|
---|
47 | }
|
---|
48 |
|
---|
49 | /* Not UTF8, emit a special symbol above the UTF8-code space */
|
---|
50 | *symbol = 0x110000 | int(input[0])
|
---|
51 |
|
---|
52 | return 1
|
---|
53 | }
|
---|
54 |
|
---|
55 | /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
|
---|
56 | func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction float64) bool {
|
---|
57 | var size_utf8 uint = 0
|
---|
58 | var i uint = 0
|
---|
59 | for i < length {
|
---|
60 | var symbol int
|
---|
61 | current_data := data[(pos+i)&mask:]
|
---|
62 | var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
|
---|
63 | i += bytes_read
|
---|
64 | if symbol < 0x110000 {
|
---|
65 | size_utf8 += bytes_read
|
---|
66 | }
|
---|
67 | }
|
---|
68 |
|
---|
69 | return float64(size_utf8) > min_fraction*float64(length)
|
---|
70 | }
|
---|