1 | package brotli
|
---|
2 |
|
---|
3 | func utf8Position(last uint, c uint, clamp uint) uint {
|
---|
4 | if c < 128 {
|
---|
5 | return 0 /* Next one is the 'Byte 1' again. */
|
---|
6 | } else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */
|
---|
7 | return brotli_min_size_t(1, clamp)
|
---|
8 | } else {
|
---|
9 | /* Let's decide over the last byte if this ends the sequence. */
|
---|
10 | if last < 0xE0 {
|
---|
11 | return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */
|
---|
12 | } else {
|
---|
13 | return brotli_min_size_t(2, clamp)
|
---|
14 | }
|
---|
15 | }
|
---|
16 | }
|
---|
17 |
|
---|
18 | func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {
|
---|
19 | var counts = [3]uint{0} /* should be 2, but 1 compresses better. */
|
---|
20 | var max_utf8 uint = 1
|
---|
21 | var last_c uint = 0
|
---|
22 | var i uint
|
---|
23 | for i = 0; i < len; i++ {
|
---|
24 | var c uint = uint(data[(pos+i)&mask])
|
---|
25 | counts[utf8Position(last_c, c, 2)]++
|
---|
26 | last_c = c
|
---|
27 | }
|
---|
28 |
|
---|
29 | if counts[2] < 500 {
|
---|
30 | max_utf8 = 1
|
---|
31 | }
|
---|
32 |
|
---|
33 | if counts[1]+counts[2] < 25 {
|
---|
34 | max_utf8 = 0
|
---|
35 | }
|
---|
36 |
|
---|
37 | return max_utf8
|
---|
38 | }
|
---|
39 |
|
---|
40 | func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {
|
---|
41 | var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)
|
---|
42 | /* Bootstrap histograms. */
|
---|
43 | var histogram = [3][256]uint{[256]uint{0}}
|
---|
44 | var window_half uint = 495
|
---|
45 | var in_window uint = brotli_min_size_t(window_half, uint(len))
|
---|
46 | var in_window_utf8 = [3]uint{0}
|
---|
47 | /* max_utf8 is 0 (normal ASCII single byte modeling),
|
---|
48 | 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
|
---|
49 |
|
---|
50 | var i uint
|
---|
51 | {
|
---|
52 | var last_c uint = 0
|
---|
53 | var utf8_pos uint = 0
|
---|
54 | for i = 0; i < in_window; i++ {
|
---|
55 | var c uint = uint(data[(pos+i)&mask])
|
---|
56 | histogram[utf8_pos][c]++
|
---|
57 | in_window_utf8[utf8_pos]++
|
---|
58 | utf8_pos = utf8Position(last_c, c, max_utf8)
|
---|
59 | last_c = c
|
---|
60 | }
|
---|
61 | }
|
---|
62 |
|
---|
63 | /* Compute bit costs with sliding window. */
|
---|
64 | for i = 0; i < len; i++ {
|
---|
65 | if i >= window_half {
|
---|
66 | var c uint
|
---|
67 | var last_c uint
|
---|
68 | if i < window_half+1 {
|
---|
69 | c = 0
|
---|
70 | } else {
|
---|
71 | c = uint(data[(pos+i-window_half-1)&mask])
|
---|
72 | }
|
---|
73 | if i < window_half+2 {
|
---|
74 | last_c = 0
|
---|
75 | } else {
|
---|
76 | last_c = uint(data[(pos+i-window_half-2)&mask])
|
---|
77 | }
|
---|
78 | /* Remove a byte in the past. */
|
---|
79 |
|
---|
80 | var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
|
---|
81 | histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--
|
---|
82 | in_window_utf8[utf8_pos2]--
|
---|
83 | }
|
---|
84 |
|
---|
85 | if i+window_half < len {
|
---|
86 | var c uint = uint(data[(pos+i+window_half-1)&mask])
|
---|
87 | var last_c uint = uint(data[(pos+i+window_half-2)&mask])
|
---|
88 | /* Add a byte in the future. */
|
---|
89 |
|
---|
90 | var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
|
---|
91 | histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++
|
---|
92 | in_window_utf8[utf8_pos2]++
|
---|
93 | }
|
---|
94 | {
|
---|
95 | var c uint
|
---|
96 | var last_c uint
|
---|
97 | if i < 1 {
|
---|
98 | c = 0
|
---|
99 | } else {
|
---|
100 | c = uint(data[(pos+i-1)&mask])
|
---|
101 | }
|
---|
102 | if i < 2 {
|
---|
103 | last_c = 0
|
---|
104 | } else {
|
---|
105 | last_c = uint(data[(pos+i-2)&mask])
|
---|
106 | }
|
---|
107 | var utf8_pos uint = utf8Position(last_c, c, max_utf8)
|
---|
108 | var masked_pos uint = (pos + i) & mask
|
---|
109 | var histo uint = histogram[utf8_pos][data[masked_pos]]
|
---|
110 | var lit_cost float64
|
---|
111 | if histo == 0 {
|
---|
112 | histo = 1
|
---|
113 | }
|
---|
114 |
|
---|
115 | lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)
|
---|
116 | lit_cost += 0.02905
|
---|
117 | if lit_cost < 1.0 {
|
---|
118 | lit_cost *= 0.5
|
---|
119 | lit_cost += 0.5
|
---|
120 | }
|
---|
121 |
|
---|
122 | /* Make the first bytes more expensive -- seems to help, not sure why.
|
---|
123 | Perhaps because the entropy source is changing its properties
|
---|
124 | rapidly in the beginning of the file, perhaps because the beginning
|
---|
125 | of the data is a statistical "anomaly". */
|
---|
126 | if i < 2000 {
|
---|
127 | lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)
|
---|
128 | }
|
---|
129 |
|
---|
130 | cost[i] = float32(lit_cost)
|
---|
131 | }
|
---|
132 | }
|
---|
133 | }
|
---|
134 |
|
---|
135 | func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {
|
---|
136 | if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {
|
---|
137 | estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)
|
---|
138 | return
|
---|
139 | } else {
|
---|
140 | var histogram = [256]uint{0}
|
---|
141 | var window_half uint = 2000
|
---|
142 | var in_window uint = brotli_min_size_t(window_half, uint(len))
|
---|
143 | var i uint
|
---|
144 | /* Bootstrap histogram. */
|
---|
145 | for i = 0; i < in_window; i++ {
|
---|
146 | histogram[data[(pos+i)&mask]]++
|
---|
147 | }
|
---|
148 |
|
---|
149 | /* Compute bit costs with sliding window. */
|
---|
150 | for i = 0; i < len; i++ {
|
---|
151 | var histo uint
|
---|
152 | if i >= window_half {
|
---|
153 | /* Remove a byte in the past. */
|
---|
154 | histogram[data[(pos+i-window_half)&mask]]--
|
---|
155 |
|
---|
156 | in_window--
|
---|
157 | }
|
---|
158 |
|
---|
159 | if i+window_half < len {
|
---|
160 | /* Add a byte in the future. */
|
---|
161 | histogram[data[(pos+i+window_half)&mask]]++
|
---|
162 |
|
---|
163 | in_window++
|
---|
164 | }
|
---|
165 |
|
---|
166 | histo = histogram[data[(pos+i)&mask]]
|
---|
167 | if histo == 0 {
|
---|
168 | histo = 1
|
---|
169 | }
|
---|
170 | {
|
---|
171 | var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)
|
---|
172 | lit_cost += 0.029
|
---|
173 | if lit_cost < 1.0 {
|
---|
174 | lit_cost *= 0.5
|
---|
175 | lit_cost += 0.5
|
---|
176 | }
|
---|
177 |
|
---|
178 | cost[i] = float32(lit_cost)
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 | }
|
---|