source: code/trunk/vendor/github.com/andybalholm/brotli/literal_cost.go@ 145

Last change on this file since 145 was 145, checked in by Izuru Yakumo, 22 months ago

Updated the Makefile and vendored depedencies

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 4.6 KB
Line 
1package brotli
2
3func utf8Position(last uint, c uint, clamp uint) uint {
4 if c < 128 {
5 return 0 /* Next one is the 'Byte 1' again. */
6 } else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */
7 return brotli_min_size_t(1, clamp)
8 } else {
9 /* Let's decide over the last byte if this ends the sequence. */
10 if last < 0xE0 {
11 return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */
12 } else {
13 return brotli_min_size_t(2, clamp)
14 }
15 }
16}
17
18func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {
19 var counts = [3]uint{0} /* should be 2, but 1 compresses better. */
20 var max_utf8 uint = 1
21 var last_c uint = 0
22 var i uint
23 for i = 0; i < len; i++ {
24 var c uint = uint(data[(pos+i)&mask])
25 counts[utf8Position(last_c, c, 2)]++
26 last_c = c
27 }
28
29 if counts[2] < 500 {
30 max_utf8 = 1
31 }
32
33 if counts[1]+counts[2] < 25 {
34 max_utf8 = 0
35 }
36
37 return max_utf8
38}
39
40func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {
41 var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)
42 /* Bootstrap histograms. */
43 var histogram = [3][256]uint{[256]uint{0}}
44 var window_half uint = 495
45 var in_window uint = brotli_min_size_t(window_half, uint(len))
46 var in_window_utf8 = [3]uint{0}
47 /* max_utf8 is 0 (normal ASCII single byte modeling),
48 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
49
50 var i uint
51 {
52 var last_c uint = 0
53 var utf8_pos uint = 0
54 for i = 0; i < in_window; i++ {
55 var c uint = uint(data[(pos+i)&mask])
56 histogram[utf8_pos][c]++
57 in_window_utf8[utf8_pos]++
58 utf8_pos = utf8Position(last_c, c, max_utf8)
59 last_c = c
60 }
61 }
62
63 /* Compute bit costs with sliding window. */
64 for i = 0; i < len; i++ {
65 if i >= window_half {
66 var c uint
67 var last_c uint
68 if i < window_half+1 {
69 c = 0
70 } else {
71 c = uint(data[(pos+i-window_half-1)&mask])
72 }
73 if i < window_half+2 {
74 last_c = 0
75 } else {
76 last_c = uint(data[(pos+i-window_half-2)&mask])
77 }
78 /* Remove a byte in the past. */
79
80 var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
81 histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--
82 in_window_utf8[utf8_pos2]--
83 }
84
85 if i+window_half < len {
86 var c uint = uint(data[(pos+i+window_half-1)&mask])
87 var last_c uint = uint(data[(pos+i+window_half-2)&mask])
88 /* Add a byte in the future. */
89
90 var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
91 histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++
92 in_window_utf8[utf8_pos2]++
93 }
94 {
95 var c uint
96 var last_c uint
97 if i < 1 {
98 c = 0
99 } else {
100 c = uint(data[(pos+i-1)&mask])
101 }
102 if i < 2 {
103 last_c = 0
104 } else {
105 last_c = uint(data[(pos+i-2)&mask])
106 }
107 var utf8_pos uint = utf8Position(last_c, c, max_utf8)
108 var masked_pos uint = (pos + i) & mask
109 var histo uint = histogram[utf8_pos][data[masked_pos]]
110 var lit_cost float64
111 if histo == 0 {
112 histo = 1
113 }
114
115 lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)
116 lit_cost += 0.02905
117 if lit_cost < 1.0 {
118 lit_cost *= 0.5
119 lit_cost += 0.5
120 }
121
122 /* Make the first bytes more expensive -- seems to help, not sure why.
123 Perhaps because the entropy source is changing its properties
124 rapidly in the beginning of the file, perhaps because the beginning
125 of the data is a statistical "anomaly". */
126 if i < 2000 {
127 lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)
128 }
129
130 cost[i] = float32(lit_cost)
131 }
132 }
133}
134
135func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {
136 if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {
137 estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)
138 return
139 } else {
140 var histogram = [256]uint{0}
141 var window_half uint = 2000
142 var in_window uint = brotli_min_size_t(window_half, uint(len))
143 var i uint
144 /* Bootstrap histogram. */
145 for i = 0; i < in_window; i++ {
146 histogram[data[(pos+i)&mask]]++
147 }
148
149 /* Compute bit costs with sliding window. */
150 for i = 0; i < len; i++ {
151 var histo uint
152 if i >= window_half {
153 /* Remove a byte in the past. */
154 histogram[data[(pos+i-window_half)&mask]]--
155
156 in_window--
157 }
158
159 if i+window_half < len {
160 /* Add a byte in the future. */
161 histogram[data[(pos+i+window_half)&mask]]++
162
163 in_window++
164 }
165
166 histo = histogram[data[(pos+i)&mask]]
167 if histo == 0 {
168 histo = 1
169 }
170 {
171 var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)
172 lit_cost += 0.029
173 if lit_cost < 1.0 {
174 lit_cost *= 0.5
175 lit_cost += 0.5
176 }
177
178 cost[i] = float32(lit_cost)
179 }
180 }
181 }
182}
Note: See TracBrowser for help on using the repository browser.