1 | package brotli
|
---|
2 |
|
---|
3 | import "math"
|
---|
4 |
|
---|
5 | /* Copyright 2013 Google Inc. All Rights Reserved.
|
---|
6 |
|
---|
7 | Distributed under MIT license.
|
---|
8 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
---|
9 | */
|
---|
10 |
|
---|
11 | /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
|
---|
12 | it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
|
---|
13 | func compareAndPushToQueueDistance(out []histogramDistance, cluster_size []uint32, idx1 uint32, idx2 uint32, max_num_pairs uint, pairs []histogramPair, num_pairs *uint) {
|
---|
14 | var is_good_pair bool = false
|
---|
15 | var p histogramPair
|
---|
16 | p.idx2 = 0
|
---|
17 | p.idx1 = p.idx2
|
---|
18 | p.cost_combo = 0
|
---|
19 | p.cost_diff = p.cost_combo
|
---|
20 | if idx1 == idx2 {
|
---|
21 | return
|
---|
22 | }
|
---|
23 |
|
---|
24 | if idx2 < idx1 {
|
---|
25 | var t uint32 = idx2
|
---|
26 | idx2 = idx1
|
---|
27 | idx1 = t
|
---|
28 | }
|
---|
29 |
|
---|
30 | p.idx1 = idx1
|
---|
31 | p.idx2 = idx2
|
---|
32 | p.cost_diff = 0.5 * clusterCostDiff(uint(cluster_size[idx1]), uint(cluster_size[idx2]))
|
---|
33 | p.cost_diff -= out[idx1].bit_cost_
|
---|
34 | p.cost_diff -= out[idx2].bit_cost_
|
---|
35 |
|
---|
36 | if out[idx1].total_count_ == 0 {
|
---|
37 | p.cost_combo = out[idx2].bit_cost_
|
---|
38 | is_good_pair = true
|
---|
39 | } else if out[idx2].total_count_ == 0 {
|
---|
40 | p.cost_combo = out[idx1].bit_cost_
|
---|
41 | is_good_pair = true
|
---|
42 | } else {
|
---|
43 | var threshold float64
|
---|
44 | if *num_pairs == 0 {
|
---|
45 | threshold = 1e99
|
---|
46 | } else {
|
---|
47 | threshold = brotli_max_double(0.0, pairs[0].cost_diff)
|
---|
48 | }
|
---|
49 | var combo histogramDistance = out[idx1]
|
---|
50 | var cost_combo float64
|
---|
51 | histogramAddHistogramDistance(&combo, &out[idx2])
|
---|
52 | cost_combo = populationCostDistance(&combo)
|
---|
53 | if cost_combo < threshold-p.cost_diff {
|
---|
54 | p.cost_combo = cost_combo
|
---|
55 | is_good_pair = true
|
---|
56 | }
|
---|
57 | }
|
---|
58 |
|
---|
59 | if is_good_pair {
|
---|
60 | p.cost_diff += p.cost_combo
|
---|
61 | if *num_pairs > 0 && histogramPairIsLess(&pairs[0], &p) {
|
---|
62 | /* Replace the top of the queue if needed. */
|
---|
63 | if *num_pairs < max_num_pairs {
|
---|
64 | pairs[*num_pairs] = pairs[0]
|
---|
65 | (*num_pairs)++
|
---|
66 | }
|
---|
67 |
|
---|
68 | pairs[0] = p
|
---|
69 | } else if *num_pairs < max_num_pairs {
|
---|
70 | pairs[*num_pairs] = p
|
---|
71 | (*num_pairs)++
|
---|
72 | }
|
---|
73 | }
|
---|
74 | }
|
---|
75 |
|
---|
76 | func histogramCombineDistance(out []histogramDistance, cluster_size []uint32, symbols []uint32, clusters []uint32, pairs []histogramPair, num_clusters uint, symbols_size uint, max_clusters uint, max_num_pairs uint) uint {
|
---|
77 | var cost_diff_threshold float64 = 0.0
|
---|
78 | var min_cluster_size uint = 1
|
---|
79 | var num_pairs uint = 0
|
---|
80 | {
|
---|
81 | /* We maintain a vector of histogram pairs, with the property that the pair
|
---|
82 | with the maximum bit cost reduction is the first. */
|
---|
83 | var idx1 uint
|
---|
84 | for idx1 = 0; idx1 < num_clusters; idx1++ {
|
---|
85 | var idx2 uint
|
---|
86 | for idx2 = idx1 + 1; idx2 < num_clusters; idx2++ {
|
---|
87 | compareAndPushToQueueDistance(out, cluster_size, clusters[idx1], clusters[idx2], max_num_pairs, pairs[0:], &num_pairs)
|
---|
88 | }
|
---|
89 | }
|
---|
90 | }
|
---|
91 |
|
---|
92 | for num_clusters > min_cluster_size {
|
---|
93 | var best_idx1 uint32
|
---|
94 | var best_idx2 uint32
|
---|
95 | var i uint
|
---|
96 | if pairs[0].cost_diff >= cost_diff_threshold {
|
---|
97 | cost_diff_threshold = 1e99
|
---|
98 | min_cluster_size = max_clusters
|
---|
99 | continue
|
---|
100 | }
|
---|
101 |
|
---|
102 | /* Take the best pair from the top of heap. */
|
---|
103 | best_idx1 = pairs[0].idx1
|
---|
104 |
|
---|
105 | best_idx2 = pairs[0].idx2
|
---|
106 | histogramAddHistogramDistance(&out[best_idx1], &out[best_idx2])
|
---|
107 | out[best_idx1].bit_cost_ = pairs[0].cost_combo
|
---|
108 | cluster_size[best_idx1] += cluster_size[best_idx2]
|
---|
109 | for i = 0; i < symbols_size; i++ {
|
---|
110 | if symbols[i] == best_idx2 {
|
---|
111 | symbols[i] = best_idx1
|
---|
112 | }
|
---|
113 | }
|
---|
114 |
|
---|
115 | for i = 0; i < num_clusters; i++ {
|
---|
116 | if clusters[i] == best_idx2 {
|
---|
117 | copy(clusters[i:], clusters[i+1:][:num_clusters-i-1])
|
---|
118 | break
|
---|
119 | }
|
---|
120 | }
|
---|
121 |
|
---|
122 | num_clusters--
|
---|
123 | {
|
---|
124 | /* Remove pairs intersecting the just combined best pair. */
|
---|
125 | var copy_to_idx uint = 0
|
---|
126 | for i = 0; i < num_pairs; i++ {
|
---|
127 | var p *histogramPair = &pairs[i]
|
---|
128 | if p.idx1 == best_idx1 || p.idx2 == best_idx1 || p.idx1 == best_idx2 || p.idx2 == best_idx2 {
|
---|
129 | /* Remove invalid pair from the queue. */
|
---|
130 | continue
|
---|
131 | }
|
---|
132 |
|
---|
133 | if histogramPairIsLess(&pairs[0], p) {
|
---|
134 | /* Replace the top of the queue if needed. */
|
---|
135 | var front histogramPair = pairs[0]
|
---|
136 | pairs[0] = *p
|
---|
137 | pairs[copy_to_idx] = front
|
---|
138 | } else {
|
---|
139 | pairs[copy_to_idx] = *p
|
---|
140 | }
|
---|
141 |
|
---|
142 | copy_to_idx++
|
---|
143 | }
|
---|
144 |
|
---|
145 | num_pairs = copy_to_idx
|
---|
146 | }
|
---|
147 |
|
---|
148 | /* Push new pairs formed with the combined histogram to the heap. */
|
---|
149 | for i = 0; i < num_clusters; i++ {
|
---|
150 | compareAndPushToQueueDistance(out, cluster_size, best_idx1, clusters[i], max_num_pairs, pairs[0:], &num_pairs)
|
---|
151 | }
|
---|
152 | }
|
---|
153 |
|
---|
154 | return num_clusters
|
---|
155 | }
|
---|
156 |
|
---|
157 | /* What is the bit cost of moving histogram from cur_symbol to candidate. */
|
---|
158 | func histogramBitCostDistanceDistance(histogram *histogramDistance, candidate *histogramDistance) float64 {
|
---|
159 | if histogram.total_count_ == 0 {
|
---|
160 | return 0.0
|
---|
161 | } else {
|
---|
162 | var tmp histogramDistance = *histogram
|
---|
163 | histogramAddHistogramDistance(&tmp, candidate)
|
---|
164 | return populationCostDistance(&tmp) - candidate.bit_cost_
|
---|
165 | }
|
---|
166 | }
|
---|
167 |
|
---|
168 | /* Find the best 'out' histogram for each of the 'in' histograms.
|
---|
169 | When called, clusters[0..num_clusters) contains the unique values from
|
---|
170 | symbols[0..in_size), but this property is not preserved in this function.
|
---|
171 | Note: we assume that out[]->bit_cost_ is already up-to-date. */
|
---|
172 | func histogramRemapDistance(in []histogramDistance, in_size uint, clusters []uint32, num_clusters uint, out []histogramDistance, symbols []uint32) {
|
---|
173 | var i uint
|
---|
174 | for i = 0; i < in_size; i++ {
|
---|
175 | var best_out uint32
|
---|
176 | if i == 0 {
|
---|
177 | best_out = symbols[0]
|
---|
178 | } else {
|
---|
179 | best_out = symbols[i-1]
|
---|
180 | }
|
---|
181 | var best_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[best_out])
|
---|
182 | var j uint
|
---|
183 | for j = 0; j < num_clusters; j++ {
|
---|
184 | var cur_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[clusters[j]])
|
---|
185 | if cur_bits < best_bits {
|
---|
186 | best_bits = cur_bits
|
---|
187 | best_out = clusters[j]
|
---|
188 | }
|
---|
189 | }
|
---|
190 |
|
---|
191 | symbols[i] = best_out
|
---|
192 | }
|
---|
193 |
|
---|
194 | /* Recompute each out based on raw and symbols. */
|
---|
195 | for i = 0; i < num_clusters; i++ {
|
---|
196 | histogramClearDistance(&out[clusters[i]])
|
---|
197 | }
|
---|
198 |
|
---|
199 | for i = 0; i < in_size; i++ {
|
---|
200 | histogramAddHistogramDistance(&out[symbols[i]], &in[i])
|
---|
201 | }
|
---|
202 | }
|
---|
203 |
|
---|
204 | /* Reorders elements of the out[0..length) array and changes values in
|
---|
205 | symbols[0..length) array in the following way:
|
---|
206 | * when called, symbols[] contains indexes into out[], and has N unique
|
---|
207 | values (possibly N < length)
|
---|
208 | * on return, symbols'[i] = f(symbols[i]) and
|
---|
209 | out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
|
---|
210 | where f is a bijection between the range of symbols[] and [0..N), and
|
---|
211 | the first occurrences of values in symbols'[i] come in consecutive
|
---|
212 | increasing order.
|
---|
213 | Returns N, the number of unique values in symbols[]. */
|
---|
214 |
|
---|
215 | var histogramReindexDistance_kInvalidIndex uint32 = math.MaxUint32
|
---|
216 |
|
---|
217 | func histogramReindexDistance(out []histogramDistance, symbols []uint32, length uint) uint {
|
---|
218 | var new_index []uint32 = make([]uint32, length)
|
---|
219 | var next_index uint32
|
---|
220 | var tmp []histogramDistance
|
---|
221 | var i uint
|
---|
222 | for i = 0; i < length; i++ {
|
---|
223 | new_index[i] = histogramReindexDistance_kInvalidIndex
|
---|
224 | }
|
---|
225 |
|
---|
226 | next_index = 0
|
---|
227 | for i = 0; i < length; i++ {
|
---|
228 | if new_index[symbols[i]] == histogramReindexDistance_kInvalidIndex {
|
---|
229 | new_index[symbols[i]] = next_index
|
---|
230 | next_index++
|
---|
231 | }
|
---|
232 | }
|
---|
233 |
|
---|
234 | /* TODO: by using idea of "cycle-sort" we can avoid allocation of
|
---|
235 | tmp and reduce the number of copying by the factor of 2. */
|
---|
236 | tmp = make([]histogramDistance, next_index)
|
---|
237 |
|
---|
238 | next_index = 0
|
---|
239 | for i = 0; i < length; i++ {
|
---|
240 | if new_index[symbols[i]] == next_index {
|
---|
241 | tmp[next_index] = out[symbols[i]]
|
---|
242 | next_index++
|
---|
243 | }
|
---|
244 |
|
---|
245 | symbols[i] = new_index[symbols[i]]
|
---|
246 | }
|
---|
247 |
|
---|
248 | new_index = nil
|
---|
249 | for i = 0; uint32(i) < next_index; i++ {
|
---|
250 | out[i] = tmp[i]
|
---|
251 | }
|
---|
252 |
|
---|
253 | tmp = nil
|
---|
254 | return uint(next_index)
|
---|
255 | }
|
---|
256 |
|
---|
257 | func clusterHistogramsDistance(in []histogramDistance, in_size uint, max_histograms uint, out []histogramDistance, out_size *uint, histogram_symbols []uint32) {
|
---|
258 | var cluster_size []uint32 = make([]uint32, in_size)
|
---|
259 | var clusters []uint32 = make([]uint32, in_size)
|
---|
260 | var num_clusters uint = 0
|
---|
261 | var max_input_histograms uint = 64
|
---|
262 | var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
|
---|
263 | var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
|
---|
264 | var i uint
|
---|
265 |
|
---|
266 | /* For the first pass of clustering, we allow all pairs. */
|
---|
267 | for i = 0; i < in_size; i++ {
|
---|
268 | cluster_size[i] = 1
|
---|
269 | }
|
---|
270 |
|
---|
271 | for i = 0; i < in_size; i++ {
|
---|
272 | out[i] = in[i]
|
---|
273 | out[i].bit_cost_ = populationCostDistance(&in[i])
|
---|
274 | histogram_symbols[i] = uint32(i)
|
---|
275 | }
|
---|
276 |
|
---|
277 | for i = 0; i < in_size; i += max_input_histograms {
|
---|
278 | var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
|
---|
279 | var num_new_clusters uint
|
---|
280 | var j uint
|
---|
281 | for j = 0; j < num_to_combine; j++ {
|
---|
282 | clusters[num_clusters+j] = uint32(i + j)
|
---|
283 | }
|
---|
284 |
|
---|
285 | num_new_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
|
---|
286 | num_clusters += num_new_clusters
|
---|
287 | }
|
---|
288 | {
|
---|
289 | /* For the second pass, we limit the total number of histogram pairs.
|
---|
290 | After this limit is reached, we only keep searching for the best pair. */
|
---|
291 | var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
|
---|
292 | if pairs_capacity < (max_num_pairs + 1) {
|
---|
293 | var _new_size uint
|
---|
294 | if pairs_capacity == 0 {
|
---|
295 | _new_size = max_num_pairs + 1
|
---|
296 | } else {
|
---|
297 | _new_size = pairs_capacity
|
---|
298 | }
|
---|
299 | var new_array []histogramPair
|
---|
300 | for _new_size < (max_num_pairs + 1) {
|
---|
301 | _new_size *= 2
|
---|
302 | }
|
---|
303 | new_array = make([]histogramPair, _new_size)
|
---|
304 | if pairs_capacity != 0 {
|
---|
305 | copy(new_array, pairs[:pairs_capacity])
|
---|
306 | }
|
---|
307 |
|
---|
308 | pairs = new_array
|
---|
309 | pairs_capacity = _new_size
|
---|
310 | }
|
---|
311 |
|
---|
312 | /* Collapse similar histograms. */
|
---|
313 | num_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
|
---|
314 | }
|
---|
315 |
|
---|
316 | pairs = nil
|
---|
317 | cluster_size = nil
|
---|
318 |
|
---|
319 | /* Find the optimal map from original histograms to the final ones. */
|
---|
320 | histogramRemapDistance(in, in_size, clusters, num_clusters, out, histogram_symbols)
|
---|
321 |
|
---|
322 | clusters = nil
|
---|
323 |
|
---|
324 | /* Convert the context map to a canonical form. */
|
---|
325 | *out_size = histogramReindexDistance(out, histogram_symbols, in_size)
|
---|
326 | }
|
---|