source: code/trunk/vendor/github.com/andybalholm/brotli/cluster_distance.go@ 145

Last change on this file since 145 was 145, checked in by Izuru Yakumo, 22 months ago

Updated the Makefile and vendored depedencies

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 9.7 KB
Line 
1package brotli
2
3import "math"
4
5/* Copyright 2013 Google Inc. All Rights Reserved.
6
7 Distributed under MIT license.
8 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
9*/
10
11/* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
12 it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
13func compareAndPushToQueueDistance(out []histogramDistance, cluster_size []uint32, idx1 uint32, idx2 uint32, max_num_pairs uint, pairs []histogramPair, num_pairs *uint) {
14 var is_good_pair bool = false
15 var p histogramPair
16 p.idx2 = 0
17 p.idx1 = p.idx2
18 p.cost_combo = 0
19 p.cost_diff = p.cost_combo
20 if idx1 == idx2 {
21 return
22 }
23
24 if idx2 < idx1 {
25 var t uint32 = idx2
26 idx2 = idx1
27 idx1 = t
28 }
29
30 p.idx1 = idx1
31 p.idx2 = idx2
32 p.cost_diff = 0.5 * clusterCostDiff(uint(cluster_size[idx1]), uint(cluster_size[idx2]))
33 p.cost_diff -= out[idx1].bit_cost_
34 p.cost_diff -= out[idx2].bit_cost_
35
36 if out[idx1].total_count_ == 0 {
37 p.cost_combo = out[idx2].bit_cost_
38 is_good_pair = true
39 } else if out[idx2].total_count_ == 0 {
40 p.cost_combo = out[idx1].bit_cost_
41 is_good_pair = true
42 } else {
43 var threshold float64
44 if *num_pairs == 0 {
45 threshold = 1e99
46 } else {
47 threshold = brotli_max_double(0.0, pairs[0].cost_diff)
48 }
49 var combo histogramDistance = out[idx1]
50 var cost_combo float64
51 histogramAddHistogramDistance(&combo, &out[idx2])
52 cost_combo = populationCostDistance(&combo)
53 if cost_combo < threshold-p.cost_diff {
54 p.cost_combo = cost_combo
55 is_good_pair = true
56 }
57 }
58
59 if is_good_pair {
60 p.cost_diff += p.cost_combo
61 if *num_pairs > 0 && histogramPairIsLess(&pairs[0], &p) {
62 /* Replace the top of the queue if needed. */
63 if *num_pairs < max_num_pairs {
64 pairs[*num_pairs] = pairs[0]
65 (*num_pairs)++
66 }
67
68 pairs[0] = p
69 } else if *num_pairs < max_num_pairs {
70 pairs[*num_pairs] = p
71 (*num_pairs)++
72 }
73 }
74}
75
76func histogramCombineDistance(out []histogramDistance, cluster_size []uint32, symbols []uint32, clusters []uint32, pairs []histogramPair, num_clusters uint, symbols_size uint, max_clusters uint, max_num_pairs uint) uint {
77 var cost_diff_threshold float64 = 0.0
78 var min_cluster_size uint = 1
79 var num_pairs uint = 0
80 {
81 /* We maintain a vector of histogram pairs, with the property that the pair
82 with the maximum bit cost reduction is the first. */
83 var idx1 uint
84 for idx1 = 0; idx1 < num_clusters; idx1++ {
85 var idx2 uint
86 for idx2 = idx1 + 1; idx2 < num_clusters; idx2++ {
87 compareAndPushToQueueDistance(out, cluster_size, clusters[idx1], clusters[idx2], max_num_pairs, pairs[0:], &num_pairs)
88 }
89 }
90 }
91
92 for num_clusters > min_cluster_size {
93 var best_idx1 uint32
94 var best_idx2 uint32
95 var i uint
96 if pairs[0].cost_diff >= cost_diff_threshold {
97 cost_diff_threshold = 1e99
98 min_cluster_size = max_clusters
99 continue
100 }
101
102 /* Take the best pair from the top of heap. */
103 best_idx1 = pairs[0].idx1
104
105 best_idx2 = pairs[0].idx2
106 histogramAddHistogramDistance(&out[best_idx1], &out[best_idx2])
107 out[best_idx1].bit_cost_ = pairs[0].cost_combo
108 cluster_size[best_idx1] += cluster_size[best_idx2]
109 for i = 0; i < symbols_size; i++ {
110 if symbols[i] == best_idx2 {
111 symbols[i] = best_idx1
112 }
113 }
114
115 for i = 0; i < num_clusters; i++ {
116 if clusters[i] == best_idx2 {
117 copy(clusters[i:], clusters[i+1:][:num_clusters-i-1])
118 break
119 }
120 }
121
122 num_clusters--
123 {
124 /* Remove pairs intersecting the just combined best pair. */
125 var copy_to_idx uint = 0
126 for i = 0; i < num_pairs; i++ {
127 var p *histogramPair = &pairs[i]
128 if p.idx1 == best_idx1 || p.idx2 == best_idx1 || p.idx1 == best_idx2 || p.idx2 == best_idx2 {
129 /* Remove invalid pair from the queue. */
130 continue
131 }
132
133 if histogramPairIsLess(&pairs[0], p) {
134 /* Replace the top of the queue if needed. */
135 var front histogramPair = pairs[0]
136 pairs[0] = *p
137 pairs[copy_to_idx] = front
138 } else {
139 pairs[copy_to_idx] = *p
140 }
141
142 copy_to_idx++
143 }
144
145 num_pairs = copy_to_idx
146 }
147
148 /* Push new pairs formed with the combined histogram to the heap. */
149 for i = 0; i < num_clusters; i++ {
150 compareAndPushToQueueDistance(out, cluster_size, best_idx1, clusters[i], max_num_pairs, pairs[0:], &num_pairs)
151 }
152 }
153
154 return num_clusters
155}
156
157/* What is the bit cost of moving histogram from cur_symbol to candidate. */
158func histogramBitCostDistanceDistance(histogram *histogramDistance, candidate *histogramDistance) float64 {
159 if histogram.total_count_ == 0 {
160 return 0.0
161 } else {
162 var tmp histogramDistance = *histogram
163 histogramAddHistogramDistance(&tmp, candidate)
164 return populationCostDistance(&tmp) - candidate.bit_cost_
165 }
166}
167
168/* Find the best 'out' histogram for each of the 'in' histograms.
169 When called, clusters[0..num_clusters) contains the unique values from
170 symbols[0..in_size), but this property is not preserved in this function.
171 Note: we assume that out[]->bit_cost_ is already up-to-date. */
172func histogramRemapDistance(in []histogramDistance, in_size uint, clusters []uint32, num_clusters uint, out []histogramDistance, symbols []uint32) {
173 var i uint
174 for i = 0; i < in_size; i++ {
175 var best_out uint32
176 if i == 0 {
177 best_out = symbols[0]
178 } else {
179 best_out = symbols[i-1]
180 }
181 var best_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[best_out])
182 var j uint
183 for j = 0; j < num_clusters; j++ {
184 var cur_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[clusters[j]])
185 if cur_bits < best_bits {
186 best_bits = cur_bits
187 best_out = clusters[j]
188 }
189 }
190
191 symbols[i] = best_out
192 }
193
194 /* Recompute each out based on raw and symbols. */
195 for i = 0; i < num_clusters; i++ {
196 histogramClearDistance(&out[clusters[i]])
197 }
198
199 for i = 0; i < in_size; i++ {
200 histogramAddHistogramDistance(&out[symbols[i]], &in[i])
201 }
202}
203
204/* Reorders elements of the out[0..length) array and changes values in
205 symbols[0..length) array in the following way:
206 * when called, symbols[] contains indexes into out[], and has N unique
207 values (possibly N < length)
208 * on return, symbols'[i] = f(symbols[i]) and
209 out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
210 where f is a bijection between the range of symbols[] and [0..N), and
211 the first occurrences of values in symbols'[i] come in consecutive
212 increasing order.
213 Returns N, the number of unique values in symbols[]. */
214
215var histogramReindexDistance_kInvalidIndex uint32 = math.MaxUint32
216
217func histogramReindexDistance(out []histogramDistance, symbols []uint32, length uint) uint {
218 var new_index []uint32 = make([]uint32, length)
219 var next_index uint32
220 var tmp []histogramDistance
221 var i uint
222 for i = 0; i < length; i++ {
223 new_index[i] = histogramReindexDistance_kInvalidIndex
224 }
225
226 next_index = 0
227 for i = 0; i < length; i++ {
228 if new_index[symbols[i]] == histogramReindexDistance_kInvalidIndex {
229 new_index[symbols[i]] = next_index
230 next_index++
231 }
232 }
233
234 /* TODO: by using idea of "cycle-sort" we can avoid allocation of
235 tmp and reduce the number of copying by the factor of 2. */
236 tmp = make([]histogramDistance, next_index)
237
238 next_index = 0
239 for i = 0; i < length; i++ {
240 if new_index[symbols[i]] == next_index {
241 tmp[next_index] = out[symbols[i]]
242 next_index++
243 }
244
245 symbols[i] = new_index[symbols[i]]
246 }
247
248 new_index = nil
249 for i = 0; uint32(i) < next_index; i++ {
250 out[i] = tmp[i]
251 }
252
253 tmp = nil
254 return uint(next_index)
255}
256
257func clusterHistogramsDistance(in []histogramDistance, in_size uint, max_histograms uint, out []histogramDistance, out_size *uint, histogram_symbols []uint32) {
258 var cluster_size []uint32 = make([]uint32, in_size)
259 var clusters []uint32 = make([]uint32, in_size)
260 var num_clusters uint = 0
261 var max_input_histograms uint = 64
262 var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
263 var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
264 var i uint
265
266 /* For the first pass of clustering, we allow all pairs. */
267 for i = 0; i < in_size; i++ {
268 cluster_size[i] = 1
269 }
270
271 for i = 0; i < in_size; i++ {
272 out[i] = in[i]
273 out[i].bit_cost_ = populationCostDistance(&in[i])
274 histogram_symbols[i] = uint32(i)
275 }
276
277 for i = 0; i < in_size; i += max_input_histograms {
278 var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
279 var num_new_clusters uint
280 var j uint
281 for j = 0; j < num_to_combine; j++ {
282 clusters[num_clusters+j] = uint32(i + j)
283 }
284
285 num_new_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
286 num_clusters += num_new_clusters
287 }
288 {
289 /* For the second pass, we limit the total number of histogram pairs.
290 After this limit is reached, we only keep searching for the best pair. */
291 var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
292 if pairs_capacity < (max_num_pairs + 1) {
293 var _new_size uint
294 if pairs_capacity == 0 {
295 _new_size = max_num_pairs + 1
296 } else {
297 _new_size = pairs_capacity
298 }
299 var new_array []histogramPair
300 for _new_size < (max_num_pairs + 1) {
301 _new_size *= 2
302 }
303 new_array = make([]histogramPair, _new_size)
304 if pairs_capacity != 0 {
305 copy(new_array, pairs[:pairs_capacity])
306 }
307
308 pairs = new_array
309 pairs_capacity = _new_size
310 }
311
312 /* Collapse similar histograms. */
313 num_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
314 }
315
316 pairs = nil
317 cluster_size = nil
318
319 /* Find the optimal map from original histograms to the final ones. */
320 histogramRemapDistance(in, in_size, clusters, num_clusters, out, histogram_symbols)
321
322 clusters = nil
323
324 /* Convert the context map to a canonical form. */
325 *out_size = histogramReindexDistance(out, histogram_symbols, in_size)
326}
Note: See TracBrowser for help on using the repository browser.