1 | // Copyright 2009 The Go Authors. All rights reserved.
|
---|
2 | // Use of this source code is governed by a BSD-style
|
---|
3 | // license that can be found in the LICENSE file.
|
---|
4 |
|
---|
5 | // Package gzip implements reading and writing of gzip format compressed files,
|
---|
6 | // as specified in RFC 1952.
|
---|
7 | package gzip
|
---|
8 |
|
---|
9 | import (
|
---|
10 | "bufio"
|
---|
11 | "compress/gzip"
|
---|
12 | "encoding/binary"
|
---|
13 | "hash/crc32"
|
---|
14 | "io"
|
---|
15 | "time"
|
---|
16 |
|
---|
17 | "github.com/klauspost/compress/flate"
|
---|
18 | )
|
---|
19 |
|
---|
20 | const (
|
---|
21 | gzipID1 = 0x1f
|
---|
22 | gzipID2 = 0x8b
|
---|
23 | gzipDeflate = 8
|
---|
24 | flagText = 1 << 0
|
---|
25 | flagHdrCrc = 1 << 1
|
---|
26 | flagExtra = 1 << 2
|
---|
27 | flagName = 1 << 3
|
---|
28 | flagComment = 1 << 4
|
---|
29 | )
|
---|
30 |
|
---|
31 | var (
|
---|
32 | // ErrChecksum is returned when reading GZIP data that has an invalid checksum.
|
---|
33 | ErrChecksum = gzip.ErrChecksum
|
---|
34 | // ErrHeader is returned when reading GZIP data that has an invalid header.
|
---|
35 | ErrHeader = gzip.ErrHeader
|
---|
36 | )
|
---|
37 |
|
---|
38 | var le = binary.LittleEndian
|
---|
39 |
|
---|
40 | // noEOF converts io.EOF to io.ErrUnexpectedEOF.
|
---|
41 | func noEOF(err error) error {
|
---|
42 | if err == io.EOF {
|
---|
43 | return io.ErrUnexpectedEOF
|
---|
44 | }
|
---|
45 | return err
|
---|
46 | }
|
---|
47 |
|
---|
48 | // The gzip file stores a header giving metadata about the compressed file.
|
---|
49 | // That header is exposed as the fields of the Writer and Reader structs.
|
---|
50 | //
|
---|
51 | // Strings must be UTF-8 encoded and may only contain Unicode code points
|
---|
52 | // U+0001 through U+00FF, due to limitations of the GZIP file format.
|
---|
53 | type Header struct {
|
---|
54 | Comment string // comment
|
---|
55 | Extra []byte // "extra data"
|
---|
56 | ModTime time.Time // modification time
|
---|
57 | Name string // file name
|
---|
58 | OS byte // operating system type
|
---|
59 | }
|
---|
60 |
|
---|
61 | // A Reader is an io.Reader that can be read to retrieve
|
---|
62 | // uncompressed data from a gzip-format compressed file.
|
---|
63 | //
|
---|
64 | // In general, a gzip file can be a concatenation of gzip files,
|
---|
65 | // each with its own header. Reads from the Reader
|
---|
66 | // return the concatenation of the uncompressed data of each.
|
---|
67 | // Only the first header is recorded in the Reader fields.
|
---|
68 | //
|
---|
69 | // Gzip files store a length and checksum of the uncompressed data.
|
---|
70 | // The Reader will return a ErrChecksum when Read
|
---|
71 | // reaches the end of the uncompressed data if it does not
|
---|
72 | // have the expected length or checksum. Clients should treat data
|
---|
73 | // returned by Read as tentative until they receive the io.EOF
|
---|
74 | // marking the end of the data.
|
---|
75 | type Reader struct {
|
---|
76 | Header // valid after NewReader or Reader.Reset
|
---|
77 | r flate.Reader
|
---|
78 | br *bufio.Reader
|
---|
79 | decompressor io.ReadCloser
|
---|
80 | digest uint32 // CRC-32, IEEE polynomial (section 8)
|
---|
81 | size uint32 // Uncompressed size (section 2.3.1)
|
---|
82 | buf [512]byte
|
---|
83 | err error
|
---|
84 | multistream bool
|
---|
85 | }
|
---|
86 |
|
---|
87 | // NewReader creates a new Reader reading the given reader.
|
---|
88 | // If r does not also implement io.ByteReader,
|
---|
89 | // the decompressor may read more data than necessary from r.
|
---|
90 | //
|
---|
91 | // It is the caller's responsibility to call Close on the Reader when done.
|
---|
92 | //
|
---|
93 | // The Reader.Header fields will be valid in the Reader returned.
|
---|
94 | func NewReader(r io.Reader) (*Reader, error) {
|
---|
95 | z := new(Reader)
|
---|
96 | if err := z.Reset(r); err != nil {
|
---|
97 | return nil, err
|
---|
98 | }
|
---|
99 | return z, nil
|
---|
100 | }
|
---|
101 |
|
---|
102 | // Reset discards the Reader z's state and makes it equivalent to the
|
---|
103 | // result of its original state from NewReader, but reading from r instead.
|
---|
104 | // This permits reusing a Reader rather than allocating a new one.
|
---|
105 | func (z *Reader) Reset(r io.Reader) error {
|
---|
106 | *z = Reader{
|
---|
107 | decompressor: z.decompressor,
|
---|
108 | multistream: true,
|
---|
109 | }
|
---|
110 | if rr, ok := r.(flate.Reader); ok {
|
---|
111 | z.r = rr
|
---|
112 | } else {
|
---|
113 | // Reuse if we can.
|
---|
114 | if z.br != nil {
|
---|
115 | z.br.Reset(r)
|
---|
116 | } else {
|
---|
117 | z.br = bufio.NewReader(r)
|
---|
118 | }
|
---|
119 | z.r = z.br
|
---|
120 | }
|
---|
121 | z.Header, z.err = z.readHeader()
|
---|
122 | return z.err
|
---|
123 | }
|
---|
124 |
|
---|
125 | // Multistream controls whether the reader supports multistream files.
|
---|
126 | //
|
---|
127 | // If enabled (the default), the Reader expects the input to be a sequence
|
---|
128 | // of individually gzipped data streams, each with its own header and
|
---|
129 | // trailer, ending at EOF. The effect is that the concatenation of a sequence
|
---|
130 | // of gzipped files is treated as equivalent to the gzip of the concatenation
|
---|
131 | // of the sequence. This is standard behavior for gzip readers.
|
---|
132 | //
|
---|
133 | // Calling Multistream(false) disables this behavior; disabling the behavior
|
---|
134 | // can be useful when reading file formats that distinguish individual gzip
|
---|
135 | // data streams or mix gzip data streams with other data streams.
|
---|
136 | // In this mode, when the Reader reaches the end of the data stream,
|
---|
137 | // Read returns io.EOF. If the underlying reader implements io.ByteReader,
|
---|
138 | // it will be left positioned just after the gzip stream.
|
---|
139 | // To start the next stream, call z.Reset(r) followed by z.Multistream(false).
|
---|
140 | // If there is no next stream, z.Reset(r) will return io.EOF.
|
---|
141 | func (z *Reader) Multistream(ok bool) {
|
---|
142 | z.multistream = ok
|
---|
143 | }
|
---|
144 |
|
---|
145 | // readString reads a NUL-terminated string from z.r.
|
---|
146 | // It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
|
---|
147 | // will output a string encoded using UTF-8.
|
---|
148 | // This method always updates z.digest with the data read.
|
---|
149 | func (z *Reader) readString() (string, error) {
|
---|
150 | var err error
|
---|
151 | needConv := false
|
---|
152 | for i := 0; ; i++ {
|
---|
153 | if i >= len(z.buf) {
|
---|
154 | return "", ErrHeader
|
---|
155 | }
|
---|
156 | z.buf[i], err = z.r.ReadByte()
|
---|
157 | if err != nil {
|
---|
158 | return "", err
|
---|
159 | }
|
---|
160 | if z.buf[i] > 0x7f {
|
---|
161 | needConv = true
|
---|
162 | }
|
---|
163 | if z.buf[i] == 0 {
|
---|
164 | // Digest covers the NUL terminator.
|
---|
165 | z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
|
---|
166 |
|
---|
167 | // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
|
---|
168 | if needConv {
|
---|
169 | s := make([]rune, 0, i)
|
---|
170 | for _, v := range z.buf[:i] {
|
---|
171 | s = append(s, rune(v))
|
---|
172 | }
|
---|
173 | return string(s), nil
|
---|
174 | }
|
---|
175 | return string(z.buf[:i]), nil
|
---|
176 | }
|
---|
177 | }
|
---|
178 | }
|
---|
179 |
|
---|
180 | // readHeader reads the GZIP header according to section 2.3.1.
|
---|
181 | // This method does not set z.err.
|
---|
182 | func (z *Reader) readHeader() (hdr Header, err error) {
|
---|
183 | if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
|
---|
184 | // RFC 1952, section 2.2, says the following:
|
---|
185 | // A gzip file consists of a series of "members" (compressed data sets).
|
---|
186 | //
|
---|
187 | // Other than this, the specification does not clarify whether a
|
---|
188 | // "series" is defined as "one or more" or "zero or more". To err on the
|
---|
189 | // side of caution, Go interprets this to mean "zero or more".
|
---|
190 | // Thus, it is okay to return io.EOF here.
|
---|
191 | return hdr, err
|
---|
192 | }
|
---|
193 | if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
|
---|
194 | return hdr, ErrHeader
|
---|
195 | }
|
---|
196 | flg := z.buf[3]
|
---|
197 | hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
|
---|
198 | // z.buf[8] is XFL and is currently ignored.
|
---|
199 | hdr.OS = z.buf[9]
|
---|
200 | z.digest = crc32.ChecksumIEEE(z.buf[:10])
|
---|
201 |
|
---|
202 | if flg&flagExtra != 0 {
|
---|
203 | if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
|
---|
204 | return hdr, noEOF(err)
|
---|
205 | }
|
---|
206 | z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
|
---|
207 | data := make([]byte, le.Uint16(z.buf[:2]))
|
---|
208 | if _, err = io.ReadFull(z.r, data); err != nil {
|
---|
209 | return hdr, noEOF(err)
|
---|
210 | }
|
---|
211 | z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
|
---|
212 | hdr.Extra = data
|
---|
213 | }
|
---|
214 |
|
---|
215 | var s string
|
---|
216 | if flg&flagName != 0 {
|
---|
217 | if s, err = z.readString(); err != nil {
|
---|
218 | return hdr, err
|
---|
219 | }
|
---|
220 | hdr.Name = s
|
---|
221 | }
|
---|
222 |
|
---|
223 | if flg&flagComment != 0 {
|
---|
224 | if s, err = z.readString(); err != nil {
|
---|
225 | return hdr, err
|
---|
226 | }
|
---|
227 | hdr.Comment = s
|
---|
228 | }
|
---|
229 |
|
---|
230 | if flg&flagHdrCrc != 0 {
|
---|
231 | if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
|
---|
232 | return hdr, noEOF(err)
|
---|
233 | }
|
---|
234 | digest := le.Uint16(z.buf[:2])
|
---|
235 | if digest != uint16(z.digest) {
|
---|
236 | return hdr, ErrHeader
|
---|
237 | }
|
---|
238 | }
|
---|
239 |
|
---|
240 | z.digest = 0
|
---|
241 | if z.decompressor == nil {
|
---|
242 | z.decompressor = flate.NewReader(z.r)
|
---|
243 | } else {
|
---|
244 | z.decompressor.(flate.Resetter).Reset(z.r, nil)
|
---|
245 | }
|
---|
246 | return hdr, nil
|
---|
247 | }
|
---|
248 |
|
---|
249 | // Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
|
---|
250 | func (z *Reader) Read(p []byte) (n int, err error) {
|
---|
251 | if z.err != nil {
|
---|
252 | return 0, z.err
|
---|
253 | }
|
---|
254 |
|
---|
255 | n, z.err = z.decompressor.Read(p)
|
---|
256 | z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
|
---|
257 | z.size += uint32(n)
|
---|
258 | if z.err != io.EOF {
|
---|
259 | // In the normal case we return here.
|
---|
260 | return n, z.err
|
---|
261 | }
|
---|
262 |
|
---|
263 | // Finished file; check checksum and size.
|
---|
264 | if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
|
---|
265 | z.err = noEOF(err)
|
---|
266 | return n, z.err
|
---|
267 | }
|
---|
268 | digest := le.Uint32(z.buf[:4])
|
---|
269 | size := le.Uint32(z.buf[4:8])
|
---|
270 | if digest != z.digest || size != z.size {
|
---|
271 | z.err = ErrChecksum
|
---|
272 | return n, z.err
|
---|
273 | }
|
---|
274 | z.digest, z.size = 0, 0
|
---|
275 |
|
---|
276 | // File is ok; check if there is another.
|
---|
277 | if !z.multistream {
|
---|
278 | return n, io.EOF
|
---|
279 | }
|
---|
280 | z.err = nil // Remove io.EOF
|
---|
281 |
|
---|
282 | if _, z.err = z.readHeader(); z.err != nil {
|
---|
283 | return n, z.err
|
---|
284 | }
|
---|
285 |
|
---|
286 | // Read from next file, if necessary.
|
---|
287 | if n > 0 {
|
---|
288 | return n, nil
|
---|
289 | }
|
---|
290 | return z.Read(p)
|
---|
291 | }
|
---|
292 |
|
---|
293 | // Support the io.WriteTo interface for io.Copy and friends.
|
---|
294 | func (z *Reader) WriteTo(w io.Writer) (int64, error) {
|
---|
295 | total := int64(0)
|
---|
296 | crcWriter := crc32.NewIEEE()
|
---|
297 | for {
|
---|
298 | if z.err != nil {
|
---|
299 | if z.err == io.EOF {
|
---|
300 | return total, nil
|
---|
301 | }
|
---|
302 | return total, z.err
|
---|
303 | }
|
---|
304 |
|
---|
305 | // We write both to output and digest.
|
---|
306 | mw := io.MultiWriter(w, crcWriter)
|
---|
307 | n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
|
---|
308 | total += n
|
---|
309 | z.size += uint32(n)
|
---|
310 | if err != nil {
|
---|
311 | z.err = err
|
---|
312 | return total, z.err
|
---|
313 | }
|
---|
314 |
|
---|
315 | // Finished file; check checksum + size.
|
---|
316 | if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
|
---|
317 | if err == io.EOF {
|
---|
318 | err = io.ErrUnexpectedEOF
|
---|
319 | }
|
---|
320 | z.err = err
|
---|
321 | return total, err
|
---|
322 | }
|
---|
323 | z.digest = crcWriter.Sum32()
|
---|
324 | digest := le.Uint32(z.buf[:4])
|
---|
325 | size := le.Uint32(z.buf[4:8])
|
---|
326 | if digest != z.digest || size != z.size {
|
---|
327 | z.err = ErrChecksum
|
---|
328 | return total, z.err
|
---|
329 | }
|
---|
330 | z.digest, z.size = 0, 0
|
---|
331 |
|
---|
332 | // File is ok; check if there is another.
|
---|
333 | if !z.multistream {
|
---|
334 | return total, nil
|
---|
335 | }
|
---|
336 | crcWriter.Reset()
|
---|
337 | z.err = nil // Remove io.EOF
|
---|
338 |
|
---|
339 | if _, z.err = z.readHeader(); z.err != nil {
|
---|
340 | if z.err == io.EOF {
|
---|
341 | return total, nil
|
---|
342 | }
|
---|
343 | return total, z.err
|
---|
344 | }
|
---|
345 | }
|
---|
346 | }
|
---|
347 |
|
---|
348 | // Close closes the Reader. It does not close the underlying io.Reader.
|
---|
349 | // In order for the GZIP checksum to be verified, the reader must be
|
---|
350 | // fully consumed until the io.EOF.
|
---|
351 | func (z *Reader) Close() error { return z.decompressor.Close() }
|
---|