diff --git a/README.md b/README.md index 06fe14114e..266d613a21 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,14 @@ This package provides various compression algorithms. * [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding. * [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation. +[](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories) [](https://travis-ci.org/klauspost/compress) [](https://sourcegraph.com/github.com/klauspost/compress?badge) [](https://fuzzit.dev) # changelog +* Jan 31, 2020: Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214) * Jan 28, 2020: Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s. [#186](https://github.com/klauspost/compress/pull/186) * Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056), [#206](https://github.com/klauspost/compress/pull/206). * Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204) @@ -118,12 +120,14 @@ This package provides various compression algorithms. The packages are drop-in replacements for standard libraries. Simply replace the import path to use them: -| old import | new import | -|--------------------|-----------------------------------------| -| `compress/gzip` | `github.com/klauspost/compress/gzip` | -| `compress/zlib` | `github.com/klauspost/compress/zlib` | -| `archive/zip` | `github.com/klauspost/compress/zip` | -| `compress/flate` | `github.com/klauspost/compress/flate` | +| old import | new import | Documentation +|--------------------|-----------------------------------------|--------------------| +| `compress/gzip` | `github.com/klauspost/compress/gzip` | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc) +| `compress/zlib` | `github.com/klauspost/compress/zlib` | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc) +| `archive/zip` | `github.com/klauspost/compress/zip` | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc) +| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc) + +* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib). You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages. diff --git a/zip/reader.go b/zip/reader.go index 2260b398c3..9fab6e3878 100644 --- a/zip/reader.go +++ b/zip/reader.go @@ -182,6 +182,16 @@ func (f *File) Open() (io.ReadCloser, error) { return rc, nil } +// OpenRaw returns a Reader that returns the *compressed* output of the file. +func (f *File) OpenRaw() (io.Reader, error) { + bodyOffset, err := f.findBodyOffset() + if err != nil { + return nil, err + } + size := int64(f.CompressedSize64) + return io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size), nil +} + type checksumReader struct { rc io.ReadCloser hash hash.Hash32 diff --git a/zip/writer.go b/zip/writer.go index cdc534eaf0..4d0628a879 100644 --- a/zip/writer.go +++ b/zip/writer.go @@ -20,11 +20,16 @@ var ( errLongExtra = errors.New("zip: FileHeader.Extra too long") ) +type lastWriter interface { + Close() error + Closed() bool +} + // Writer implements a zip file writer. type Writer struct { cw *countWriter dir []*header - last *fileWriter + last lastWriter closed bool compressors map[uint16]Compressor comment string @@ -72,10 +77,10 @@ func (w *Writer) SetComment(comment string) error { } // Close finishes writing the zip file by writing the central directory. -// It does not close the underlying writer. +// It does not Close the underlying writer. func (w *Writer) Close() error { - if w.last != nil && !w.last.closed { - if err := w.last.close(); err != nil { + if w.last != nil && !w.last.Closed() { + if err := w.last.Close(); err != nil { return err } w.last = nil @@ -222,6 +227,25 @@ func (w *Writer) Create(name string) (io.Writer, error) { return w.CreateHeader(header) } +// Copy will copy raw content from input file. +// Optionally a different name can be given to the new file. +func (w *Writer) Copy(name string, src *File) error { + header := src.FileHeader + if name != "" { + header.Name = name + } + raw, err := src.OpenRaw() + if err != nil { + return err + } + wr, err := w.CreateHeaderRaw(&header) + if err != nil { + return err + } + _, err = io.Copy(wr, raw) + return err +} + // detectUTF8 reports whether s is a valid UTF-8 string, and whether the string // must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII, // or any other common encoding). @@ -251,10 +275,10 @@ func detectUTF8(s string) (valid, require bool) { // // This returns a Writer to which the file contents should be written. // The file's contents must be written to the io.Writer before the next -// call to Create, CreateHeader, or Close. +// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close. func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) { - if w.last != nil && !w.last.closed { - if err := w.last.close(); err != nil { + if w.last != nil && !w.last.Closed() { + if err := w.last.Close(); err != nil { return nil, err } } @@ -319,10 +343,7 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) { fh.Extra = append(fh.Extra, mbuf[:]...) } - var ( - ow io.Writer - fw *fileWriter - ) + var ow io.Writer h := &header{ FileHeader: fh, offset: uint64(w.cw.count), @@ -343,10 +364,11 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) { fh.UncompressedSize64 = 0 ow = dirWriter{} + w.last = nil } else { fh.Flags |= 0x8 // we will write a data descriptor - fw = &fileWriter{ + fw := &fileWriter{ zipw: w.cw, compCount: &countWriter{w: w.cw}, crc32: crc32.NewIEEE(), @@ -363,13 +385,131 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) { fw.rawCount = &countWriter{w: fw.comp} fw.header = h ow = fw + w.last = fw + } + w.dir = append(w.dir, h) + if err := writeHeader(w.cw, fh); err != nil { + return nil, err + } + // If we're creating a directory, fw is nil. + return ow, nil +} + +// CreateHeaderRaw adds a file to the zip archive using the provided FileHeader +// for the file metadata. Writer takes ownership of fh and may mutate +// its fields. The caller must not modify fh after calling CreateHeaderRaw. +// +// This returns a Writer to which the compressed file contents should be written. +// The file's contents must be written to the io.Writer before the next +// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close. +// +// Using this requires knowledge of populating the FileHeader correctly. +// Generally using the Copy() function is recommended. +func (w *Writer) CreateHeaderRaw(fh *FileHeader) (io.Writer, error) { + if w.last != nil && !w.last.Closed() { + if err := w.last.Close(); err != nil { + return nil, err + } + } + if len(w.dir) > 0 && w.dir[len(w.dir)-1].FileHeader == fh { + // See https://golang.org/issue/11144 confusion. + return nil, errors.New("archive/zip: invalid duplicate FileHeader") + } + + // The ZIP format has a sad state of affairs regarding character encoding. + // Officially, the name and comment fields are supposed to be encoded + // in CP-437 (which is mostly compatible with ASCII), unless the UTF-8 + // flag bit is set. However, there are several problems: + // + // * Many ZIP readers still do not support UTF-8. + // * If the UTF-8 flag is cleared, several readers simply interpret the + // name and comment fields as whatever the local system encoding is. + // + // In order to avoid breaking readers without UTF-8 support, + // we avoid setting the UTF-8 flag if the strings are CP-437 compatible. + // However, if the strings require multibyte UTF-8 encoding and is a + // valid UTF-8 string, then we set the UTF-8 bit. + // + // For the case, where the user explicitly wants to specify the encoding + // as UTF-8, they will need to set the flag bit themselves. + utf8Valid1, utf8Require1 := detectUTF8(fh.Name) + utf8Valid2, utf8Require2 := detectUTF8(fh.Comment) + switch { + case fh.NonUTF8: + fh.Flags &^= 0x800 + case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2): + fh.Flags |= 0x800 + } + + fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte + fh.ReaderVersion = zipVersion20 + + // If Modified is set, this takes precedence over MS-DOS timestamp fields. + if !fh.Modified.IsZero() { + // Contrary to the FileHeader.SetModTime method, we intentionally + // do not convert to UTC, because we assume the user intends to encode + // the date using the specified timezone. A user may want this control + // because many legacy ZIP readers interpret the timestamp according + // to the local timezone. + // + // The timezone is only non-UTC if a user directly sets the Modified + // field directly themselves. All other approaches sets UTC. + fh.ModifiedDate, fh.ModifiedTime = timeToMsDosTime(fh.Modified) + + // Use "extended timestamp" format since this is what Info-ZIP uses. + // Nearly every major ZIP implementation uses a different format, + // but at least most seem to be able to understand the other formats. + // + // This format happens to be identical for both local and central header + // if modification time is the only timestamp being encoded. + var mbuf [9]byte // 2*SizeOf(uint16) + SizeOf(uint8) + SizeOf(uint32) + mt := uint32(fh.Modified.Unix()) + eb := writeBuf(mbuf[:]) + eb.uint16(extTimeExtraID) + eb.uint16(5) // Size: SizeOf(uint8) + SizeOf(uint32) + eb.uint8(1) // Flags: ModTime + eb.uint32(mt) // ModTime + fh.Extra = append(fh.Extra, mbuf[:]...) + } + + var ow io.Writer + h := &header{ + FileHeader: fh, + offset: uint64(w.cw.count), + } + + if strings.HasSuffix(fh.Name, "/") { + // Set the compression method to Store to ensure data length is truly zero, + // which the writeHeader method always encodes for the size fields. + // This is necessary as most compression formats have non-zero lengths + // even when compressing an empty string. + fh.Method = Store + fh.Flags &^= 0x8 // we will not write a data descriptor + + // Explicitly clear sizes as they have no meaning for directories. + fh.CompressedSize = 0 + fh.CompressedSize64 = 0 + fh.UncompressedSize = 0 + fh.UncompressedSize64 = 0 + + ow = dirWriter{} + w.last = nil + } else { + fh.Flags |= 0x8 // we will write a data descriptor + + fw := &rawWriter{ + header: h, + zipw: w.cw, + rawCount: &countWriter{w: w.cw}, + } + ow = fw + w.last = fw } w.dir = append(w.dir, h) if err := writeHeader(w.cw, fh); err != nil { return nil, err } // If we're creating a directory, fw is nil. - w.last = fw return ow, nil } @@ -450,7 +590,11 @@ func (w *fileWriter) Write(p []byte) (int, error) { return w.rawCount.Write(p) } -func (w *fileWriter) close() error { +func (w *fileWriter) Closed() bool { + return w.closed +} + +func (w *fileWriter) Close() error { if w.closed { return errors.New("zip: file closed twice") } @@ -499,6 +643,57 @@ func (w *fileWriter) close() error { return err } +type rawWriter struct { + *header + zipw io.Writer + rawCount *countWriter + closed bool +} + +func (w *rawWriter) Write(p []byte) (int, error) { + if w.closed { + return 0, errors.New("zip: write to closed file") + } + return w.rawCount.Write(p) +} + +func (w *rawWriter) Closed() bool { + return w.closed +} + +func (w *rawWriter) Close() error { + if w.closed { + return errors.New("zip: file closed twice") + } + w.closed = true + fh := w.FileHeader + fh.CompressedSize64 = uint64(w.rawCount.count) + + // Write data descriptor. This is more complicated than one would + // think, see e.g. comments in zipfile.c:putextended() and + // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7073588. + // The approach here is to write 8 byte sizes if needed without + // adding a zip64 extra in the local header (too late anyway). + var buf []byte + if fh.isZip64() { + buf = make([]byte, dataDescriptor64Len) + } else { + buf = make([]byte, dataDescriptorLen) + } + b := writeBuf(buf) + b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X + b.uint32(fh.CRC32) + if fh.isZip64() { + b.uint64(fh.CompressedSize64) + b.uint64(fh.UncompressedSize64) + } else { + b.uint32(fh.CompressedSize) + b.uint32(fh.UncompressedSize) + } + _, err := w.zipw.Write(buf) + return err +} + type countWriter struct { w io.Writer count int64 diff --git a/zip/writer_test.go b/zip/writer_test.go index 1fedfd85e8..61540fbd4d 100644 --- a/zip/writer_test.go +++ b/zip/writer_test.go @@ -247,6 +247,58 @@ func TestWriterTime(t *testing.T) { } } +func TestWriterCopy(t *testing.T) { + want, err := ioutil.ReadFile("testdata/test.zip") + if err != nil { + t.Fatalf("unexpected ReadFile error: %v", err) + } + r, err := NewReader(bytes.NewReader(want), int64(len(want))) + if err != nil { + t.Fatalf("unexpected NewReader error: %v", err) + } + var buf bytes.Buffer + w := NewWriter(&buf) + for _, f := range r.File { + err := w.Copy("", f) + if err != nil { + t.Fatalf("unexpected Copy error: %v", err) + } + } + if err := w.Close(); err != nil { + t.Fatalf("unexpected Close error: %v", err) + } + + // Read back... + got := buf.Bytes() + r2, err := NewReader(bytes.NewReader(got), int64(len(got))) + if err != nil { + t.Fatalf("unexpected NewReader error: %v", err) + } + for i, fWAnt := range r.File { + wantR, err := fWAnt.Open() + if err != nil { + t.Fatalf("unexpected Open error: %v", err) + } + want, err := ioutil.ReadAll(wantR) + if err != nil { + t.Fatalf("unexpected Copy error: %v", err) + } + + fGot := r2.File[i] + gotR, err := fGot.Open() + if err != nil { + t.Fatalf("unexpected Open error: %v", err) + } + got, err := ioutil.ReadAll(gotR) + if err != nil { + t.Fatalf("unexpected Copy error: %v", err) + } + if !bytes.Equal(got, want) { + fmt.Printf("%x\n%x\n", got, want) + t.Error("contents of copied mismatch") + } + } +} func TestWriterOffset(t *testing.T) { largeData := make([]byte, 1<<17) if _, err := rand.Read(largeData); err != nil {