Skip to content

Commit

Permalink
Add compressed ZIP copying (#214)
Browse files Browse the repository at this point in the history
* Add compressed ZIP copying

Allow copying content from an existing ZIP file.
  • Loading branch information
klauspost authored Feb 1, 2020
1 parent e1f2a6e commit 26bacdf
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 20 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ This package provides various compression algorithms.
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.

[![Documentation](https://godoc.org/github.com/klauspost/compress?status.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
[![Build Status](https://travis-ci.org/klauspost/compress.svg?branch=master)](https://travis-ci.org/klauspost/compress)
[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
[![fuzzit](https://app.fuzzit.dev/badge?org_id=klauspost)](https://fuzzit.dev)

# changelog

* Jan 31, 2020: Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214)
* Jan 28, 2020: Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s. [#186](https://github.com/klauspost/compress/pull/186)
* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056), [#206](https://github.com/klauspost/compress/pull/206).
* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204)
Expand Down Expand Up @@ -118,12 +120,14 @@ This package provides various compression algorithms.

The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:

| old import | new import |
|--------------------|-----------------------------------------|
| `compress/gzip` | `github.com/klauspost/compress/gzip` |
| `compress/zlib` | `github.com/klauspost/compress/zlib` |
| `archive/zip` | `github.com/klauspost/compress/zip` |
| `compress/flate` | `github.com/klauspost/compress/flate` |
| old import | new import | Documentation
|--------------------|-----------------------------------------|--------------------|
| `compress/gzip` | `github.com/klauspost/compress/gzip` | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)
| `compress/zlib` | `github.com/klauspost/compress/zlib` | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)
| `archive/zip` | `github.com/klauspost/compress/zip` | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)
| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)

* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).

You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages.

Expand Down
10 changes: 10 additions & 0 deletions zip/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ func (f *File) Open() (io.ReadCloser, error) {
return rc, nil
}

// OpenRaw returns a Reader that returns the *compressed* output of the file.
func (f *File) OpenRaw() (io.Reader, error) {
bodyOffset, err := f.findBodyOffset()
if err != nil {
return nil, err
}
size := int64(f.CompressedSize64)
return io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size), nil
}

type checksumReader struct {
rc io.ReadCloser
hash hash.Hash32
Expand Down
223 changes: 209 additions & 14 deletions zip/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,16 @@ var (
errLongExtra = errors.New("zip: FileHeader.Extra too long")
)

type lastWriter interface {
Close() error
Closed() bool
}

// Writer implements a zip file writer.
type Writer struct {
cw *countWriter
dir []*header
last *fileWriter
last lastWriter
closed bool
compressors map[uint16]Compressor
comment string
Expand Down Expand Up @@ -72,10 +77,10 @@ func (w *Writer) SetComment(comment string) error {
}

// Close finishes writing the zip file by writing the central directory.
// It does not close the underlying writer.
// It does not Close the underlying writer.
func (w *Writer) Close() error {
if w.last != nil && !w.last.closed {
if err := w.last.close(); err != nil {
if w.last != nil && !w.last.Closed() {
if err := w.last.Close(); err != nil {
return err
}
w.last = nil
Expand Down Expand Up @@ -222,6 +227,25 @@ func (w *Writer) Create(name string) (io.Writer, error) {
return w.CreateHeader(header)
}

// Copy will copy raw content from input file.
// Optionally a different name can be given to the new file.
func (w *Writer) Copy(name string, src *File) error {
header := src.FileHeader
if name != "" {
header.Name = name
}
raw, err := src.OpenRaw()
if err != nil {
return err
}
wr, err := w.CreateHeaderRaw(&header)
if err != nil {
return err
}
_, err = io.Copy(wr, raw)
return err
}

// detectUTF8 reports whether s is a valid UTF-8 string, and whether the string
// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
// or any other common encoding).
Expand Down Expand Up @@ -251,10 +275,10 @@ func detectUTF8(s string) (valid, require bool) {
//
// This returns a Writer to which the file contents should be written.
// The file's contents must be written to the io.Writer before the next
// call to Create, CreateHeader, or Close.
// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close.
func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
if w.last != nil && !w.last.closed {
if err := w.last.close(); err != nil {
if w.last != nil && !w.last.Closed() {
if err := w.last.Close(); err != nil {
return nil, err
}
}
Expand Down Expand Up @@ -319,10 +343,7 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
fh.Extra = append(fh.Extra, mbuf[:]...)
}

var (
ow io.Writer
fw *fileWriter
)
var ow io.Writer
h := &header{
FileHeader: fh,
offset: uint64(w.cw.count),
Expand All @@ -343,10 +364,11 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
fh.UncompressedSize64 = 0

ow = dirWriter{}
w.last = nil
} else {
fh.Flags |= 0x8 // we will write a data descriptor

fw = &fileWriter{
fw := &fileWriter{
zipw: w.cw,
compCount: &countWriter{w: w.cw},
crc32: crc32.NewIEEE(),
Expand All @@ -363,13 +385,131 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
fw.rawCount = &countWriter{w: fw.comp}
fw.header = h
ow = fw
w.last = fw
}
w.dir = append(w.dir, h)
if err := writeHeader(w.cw, fh); err != nil {
return nil, err
}
// If we're creating a directory, fw is nil.
return ow, nil
}

// CreateHeaderRaw adds a file to the zip archive using the provided FileHeader
// for the file metadata. Writer takes ownership of fh and may mutate
// its fields. The caller must not modify fh after calling CreateHeaderRaw.
//
// This returns a Writer to which the compressed file contents should be written.
// The file's contents must be written to the io.Writer before the next
// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close.
//
// Using this requires knowledge of populating the FileHeader correctly.
// Generally using the Copy() function is recommended.
func (w *Writer) CreateHeaderRaw(fh *FileHeader) (io.Writer, error) {
if w.last != nil && !w.last.Closed() {
if err := w.last.Close(); err != nil {
return nil, err
}
}
if len(w.dir) > 0 && w.dir[len(w.dir)-1].FileHeader == fh {
// See https://golang.org/issue/11144 confusion.
return nil, errors.New("archive/zip: invalid duplicate FileHeader")
}

// The ZIP format has a sad state of affairs regarding character encoding.
// Officially, the name and comment fields are supposed to be encoded
// in CP-437 (which is mostly compatible with ASCII), unless the UTF-8
// flag bit is set. However, there are several problems:
//
// * Many ZIP readers still do not support UTF-8.
// * If the UTF-8 flag is cleared, several readers simply interpret the
// name and comment fields as whatever the local system encoding is.
//
// In order to avoid breaking readers without UTF-8 support,
// we avoid setting the UTF-8 flag if the strings are CP-437 compatible.
// However, if the strings require multibyte UTF-8 encoding and is a
// valid UTF-8 string, then we set the UTF-8 bit.
//
// For the case, where the user explicitly wants to specify the encoding
// as UTF-8, they will need to set the flag bit themselves.
utf8Valid1, utf8Require1 := detectUTF8(fh.Name)
utf8Valid2, utf8Require2 := detectUTF8(fh.Comment)
switch {
case fh.NonUTF8:
fh.Flags &^= 0x800
case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2):
fh.Flags |= 0x800
}

fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte
fh.ReaderVersion = zipVersion20

// If Modified is set, this takes precedence over MS-DOS timestamp fields.
if !fh.Modified.IsZero() {
// Contrary to the FileHeader.SetModTime method, we intentionally
// do not convert to UTC, because we assume the user intends to encode
// the date using the specified timezone. A user may want this control
// because many legacy ZIP readers interpret the timestamp according
// to the local timezone.
//
// The timezone is only non-UTC if a user directly sets the Modified
// field directly themselves. All other approaches sets UTC.
fh.ModifiedDate, fh.ModifiedTime = timeToMsDosTime(fh.Modified)

// Use "extended timestamp" format since this is what Info-ZIP uses.
// Nearly every major ZIP implementation uses a different format,
// but at least most seem to be able to understand the other formats.
//
// This format happens to be identical for both local and central header
// if modification time is the only timestamp being encoded.
var mbuf [9]byte // 2*SizeOf(uint16) + SizeOf(uint8) + SizeOf(uint32)
mt := uint32(fh.Modified.Unix())
eb := writeBuf(mbuf[:])
eb.uint16(extTimeExtraID)
eb.uint16(5) // Size: SizeOf(uint8) + SizeOf(uint32)
eb.uint8(1) // Flags: ModTime
eb.uint32(mt) // ModTime
fh.Extra = append(fh.Extra, mbuf[:]...)
}

var ow io.Writer
h := &header{
FileHeader: fh,
offset: uint64(w.cw.count),
}

if strings.HasSuffix(fh.Name, "/") {
// Set the compression method to Store to ensure data length is truly zero,
// which the writeHeader method always encodes for the size fields.
// This is necessary as most compression formats have non-zero lengths
// even when compressing an empty string.
fh.Method = Store
fh.Flags &^= 0x8 // we will not write a data descriptor

// Explicitly clear sizes as they have no meaning for directories.
fh.CompressedSize = 0
fh.CompressedSize64 = 0
fh.UncompressedSize = 0
fh.UncompressedSize64 = 0

ow = dirWriter{}
w.last = nil
} else {
fh.Flags |= 0x8 // we will write a data descriptor

fw := &rawWriter{
header: h,
zipw: w.cw,
rawCount: &countWriter{w: w.cw},
}
ow = fw
w.last = fw
}
w.dir = append(w.dir, h)
if err := writeHeader(w.cw, fh); err != nil {
return nil, err
}
// If we're creating a directory, fw is nil.
w.last = fw
return ow, nil
}

Expand Down Expand Up @@ -450,7 +590,11 @@ func (w *fileWriter) Write(p []byte) (int, error) {
return w.rawCount.Write(p)
}

func (w *fileWriter) close() error {
func (w *fileWriter) Closed() bool {
return w.closed
}

func (w *fileWriter) Close() error {
if w.closed {
return errors.New("zip: file closed twice")
}
Expand Down Expand Up @@ -499,6 +643,57 @@ func (w *fileWriter) close() error {
return err
}

type rawWriter struct {
*header
zipw io.Writer
rawCount *countWriter
closed bool
}

func (w *rawWriter) Write(p []byte) (int, error) {
if w.closed {
return 0, errors.New("zip: write to closed file")
}
return w.rawCount.Write(p)
}

func (w *rawWriter) Closed() bool {
return w.closed
}

func (w *rawWriter) Close() error {
if w.closed {
return errors.New("zip: file closed twice")
}
w.closed = true
fh := w.FileHeader
fh.CompressedSize64 = uint64(w.rawCount.count)

// Write data descriptor. This is more complicated than one would
// think, see e.g. comments in zipfile.c:putextended() and
// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7073588.
// The approach here is to write 8 byte sizes if needed without
// adding a zip64 extra in the local header (too late anyway).
var buf []byte
if fh.isZip64() {
buf = make([]byte, dataDescriptor64Len)
} else {
buf = make([]byte, dataDescriptorLen)
}
b := writeBuf(buf)
b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X
b.uint32(fh.CRC32)
if fh.isZip64() {
b.uint64(fh.CompressedSize64)
b.uint64(fh.UncompressedSize64)
} else {
b.uint32(fh.CompressedSize)
b.uint32(fh.UncompressedSize)
}
_, err := w.zipw.Write(buf)
return err
}

type countWriter struct {
w io.Writer
count int64
Expand Down
Loading

0 comments on commit 26bacdf

Please sign in to comment.