bzip2: reject missing repeater when decoding RLE1

The last repeater value in RLE1 can technically be omitted and still be non-ambiguous since the end of the buffer implies that the data is complete. However, the C implementation does not handle this case, so we must reject it as well.
dsnet · Dec 31, 2016 · f98e3a5 · f98e3a5
1 parent ca9a1d2
commit f98e3a5
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 14 deletions.
diff --git a/bzip2/reader.go b/bzip2/reader.go
@@ -53,7 +53,10 @@ func (zr *Reader) Reset(r io.Reader) {
 
 func (zr *Reader) Read(buf []byte) (int, error) {
 	for {
-		cnt, _ := zr.rle.Read(buf)
+		cnt, err := zr.rle.Read(buf)
+		if err != rleDone && zr.err == nil {
+			zr.err = err
+		}
 		if cnt > 0 {
 			zr.wantBlkCRC = zr.crc.update(zr.wantBlkCRC, buf[:cnt])
 			zr.OutputOffset += int64(cnt)
@@ -105,7 +108,6 @@ func (zr *Reader) Read(buf []byte) (int, error) {
 			buf := zr.decodeBlock()
 			zr.rle.Init(buf)
 		}()
-		var err error
 		if zr.InputOffset, err = zr.rd.Flush(); zr.err == nil {
 			zr.err = err
 		}

diff --git a/bzip2/reader_test.go b/bzip2/reader_test.go
@@ -180,10 +180,6 @@ func TestReader(t *testing.T) {
 		output: db(`>>> X:41*4`),
 	}, {
 		// RLE1 stage with missing repeater value.
-		//
-		// NOTE: The C library rejects this file. Currently, our implementation
-		// liberally allows this since it simplifies the implementation.
-		// This may change in the future.
 		name: "RLE3",
 		input: db(`>>>
 			"BZh1"
@@ -196,6 +192,7 @@ func TestReader(t *testing.T) {
 			> H48:177245385090 H32:e16e6571
 		`),
 		output: db(`>>> X:41*4`),
+		errf:   "IsCorrupted",
 	}, {
 		// RLE1 stage with sub-optimal repeater usage.
 		name: "RLE4",

diff --git a/bzip2/rle1.go b/bzip2/rle1.go
@@ -8,15 +8,15 @@ import (
 	"github.com/dsnet/compress/internal/errors"
 )
 
-// rleDone is a special "error" to indicate that the RLE buffer is depleted.
-var rleDone = errorf(errors.Unknown, "RLE1 buffer is full")
+// rleDone is a special "error" to indicate that the RLE stage is done.
+var rleDone = errorf(errors.Unknown, "RLE1 stage is completed")
 
 // runLengthEncoding implements the first RLE stage of bzip2. Every sequence
 // of 4..255 duplicated bytes is replaced by only the first 4 bytes, and a
 // single byte representing the repeat length. Similar to the C bzip2
 // implementation, the encoder will always terminate repeat sequences with a
 // count (even if it is the end of the buffer), and it will also never produce
-// run lengths of 256..259. The decoder, however, will handle both cases.
+// run lengths of 256..259. The decoder can handle the latter case.
 //
 // For example, if the input was:
 //	input:  "AAAAAAABBBBCCCD"
@@ -75,7 +75,7 @@ func (rle *runLengthEncoding) Read(buf []byte) (int, error) {
 		switch {
 		case rle.lastCnt == -4:
 			if rle.idx >= len(rle.buf) {
-				return i, rleDone
+				return i, errorf(errors.Corrupted, "missing terminating run-length repeater")
 			}
 			rle.lastCnt = int(rle.buf[rle.idx])
 			rle.idx++

diff --git a/bzip2/rle1_test.go b/bzip2/rle1_test.go
@@ -71,9 +71,10 @@ func TestRunLengthEncoder(t *testing.T) {
 
 	buf := make([]byte, 3)
 	for i, v := range vectors {
+		rd := strings.NewReader(v.input)
 		rle := new(runLengthEncoding)
 		rle.Init(make([]byte, v.size))
-		_, err := io.CopyBuffer(rle, strings.NewReader(v.input), buf)
+		_, err := io.CopyBuffer(rle, struct{ io.Reader }{rd}, buf)
 		output := string(rle.Bytes())
 
 		if output != v.output {
@@ -89,6 +90,7 @@ func TestRunLengthDecoder(t *testing.T) {
 	vectors := []struct {
 		input  string
 		output string
+		fail   bool
 	}{{
 		input:  "",
 		output: "",
@@ -98,9 +100,11 @@ func TestRunLengthDecoder(t *testing.T) {
 	}, {
 		input:  "aaaa",
 		output: "aaaa",
+		fail:   true,
 	}, {
 		input:  "baaaa\x00aaaa",
 		output: "baaaaaaaa",
+		fail:   true,
 	}, {
 		input:  "abcccc\x00",
 		output: "abcccc",
@@ -141,14 +145,17 @@ func TestRunLengthDecoder(t *testing.T) {
 
 	buf := make([]byte, 3)
 	for i, v := range vectors {
-		rle := new(runLengthEncoding)
 		wr := new(bytes.Buffer)
+		rle := new(runLengthEncoding)
 		rle.Init([]byte(v.input))
-		io.CopyBuffer(wr, rle, buf)
+		_, err := io.CopyBuffer(struct{ io.Writer }{wr}, rle, buf)
 		output := wr.String()
 
 		if output != v.output {
 			t.Errorf("test %d, output mismatch:\ngot  %q\nwant %q", i, output, v.output)
 		}
+		if fail := err != rleDone; fail != v.fail {
+			t.Errorf("test %d, failure mismatch: got %t, want %t", i, fail, v.fail)
+		}
 	}
 }
diff --git a/bzip2/writer.go b/bzip2/writer.go
@@ -76,7 +76,10 @@ func (zw *Writer) Write(buf []byte) (int, error) {
 
 	cnt := len(buf)
 	for {
-		wrCnt, _ := zw.rle.Write(buf)
+		wrCnt, err := zw.rle.Write(buf)
+		if err != rleDone && zw.err == nil {
+			zw.err = err
+		}
 		zw.blkCRC = zw.crc.update(zw.blkCRC, buf[:wrCnt])
 		buf = buf[wrCnt:]
 		if len(buf) == 0 {