diff --git a/README.md b/README.md
index 2d3c88282d..e2682be6fe 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,11 @@ This package provides various compression algorithms.
 
 # changelog
 
+* June 1, 2020 (v1.10.7): Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries).
+* June 1, 2020: Increase zstd decompression speed up to 1.19x.  [#259](https://github.com/klauspost/compress/pull/259)
+* June 1, 2020: Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263)
 * May 21, 2020: (v1.10.6) zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252)
-* May 21, 2020: zstd: Stricter decoding checks.
+* May 21, 2020: zstd: Stricter decompression checks.
 * April 12, 2020: (v1.10.5) s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239)
 * Apr 8, 2020: (v1.10.4) zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251),  [#250](https://github.com/klauspost/compress/pull/250),  [#249](https://github.com/klauspost/compress/pull/249),  [#247](https://github.com/klauspost/compress/pull/247)
 * Mar 11, 2020: (v1.10.3) s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245)
diff --git a/huff0/decompress.go b/huff0/decompress.go
index 97ae66a4ac..fb42a398be 100644
--- a/huff0/decompress.go
+++ b/huff0/decompress.go
@@ -155,20 +155,70 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 // The length of the supplied input must match the end of a block exactly.
 // Before this is called, the table must be initialized with ReadTable unless
 // the encoder re-used the table.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
-	if len(s.dt.single) == 0 {
+	if cap(s.Out) < s.MaxDecodedSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
+	}
+	s.Out = s.Out[:0:s.MaxDecodedSize]
+	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
+	return s.Out, err
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// Before this is called, the table must be initialized with ReadTable unless
+// the encoder re-used the table.
+// The length of the supplied input must match the end of a block exactly.
+// The destination size of the uncompressed data must be known and provided.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
+func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
+	if dstSize > s.MaxDecodedSize {
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	if cap(s.Out) < dstSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
+	}
+	s.Out = s.Out[:0:dstSize]
+	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
+	return s.Out, err
+}
+
+// Decoder will return a stateless decoder that can be used by multiple
+// decompressors concurrently.
+// Before this is called, the table must be initialized with ReadTable.
+// The Decoder is still linked to the scratch buffer so that cannot be reused.
+// However, it is safe to discard the scratch.
+func (s *Scratch) Decoder() *Decoder {
+	return &Decoder{
+		dt:             s.dt,
+		actualTableLog: s.actualTableLog,
+	}
+}
+
+// Decoder provides stateless decoding.
+type Decoder struct {
+	dt             dTable
+	actualTableLog uint8
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	var br bitReader
-	err = br.init(in)
+	err := br.init(src)
 	if err != nil {
-		return nil, err
+		return dst, err
 	}
-	s.Out = s.Out[:0]
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
 
 	decode := func() byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := s.dt.single[val]
+		val := br.peekBitsFast(d.actualTableLog) /* note : actualTableLog >= 1 */
+		v := d.dt.single[val]
 		br.bitsRead += uint8(v.entry)
 		return uint8(v.entry >> 8)
 	}
@@ -180,88 +230,80 @@ func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
-	dt := s.dt.single[:tlSize]
+	dt := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
+	var buf [256]byte
 	var off uint8
 
 	for br.off >= 8 {
 		br.fillFast()
-		tmp[off+0] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+1] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		buf[off+0] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		buf[off+1] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
 		br.fillFast()
-		tmp[off+2] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		buf[off+2] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		buf[off+3] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
 		off += 4
 		if off == 0 {
-			if len(s.Out)+256 > s.MaxDecodedSize {
+			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
-			s.Out = append(s.Out, tmp...)
+			dst = append(dst, buf[:]...)
 		}
 	}
 
-	if len(s.Out)+int(off) > s.MaxDecodedSize {
+	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
-	s.Out = append(s.Out, tmp[:off]...)
+	dst = append(dst, buf[:off]...)
 
 	for !br.finished() {
 		br.fill()
-		if len(s.Out) >= s.MaxDecodedSize {
+		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		s.Out = append(s.Out, decode())
+		dst = append(dst, decode())
 	}
-	return s.Out, br.close()
+	return dst, br.close()
 }
 
 // Decompress4X will decompress a 4X encoded stream.
-// Before this is called, the table must be initialized with ReadTable unless
-// the encoder re-used the table.
 // The length of the supplied input must match the end of a block exactly.
-// The destination size of the uncompressed data must be known and provided.
-func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	if len(s.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
-	if len(in) < 6+(4*1) {
+	if len(src) < 6+(4*1) {
 		return nil, errors.New("input too small")
 	}
-	if dstSize > s.MaxDecodedSize {
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	// TODO: We do not detect when we overrun a buffer, except if the last one does.
 
 	var br [4]bitReader
 	start := 6
 	for i := 0; i < 3; i++ {
-		length := int(in[i*2]) | (int(in[i*2+1]) << 8)
-		if start+length >= len(in) {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
-		err = br[i].init(in[start : start+length])
+		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
-	err = br[3].init(in[start:])
+	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}
 
-	// Prepare output
-	if cap(s.Out) < dstSize {
-		s.Out = make([]byte, 0, dstSize)
-	}
-	s.Out = s.Out[:dstSize]
 	// destination, offset to match first output
-	dstOut := s.Out
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
 	dstEvery := (dstSize + 3) / 4
 
 	const tlSize = 1 << tableLogMax
@@ -276,7 +318,7 @@ func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
 	}
 
 	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
+	var buf [256]byte
 	var off uint8
 	var decoded int
 
@@ -300,8 +342,8 @@ bigloop:
 
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
 
@@ -313,8 +355,8 @@ bigloop:
 
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
 
@@ -326,8 +368,8 @@ bigloop:
 
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
 
@@ -339,8 +381,8 @@ bigloop:
 
 			val2 := br[stream].peekBitsFast(s.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 			br[stream].bitsRead += uint8(v2.entry)
 		}
 
@@ -350,30 +392,30 @@ bigloop:
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(dstOut, tmp[:bufoff])
-			copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
-			copy(dstOut[dstEvery*2:], tmp[bufoff*2:bufoff*3])
-			copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4])
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
-			dstOut = dstOut[bufoff:]
+			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
-			if len(dstOut) < dstEvery*3 {
+			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
-		if len(dstOut) < dstEvery*3+ioff {
+		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(dstOut, tmp[:off])
-		copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
-		copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3])
-		copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4])
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
-		dstOut = dstOut[off:]
+		out = out[off:]
 	}
 
 	// Decode remaining.
@@ -382,10 +424,10 @@ bigloop:
 		br := &br[i]
 		for !br.finished() {
 			br.fill()
-			if offset >= len(dstOut) {
+			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
-			dstOut[offset] = decode(br)
+			out[offset] = decode(br)
 			offset++
 		}
 		decoded += offset - dstEvery*i
@@ -397,7 +439,7 @@ bigloop:
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
-	return s.Out, nil
+	return dst, nil
 }
 
 // matches will compare a decoding table to a coding table.
diff --git a/zstd/README.md b/zstd/README.md
index bc977a3023..f2a80b5d06 100644
--- a/zstd/README.md
+++ b/zstd/README.md
@@ -309,6 +309,20 @@ The decoder can be used for *concurrent* decompression of multiple buffers.
 It will only allow a certain number of concurrent operations to run. 
 To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
 
+### Dictionaries
+
+Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
+
+Dictionaries are added individually to Decoders.
+Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
+To add a dictionary use the `RegisterDict(data)` with the dictionary data before starting any decompression.
+
+The dictionary will be used automatically for the data that specifies them.
+
+A re-used Decoder will still contain the dictionaries registered.
+
+When registering a dictionary with the same ID it will override the existing.
+
 ### Allocation-less operation
 
 The decoder has been designed to operate without allocations after a warmup. 
diff --git a/zstd/blockdec.go b/zstd/blockdec.go
index 19181caea1..4a14242c76 100644
--- a/zstd/blockdec.go
+++ b/zstd/blockdec.go
@@ -461,26 +461,22 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if huff == nil {
 			huff = &huff0.Scratch{}
 		}
-		huff.Out = b.literalBuf[:0]
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
 			return err
 		}
 		// Use our out buffer.
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
 			return err
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if len(literals) != litRegenSize {
 			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
@@ -631,15 +627,12 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		var err error
 		// Use our out buffer.
 		huff = hist.huffTree
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if err != nil {
 			println("decompressing literals:", err)
 			return err
@@ -649,12 +642,13 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 	} else {
 		if hist.huffTree != nil && huff != nil {
-			huffDecoderPool.Put(hist.huffTree)
+			if hist.dict == nil || hist.dict.litDec != hist.huffTree {
+				huffDecoderPool.Put(hist.huffTree)
+			}
 			hist.huffTree = nil
 		}
 	}
 	if huff != nil {
-		huff.Out = nil
 		hist.huffTree = huff
 	}
 	if debug {
@@ -687,14 +681,20 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	//   If only recent offsets were not transferred, this would be an obvious win.
 	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
 
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
+		// We do not need history any more.
+		if hist.dict != nil {
+			hist.dict.content = nil
+		}
 	}
+
+	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+
 	err = seqs.decode(nSeqs, br, hbytes)
 	if err != nil {
 		return err
diff --git a/zstd/bytereader.go b/zstd/bytereader.go
index dc4378b640..f708df1c4c 100644
--- a/zstd/bytereader.go
+++ b/zstd/bytereader.go
@@ -4,6 +4,8 @@
 
 package zstd
 
+import "encoding/binary"
+
 // byteReader provides a byte reader that reads
 // little endian values from a byte stream.
 // The input stream is manually advanced.
@@ -55,12 +57,7 @@ func (b byteReader) Uint32() uint32 {
 		}
 		return v
 	}
-	b2 := b.b[b.off : b.off+4 : b.off+4]
-	v3 := uint32(b2[3])
-	v2 := uint32(b2[2])
-	v1 := uint32(b2[1])
-	v0 := uint32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+	return binary.LittleEndian.Uint32(b.b[b.off : b.off+4])
 }
 
 // unread returns the unread portion of the input.
diff --git a/zstd/decoder.go b/zstd/decoder.go
index 324347623c..8e34479ff8 100644
--- a/zstd/decoder.go
+++ b/zstd/decoder.go
@@ -32,8 +32,9 @@ type Decoder struct {
 	// Current read position used for Reader functionality.
 	current decoderState
 
-	// Custom dictionaries
-	dicts map[uint32]struct{}
+	// Custom dictionaries.
+	// Always uses copies.
+	dicts map[uint32]dict
 
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -295,10 +296,18 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	frame.bBuf = input
 
 	for {
+		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
 			return dst, nil
 		}
+		if frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				return nil, ErrUnknownDictionary
+			}
+			frame.history.setDict(&dict)
+		}
 		if err != nil {
 			return dst, err
 		}
@@ -393,6 +402,19 @@ func (d *Decoder) Close() {
 	d.current.err = ErrDecoderClosed
 }
 
+// RegisterDict will load a dictionary
+func (d *Decoder) RegisterDict(b []byte) error {
+	dc, err := loadDict(b)
+	if err != nil {
+		return err
+	}
+	if d.dicts == nil {
+		d.dicts = make(map[uint32]dict, 1)
+	}
+	d.dicts[dc.id] = *dc
+	return nil
+}
+
 // IOReadCloser returns the decoder as an io.ReadCloser for convenience.
 // Any changes to the decoder will be reflected, so the returned ReadCloser
 // can be reused along with the decoder.
@@ -466,6 +488,14 @@ func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
 			if debug && err != nil {
 				println("Frame decoder returned", err)
 			}
+			if err == nil && frame.DictionaryID != nil {
+				dict, ok := d.dicts[*frame.DictionaryID]
+				if !ok {
+					err = ErrUnknownDictionary
+				} else {
+					frame.history.setDict(&dict)
+				}
+			}
 			if err != nil {
 				stream.output <- decodeOutput{
 					err: err,
diff --git a/zstd/decoder_test.go b/zstd/decoder_test.go
index 7e886d3715..bd6e189a14 100644
--- a/zstd/decoder_test.go
+++ b/zstd/decoder_test.go
@@ -1154,6 +1154,9 @@ func testDecoderDecodeAll(t *testing.T, fn string, dec *Decoder) {
 			if err != nil {
 				t.Error(err)
 			}
+			if len(got) < 10 {
+				t.Fatal("didn't get input back")
+			}
 			got = got[10:]
 			if !bytes.Equal(wantB, got) {
 				if len(wantB)+len(got) < 1000 {
diff --git a/zstd/dict.go b/zstd/dict.go
new file mode 100644
index 0000000000..8eb6f6ba33
--- /dev/null
+++ b/zstd/dict.go
@@ -0,0 +1,104 @@
+package zstd
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/klauspost/compress/huff0"
+)
+
+type dict struct {
+	id uint32
+
+	litDec              *huff0.Scratch
+	llDec, ofDec, mlDec sequenceDec
+	offsets             [3]int
+	content             []byte
+}
+
+var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+
+// Load a dictionary as described in
+// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+func loadDict(b []byte) (*dict, error) {
+	// Check static field size.
+	if len(b) <= 8+(3*4) {
+		return nil, io.ErrUnexpectedEOF
+	}
+	d := dict{
+		llDec: sequenceDec{fse: &fseDecoder{}},
+		ofDec: sequenceDec{fse: &fseDecoder{}},
+		mlDec: sequenceDec{fse: &fseDecoder{}},
+	}
+	if !bytes.Equal(b[:4], dictMagic[:]) {
+		return nil, ErrMagicMismatch
+	}
+	d.id = binary.LittleEndian.Uint32(b[4:8])
+	if d.id == 0 {
+		return nil, errors.New("dictionaries cannot have ID 0")
+	}
+
+	// Read literal table
+	var err error
+	d.litDec, b, err = huff0.ReadTable(b[8:], nil)
+	if err != nil {
+		return nil, err
+	}
+
+	br := byteReader{
+		b:   b,
+		off: 0,
+	}
+	readDec := func(i tableIndex, dec *fseDecoder) error {
+		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
+			return err
+		}
+		if br.overread() {
+			return io.ErrUnexpectedEOF
+		}
+		err = dec.transform(symbolTableX[i])
+		if err != nil {
+			println("Transform table error:", err)
+			return err
+		}
+		if debug {
+			println("Read table ok", "symbolLen:", dec.symbolLen)
+		}
+		// Set decoders as predefined so they aren't reused.
+		dec.preDefined = true
+		return nil
+	}
+
+	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
+		return nil, err
+	}
+	if br.remain() < 12 {
+		return nil, io.ErrUnexpectedEOF
+	}
+
+	d.offsets[0] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[1] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[2] = int(br.Uint32())
+	br.advance(4)
+	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
+		return nil, errors.New("invalid offset in dictionary")
+	}
+	d.content = make([]byte, br.remain())
+	copy(d.content, br.unread())
+	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
+		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
+	}
+
+	return &d, nil
+}
diff --git a/zstd/dict_test.go b/zstd/dict_test.go
new file mode 100644
index 0000000000..2f4bbc6185
--- /dev/null
+++ b/zstd/dict_test.go
@@ -0,0 +1,137 @@
+package zstd
+
+import (
+	"bytes"
+	"io/ioutil"
+	"strings"
+	"testing"
+
+	"github.com/klauspost/compress/zip"
+)
+
+func TestDecoder_SmallDict(t *testing.T) {
+	// All files have CRC
+	fn := "testdata/dict-tests-small.zip"
+	data, err := ioutil.ReadFile(fn)
+	if err != nil {
+		t.Fatal(err)
+	}
+	zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+	dec, err := NewReader(nil, WithDecoderConcurrency(1))
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+	for _, tt := range zr.File {
+		if !strings.HasSuffix(tt.Name, ".dict") {
+			continue
+		}
+		func() {
+			r, err := tt.Open()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+			in, err := ioutil.ReadAll(r)
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = dec.RegisterDict(in)
+			if err != nil {
+				t.Fatal(tt.Name, err)
+			}
+		}()
+	}
+	defer dec.Close()
+	for _, tt := range zr.File {
+		if !strings.HasSuffix(tt.Name, ".zst") {
+			continue
+		}
+		t.Run("decodeall-"+tt.Name, func(t *testing.T) {
+			r, err := tt.Open()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+			in, err := ioutil.ReadAll(r)
+			if err != nil {
+				t.Fatal(err)
+			}
+			got, err := dec.DecodeAll(in, nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+			_, err = dec.DecodeAll(in, got[:0])
+			if err != nil {
+				t.Fatal(err)
+			}
+		})
+	}
+}
+
+func TestDecoder_MoreDicts(t *testing.T) {
+	// All files have CRC
+	// https://files.klauspost.com/compress/zstd-dict-tests.zip
+	fn := "testdata/zstd-dict-tests.zip"
+	data, err := ioutil.ReadFile(fn)
+	if err != nil {
+		t.Skip("extended dict test not found.")
+	}
+	zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+	dec, err := NewReader(nil, WithDecoderConcurrency(1))
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+	for _, tt := range zr.File {
+		if !strings.HasSuffix(tt.Name, ".dict") {
+			continue
+		}
+		func() {
+			r, err := tt.Open()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+			in, err := ioutil.ReadAll(r)
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = dec.RegisterDict(in)
+			if err != nil {
+				t.Fatal(tt.Name, err)
+			}
+		}()
+	}
+	defer dec.Close()
+	for _, tt := range zr.File {
+		if !strings.HasSuffix(tt.Name, ".zst") {
+			continue
+		}
+		t.Run("decodeall-"+tt.Name, func(t *testing.T) {
+			r, err := tt.Open()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+			in, err := ioutil.ReadAll(r)
+			if err != nil {
+				t.Fatal(err)
+			}
+			got, err := dec.DecodeAll(in, nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+			_, err = dec.DecodeAll(in, got[:0])
+			if err != nil {
+				t.Fatal(err)
+			}
+		})
+	}
+}
diff --git a/zstd/framedec.go b/zstd/framedec.go
index 780880ebe4..fc4a566d39 100644
--- a/zstd/framedec.go
+++ b/zstd/framedec.go
@@ -40,7 +40,7 @@ type frameDec struct {
 	FrameContentSize uint64
 	frameDone        sync.WaitGroup
 
-	DictionaryID  uint32
+	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
 
@@ -142,7 +142,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
+	d.DictionaryID = nil
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -154,19 +154,22 @@ func (d *frameDec) reset(br byteBuffer) error {
 			}
 			return io.ErrUnexpectedEOF
 		}
+		var id uint32
 		switch size {
 		case 1:
-			d.DictionaryID = uint32(b[0])
+			id = uint32(b[0])
 		case 2:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+			id = uint32(b[0]) | (uint32(b[1]) << 8)
 		case 4:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
 		if debug {
-			println("Dict size", size, "ID:", d.DictionaryID)
+			println("Dict size", size, "ID:", id)
 		}
-		if d.DictionaryID != 0 {
-			return ErrUnknownDictionary
+		if id > 0 {
+			// ID 0 means "sorry, no dictionary anyway".
+			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
+			d.DictionaryID = &id
 		}
 	}
 
@@ -351,8 +354,6 @@ func (d *frameDec) initAsync() {
 // When the frame has finished decoding the *bufio.Reader
 // containing the remaining input will be sent on frameDec.frameDone.
 func (d *frameDec) startDecoder(output chan decodeOutput) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	written := int64(0)
 
 	defer func() {
@@ -445,8 +446,6 @@ func (d *frameDec) startDecoder(output chan decodeOutput) {
 
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	saved := d.history.b
 
 	// We use the history for output to avoid copying it.
diff --git a/zstd/fse_decoder.go b/zstd/fse_decoder.go
index e002be98b9..957cfeb79c 100644
--- a/zstd/fse_decoder.go
+++ b/zstd/fse_decoder.go
@@ -19,7 +19,7 @@ const (
 	 *  Increasing memory usage improves compression ratio
 	 *  Reduced memory usage can improve speed, due to cache effect
 	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = 11
+	maxMemoryUsage = tablelogAbsoluteMax + 2
 
 	maxTableLog    = maxMemoryUsage - 2
 	maxTablesize   = 1 << maxTableLog
diff --git a/zstd/history.go b/zstd/history.go
index e8c419bd53..f418f50fcd 100644
--- a/zstd/history.go
+++ b/zstd/history.go
@@ -17,6 +17,7 @@ type history struct {
 	windowSize    int
 	maxSize       int
 	error         bool
+	dict          *dict
 }
 
 // reset will reset the history to initial state of a frame.
@@ -36,12 +37,27 @@ func (h *history) reset() {
 	}
 	h.decoders = sequenceDecs{}
 	if h.huffTree != nil {
-		huffDecoderPool.Put(h.huffTree)
+		if h.dict == nil || h.dict.litDec != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+		}
 	}
 	h.huffTree = nil
+	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 
+func (h *history) setDict(dict *dict) {
+	if dict == nil {
+		return
+	}
+	h.dict = dict
+	h.decoders.litLengths = dict.llDec
+	h.decoders.offsets = dict.ofDec
+	h.decoders.matchLengths = dict.mlDec
+	h.recentOffsets = dict.offsets
+	h.huffTree = dict.litDec
+}
+
 // append bytes to history.
 // This function will make sure there is space for it,
 // if the buffer has been allocated with enough extra space.
diff --git a/zstd/seqdec.go b/zstd/seqdec.go
index 634e6497ad..7ff870400d 100644
--- a/zstd/seqdec.go
+++ b/zstd/seqdec.go
@@ -62,6 +62,7 @@ type sequenceDecs struct {
 	matchLengths sequenceDec
 	prevOffset   [3]int
 	hist         []byte
+	dict         []byte
 	literals     []byte
 	out          []byte
 	windowSize   int
@@ -85,6 +86,10 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
 	s.out = out
+	s.dict = nil
+	if hist.dict != nil {
+		s.dict = hist.dict.content
+	}
 	return nil
 }
 
@@ -185,20 +190,38 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
-		if mo > len(s.out)+len(hist)+ll {
-			return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist)+ll)
-		}
-		if mo > s.windowSize {
-			return fmt.Errorf("match offset (%d) bigger than window size (%d)", mo, s.windowSize)
-		}
-		if mo == 0 && ml > 0 {
-			return fmt.Errorf("zero matchoff and matchlen > 0")
-		}
 
+		// Add literals
 		s.out = append(s.out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
 		out := s.out
 
+		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+			end := dictO + ml
+			if end > len(s.dict) {
+				out = append(out, s.dict[dictO:]...)
+				mo -= len(s.dict) - dictO
+				ml -= len(s.dict) - dictO
+			} else {
+				out = append(out, s.dict[dictO:end]...)
+				mo = 0
+				ml = 0
+			}
+		}
+
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
 		if v := mo - len(s.out); v > 0 {
diff --git a/zstd/testdata/dict-tests-small.zip b/zstd/testdata/dict-tests-small.zip
new file mode 100644
index 0000000000..89fbf165a8
Binary files /dev/null and b/zstd/testdata/dict-tests-small.zip differ