Initial commit: Go 1.23 release state

This commit is contained in:
Vorapol Rinsatitnon
2024-09-21 23:49:08 +10:00
commit 17cd57a668
13231 changed files with 3114330 additions and 0 deletions

View File

@@ -0,0 +1,82 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
import (
"bufio"
"io"
)
// bitReader wraps an io.Reader and provides the ability to read values,
// bit-by-bit, from it. Its Read* methods don't return the usual error
// because the error handling was verbose. Instead, any error is kept and can
// be checked afterwards.
type bitReader struct {
r io.ByteReader
n uint64
bits uint
err error
}
// newBitReader returns a new bitReader reading from r. If r is not
// already an io.ByteReader, it will be converted via a bufio.Reader.
func newBitReader(r io.Reader) bitReader {
byter, ok := r.(io.ByteReader)
if !ok {
byter = bufio.NewReader(r)
}
return bitReader{r: byter}
}
// ReadBits64 reads the given number of bits and returns them in the
// least-significant part of a uint64. In the event of an error, it returns 0
// and the error can be obtained by calling bitReader.Err().
func (br *bitReader) ReadBits64(bits uint) (n uint64) {
for bits > br.bits {
b, err := br.r.ReadByte()
if err == io.EOF {
err = io.ErrUnexpectedEOF
}
if err != nil {
br.err = err
return 0
}
br.n <<= 8
br.n |= uint64(b)
br.bits += 8
}
// br.n looks like this (assuming that br.bits = 14 and bits = 6):
// Bit: 111111
// 5432109876543210
//
// (6 bits, the desired output)
// |-----|
// V V
// 0101101101001110
// ^ ^
// |------------|
// br.bits (num valid bits)
//
// The next line right shifts the desired bits into the
// least-significant places and masks off anything above.
n = (br.n >> (br.bits - bits)) & ((1 << bits) - 1)
br.bits -= bits
return
}
func (br *bitReader) ReadBits(bits uint) (n int) {
n64 := br.ReadBits64(bits)
return int(n64)
}
func (br *bitReader) ReadBit() bool {
n := br.ReadBits(1)
return n != 0
}
func (br *bitReader) Err() error {
return br.err
}

500
src/compress/bzip2/bzip2.go Normal file
View File

@@ -0,0 +1,500 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bzip2 implements bzip2 decompression.
package bzip2
import "io"
// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
// of guessing: https://en.wikipedia.org/wiki/Bzip2
// The source code to pyflate was useful for debugging:
// http://www.paul.sladen.org/projects/pyflate
// A StructuralError is returned when the bzip2 data is found to be
// syntactically invalid.
type StructuralError string
func (s StructuralError) Error() string {
return "bzip2 data invalid: " + string(s)
}
// A reader decompresses bzip2 compressed data.
type reader struct {
br bitReader
fileCRC uint32
blockCRC uint32
wantBlockCRC uint32
setupDone bool // true if we have parsed the bzip2 header.
eof bool
blockSize int // blockSize in bytes, i.e. 900 * 1000.
c [256]uint // the ``C'' array for the inverse BWT.
tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
tPos uint32 // Index of the next output byte in tt.
preRLE []uint32 // contains the RLE data still to be processed.
preRLEUsed int // number of entries of preRLE used.
lastByte int // the last byte value seen.
byteRepeats uint // the number of repeats of lastByte seen.
repeats uint // the number of copies of lastByte to output.
}
// NewReader returns an io.Reader which decompresses bzip2 data from r.
// If r does not also implement [io.ByteReader],
// the decompressor may read more data than necessary from r.
func NewReader(r io.Reader) io.Reader {
bz2 := new(reader)
bz2.br = newBitReader(r)
return bz2
}
const bzip2FileMagic = 0x425a // "BZ"
const bzip2BlockMagic = 0x314159265359
const bzip2FinalMagic = 0x177245385090
// setup parses the bzip2 header.
func (bz2 *reader) setup(needMagic bool) error {
br := &bz2.br
if needMagic {
magic := br.ReadBits(16)
if magic != bzip2FileMagic {
return StructuralError("bad magic value")
}
}
t := br.ReadBits(8)
if t != 'h' {
return StructuralError("non-Huffman entropy encoding")
}
level := br.ReadBits(8)
if level < '1' || level > '9' {
return StructuralError("invalid compression level")
}
bz2.fileCRC = 0
bz2.blockSize = 100 * 1000 * (level - '0')
if bz2.blockSize > len(bz2.tt) {
bz2.tt = make([]uint32, bz2.blockSize)
}
return nil
}
func (bz2 *reader) Read(buf []byte) (n int, err error) {
if bz2.eof {
return 0, io.EOF
}
if !bz2.setupDone {
err = bz2.setup(true)
brErr := bz2.br.Err()
if brErr != nil {
err = brErr
}
if err != nil {
return 0, err
}
bz2.setupDone = true
}
n, err = bz2.read(buf)
brErr := bz2.br.Err()
if brErr != nil {
err = brErr
}
return
}
func (bz2 *reader) readFromBlock(buf []byte) int {
// bzip2 is a block based compressor, except that it has a run-length
// preprocessing step. The block based nature means that we can
// preallocate fixed-size buffers and reuse them. However, the RLE
// preprocessing would require allocating huge buffers to store the
// maximum expansion. Thus we process blocks all at once, except for
// the RLE which we decompress as required.
n := 0
for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
// We have RLE data pending.
// The run-length encoding works like this:
// Any sequence of four equal bytes is followed by a length
// byte which contains the number of repeats of that byte to
// include. (The number of repeats can be zero.) Because we are
// decompressing on-demand our state is kept in the reader
// object.
if bz2.repeats > 0 {
buf[n] = byte(bz2.lastByte)
n++
bz2.repeats--
if bz2.repeats == 0 {
bz2.lastByte = -1
}
continue
}
bz2.tPos = bz2.preRLE[bz2.tPos]
b := byte(bz2.tPos)
bz2.tPos >>= 8
bz2.preRLEUsed++
if bz2.byteRepeats == 3 {
bz2.repeats = uint(b)
bz2.byteRepeats = 0
continue
}
if bz2.lastByte == int(b) {
bz2.byteRepeats++
} else {
bz2.byteRepeats = 0
}
bz2.lastByte = int(b)
buf[n] = b
n++
}
return n
}
func (bz2 *reader) read(buf []byte) (int, error) {
for {
n := bz2.readFromBlock(buf)
if n > 0 || len(buf) == 0 {
bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
return n, nil
}
// End of block. Check CRC.
if bz2.blockCRC != bz2.wantBlockCRC {
bz2.br.err = StructuralError("block checksum mismatch")
return 0, bz2.br.err
}
// Find next block.
br := &bz2.br
switch br.ReadBits64(48) {
default:
return 0, StructuralError("bad magic value found")
case bzip2BlockMagic:
// Start of block.
err := bz2.readBlock()
if err != nil {
return 0, err
}
case bzip2FinalMagic:
// Check end-of-file CRC.
wantFileCRC := uint32(br.ReadBits64(32))
if br.err != nil {
return 0, br.err
}
if bz2.fileCRC != wantFileCRC {
br.err = StructuralError("file checksum mismatch")
return 0, br.err
}
// Skip ahead to byte boundary.
// Is there a file concatenated to this one?
// It would start with BZ.
if br.bits%8 != 0 {
br.ReadBits(br.bits % 8)
}
b, err := br.r.ReadByte()
if err == io.EOF {
br.err = io.EOF
bz2.eof = true
return 0, io.EOF
}
if err != nil {
br.err = err
return 0, err
}
z, err := br.r.ReadByte()
if err != nil {
if err == io.EOF {
err = io.ErrUnexpectedEOF
}
br.err = err
return 0, err
}
if b != 'B' || z != 'Z' {
return 0, StructuralError("bad magic value in continuation file")
}
if err := bz2.setup(false); err != nil {
return 0, err
}
}
}
}
// readBlock reads a bzip2 block. The magic number should already have been consumed.
func (bz2 *reader) readBlock() (err error) {
br := &bz2.br
bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
bz2.blockCRC = 0
bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
randomized := br.ReadBits(1)
if randomized != 0 {
return StructuralError("deprecated randomized files")
}
origPtr := uint(br.ReadBits(24))
// If not every byte value is used in the block (i.e., it's text) then
// the symbol set is reduced. The symbols used are stored as a
// two-level, 16x16 bitmap.
symbolRangeUsedBitmap := br.ReadBits(16)
symbolPresent := make([]bool, 256)
numSymbols := 0
for symRange := uint(0); symRange < 16; symRange++ {
if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
bits := br.ReadBits(16)
for symbol := uint(0); symbol < 16; symbol++ {
if bits&(1<<(15-symbol)) != 0 {
symbolPresent[16*symRange+symbol] = true
numSymbols++
}
}
}
}
if numSymbols == 0 {
// There must be an EOF symbol.
return StructuralError("no symbols in input")
}
// A block uses between two and six different Huffman trees.
numHuffmanTrees := br.ReadBits(3)
if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
return StructuralError("invalid number of Huffman trees")
}
// The Huffman tree can switch every 50 symbols so there's a list of
// tree indexes telling us which tree to use for each 50 symbol block.
numSelectors := br.ReadBits(15)
treeIndexes := make([]uint8, numSelectors)
// The tree indexes are move-to-front transformed and stored as unary
// numbers.
mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
for i := range treeIndexes {
c := 0
for {
inc := br.ReadBits(1)
if inc == 0 {
break
}
c++
}
if c >= numHuffmanTrees {
return StructuralError("tree index too large")
}
treeIndexes[i] = mtfTreeDecoder.Decode(c)
}
// The list of symbols for the move-to-front transform is taken from
// the previously decoded symbol bitmap.
symbols := make([]byte, numSymbols)
nextSymbol := 0
for i := 0; i < 256; i++ {
if symbolPresent[i] {
symbols[nextSymbol] = byte(i)
nextSymbol++
}
}
mtf := newMTFDecoder(symbols)
numSymbols += 2 // to account for RUNA and RUNB symbols
huffmanTrees := make([]huffmanTree, numHuffmanTrees)
// Now we decode the arrays of code-lengths for each tree.
lengths := make([]uint8, numSymbols)
for i := range huffmanTrees {
// The code lengths are delta encoded from a 5-bit base value.
length := br.ReadBits(5)
for j := range lengths {
for {
if length < 1 || length > 20 {
return StructuralError("Huffman length out of range")
}
if !br.ReadBit() {
break
}
if br.ReadBit() {
length--
} else {
length++
}
}
lengths[j] = uint8(length)
}
huffmanTrees[i], err = newHuffmanTree(lengths)
if err != nil {
return err
}
}
selectorIndex := 1 // the next tree index to use
if len(treeIndexes) == 0 {
return StructuralError("no tree selectors given")
}
if int(treeIndexes[0]) >= len(huffmanTrees) {
return StructuralError("tree selector out of range")
}
currentHuffmanTree := huffmanTrees[treeIndexes[0]]
bufIndex := 0 // indexes bz2.buf, the output buffer.
// The output of the move-to-front transform is run-length encoded and
// we merge the decoding into the Huffman parsing loop. These two
// variables accumulate the repeat count. See the Wikipedia page for
// details.
repeat := 0
repeatPower := 0
// The `C' array (used by the inverse BWT) needs to be zero initialized.
clear(bz2.c[:])
decoded := 0 // counts the number of symbols decoded by the current tree.
for {
if decoded == 50 {
if selectorIndex >= numSelectors {
return StructuralError("insufficient selector indices for number of symbols")
}
if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
return StructuralError("tree selector out of range")
}
currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
selectorIndex++
decoded = 0
}
v := currentHuffmanTree.Decode(br)
decoded++
if v < 2 {
// This is either the RUNA or RUNB symbol.
if repeat == 0 {
repeatPower = 1
}
repeat += repeatPower << v
repeatPower <<= 1
// This limit of 2 million comes from the bzip2 source
// code. It prevents repeat from overflowing.
if repeat > 2*1024*1024 {
return StructuralError("repeat count too large")
}
continue
}
if repeat > 0 {
// We have decoded a complete run-length so we need to
// replicate the last output symbol.
if repeat > bz2.blockSize-bufIndex {
return StructuralError("repeats past end of block")
}
for i := 0; i < repeat; i++ {
b := mtf.First()
bz2.tt[bufIndex] = uint32(b)
bz2.c[b]++
bufIndex++
}
repeat = 0
}
if int(v) == numSymbols-1 {
// This is the EOF symbol. Because it's always at the
// end of the move-to-front list, and never gets moved
// to the front, it has this unique value.
break
}
// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
// one would expect |v-2| to be passed to the MTF decoder.
// However, the front of the MTF list is never referenced as 0,
// it's always referenced with a run-length of 1. Thus 0
// doesn't need to be encoded and we have |v-1| in the next
// line.
b := mtf.Decode(int(v - 1))
if bufIndex >= bz2.blockSize {
return StructuralError("data exceeds block size")
}
bz2.tt[bufIndex] = uint32(b)
bz2.c[b]++
bufIndex++
}
if origPtr >= uint(bufIndex) {
return StructuralError("origPtr out of bounds")
}
// We have completed the entropy decoding. Now we can perform the
// inverse BWT and setup the RLE buffer.
bz2.preRLE = bz2.tt[:bufIndex]
bz2.preRLEUsed = 0
bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
bz2.lastByte = -1
bz2.byteRepeats = 0
bz2.repeats = 0
return nil
}
// inverseBWT implements the inverse Burrows-Wheeler transform as described in
// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
// In that document, origPtr is called “I” and c is the “C” array after the
// first pass over the data. It's an argument here because we merge the first
// pass with the Huffman decoding.
//
// This also implements the “single array” method from the bzip2 source code
// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
// index of the next byte in the top 24-bits. The index of the first byte is
// returned.
func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
sum := uint(0)
for i := 0; i < 256; i++ {
sum += c[i]
c[i] = sum - c[i]
}
for i := range tt {
b := tt[i] & 0xff
tt[c[b]] |= uint32(i) << 8
c[b]++
}
return tt[origPtr] >> 8
}
// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
// causing the bits in the input to be processed in the reverse of the usual order.
var crctab [256]uint32
func init() {
const poly = 0x04C11DB7
for i := range crctab {
crc := uint32(i) << 24
for j := 0; j < 8; j++ {
if crc&0x80000000 != 0 {
crc = (crc << 1) ^ poly
} else {
crc <<= 1
}
}
crctab[i] = crc
}
}
// updateCRC updates the crc value to incorporate the data in b.
// The initial value is 0.
func updateCRC(val uint32, b []byte) uint32 {
crc := ^val
for _, v := range b {
crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
}
return ^crc
}

View File

@@ -0,0 +1,240 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
import (
"bytes"
"encoding/hex"
"fmt"
"io"
"os"
"testing"
)
func mustDecodeHex(s string) []byte {
b, err := hex.DecodeString(s)
if err != nil {
panic(err)
}
return b
}
func mustLoadFile(f string) []byte {
b, err := os.ReadFile(f)
if err != nil {
panic(err)
}
return b
}
func trim(b []byte) string {
const limit = 1024
if len(b) < limit {
return fmt.Sprintf("%q", b)
}
return fmt.Sprintf("%q...", b[:limit])
}
func TestReader(t *testing.T) {
var vectors = []struct {
desc string
input []byte
output []byte
fail bool
}{{
desc: "hello world",
input: mustDecodeHex("" +
"425a68393141592653594eece83600000251800010400006449080200031064c" +
"4101a7a9a580bb9431f8bb9229c28482776741b0",
),
output: []byte("hello world\n"),
}, {
desc: "concatenated files",
input: mustDecodeHex("" +
"425a68393141592653594eece83600000251800010400006449080200031064c" +
"4101a7a9a580bb9431f8bb9229c28482776741b0425a68393141592653594eec" +
"e83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb92" +
"29c28482776741b0",
),
output: []byte("hello world\nhello world\n"),
}, {
desc: "32B zeros",
input: mustDecodeHex("" +
"425a6839314159265359b5aa5098000000600040000004200021008283177245" +
"385090b5aa5098",
),
output: make([]byte, 32),
}, {
desc: "1MiB zeros",
input: mustDecodeHex("" +
"425a683931415926535938571ce50008084000c0040008200030cc0529a60806" +
"c4201e2ee48a70a12070ae39ca",
),
output: make([]byte, 1<<20),
}, {
desc: "random data",
input: mustLoadFile("testdata/pass-random1.bz2"),
output: mustLoadFile("testdata/pass-random1.bin"),
}, {
desc: "random data - full symbol range",
input: mustLoadFile("testdata/pass-random2.bz2"),
output: mustLoadFile("testdata/pass-random2.bin"),
}, {
desc: "random data - uses RLE1 stage",
input: mustDecodeHex("" +
"425a6839314159265359d992d0f60000137dfe84020310091c1e280e100e0428" +
"01099210094806c0110002e70806402000546034000034000000f28300000320" +
"00d3403264049270eb7a9280d308ca06ad28f6981bee1bf8160727c7364510d7" +
"3a1e123083421b63f031f63993a0f40051fbf177245385090d992d0f60",
),
output: mustDecodeHex("" +
"92d5652616ac444a4a04af1a8a3964aca0450d43d6cf233bd03233f4ba92f871" +
"9e6c2a2bd4f5f88db07ecd0da3a33b263483db9b2c158786ad6363be35d17335" +
"ba",
),
}, {
desc: "1MiB sawtooth",
input: mustLoadFile("testdata/pass-sawtooth.bz2"),
output: func() []byte {
b := make([]byte, 1<<20)
for i := range b {
b[i] = byte(i)
}
return b
}(),
}, {
desc: "RLE2 buffer overrun - issue 5747",
input: mustLoadFile("testdata/fail-issue5747.bz2"),
fail: true,
}, {
desc: "out-of-range selector - issue 8363",
input: mustDecodeHex("" +
"425a68393141592653594eece83600000251800010400006449080200031064c" +
"4101a7a9a580bb943117724538509000000000",
),
fail: true,
}, {
desc: "bad block size - issue 13941",
input: mustDecodeHex("" +
"425a683131415926535936dc55330063ffc0006000200020a40830008b0008b8" +
"bb9229c28481b6e2a998",
),
fail: true,
}, {
desc: "bad huffman delta",
input: mustDecodeHex("" +
"425a6836314159265359b1f7404b000000400040002000217d184682ee48a70a" +
"12163ee80960",
),
fail: true,
}}
for i, v := range vectors {
rd := NewReader(bytes.NewReader(v.input))
buf, err := io.ReadAll(rd)
if fail := bool(err != nil); fail != v.fail {
if fail {
t.Errorf("test %d (%s), unexpected failure: %v", i, v.desc, err)
} else {
t.Errorf("test %d (%s), unexpected success", i, v.desc)
}
}
if !v.fail && !bytes.Equal(buf, v.output) {
t.Errorf("test %d (%s), output mismatch:\ngot %s\nwant %s", i, v.desc, trim(buf), trim(v.output))
}
}
}
func TestBitReader(t *testing.T) {
var vectors = []struct {
nbits uint // Number of bits to read
value int // Expected output value (0 for error)
fail bool // Expected operation failure?
}{
{nbits: 1, value: 1},
{nbits: 1, value: 0},
{nbits: 1, value: 1},
{nbits: 5, value: 11},
{nbits: 32, value: 0x12345678},
{nbits: 15, value: 14495},
{nbits: 3, value: 6},
{nbits: 6, value: 13},
{nbits: 1, fail: true},
}
rd := bytes.NewReader([]byte{0xab, 0x12, 0x34, 0x56, 0x78, 0x71, 0x3f, 0x8d})
br := newBitReader(rd)
for i, v := range vectors {
val := br.ReadBits(v.nbits)
if fail := bool(br.err != nil); fail != v.fail {
if fail {
t.Errorf("test %d, unexpected failure: ReadBits(%d) = %v", i, v.nbits, br.err)
} else {
t.Errorf("test %d, unexpected success: ReadBits(%d) = nil", i, v.nbits)
}
}
if !v.fail && val != v.value {
t.Errorf("test %d, mismatching value: ReadBits(%d) = %d, want %d", i, v.nbits, val, v.value)
}
}
}
func TestMTF(t *testing.T) {
var vectors = []struct {
idx int // Input index
sym uint8 // Expected output symbol
}{
{idx: 1, sym: 1}, // [1 0 2 3 4]
{idx: 0, sym: 1}, // [1 0 2 3 4]
{idx: 1, sym: 0}, // [0 1 2 3 4]
{idx: 4, sym: 4}, // [4 0 1 2 3]
{idx: 1, sym: 0}, // [0 4 1 2 3]
}
mtf := newMTFDecoderWithRange(5)
for i, v := range vectors {
sym := mtf.Decode(v.idx)
t.Log(mtf)
if sym != v.sym {
t.Errorf("test %d, symbol mismatch: Decode(%d) = %d, want %d", i, v.idx, sym, v.sym)
}
}
}
func TestZeroRead(t *testing.T) {
b := mustDecodeHex("425a6839314159265359b5aa5098000000600040000004200021008283177245385090b5aa5098")
r := NewReader(bytes.NewReader(b))
if n, err := r.Read(nil); n != 0 || err != nil {
t.Errorf("Read(nil) = (%d, %v), want (0, nil)", n, err)
}
}
var (
digits = mustLoadFile("testdata/e.txt.bz2")
newton = mustLoadFile("testdata/Isaac.Newton-Opticks.txt.bz2")
random = mustLoadFile("testdata/random.data.bz2")
)
func benchmarkDecode(b *testing.B, compressed []byte) {
// Determine the uncompressed size of testfile.
uncompressedSize, err := io.Copy(io.Discard, NewReader(bytes.NewReader(compressed)))
if err != nil {
b.Fatal(err)
}
b.SetBytes(uncompressedSize)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
r := bytes.NewReader(compressed)
io.Copy(io.Discard, NewReader(r))
}
}
func BenchmarkDecodeDigits(b *testing.B) { benchmarkDecode(b, digits) }
func BenchmarkDecodeNewton(b *testing.B) { benchmarkDecode(b, newton) }
func BenchmarkDecodeRand(b *testing.B) { benchmarkDecode(b, random) }

View File

@@ -0,0 +1,234 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
import (
"cmp"
"slices"
)
// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
// symbol.
type huffmanTree struct {
// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
// root of the tree and nextNode contains the index of the next element
// of nodes to use when the tree is being constructed.
nodes []huffmanNode
nextNode int
}
// A huffmanNode is a node in the tree. left and right contain indexes into the
// nodes slice of the tree. If left or right is invalidNodeValue then the child
// is a left node and its value is in leftValue/rightValue.
//
// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
// tree, but also two magic values for run-length encoding and an EOF symbol.
// Thus there are more than 256 possible symbols.
type huffmanNode struct {
left, right uint16
leftValue, rightValue uint16
}
// invalidNodeValue is an invalid index which marks a leaf node in the tree.
const invalidNodeValue = 0xffff
// Decode reads bits from the given bitReader and navigates the tree until a
// symbol is found.
func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
nodeIndex := uint16(0) // node 0 is the root of the tree.
for {
node := &t.nodes[nodeIndex]
var bit uint16
if br.bits > 0 {
// Get next bit - fast path.
br.bits--
bit = uint16(br.n>>(br.bits&63)) & 1
} else {
// Get next bit - slow path.
// Use ReadBits to retrieve a single bit
// from the underling io.ByteReader.
bit = uint16(br.ReadBits(1))
}
// Trick a compiler into generating conditional move instead of branch,
// by making both loads unconditional.
l, r := node.left, node.right
if bit == 1 {
nodeIndex = l
} else {
nodeIndex = r
}
if nodeIndex == invalidNodeValue {
// We found a leaf. Use the value of bit to decide
// whether is a left or a right value.
l, r := node.leftValue, node.rightValue
if bit == 1 {
v = l
} else {
v = r
}
return
}
}
}
// newHuffmanTree builds a Huffman tree from a slice containing the code
// lengths of each symbol. The maximum code length is 32 bits.
func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
// There are many possible trees that assign the same code length to
// each symbol (consider reflecting a tree down the middle, for
// example). Since the code length assignments determine the
// efficiency of the tree, each of these trees is equally good. In
// order to minimize the amount of information needed to build a tree
// bzip2 uses a canonical tree so that it can be reconstructed given
// only the code length assignments.
if len(lengths) < 2 {
panic("newHuffmanTree: too few symbols")
}
var t huffmanTree
// First we sort the code length assignments by ascending code length,
// using the symbol value to break ties.
pairs := make([]huffmanSymbolLengthPair, len(lengths))
for i, length := range lengths {
pairs[i].value = uint16(i)
pairs[i].length = length
}
slices.SortFunc(pairs, func(a, b huffmanSymbolLengthPair) int {
if c := cmp.Compare(a.length, b.length); c != 0 {
return c
}
return cmp.Compare(a.value, b.value)
})
// Now we assign codes to the symbols, starting with the longest code.
// We keep the codes packed into a uint32, at the most-significant end.
// So branches are taken from the MSB downwards. This makes it easy to
// sort them later.
code := uint32(0)
length := uint8(32)
codes := make([]huffmanCode, len(lengths))
for i := len(pairs) - 1; i >= 0; i-- {
if length > pairs[i].length {
length = pairs[i].length
}
codes[i].code = code
codes[i].codeLen = length
codes[i].value = pairs[i].value
// We need to 'increment' the code, which means treating |code|
// like a |length| bit number.
code += 1 << (32 - length)
}
// Now we can sort by the code so that the left half of each branch are
// grouped together, recursively.
slices.SortFunc(codes, func(a, b huffmanCode) int {
return cmp.Compare(a.code, b.code)
})
t.nodes = make([]huffmanNode, len(codes))
_, err := buildHuffmanNode(&t, codes, 0)
return t, err
}
// huffmanSymbolLengthPair contains a symbol and its code length.
type huffmanSymbolLengthPair struct {
value uint16
length uint8
}
// huffmanCode contains a symbol, its code and code length.
type huffmanCode struct {
code uint32
codeLen uint8
value uint16
}
// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
// the Huffman tree at the given level. It returns the index of the newly
// constructed node.
func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
test := uint32(1) << (31 - level)
// We have to search the list of codes to find the divide between the left and right sides.
firstRightIndex := len(codes)
for i, code := range codes {
if code.code&test != 0 {
firstRightIndex = i
break
}
}
left := codes[:firstRightIndex]
right := codes[firstRightIndex:]
if len(left) == 0 || len(right) == 0 {
// There is a superfluous level in the Huffman tree indicating
// a bug in the encoder. However, this bug has been observed in
// the wild so we handle it.
// If this function was called recursively then we know that
// len(codes) >= 2 because, otherwise, we would have hit the
// "leaf node" case, below, and not recurred.
//
// However, for the initial call it's possible that len(codes)
// is zero or one. Both cases are invalid because a zero length
// tree cannot encode anything and a length-1 tree can only
// encode EOF and so is superfluous. We reject both.
if len(codes) < 2 {
return 0, StructuralError("empty Huffman tree")
}
// In this case the recursion doesn't always reduce the length
// of codes so we need to ensure termination via another
// mechanism.
if level == 31 {
// Since len(codes) >= 2 the only way that the values
// can match at all 32 bits is if they are equal, which
// is invalid. This ensures that we never enter
// infinite recursion.
return 0, StructuralError("equal symbols in Huffman tree")
}
if len(left) == 0 {
return buildHuffmanNode(t, right, level+1)
}
return buildHuffmanNode(t, left, level+1)
}
nodeIndex = uint16(t.nextNode)
node := &t.nodes[t.nextNode]
t.nextNode++
if len(left) == 1 {
// leaf node
node.left = invalidNodeValue
node.leftValue = left[0].value
} else {
node.left, err = buildHuffmanNode(t, left, level+1)
}
if err != nil {
return
}
if len(right) == 1 {
// leaf node
node.right = invalidNodeValue
node.rightValue = right[0].value
} else {
node.right, err = buildHuffmanNode(t, right, level+1)
}
return
}

View File

@@ -0,0 +1,53 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
// moveToFrontDecoder implements a move-to-front list. Such a list is an
// efficient way to transform a string with repeating elements into one with
// many small valued numbers, which is suitable for entropy encoding. It works
// by starting with an initial list of symbols and references symbols by their
// index into that list. When a symbol is referenced, it's moved to the front
// of the list. Thus, a repeated symbol ends up being encoded with many zeros,
// as the symbol will be at the front of the list after the first access.
type moveToFrontDecoder []byte
// newMTFDecoder creates a move-to-front decoder with an explicit initial list
// of symbols.
func newMTFDecoder(symbols []byte) moveToFrontDecoder {
if len(symbols) > 256 {
panic("too many symbols")
}
return moveToFrontDecoder(symbols)
}
// newMTFDecoderWithRange creates a move-to-front decoder with an initial
// symbol list of 0...n-1.
func newMTFDecoderWithRange(n int) moveToFrontDecoder {
if n > 256 {
panic("newMTFDecoderWithRange: cannot have > 256 symbols")
}
m := make([]byte, n)
for i := 0; i < n; i++ {
m[i] = byte(i)
}
return moveToFrontDecoder(m)
}
func (m moveToFrontDecoder) Decode(n int) (b byte) {
// Implement move-to-front with a simple copy. This approach
// beats more sophisticated approaches in benchmarking, probably
// because it has high locality of reference inside of a
// single cache line (most move-to-front operations have n < 64).
b = m[n]
copy(m[1:], m[:n])
m[0] = b
return
}
// First returns the symbol at the front of the list.
func (m moveToFrontDecoder) First() byte {
return m[0]
}

Binary file not shown.

BIN
src/compress/bzip2/testdata/e.txt.bz2 vendored Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1 @@
<EFBFBD><EFBFBD>e&<16>DJJ<04><1A>9d<39><64>E

Binary file not shown.

Binary file not shown.

Binary file not shown.