update deps

10 years ago · 7aae3661ee
12 changed files with 775 additions and 4 deletions
--- a/5
+++ b/5
@ -6,15 +6,12 @@ SOURCE_DIR = ./weed
 all: build
 .PHONY : clean deps build linux vet
 .PHONY : clean godep build linux vet
 clean:
 	go clean -i $(GO_FLAGS) $(SOURCE_DIR)
 	rm -f $(BINARY)
 deps:
 	go get $(GO_FLAGS) -d $(SOURCE_DIR)
 fmt:
 	gofmt -w -s $(SOURCE_DIR)
--- a/weed/Godeps/Godeps.json
+++ b/weed/Godeps/Godeps.json
@ -83,6 +83,11 @@
 			"ImportPath": "github.com/hailocab/go-hostpool",
 			"Rev": "0637eae892be221164aff5fcbccc57171aea6406"
 		},
 		{
 			"ImportPath": "github.com/klauspost/crc32",
 			"Comment": "v1.0",
 			"Rev": "19b0b332c9e4516a6370a0456e6182c3b5036720"
 		},
 		{
 			"ImportPath": "github.com/pierrec/lz4",
 			"Rev": "0b67ae4bb1ab03691079e38dddbc3909d68de64f"
--- a/weed/vendor/github.com/klauspost/crc32/.gitignore
+++ b/weed/vendor/github.com/klauspost/crc32/.gitignore
@ -0,0 +1,24 @@
 # Compiled Object files, Static and Dynamic libs (Shared Objects)
 *.o
 *.a
 *.so
 # Folders
 _obj
 _test
 # Architecture specific extensions/prefixes
 *.[568vq]
 [568vq].out
 *.cgo1.go
 *.cgo2.c
 _cgo_defun.c
 _cgo_gotypes.go
 _cgo_export.*
 _testmain.go
 *.exe
 *.test
 *.prof
--- a/weed/vendor/github.com/klauspost/crc32/.travis.yml
+++ b/weed/vendor/github.com/klauspost/crc32/.travis.yml
@ -0,0 +1,12 @@
 language: go
 go:
  - 1.3
  - 1.4
  - 1.5
  - 1.6
  - tip
 script: 
 - go test -v .
 - go test -v -race .
--- a/weed/vendor/github.com/klauspost/crc32/LICENSE
+++ b/weed/vendor/github.com/klauspost/crc32/LICENSE
@ -0,0 +1,28 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
 Copyright (c) 2015 Klaus Post
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
   * Neither the name of Google Inc. nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/weed/vendor/github.com/klauspost/crc32/README.md
+++ b/weed/vendor/github.com/klauspost/crc32/README.md
@ -0,0 +1,84 @@
 # crc32
 CRC32 hash with x64 optimizations
 This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
 [![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
 # usage
 Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
 Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
 # changes
 * Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
 # performance
 For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
 ```
 benchmark            old ns/op     new ns/op     delta
 BenchmarkCrc32KB     99955         10258         -89.74%
 benchmark            old MB/s     new MB/s     speedup
 BenchmarkCrc32KB     327.83       3194.20      9.74x
 ```
 For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
 Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
 ```
 Std:   Standard Go 1.5 library
 Crc:   Indicates IEEE type CRC.
 40B:   Size of each slice encoded.
 NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
 Castagnoli: Castagnoli CRC type.
 BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
 BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
 BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
 BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
 BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
 BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
 BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
 BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
 BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
 BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
 BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
 BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
 BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
 BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
 BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
 BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
 BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
 BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
 BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
 BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
 BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
 BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
 BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
 BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
 BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
 ```
 The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
 However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
 # license
 Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/weed/vendor/github.com/klauspost/crc32/crc32.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32.go
@ -0,0 +1,186 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
 // checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
 // information.
 //
 // Polynomials are represented in LSB-first form also known as reversed representation.
 //
 // See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
 // for information.
 package crc32
 import (
 	"hash"
 	"sync"
 )
 // The size of a CRC-32 checksum in bytes.
 const Size = 4
 // Predefined polynomials.
 const (
 	// IEEE is by far and away the most common CRC-32 polynomial.
 	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
 	IEEE = 0xedb88320
 	// Castagnoli's polynomial, used in iSCSI.
 	// Has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/26.231911
 	Castagnoli = 0x82f63b78
 	// Koopman's polynomial.
 	// Also has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/DSN.2002.1028931
 	Koopman = 0xeb31d82e
 )
 // Table is a 256-word table representing the polynomial for efficient processing.
 type Table [256]uint32
 // castagnoliTable points to a lazily initialized Table for the Castagnoli
 // polynomial. MakeTable will always return this value when asked to make a
 // Castagnoli table so we can compare against it to find when the caller is
 // using this polynomial.
 var castagnoliTable *Table
 var castagnoliTable8 *slicing8Table
 var castagnoliOnce sync.Once
 func castagnoliInit() {
 	castagnoliTable = makeTable(Castagnoli)
 	castagnoliTable8 = makeTable8(Castagnoli)
 }
 // IEEETable is the table for the IEEE polynomial.
 var IEEETable = makeTable(IEEE)
 // slicing8Table is array of 8 Tables
 type slicing8Table [8]Table
 // ieeeTable8 is the slicing8Table for IEEE
 var ieeeTable8 *slicing8Table
 var ieeeTable8Once sync.Once
 // MakeTable returns a Table constructed from the specified polynomial.
 // The contents of this Table must not be modified.
 func MakeTable(poly uint32) *Table {
 	switch poly {
 	case IEEE:
 		return IEEETable
 	case Castagnoli:
 		castagnoliOnce.Do(castagnoliInit)
 		return castagnoliTable
 	}
 	return makeTable(poly)
 }
 // makeTable returns the Table constructed from the specified polynomial.
 func makeTable(poly uint32) *Table {
 	t := new(Table)
 	for i := 0; i < 256; i++ {
 		crc := uint32(i)
 		for j := 0; j < 8; j++ {
 			if crc&1 == 1 {
 				crc = (crc >> 1) ^ poly
 			} else {
 				crc >>= 1
 			}
 		}
 		t[i] = crc
 	}
 	return t
 }
 // makeTable8 returns slicing8Table constructed from the specified polynomial.
 func makeTable8(poly uint32) *slicing8Table {
 	t := new(slicing8Table)
 	t[0] = *makeTable(poly)
 	for i := 0; i < 256; i++ {
 		crc := t[0][i]
 		for j := 1; j < 8; j++ {
 			crc = t[0][crc&0xFF] ^ (crc >> 8)
 			t[j][i] = crc
 		}
 	}
 	return t
 }
 // digest represents the partial evaluation of a checksum.
 type digest struct {
 	crc uint32
 	tab *Table
 }
 // New creates a new hash.Hash32 computing the CRC-32 checksum
 // using the polynomial represented by the Table.
 // Its Sum method will lay the value out in big-endian byte order.
 func New(tab *Table) hash.Hash32 { return &digest{0, tab} }
 // NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
 // using the IEEE polynomial.
 // Its Sum method will lay the value out in big-endian byte order.
 func NewIEEE() hash.Hash32 { return New(IEEETable) }
 func (d *digest) Size() int { return Size }
 func (d *digest) BlockSize() int { return 1 }
 func (d *digest) Reset() { d.crc = 0 }
 func update(crc uint32, tab *Table, p []byte) uint32 {
 	crc = ^crc
 	for _, v := range p {
 		crc = tab[byte(crc)^v] ^ (crc >> 8)
 	}
 	return ^crc
 }
 // updateSlicingBy8 updates CRC using Slicing-by-8
 func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
 	crc = ^crc
 	for len(p) > 8 {
 		crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
 		crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
 			tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
 			tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
 		p = p[8:]
 	}
 	crc = ^crc
 	if len(p) == 0 {
 		return crc
 	}
 	return update(crc, &tab[0], p)
 }
 // Update returns the result of adding the bytes in p to the crc.
 func Update(crc uint32, tab *Table, p []byte) uint32 {
 	if tab == castagnoliTable {
 		return updateCastagnoli(crc, p)
 	}
 	if tab == IEEETable {
 		return updateIEEE(crc, p)
 	}
 	return update(crc, tab, p)
 }
 func (d *digest) Write(p []byte) (n int, err error) {
 	d.crc = Update(d.crc, d.tab, p)
 	return len(p), nil
 }
 func (d *digest) Sum32() uint32 { return d.crc }
 func (d *digest) Sum(in []byte) []byte {
 	s := d.Sum32()
 	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
 }
 // Checksum returns the CRC-32 checksum of data
 // using the polynomial represented by the Table.
 func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
 // ChecksumIEEE returns the CRC-32 checksum of data
 // using the IEEE polynomial.
 func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) }
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -0,0 +1,62 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 package crc32
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // and IEEE CRC.
 // haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
 // CPUID to test for SSE 4.1, 4.2 and CLMUL support.
 func haveSSE41() bool
 func haveSSE42() bool
 func haveCLMUL() bool
 // castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 // ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
 // instruction as well as SSE 4.1.
 //go:noescape
 func ieeeCLMUL(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 var useFastIEEE = haveCLMUL() && haveSSE41()
 func updateCastagnoli(crc uint32, p []byte) uint32 {
 	if sse42 {
 		return castagnoliSSE42(crc, p)
 	}
 	// only use slicing-by-8 when input is >= 16 Bytes
 	if len(p) >= 16 {
 		return updateSlicingBy8(crc, castagnoliTable8, p)
 	}
 	return update(crc, castagnoliTable, p)
 }
 func updateIEEE(crc uint32, p []byte) uint32 {
 	if useFastIEEE && len(p) >= 64 {
 		left := len(p) & 15
 		do := len(p) - left
 		crc = ^ieeeCLMUL(^crc, p[:do])
 		if left > 0 {
 			crc = update(crc, IEEETable, p[do:])
 		}
 		return crc
 	}
 	// only use slicing-by-8 when input is >= 16 Bytes
 	if len(p) >= 16 {
 		ieeeTable8Once.Do(func() {
 			ieeeTable8 = makeTable8(IEEE)
 		})
 		return updateSlicingBy8(crc, ieeeTable8, p)
 	}
 	return update(crc, IEEETable, p)
 }
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -0,0 +1,237 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX    // CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	NOTL AX
 	// If there's less than 8 bytes to process, we do it byte-by-byte.
 	CMPQ CX, $8
 	JL   cleanup
 	// Process individual bytes until the input is 8-byte aligned.
 startup:
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 	JMP    startup
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   cleanup
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 cleanup:
 	// We may have some bytes left over that we process one at a time.
 	CMPQ CX, $0
 	JE   done
 	CRC32B (SI), AX
 	INCQ   SI
 	DECQ   CX
 	JMP    cleanup
 done:
 	NOTL AX
 	MOVL AX, ret+32(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveCLMUL() bool
 TEXT ·haveCLMUL(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $1, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveSSE41() bool
 TEXT ·haveSSE41(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $19, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // CRC32 polynomial data
 //
 // These constants are lifted from the
 // Linux kernel, since they avoid the costly
 // PSHUFB 16 byte reversal proposed in the
 // original Intel paper.
 DATA r2r1kp<>+0(SB)/8, $0x154442bd4
 DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
 DATA r4r3kp<>+0(SB)/8, $0x1751997d0
 DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
 DATA rupolykp<>+0(SB)/8, $0x1db710641
 DATA rupolykp<>+8(SB)/8, $0x1f7011641
 DATA r5kp<>+0(SB)/8, $0x163cd6124
 GLOBL r2r1kp<>(SB), RODATA, $16
 GLOBL r4r3kp<>(SB), RODATA, $16
 GLOBL rupolykp<>(SB), RODATA, $16
 GLOBL r5kp<>(SB), RODATA, $8
 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 // len(p) must be at least 64, and must be a multiple of 16.
 // func ieeeCLMUL(crc uint32, p []byte) uint32
 TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
 	MOVL crc+0(FP), X0    // Initial CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	MOVOU (SI), X1
 	MOVOU 16(SI), X2
 	MOVOU 32(SI), X3
 	MOVOU 48(SI), X4
 	PXOR  X0, X1
 	ADDQ  $64, SI    // buf+=64
 	SUBQ  $64, CX    // len-=64
 	CMPQ  CX, $64    // Less than 64 bytes left
 	JB    remain64
 	MOVOA r2r1kp<>+0(SB), X0
 loopback64:
 	MOVOA X1, X5
 	MOVOA X2, X6
 	MOVOA X3, X7
 	MOVOA X4, X8
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0, X0, X2
 	PCLMULQDQ $0, X0, X3
 	PCLMULQDQ $0, X0, X4
 	// Load next early
 	MOVOU (SI), X11
 	MOVOU 16(SI), X12
 	MOVOU 32(SI), X13
 	MOVOU 48(SI), X14
 	PCLMULQDQ $0x11, X0, X5
 	PCLMULQDQ $0x11, X0, X6
 	PCLMULQDQ $0x11, X0, X7
 	PCLMULQDQ $0x11, X0, X8
 	PXOR X5, X1
 	PXOR X6, X2
 	PXOR X7, X3
 	PXOR X8, X4
 	PXOR X11, X1
 	PXOR X12, X2
 	PXOR X13, X3
 	PXOR X14, X4
 	ADDQ $0x40, DI
 	ADDQ $64, SI    // buf+=64
 	SUBQ $64, CX    // len-=64
 	CMPQ CX, $64    // Less than 64 bytes left?
 	JGE  loopback64
 	// Fold result into a single register (X1)
 remain64:
 	MOVOA r4r3kp<>+0(SB), X0
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X2, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X3, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X4, X1
 	// More than 16 bytes left?
 	CMPQ CX, $16
 	JB   finish
 	// Encode 16 bytes
 remain16:
 	MOVOU     (SI), X10
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X10, X1
 	SUBQ      $16, CX
 	ADDQ      $16, SI
 	CMPQ      CX, $16
 	JGE       remain16
 finish:
 	// Fold final result into 32 bits and return it
 	PCMPEQB   X3, X3
 	PCLMULQDQ $1, X1, X0
 	PSRLDQ    $8, X1
 	PXOR      X0, X1
 	MOVOA X1, X2
 	MOVQ  r5kp<>+0(SB), X0
 	// Creates 32 bit mask. Note that we don't care about upper half.
 	PSRLQ $32, X3
 	PSRLDQ    $4, X2
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	MOVOA rupolykp<>+0(SB), X0
 	MOVOA     X1, X2
 	PAND      X3, X1
 	PCLMULQDQ $0x10, X0, X1
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	// PEXTRD   $1, X1, AX  (SSE 4.1)
 	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
 	BYTE $0x16; BYTE $0xc8; BYTE $0x01
 	MOVL AX, ret+32(FP)
 	RET
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -0,0 +1,40 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 package crc32
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // CRC.
 // haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
 // support.
 func haveSSE42() bool
 // castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 func updateCastagnoli(crc uint32, p []byte) uint32 {
 	if sse42 {
 		return castagnoliSSE42(crc, p)
 	}
 	return update(crc, castagnoliTable, p)
 }
 func updateIEEE(crc uint32, p []byte) uint32 {
 	// only use slicing-by-8 when input is >= 4KB
 	if len(p) >= 4096 {
 		ieeeTable8Once.Do(func() {
 			ieeeTable8 = makeTable8(IEEE)
 		})
 		return updateSlicingBy8(crc, ieeeTable8, p)
 	}
 	return update(crc, IEEETable, p)
 }
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -0,0 +1,67 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX   // CRC value
 	MOVL p+4(FP), SI     // data pointer
 	MOVL p_len+8(FP), CX // len(p)
 	NOTL AX
 	// If there's less than 8 bytes to process, we do it byte-by-byte.
 	CMPQ CX, $8
 	JL   cleanup
 	// Process individual bytes until the input is 8-byte aligned.
 startup:
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 	JMP    startup
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   cleanup
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 cleanup:
 	// We may have some bytes left over that we process one at a time.
 	CMPQ CX, $0
 	JE   done
 	CRC32B (SI), AX
 	INCQ   SI
 	DECQ   CX
 	JMP    cleanup
 done:
 	NOTL AX
 	MOVL AX, ret+16(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
--- a/weed/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -0,0 +1,29 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!amd64p32 appengine gccgo
 package crc32
 // This file contains the generic version of updateCastagnoli which does
 // slicing-by-8, or uses the fallback for very small sizes.
 func updateCastagnoli(crc uint32, p []byte) uint32 {
 	// only use slicing-by-8 when input is >= 16 Bytes
 	if len(p) >= 16 {
 		return updateSlicingBy8(crc, castagnoliTable8, p)
 	}
 	return update(crc, castagnoliTable, p)
 }
 func updateIEEE(crc uint32, p []byte) uint32 {
 	// only use slicing-by-8 when input is >= 16 Bytes
 	if len(p) >= 16 {
 		ieeeTable8Once.Do(func() {
 			ieeeTable8 = makeTable8(IEEE)
 		})
 		return updateSlicingBy8(crc, ieeeTable8, p)
 	}
 	return update(crc, IEEETable, p)
 }