update deps

10 years ago · 7aae3661ee
12 changed files with 775 additions and 4 deletions
--- a/5
+++ b/5
@ -6,15 +6,12 @@ SOURCE_DIR = ./weed

 all: build

-.PHONY : clean deps build linux vet
+.PHONY : clean godep build linux vet

 clean:
 	go clean -i $(GO_FLAGS) $(SOURCE_DIR)
 	rm -f $(BINARY)

-deps:
-	go get $(GO_FLAGS) -d $(SOURCE_DIR)
-
 fmt:
 	gofmt -w -s $(SOURCE_DIR)

--- a/weed/Godeps/Godeps.json
+++ b/weed/Godeps/Godeps.json
@ -83,6 +83,11 @@
 			"ImportPath": "github.com/hailocab/go-hostpool",
 			"Rev": "0637eae892be221164aff5fcbccc57171aea6406"
 		},
+		{
+			"ImportPath": "github.com/klauspost/crc32",
+			"Comment": "v1.0",
+			"Rev": "19b0b332c9e4516a6370a0456e6182c3b5036720"
+		},
 		{
 			"ImportPath": "github.com/pierrec/lz4",
 			"Rev": "0b67ae4bb1ab03691079e38dddbc3909d68de64f"
--- a/weed/vendor/github.com/klauspost/crc32/.gitignore
+++ b/weed/vendor/github.com/klauspost/crc32/.gitignore
@ -0,0 +1,24 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
--- a/weed/vendor/github.com/klauspost/crc32/.travis.yml
+++ b/weed/vendor/github.com/klauspost/crc32/.travis.yml
@ -0,0 +1,12 @@
+language: go
+
+go:
+  - 1.3
+  - 1.4
+  - 1.5
+  - 1.6
+  - tip
+
+script: 
+ - go test -v .
+ - go test -v -race .
--- a/weed/vendor/github.com/klauspost/crc32/LICENSE
+++ b/weed/vendor/github.com/klauspost/crc32/LICENSE
@ -0,0 +1,28 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2015 Klaus Post
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/weed/vendor/github.com/klauspost/crc32/README.md
+++ b/weed/vendor/github.com/klauspost/crc32/README.md
@ -0,0 +1,84 @@
+# crc32
+CRC32 hash with x64 optimizations
+
+This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
+
+[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
+
+# usage
+
+Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
+
+Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
+
+# changes
+
+* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
+
+
+# performance
+
+For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
+```
+benchmark            old ns/op     new ns/op     delta
+BenchmarkCrc32KB     99955         10258         -89.74%
+
+benchmark            old MB/s     new MB/s     speedup
+BenchmarkCrc32KB     327.83       3194.20      9.74x
+```
+
+For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
+
+Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
+
+```
+Std:   Standard Go 1.5 library
+Crc:   Indicates IEEE type CRC.
+40B:   Size of each slice encoded.
+NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
+Castagnoli: Castagnoli CRC type.
+
+BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
+BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
+BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
+
+BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
+BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
+BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
+
+BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
+BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
+BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
+
+BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
+BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
+BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
+BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
+BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
+BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
+BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
+BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
+BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
+BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
+BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
+BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
+BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
+BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
+BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
+```
+
+The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
+
+However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
+
+# license
+
+Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/weed/vendor/github.com/klauspost/crc32/crc32.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32.go
@ -0,0 +1,186 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
+// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
+// information.
+//
+// Polynomials are represented in LSB-first form also known as reversed representation.
+//
+// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
+// for information.
+package crc32
+
+import (
+	"hash"
+	"sync"
+)
+
+// The size of a CRC-32 checksum in bytes.
+const Size = 4
+
+// Predefined polynomials.
+const (
+	// IEEE is by far and away the most common CRC-32 polynomial.
+	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
+	IEEE = 0xedb88320
+
+	// Castagnoli's polynomial, used in iSCSI.
+	// Has better error detection characteristics than IEEE.
+	// http://dx.doi.org/10.1109/26.231911
+	Castagnoli = 0x82f63b78
+
+	// Koopman's polynomial.
+	// Also has better error detection characteristics than IEEE.
+	// http://dx.doi.org/10.1109/DSN.2002.1028931
+	Koopman = 0xeb31d82e
+)
+
+// Table is a 256-word table representing the polynomial for efficient processing.
+type Table [256]uint32
+
+// castagnoliTable points to a lazily initialized Table for the Castagnoli
+// polynomial. MakeTable will always return this value when asked to make a
+// Castagnoli table so we can compare against it to find when the caller is
+// using this polynomial.
+var castagnoliTable *Table
+var castagnoliTable8 *slicing8Table
+var castagnoliOnce sync.Once
+
+func castagnoliInit() {
+	castagnoliTable = makeTable(Castagnoli)
+	castagnoliTable8 = makeTable8(Castagnoli)
+}
+
+// IEEETable is the table for the IEEE polynomial.
+var IEEETable = makeTable(IEEE)
+
+// slicing8Table is array of 8 Tables
+type slicing8Table [8]Table
+
+// ieeeTable8 is the slicing8Table for IEEE
+var ieeeTable8 *slicing8Table
+var ieeeTable8Once sync.Once
+
+// MakeTable returns a Table constructed from the specified polynomial.
+// The contents of this Table must not be modified.
+func MakeTable(poly uint32) *Table {
+	switch poly {
+	case IEEE:
+		return IEEETable
+	case Castagnoli:
+		castagnoliOnce.Do(castagnoliInit)
+		return castagnoliTable
+	}
+	return makeTable(poly)
+}
+
+// makeTable returns the Table constructed from the specified polynomial.
+func makeTable(poly uint32) *Table {
+	t := new(Table)
+	for i := 0; i < 256; i++ {
+		crc := uint32(i)
+		for j := 0; j < 8; j++ {
+			if crc&1 == 1 {
+				crc = (crc >> 1) ^ poly
+			} else {
+				crc >>= 1
+			}
+		}
+		t[i] = crc
+	}
+	return t
+}
+
+// makeTable8 returns slicing8Table constructed from the specified polynomial.
+func makeTable8(poly uint32) *slicing8Table {
+	t := new(slicing8Table)
+	t[0] = *makeTable(poly)
+	for i := 0; i < 256; i++ {
+		crc := t[0][i]
+		for j := 1; j < 8; j++ {
+			crc = t[0][crc&0xFF] ^ (crc >> 8)
+			t[j][i] = crc
+		}
+	}
+	return t
+}
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	crc uint32
+	tab *Table
+}
+
+// New creates a new hash.Hash32 computing the CRC-32 checksum
+// using the polynomial represented by the Table.
+// Its Sum method will lay the value out in big-endian byte order.
+func New(tab *Table) hash.Hash32 { return &digest{0, tab} }
+
+// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
+// using the IEEE polynomial.
+// Its Sum method will lay the value out in big-endian byte order.
+func NewIEEE() hash.Hash32 { return New(IEEETable) }
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return 1 }
+
+func (d *digest) Reset() { d.crc = 0 }
+
+func update(crc uint32, tab *Table, p []byte) uint32 {
+	crc = ^crc
+	for _, v := range p {
+		crc = tab[byte(crc)^v] ^ (crc >> 8)
+	}
+	return ^crc
+}
+
+// updateSlicingBy8 updates CRC using Slicing-by-8
+func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
+	crc = ^crc
+	for len(p) > 8 {
+		crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+		crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
+			tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
+			tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
+		p = p[8:]
+	}
+	crc = ^crc
+	if len(p) == 0 {
+		return crc
+	}
+	return update(crc, &tab[0], p)
+}
+
+// Update returns the result of adding the bytes in p to the crc.
+func Update(crc uint32, tab *Table, p []byte) uint32 {
+	if tab == castagnoliTable {
+		return updateCastagnoli(crc, p)
+	}
+	if tab == IEEETable {
+		return updateIEEE(crc, p)
+	}
+	return update(crc, tab, p)
+}
+
+func (d *digest) Write(p []byte) (n int, err error) {
+	d.crc = Update(d.crc, d.tab, p)
+	return len(p), nil
+}
+
+func (d *digest) Sum32() uint32 { return d.crc }
+
+func (d *digest) Sum(in []byte) []byte {
+	s := d.Sum32()
+	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
+}
+
+// Checksum returns the CRC-32 checksum of data
+// using the polynomial represented by the Table.
+func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
+
+// ChecksumIEEE returns the CRC-32 checksum of data
+// using the IEEE polynomial.
+func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) }
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -0,0 +1,62 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// and IEEE CRC.
+
+// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
+// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
+func haveSSE41() bool
+func haveSSE42() bool
+func haveCLMUL() bool
+
+// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
+// instruction as well as SSE 4.1.
+//go:noescape
+func ieeeCLMUL(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+var useFastIEEE = haveCLMUL() && haveSSE41()
+
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+	if sse42 {
+		return castagnoliSSE42(crc, p)
+	}
+	// only use slicing-by-8 when input is >= 16 Bytes
+	if len(p) >= 16 {
+		return updateSlicingBy8(crc, castagnoliTable8, p)
+	}
+	return update(crc, castagnoliTable, p)
+}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	if useFastIEEE && len(p) >= 64 {
+		left := len(p) & 15
+		do := len(p) - left
+		crc = ^ieeeCLMUL(^crc, p[:do])
+		if left > 0 {
+			crc = update(crc, IEEETable, p[do:])
+		}
+		return crc
+	}
+
+	// only use slicing-by-8 when input is >= 16 Bytes
+	if len(p) >= 16 {
+		ieeeTable8Once.Do(func() {
+			ieeeTable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, ieeeTable8, p)
+	}
+
+	return update(crc, IEEETable, p)
+}
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -0,0 +1,237 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+	MOVL crc+0(FP), AX    // CRC value
+	MOVQ p+8(FP), SI      // data pointer
+	MOVQ p_len+16(FP), CX // len(p)
+
+	NOTL AX
+
+	// If there's less than 8 bytes to process, we do it byte-by-byte.
+	CMPQ CX, $8
+	JL   cleanup
+
+	// Process individual bytes until the input is 8-byte aligned.
+startup:
+	MOVQ SI, BX
+	ANDQ $7, BX
+	JZ   aligned
+
+	CRC32B (SI), AX
+	DECQ   CX
+	INCQ   SI
+	JMP    startup
+
+aligned:
+	// The input is now 8-byte aligned and we can process 8-byte chunks.
+	CMPQ CX, $8
+	JL   cleanup
+
+	CRC32Q (SI), AX
+	ADDQ   $8, SI
+	SUBQ   $8, CX
+	JMP    aligned
+
+cleanup:
+	// We may have some bytes left over that we process one at a time.
+	CMPQ CX, $0
+	JE   done
+
+	CRC32B (SI), AX
+	INCQ   SI
+	DECQ   CX
+	JMP    cleanup
+
+done:
+	NOTL AX
+	MOVL AX, ret+32(FP)
+	RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $20, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func haveCLMUL() bool
+TEXT ·haveCLMUL(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $1, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func haveSSE41() bool
+TEXT ·haveSSE41(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $19, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// CRC32 polynomial data
+//
+// These constants are lifted from the
+// Linux kernel, since they avoid the costly
+// PSHUFB 16 byte reversal proposed in the
+// original Intel paper.
+DATA r2r1kp<>+0(SB)/8, $0x154442bd4
+DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
+DATA r4r3kp<>+0(SB)/8, $0x1751997d0
+DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
+DATA rupolykp<>+0(SB)/8, $0x1db710641
+DATA rupolykp<>+8(SB)/8, $0x1f7011641
+DATA r5kp<>+0(SB)/8, $0x163cd6124
+
+GLOBL r2r1kp<>(SB), RODATA, $16
+GLOBL r4r3kp<>(SB), RODATA, $16
+GLOBL rupolykp<>(SB), RODATA, $16
+GLOBL r5kp<>(SB), RODATA, $8
+
+// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+// len(p) must be at least 64, and must be a multiple of 16.
+
+// func ieeeCLMUL(crc uint32, p []byte) uint32
+TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
+	MOVL crc+0(FP), X0    // Initial CRC value
+	MOVQ p+8(FP), SI      // data pointer
+	MOVQ p_len+16(FP), CX // len(p)
+
+	MOVOU (SI), X1
+	MOVOU 16(SI), X2
+	MOVOU 32(SI), X3
+	MOVOU 48(SI), X4
+	PXOR  X0, X1
+	ADDQ  $64, SI    // buf+=64
+	SUBQ  $64, CX    // len-=64
+	CMPQ  CX, $64    // Less than 64 bytes left
+	JB    remain64
+
+	MOVOA r2r1kp<>+0(SB), X0
+
+loopback64:
+	MOVOA X1, X5
+	MOVOA X2, X6
+	MOVOA X3, X7
+	MOVOA X4, X8
+
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0, X0, X2
+	PCLMULQDQ $0, X0, X3
+	PCLMULQDQ $0, X0, X4
+
+	// Load next early
+	MOVOU (SI), X11
+	MOVOU 16(SI), X12
+	MOVOU 32(SI), X13
+	MOVOU 48(SI), X14
+
+	PCLMULQDQ $0x11, X0, X5
+	PCLMULQDQ $0x11, X0, X6
+	PCLMULQDQ $0x11, X0, X7
+	PCLMULQDQ $0x11, X0, X8
+
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+	PXOR X8, X4
+
+	PXOR X11, X1
+	PXOR X12, X2
+	PXOR X13, X3
+	PXOR X14, X4
+
+	ADDQ $0x40, DI
+	ADDQ $64, SI    // buf+=64
+	SUBQ $64, CX    // len-=64
+	CMPQ CX, $64    // Less than 64 bytes left?
+	JGE  loopback64
+
+	// Fold result into a single register (X1)
+remain64:
+	MOVOA r4r3kp<>+0(SB), X0
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X2, X1
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X3, X1
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X4, X1
+
+	// More than 16 bytes left?
+	CMPQ CX, $16
+	JB   finish
+
+	// Encode 16 bytes
+remain16:
+	MOVOU     (SI), X10
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X10, X1
+	SUBQ      $16, CX
+	ADDQ      $16, SI
+	CMPQ      CX, $16
+	JGE       remain16
+
+finish:
+	// Fold final result into 32 bits and return it
+	PCMPEQB   X3, X3
+	PCLMULQDQ $1, X1, X0
+	PSRLDQ    $8, X1
+	PXOR      X0, X1
+
+	MOVOA X1, X2
+	MOVQ  r5kp<>+0(SB), X0
+
+	// Creates 32 bit mask. Note that we don't care about upper half.
+	PSRLQ $32, X3
+
+	PSRLDQ    $4, X2
+	PAND      X3, X1
+	PCLMULQDQ $0, X0, X1
+	PXOR      X2, X1
+
+	MOVOA rupolykp<>+0(SB), X0
+
+	MOVOA     X1, X2
+	PAND      X3, X1
+	PCLMULQDQ $0x10, X0, X1
+	PAND      X3, X1
+	PCLMULQDQ $0, X0, X1
+	PXOR      X2, X1
+
+	// PEXTRD   $1, X1, AX  (SSE 4.1)
+	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
+	BYTE $0x16; BYTE $0xc8; BYTE $0x01
+	MOVL AX, ret+32(FP)
+
+	RET
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -0,0 +1,40 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// CRC.
+
+// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
+// support.
+func haveSSE42() bool
+
+// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+	if sse42 {
+		return castagnoliSSE42(crc, p)
+	}
+	return update(crc, castagnoliTable, p)
+}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	// only use slicing-by-8 when input is >= 4KB
+	if len(p) >= 4096 {
+		ieeeTable8Once.Do(func() {
+			ieeeTable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, ieeeTable8, p)
+	}
+
+	return update(crc, IEEETable, p)
+}
--- a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -0,0 +1,67 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+	MOVL crc+0(FP), AX   // CRC value
+	MOVL p+4(FP), SI     // data pointer
+	MOVL p_len+8(FP), CX // len(p)
+
+	NOTL AX
+
+	// If there's less than 8 bytes to process, we do it byte-by-byte.
+	CMPQ CX, $8
+	JL   cleanup
+
+	// Process individual bytes until the input is 8-byte aligned.
+startup:
+	MOVQ SI, BX
+	ANDQ $7, BX
+	JZ   aligned
+
+	CRC32B (SI), AX
+	DECQ   CX
+	INCQ   SI
+	JMP    startup
+
+aligned:
+	// The input is now 8-byte aligned and we can process 8-byte chunks.
+	CMPQ CX, $8
+	JL   cleanup
+
+	CRC32Q (SI), AX
+	ADDQ   $8, SI
+	SUBQ   $8, CX
+	JMP    aligned
+
+cleanup:
+	// We may have some bytes left over that we process one at a time.
+	CMPQ CX, $0
+	JE   done
+
+	CRC32B (SI), AX
+	INCQ   SI
+	DECQ   CX
+	JMP    cleanup
+
+done:
+	NOTL AX
+	MOVL AX, ret+16(FP)
+	RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $20, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
--- a/weed/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/weed/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -0,0 +1,29 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!amd64p32 appengine gccgo
+
+package crc32
+
+// This file contains the generic version of updateCastagnoli which does
+// slicing-by-8, or uses the fallback for very small sizes.
+
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+	// only use slicing-by-8 when input is >= 16 Bytes
+	if len(p) >= 16 {
+		return updateSlicingBy8(crc, castagnoliTable8, p)
+	}
+	return update(crc, castagnoliTable, p)
+}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	// only use slicing-by-8 when input is >= 16 Bytes
+	if len(p) >= 16 {
+		ieeeTable8Once.Do(func() {
+			ieeeTable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, ieeeTable8, p)
+	}
+	return update(crc, IEEETable, p)
+}