From 7aae3661eedb02dc0cf2b6e97bc8da666ab6fdf7 Mon Sep 17 00:00:00 2001 From: tnextday Date: Mon, 4 Apr 2016 16:28:14 +0800 Subject: [PATCH] update deps --- Makefile | 5 +- weed/Godeps/Godeps.json | 5 + .../github.com/klauspost/crc32/.gitignore | 24 ++ .../github.com/klauspost/crc32/.travis.yml | 12 + .../vendor/github.com/klauspost/crc32/LICENSE | 28 +++ .../github.com/klauspost/crc32/README.md | 84 +++++++ .../github.com/klauspost/crc32/crc32.go | 186 ++++++++++++++ .../github.com/klauspost/crc32/crc32_amd64.go | 62 +++++ .../github.com/klauspost/crc32/crc32_amd64.s | 237 ++++++++++++++++++ .../klauspost/crc32/crc32_amd64p32.go | 40 +++ .../klauspost/crc32/crc32_amd64p32.s | 67 +++++ .../klauspost/crc32/crc32_generic.go | 29 +++ 12 files changed, 775 insertions(+), 4 deletions(-) create mode 100644 weed/vendor/github.com/klauspost/crc32/.gitignore create mode 100644 weed/vendor/github.com/klauspost/crc32/.travis.yml create mode 100644 weed/vendor/github.com/klauspost/crc32/LICENSE create mode 100644 weed/vendor/github.com/klauspost/crc32/README.md create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32.go create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32_amd64.go create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32_amd64.s create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s create mode 100644 weed/vendor/github.com/klauspost/crc32/crc32_generic.go diff --git a/Makefile b/Makefile index 5e40c7b31..f80bc071b 100644 --- a/Makefile +++ b/Makefile @@ -6,15 +6,12 @@ SOURCE_DIR = ./weed all: build -.PHONY : clean deps build linux vet +.PHONY : clean godep build linux vet clean: go clean -i $(GO_FLAGS) $(SOURCE_DIR) rm -f $(BINARY) -deps: - go get $(GO_FLAGS) -d $(SOURCE_DIR) - fmt: gofmt -w -s $(SOURCE_DIR) diff --git a/weed/Godeps/Godeps.json b/weed/Godeps/Godeps.json index e3872f2ca..6448800a3 100644 --- a/weed/Godeps/Godeps.json +++ b/weed/Godeps/Godeps.json @@ -83,6 +83,11 @@ "ImportPath": "github.com/hailocab/go-hostpool", "Rev": "0637eae892be221164aff5fcbccc57171aea6406" }, + { + "ImportPath": "github.com/klauspost/crc32", + "Comment": "v1.0", + "Rev": "19b0b332c9e4516a6370a0456e6182c3b5036720" + }, { "ImportPath": "github.com/pierrec/lz4", "Rev": "0b67ae4bb1ab03691079e38dddbc3909d68de64f" diff --git a/weed/vendor/github.com/klauspost/crc32/.gitignore b/weed/vendor/github.com/klauspost/crc32/.gitignore new file mode 100644 index 000000000..daf913b1b --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/.gitignore @@ -0,0 +1,24 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof diff --git a/weed/vendor/github.com/klauspost/crc32/.travis.yml b/weed/vendor/github.com/klauspost/crc32/.travis.yml new file mode 100644 index 000000000..977179953 --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/.travis.yml @@ -0,0 +1,12 @@ +language: go + +go: + - 1.3 + - 1.4 + - 1.5 + - 1.6 + - tip + +script: + - go test -v . + - go test -v -race . diff --git a/weed/vendor/github.com/klauspost/crc32/LICENSE b/weed/vendor/github.com/klauspost/crc32/LICENSE new file mode 100644 index 000000000..4fd5963e3 --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2012 The Go Authors. All rights reserved. +Copyright (c) 2015 Klaus Post + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/weed/vendor/github.com/klauspost/crc32/README.md b/weed/vendor/github.com/klauspost/crc32/README.md new file mode 100644 index 000000000..440541c7f --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/README.md @@ -0,0 +1,84 @@ +# crc32 +CRC32 hash with x64 optimizations + +This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. + +[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32) + +# usage + +Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. + +Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. + +# changes + +* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. + + +# performance + +For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: +``` +benchmark old ns/op new ns/op delta +BenchmarkCrc32KB 99955 10258 -89.74% + +benchmark old MB/s new MB/s speedup +BenchmarkCrc32KB 327.83 3194.20 9.74x +``` + +For other tables and "CLMUL" capable machines the performance is the same as the standard library. + +Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. + +``` +Std: Standard Go 1.5 library +Crc: Indicates IEEE type CRC. +40B: Size of each slice encoded. +NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). +Castagnoli: Castagnoli CRC type. + +BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s +BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) +BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) + +BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s +BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) +BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) + +BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) +BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) +BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) + +BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) +BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) +BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) + +BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s +BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) +BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) +BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) + +BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s +BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) +BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) +BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) + +BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s +BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) +BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) +BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) + +BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s +BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) +BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) +BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) +``` + +The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. + +However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. + +# license + +Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. diff --git a/weed/vendor/github.com/klauspost/crc32/crc32.go b/weed/vendor/github.com/klauspost/crc32/crc32.go new file mode 100644 index 000000000..8d6ba5d3d --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32.go @@ -0,0 +1,186 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32, +// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for +// information. +// +// Polynomials are represented in LSB-first form also known as reversed representation. +// +// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials +// for information. +package crc32 + +import ( + "hash" + "sync" +) + +// The size of a CRC-32 checksum in bytes. +const Size = 4 + +// Predefined polynomials. +const ( + // IEEE is by far and away the most common CRC-32 polynomial. + // Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ... + IEEE = 0xedb88320 + + // Castagnoli's polynomial, used in iSCSI. + // Has better error detection characteristics than IEEE. + // http://dx.doi.org/10.1109/26.231911 + Castagnoli = 0x82f63b78 + + // Koopman's polynomial. + // Also has better error detection characteristics than IEEE. + // http://dx.doi.org/10.1109/DSN.2002.1028931 + Koopman = 0xeb31d82e +) + +// Table is a 256-word table representing the polynomial for efficient processing. +type Table [256]uint32 + +// castagnoliTable points to a lazily initialized Table for the Castagnoli +// polynomial. MakeTable will always return this value when asked to make a +// Castagnoli table so we can compare against it to find when the caller is +// using this polynomial. +var castagnoliTable *Table +var castagnoliTable8 *slicing8Table +var castagnoliOnce sync.Once + +func castagnoliInit() { + castagnoliTable = makeTable(Castagnoli) + castagnoliTable8 = makeTable8(Castagnoli) +} + +// IEEETable is the table for the IEEE polynomial. +var IEEETable = makeTable(IEEE) + +// slicing8Table is array of 8 Tables +type slicing8Table [8]Table + +// ieeeTable8 is the slicing8Table for IEEE +var ieeeTable8 *slicing8Table +var ieeeTable8Once sync.Once + +// MakeTable returns a Table constructed from the specified polynomial. +// The contents of this Table must not be modified. +func MakeTable(poly uint32) *Table { + switch poly { + case IEEE: + return IEEETable + case Castagnoli: + castagnoliOnce.Do(castagnoliInit) + return castagnoliTable + } + return makeTable(poly) +} + +// makeTable returns the Table constructed from the specified polynomial. +func makeTable(poly uint32) *Table { + t := new(Table) + for i := 0; i < 256; i++ { + crc := uint32(i) + for j := 0; j < 8; j++ { + if crc&1 == 1 { + crc = (crc >> 1) ^ poly + } else { + crc >>= 1 + } + } + t[i] = crc + } + return t +} + +// makeTable8 returns slicing8Table constructed from the specified polynomial. +func makeTable8(poly uint32) *slicing8Table { + t := new(slicing8Table) + t[0] = *makeTable(poly) + for i := 0; i < 256; i++ { + crc := t[0][i] + for j := 1; j < 8; j++ { + crc = t[0][crc&0xFF] ^ (crc >> 8) + t[j][i] = crc + } + } + return t +} + +// digest represents the partial evaluation of a checksum. +type digest struct { + crc uint32 + tab *Table +} + +// New creates a new hash.Hash32 computing the CRC-32 checksum +// using the polynomial represented by the Table. +// Its Sum method will lay the value out in big-endian byte order. +func New(tab *Table) hash.Hash32 { return &digest{0, tab} } + +// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum +// using the IEEE polynomial. +// Its Sum method will lay the value out in big-endian byte order. +func NewIEEE() hash.Hash32 { return New(IEEETable) } + +func (d *digest) Size() int { return Size } + +func (d *digest) BlockSize() int { return 1 } + +func (d *digest) Reset() { d.crc = 0 } + +func update(crc uint32, tab *Table, p []byte) uint32 { + crc = ^crc + for _, v := range p { + crc = tab[byte(crc)^v] ^ (crc >> 8) + } + return ^crc +} + +// updateSlicingBy8 updates CRC using Slicing-by-8 +func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 { + crc = ^crc + for len(p) > 8 { + crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 + crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ + tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ + tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] + p = p[8:] + } + crc = ^crc + if len(p) == 0 { + return crc + } + return update(crc, &tab[0], p) +} + +// Update returns the result of adding the bytes in p to the crc. +func Update(crc uint32, tab *Table, p []byte) uint32 { + if tab == castagnoliTable { + return updateCastagnoli(crc, p) + } + if tab == IEEETable { + return updateIEEE(crc, p) + } + return update(crc, tab, p) +} + +func (d *digest) Write(p []byte) (n int, err error) { + d.crc = Update(d.crc, d.tab, p) + return len(p), nil +} + +func (d *digest) Sum32() uint32 { return d.crc } + +func (d *digest) Sum(in []byte) []byte { + s := d.Sum32() + return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) +} + +// Checksum returns the CRC-32 checksum of data +// using the polynomial represented by the Table. +func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } + +// ChecksumIEEE returns the CRC-32 checksum of data +// using the IEEE polynomial. +func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) } diff --git a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go new file mode 100644 index 000000000..4827128ea --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.go @@ -0,0 +1,62 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine,!gccgo + +package crc32 + +// This file contains the code to call the SSE 4.2 version of the Castagnoli +// and IEEE CRC. + +// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use +// CPUID to test for SSE 4.1, 4.2 and CLMUL support. +func haveSSE41() bool +func haveSSE42() bool +func haveCLMUL() bool + +// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// instruction. +//go:noescape +func castagnoliSSE42(crc uint32, p []byte) uint32 + +// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ +// instruction as well as SSE 4.1. +//go:noescape +func ieeeCLMUL(crc uint32, p []byte) uint32 + +var sse42 = haveSSE42() +var useFastIEEE = haveCLMUL() && haveSSE41() + +func updateCastagnoli(crc uint32, p []byte) uint32 { + if sse42 { + return castagnoliSSE42(crc, p) + } + // only use slicing-by-8 when input is >= 16 Bytes + if len(p) >= 16 { + return updateSlicingBy8(crc, castagnoliTable8, p) + } + return update(crc, castagnoliTable, p) +} + +func updateIEEE(crc uint32, p []byte) uint32 { + if useFastIEEE && len(p) >= 64 { + left := len(p) & 15 + do := len(p) - left + crc = ^ieeeCLMUL(^crc, p[:do]) + if left > 0 { + crc = update(crc, IEEETable, p[do:]) + } + return crc + } + + // only use slicing-by-8 when input is >= 16 Bytes + if len(p) >= 16 { + ieeeTable8Once.Do(func() { + ieeeTable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, ieeeTable8, p) + } + + return update(crc, IEEETable, p) +} diff --git a/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s new file mode 100644 index 000000000..9bf05d89b --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64.s @@ -0,0 +1,237 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build gc + +#define NOSPLIT 4 +#define RODATA 8 + +// func castagnoliSSE42(crc uint32, p []byte) uint32 +TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 + MOVL crc+0(FP), AX // CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + NOTL AX + + // If there's less than 8 bytes to process, we do it byte-by-byte. + CMPQ CX, $8 + JL cleanup + + // Process individual bytes until the input is 8-byte aligned. +startup: + MOVQ SI, BX + ANDQ $7, BX + JZ aligned + + CRC32B (SI), AX + DECQ CX + INCQ SI + JMP startup + +aligned: + // The input is now 8-byte aligned and we can process 8-byte chunks. + CMPQ CX, $8 + JL cleanup + + CRC32Q (SI), AX + ADDQ $8, SI + SUBQ $8, CX + JMP aligned + +cleanup: + // We may have some bytes left over that we process one at a time. + CMPQ CX, $0 + JE done + + CRC32B (SI), AX + INCQ SI + DECQ CX + JMP cleanup + +done: + NOTL AX + MOVL AX, ret+32(FP) + RET + +// func haveSSE42() bool +TEXT ·haveSSE42(SB), NOSPLIT, $0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $20, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// func haveCLMUL() bool +TEXT ·haveCLMUL(SB), NOSPLIT, $0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $1, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// func haveSSE41() bool +TEXT ·haveSSE41(SB), NOSPLIT, $0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// CRC32 polynomial data +// +// These constants are lifted from the +// Linux kernel, since they avoid the costly +// PSHUFB 16 byte reversal proposed in the +// original Intel paper. +DATA r2r1kp<>+0(SB)/8, $0x154442bd4 +DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 +DATA r4r3kp<>+0(SB)/8, $0x1751997d0 +DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e +DATA rupolykp<>+0(SB)/8, $0x1db710641 +DATA rupolykp<>+8(SB)/8, $0x1f7011641 +DATA r5kp<>+0(SB)/8, $0x163cd6124 + +GLOBL r2r1kp<>(SB), RODATA, $16 +GLOBL r4r3kp<>(SB), RODATA, $16 +GLOBL rupolykp<>(SB), RODATA, $16 +GLOBL r5kp<>(SB), RODATA, $8 + +// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// len(p) must be at least 64, and must be a multiple of 16. + +// func ieeeCLMUL(crc uint32, p []byte) uint32 +TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 + MOVL crc+0(FP), X0 // Initial CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + MOVOU (SI), X1 + MOVOU 16(SI), X2 + MOVOU 32(SI), X3 + MOVOU 48(SI), X4 + PXOR X0, X1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left + JB remain64 + + MOVOA r2r1kp<>+0(SB), X0 + +loopback64: + MOVOA X1, X5 + MOVOA X2, X6 + MOVOA X3, X7 + MOVOA X4, X8 + + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0, X0, X2 + PCLMULQDQ $0, X0, X3 + PCLMULQDQ $0, X0, X4 + + // Load next early + MOVOU (SI), X11 + MOVOU 16(SI), X12 + MOVOU 32(SI), X13 + MOVOU 48(SI), X14 + + PCLMULQDQ $0x11, X0, X5 + PCLMULQDQ $0x11, X0, X6 + PCLMULQDQ $0x11, X0, X7 + PCLMULQDQ $0x11, X0, X8 + + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + PXOR X8, X4 + + PXOR X11, X1 + PXOR X12, X2 + PXOR X13, X3 + PXOR X14, X4 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64 + + // Fold result into a single register (X1) +remain64: + MOVOA r4r3kp<>+0(SB), X0 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X2, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X3, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X4, X1 + + // More than 16 bytes left? + CMPQ CX, $16 + JB finish + + // Encode 16 bytes +remain16: + MOVOU (SI), X10 + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X10, X1 + SUBQ $16, CX + ADDQ $16, SI + CMPQ CX, $16 + JGE remain16 + +finish: + // Fold final result into 32 bits and return it + PCMPEQB X3, X3 + PCLMULQDQ $1, X1, X0 + PSRLDQ $8, X1 + PXOR X0, X1 + + MOVOA X1, X2 + MOVQ r5kp<>+0(SB), X0 + + // Creates 32 bit mask. Note that we don't care about upper half. + PSRLQ $32, X3 + + PSRLDQ $4, X2 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + MOVOA rupolykp<>+0(SB), X0 + + MOVOA X1, X2 + PAND X3, X1 + PCLMULQDQ $0x10, X0, X1 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + // PEXTRD $1, X1, AX (SSE 4.1) + BYTE $0x66; BYTE $0x0f; BYTE $0x3a + BYTE $0x16; BYTE $0xc8; BYTE $0x01 + MOVL AX, ret+32(FP) + + RET diff --git a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go new file mode 100644 index 000000000..926473e7c --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go @@ -0,0 +1,40 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine,!gccgo + +package crc32 + +// This file contains the code to call the SSE 4.2 version of the Castagnoli +// CRC. + +// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2 +// support. +func haveSSE42() bool + +// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// instruction. +//go:noescape +func castagnoliSSE42(crc uint32, p []byte) uint32 + +var sse42 = haveSSE42() + +func updateCastagnoli(crc uint32, p []byte) uint32 { + if sse42 { + return castagnoliSSE42(crc, p) + } + return update(crc, castagnoliTable, p) +} + +func updateIEEE(crc uint32, p []byte) uint32 { + // only use slicing-by-8 when input is >= 4KB + if len(p) >= 4096 { + ieeeTable8Once.Do(func() { + ieeeTable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, ieeeTable8, p) + } + + return update(crc, IEEETable, p) +} diff --git a/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s new file mode 100644 index 000000000..a578d685c --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s @@ -0,0 +1,67 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build gc + +#define NOSPLIT 4 +#define RODATA 8 + +// func castagnoliSSE42(crc uint32, p []byte) uint32 +TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 + MOVL crc+0(FP), AX // CRC value + MOVL p+4(FP), SI // data pointer + MOVL p_len+8(FP), CX // len(p) + + NOTL AX + + // If there's less than 8 bytes to process, we do it byte-by-byte. + CMPQ CX, $8 + JL cleanup + + // Process individual bytes until the input is 8-byte aligned. +startup: + MOVQ SI, BX + ANDQ $7, BX + JZ aligned + + CRC32B (SI), AX + DECQ CX + INCQ SI + JMP startup + +aligned: + // The input is now 8-byte aligned and we can process 8-byte chunks. + CMPQ CX, $8 + JL cleanup + + CRC32Q (SI), AX + ADDQ $8, SI + SUBQ $8, CX + JMP aligned + +cleanup: + // We may have some bytes left over that we process one at a time. + CMPQ CX, $0 + JE done + + CRC32B (SI), AX + INCQ SI + DECQ CX + JMP cleanup + +done: + NOTL AX + MOVL AX, ret+16(FP) + RET + +// func haveSSE42() bool +TEXT ·haveSSE42(SB), NOSPLIT, $0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $20, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + diff --git a/weed/vendor/github.com/klauspost/crc32/crc32_generic.go b/weed/vendor/github.com/klauspost/crc32/crc32_generic.go new file mode 100644 index 000000000..a53cf96a0 --- /dev/null +++ b/weed/vendor/github.com/klauspost/crc32/crc32_generic.go @@ -0,0 +1,29 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!amd64p32 appengine gccgo + +package crc32 + +// This file contains the generic version of updateCastagnoli which does +// slicing-by-8, or uses the fallback for very small sizes. + +func updateCastagnoli(crc uint32, p []byte) uint32 { + // only use slicing-by-8 when input is >= 16 Bytes + if len(p) >= 16 { + return updateSlicingBy8(crc, castagnoliTable8, p) + } + return update(crc, castagnoliTable, p) +} + +func updateIEEE(crc uint32, p []byte) uint32 { + // only use slicing-by-8 when input is >= 16 Bytes + if len(p) >= 16 { + ieeeTable8Once.Do(func() { + ieeeTable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, ieeeTable8, p) + } + return update(crc, IEEETable, p) +}