12 changed files with 775 additions and 4 deletions
-
5Makefile
-
5weed/Godeps/Godeps.json
-
24weed/vendor/github.com/klauspost/crc32/.gitignore
-
12weed/vendor/github.com/klauspost/crc32/.travis.yml
-
28weed/vendor/github.com/klauspost/crc32/LICENSE
-
84weed/vendor/github.com/klauspost/crc32/README.md
-
186weed/vendor/github.com/klauspost/crc32/crc32.go
-
62weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
-
237weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
-
40weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
-
67weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
-
29weed/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -0,0 +1,24 @@ |
|||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
|||
*.o |
|||
*.a |
|||
*.so |
|||
|
|||
# Folders |
|||
_obj |
|||
_test |
|||
|
|||
# Architecture specific extensions/prefixes |
|||
*.[568vq] |
|||
[568vq].out |
|||
|
|||
*.cgo1.go |
|||
*.cgo2.c |
|||
_cgo_defun.c |
|||
_cgo_gotypes.go |
|||
_cgo_export.* |
|||
|
|||
_testmain.go |
|||
|
|||
*.exe |
|||
*.test |
|||
*.prof |
|||
@ -0,0 +1,12 @@ |
|||
language: go |
|||
|
|||
go: |
|||
- 1.3 |
|||
- 1.4 |
|||
- 1.5 |
|||
- 1.6 |
|||
- tip |
|||
|
|||
script: |
|||
- go test -v . |
|||
- go test -v -race . |
|||
@ -0,0 +1,28 @@ |
|||
Copyright (c) 2012 The Go Authors. All rights reserved. |
|||
Copyright (c) 2015 Klaus Post |
|||
|
|||
Redistribution and use in source and binary forms, with or without |
|||
modification, are permitted provided that the following conditions are |
|||
met: |
|||
|
|||
* Redistributions of source code must retain the above copyright |
|||
notice, this list of conditions and the following disclaimer. |
|||
* Redistributions in binary form must reproduce the above |
|||
copyright notice, this list of conditions and the following disclaimer |
|||
in the documentation and/or other materials provided with the |
|||
distribution. |
|||
* Neither the name of Google Inc. nor the names of its |
|||
contributors may be used to endorse or promote products derived from |
|||
this software without specific prior written permission. |
|||
|
|||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|||
@ -0,0 +1,84 @@ |
|||
# crc32 |
|||
CRC32 hash with x64 optimizations |
|||
|
|||
This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. |
|||
|
|||
[](https://travis-ci.org/klauspost/crc32) |
|||
|
|||
# usage |
|||
|
|||
Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. |
|||
|
|||
Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. |
|||
|
|||
# changes |
|||
|
|||
* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. |
|||
|
|||
|
|||
# performance |
|||
|
|||
For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: |
|||
``` |
|||
benchmark old ns/op new ns/op delta |
|||
BenchmarkCrc32KB 99955 10258 -89.74% |
|||
|
|||
benchmark old MB/s new MB/s speedup |
|||
BenchmarkCrc32KB 327.83 3194.20 9.74x |
|||
``` |
|||
|
|||
For other tables and "CLMUL" capable machines the performance is the same as the standard library. |
|||
|
|||
Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. |
|||
|
|||
``` |
|||
Std: Standard Go 1.5 library |
|||
Crc: Indicates IEEE type CRC. |
|||
40B: Size of each slice encoded. |
|||
NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). |
|||
Castagnoli: Castagnoli CRC type. |
|||
|
|||
BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s |
|||
BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) |
|||
BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) |
|||
|
|||
BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s |
|||
BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) |
|||
BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) |
|||
|
|||
BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) |
|||
BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) |
|||
BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) |
|||
|
|||
BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) |
|||
BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) |
|||
BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s |
|||
BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) |
|||
BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) |
|||
BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s |
|||
BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) |
|||
BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) |
|||
BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s |
|||
BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) |
|||
BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) |
|||
BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s |
|||
BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) |
|||
BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) |
|||
BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) |
|||
``` |
|||
|
|||
The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. |
|||
|
|||
However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. |
|||
|
|||
# license |
|||
|
|||
Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. |
|||
@ -0,0 +1,186 @@ |
|||
// Copyright 2009 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
|
|||
// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
|
|||
// information.
|
|||
//
|
|||
// Polynomials are represented in LSB-first form also known as reversed representation.
|
|||
//
|
|||
// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
|
|||
// for information.
|
|||
package crc32 |
|||
|
|||
import ( |
|||
"hash" |
|||
"sync" |
|||
) |
|||
|
|||
// The size of a CRC-32 checksum in bytes.
|
|||
const Size = 4 |
|||
|
|||
// Predefined polynomials.
|
|||
const ( |
|||
// IEEE is by far and away the most common CRC-32 polynomial.
|
|||
// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
|
|||
IEEE = 0xedb88320 |
|||
|
|||
// Castagnoli's polynomial, used in iSCSI.
|
|||
// Has better error detection characteristics than IEEE.
|
|||
// http://dx.doi.org/10.1109/26.231911
|
|||
Castagnoli = 0x82f63b78 |
|||
|
|||
// Koopman's polynomial.
|
|||
// Also has better error detection characteristics than IEEE.
|
|||
// http://dx.doi.org/10.1109/DSN.2002.1028931
|
|||
Koopman = 0xeb31d82e |
|||
) |
|||
|
|||
// Table is a 256-word table representing the polynomial for efficient processing.
|
|||
type Table [256]uint32 |
|||
|
|||
// castagnoliTable points to a lazily initialized Table for the Castagnoli
|
|||
// polynomial. MakeTable will always return this value when asked to make a
|
|||
// Castagnoli table so we can compare against it to find when the caller is
|
|||
// using this polynomial.
|
|||
var castagnoliTable *Table |
|||
var castagnoliTable8 *slicing8Table |
|||
var castagnoliOnce sync.Once |
|||
|
|||
func castagnoliInit() { |
|||
castagnoliTable = makeTable(Castagnoli) |
|||
castagnoliTable8 = makeTable8(Castagnoli) |
|||
} |
|||
|
|||
// IEEETable is the table for the IEEE polynomial.
|
|||
var IEEETable = makeTable(IEEE) |
|||
|
|||
// slicing8Table is array of 8 Tables
|
|||
type slicing8Table [8]Table |
|||
|
|||
// ieeeTable8 is the slicing8Table for IEEE
|
|||
var ieeeTable8 *slicing8Table |
|||
var ieeeTable8Once sync.Once |
|||
|
|||
// MakeTable returns a Table constructed from the specified polynomial.
|
|||
// The contents of this Table must not be modified.
|
|||
func MakeTable(poly uint32) *Table { |
|||
switch poly { |
|||
case IEEE: |
|||
return IEEETable |
|||
case Castagnoli: |
|||
castagnoliOnce.Do(castagnoliInit) |
|||
return castagnoliTable |
|||
} |
|||
return makeTable(poly) |
|||
} |
|||
|
|||
// makeTable returns the Table constructed from the specified polynomial.
|
|||
func makeTable(poly uint32) *Table { |
|||
t := new(Table) |
|||
for i := 0; i < 256; i++ { |
|||
crc := uint32(i) |
|||
for j := 0; j < 8; j++ { |
|||
if crc&1 == 1 { |
|||
crc = (crc >> 1) ^ poly |
|||
} else { |
|||
crc >>= 1 |
|||
} |
|||
} |
|||
t[i] = crc |
|||
} |
|||
return t |
|||
} |
|||
|
|||
// makeTable8 returns slicing8Table constructed from the specified polynomial.
|
|||
func makeTable8(poly uint32) *slicing8Table { |
|||
t := new(slicing8Table) |
|||
t[0] = *makeTable(poly) |
|||
for i := 0; i < 256; i++ { |
|||
crc := t[0][i] |
|||
for j := 1; j < 8; j++ { |
|||
crc = t[0][crc&0xFF] ^ (crc >> 8) |
|||
t[j][i] = crc |
|||
} |
|||
} |
|||
return t |
|||
} |
|||
|
|||
// digest represents the partial evaluation of a checksum.
|
|||
type digest struct { |
|||
crc uint32 |
|||
tab *Table |
|||
} |
|||
|
|||
// New creates a new hash.Hash32 computing the CRC-32 checksum
|
|||
// using the polynomial represented by the Table.
|
|||
// Its Sum method will lay the value out in big-endian byte order.
|
|||
func New(tab *Table) hash.Hash32 { return &digest{0, tab} } |
|||
|
|||
// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
|
|||
// using the IEEE polynomial.
|
|||
// Its Sum method will lay the value out in big-endian byte order.
|
|||
func NewIEEE() hash.Hash32 { return New(IEEETable) } |
|||
|
|||
func (d *digest) Size() int { return Size } |
|||
|
|||
func (d *digest) BlockSize() int { return 1 } |
|||
|
|||
func (d *digest) Reset() { d.crc = 0 } |
|||
|
|||
func update(crc uint32, tab *Table, p []byte) uint32 { |
|||
crc = ^crc |
|||
for _, v := range p { |
|||
crc = tab[byte(crc)^v] ^ (crc >> 8) |
|||
} |
|||
return ^crc |
|||
} |
|||
|
|||
// updateSlicingBy8 updates CRC using Slicing-by-8
|
|||
func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 { |
|||
crc = ^crc |
|||
for len(p) > 8 { |
|||
crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 |
|||
crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ |
|||
tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ |
|||
tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] |
|||
p = p[8:] |
|||
} |
|||
crc = ^crc |
|||
if len(p) == 0 { |
|||
return crc |
|||
} |
|||
return update(crc, &tab[0], p) |
|||
} |
|||
|
|||
// Update returns the result of adding the bytes in p to the crc.
|
|||
func Update(crc uint32, tab *Table, p []byte) uint32 { |
|||
if tab == castagnoliTable { |
|||
return updateCastagnoli(crc, p) |
|||
} |
|||
if tab == IEEETable { |
|||
return updateIEEE(crc, p) |
|||
} |
|||
return update(crc, tab, p) |
|||
} |
|||
|
|||
func (d *digest) Write(p []byte) (n int, err error) { |
|||
d.crc = Update(d.crc, d.tab, p) |
|||
return len(p), nil |
|||
} |
|||
|
|||
func (d *digest) Sum32() uint32 { return d.crc } |
|||
|
|||
func (d *digest) Sum(in []byte) []byte { |
|||
s := d.Sum32() |
|||
return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) |
|||
} |
|||
|
|||
// Checksum returns the CRC-32 checksum of data
|
|||
// using the polynomial represented by the Table.
|
|||
func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } |
|||
|
|||
// ChecksumIEEE returns the CRC-32 checksum of data
|
|||
// using the IEEE polynomial.
|
|||
func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) } |
|||
@ -0,0 +1,62 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !appengine,!gccgo
|
|||
|
|||
package crc32 |
|||
|
|||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
|||
// and IEEE CRC.
|
|||
|
|||
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
|
|||
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
|
|||
func haveSSE41() bool |
|||
func haveSSE42() bool |
|||
func haveCLMUL() bool |
|||
|
|||
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
|
|||
// instruction.
|
|||
//go:noescape
|
|||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
|
|||
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
|
|||
// instruction as well as SSE 4.1.
|
|||
//go:noescape
|
|||
func ieeeCLMUL(crc uint32, p []byte) uint32 |
|||
|
|||
var sse42 = haveSSE42() |
|||
var useFastIEEE = haveCLMUL() && haveSSE41() |
|||
|
|||
func updateCastagnoli(crc uint32, p []byte) uint32 { |
|||
if sse42 { |
|||
return castagnoliSSE42(crc, p) |
|||
} |
|||
// only use slicing-by-8 when input is >= 16 Bytes
|
|||
if len(p) >= 16 { |
|||
return updateSlicingBy8(crc, castagnoliTable8, p) |
|||
} |
|||
return update(crc, castagnoliTable, p) |
|||
} |
|||
|
|||
func updateIEEE(crc uint32, p []byte) uint32 { |
|||
if useFastIEEE && len(p) >= 64 { |
|||
left := len(p) & 15 |
|||
do := len(p) - left |
|||
crc = ^ieeeCLMUL(^crc, p[:do]) |
|||
if left > 0 { |
|||
crc = update(crc, IEEETable, p[do:]) |
|||
} |
|||
return crc |
|||
} |
|||
|
|||
// only use slicing-by-8 when input is >= 16 Bytes
|
|||
if len(p) >= 16 { |
|||
ieeeTable8Once.Do(func() { |
|||
ieeeTable8 = makeTable8(IEEE) |
|||
}) |
|||
return updateSlicingBy8(crc, ieeeTable8, p) |
|||
} |
|||
|
|||
return update(crc, IEEETable, p) |
|||
} |
|||
@ -0,0 +1,237 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved. |
|||
// Use of this source code is governed by a BSD-style |
|||
// license that can be found in the LICENSE file. |
|||
|
|||
// +build gc |
|||
|
|||
#define NOSPLIT 4 |
|||
#define RODATA 8 |
|||
|
|||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), AX // CRC value |
|||
MOVQ p+8(FP), SI // data pointer |
|||
MOVQ p_len+16(FP), CX // len(p) |
|||
|
|||
NOTL AX |
|||
|
|||
// If there's less than 8 bytes to process, we do it byte-by-byte. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
// Process individual bytes until the input is 8-byte aligned. |
|||
startup: |
|||
MOVQ SI, BX |
|||
ANDQ $7, BX |
|||
JZ aligned |
|||
|
|||
CRC32B (SI), AX |
|||
DECQ CX |
|||
INCQ SI |
|||
JMP startup |
|||
|
|||
aligned: |
|||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
CRC32Q (SI), AX |
|||
ADDQ $8, SI |
|||
SUBQ $8, CX |
|||
JMP aligned |
|||
|
|||
cleanup: |
|||
// We may have some bytes left over that we process one at a time. |
|||
CMPQ CX, $0 |
|||
JE done |
|||
|
|||
CRC32B (SI), AX |
|||
INCQ SI |
|||
DECQ CX |
|||
JMP cleanup |
|||
|
|||
done: |
|||
NOTL AX |
|||
MOVL AX, ret+32(FP) |
|||
RET |
|||
|
|||
// func haveSSE42() bool |
|||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $20, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// func haveCLMUL() bool |
|||
TEXT ·haveCLMUL(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $1, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// func haveSSE41() bool |
|||
TEXT ·haveSSE41(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $19, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// CRC32 polynomial data |
|||
// |
|||
// These constants are lifted from the |
|||
// Linux kernel, since they avoid the costly |
|||
// PSHUFB 16 byte reversal proposed in the |
|||
// original Intel paper. |
|||
DATA r2r1kp<>+0(SB)/8, $0x154442bd4 |
|||
DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 |
|||
DATA r4r3kp<>+0(SB)/8, $0x1751997d0 |
|||
DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e |
|||
DATA rupolykp<>+0(SB)/8, $0x1db710641 |
|||
DATA rupolykp<>+8(SB)/8, $0x1f7011641 |
|||
DATA r5kp<>+0(SB)/8, $0x163cd6124 |
|||
|
|||
GLOBL r2r1kp<>(SB), RODATA, $16 |
|||
GLOBL r4r3kp<>(SB), RODATA, $16 |
|||
GLOBL rupolykp<>(SB), RODATA, $16 |
|||
GLOBL r5kp<>(SB), RODATA, $8 |
|||
|
|||
// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
|||
// len(p) must be at least 64, and must be a multiple of 16. |
|||
|
|||
// func ieeeCLMUL(crc uint32, p []byte) uint32 |
|||
TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), X0 // Initial CRC value |
|||
MOVQ p+8(FP), SI // data pointer |
|||
MOVQ p_len+16(FP), CX // len(p) |
|||
|
|||
MOVOU (SI), X1 |
|||
MOVOU 16(SI), X2 |
|||
MOVOU 32(SI), X3 |
|||
MOVOU 48(SI), X4 |
|||
PXOR X0, X1 |
|||
ADDQ $64, SI // buf+=64 |
|||
SUBQ $64, CX // len-=64 |
|||
CMPQ CX, $64 // Less than 64 bytes left |
|||
JB remain64 |
|||
|
|||
MOVOA r2r1kp<>+0(SB), X0 |
|||
|
|||
loopback64: |
|||
MOVOA X1, X5 |
|||
MOVOA X2, X6 |
|||
MOVOA X3, X7 |
|||
MOVOA X4, X8 |
|||
|
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0, X0, X2 |
|||
PCLMULQDQ $0, X0, X3 |
|||
PCLMULQDQ $0, X0, X4 |
|||
|
|||
// Load next early |
|||
MOVOU (SI), X11 |
|||
MOVOU 16(SI), X12 |
|||
MOVOU 32(SI), X13 |
|||
MOVOU 48(SI), X14 |
|||
|
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PCLMULQDQ $0x11, X0, X6 |
|||
PCLMULQDQ $0x11, X0, X7 |
|||
PCLMULQDQ $0x11, X0, X8 |
|||
|
|||
PXOR X5, X1 |
|||
PXOR X6, X2 |
|||
PXOR X7, X3 |
|||
PXOR X8, X4 |
|||
|
|||
PXOR X11, X1 |
|||
PXOR X12, X2 |
|||
PXOR X13, X3 |
|||
PXOR X14, X4 |
|||
|
|||
ADDQ $0x40, DI |
|||
ADDQ $64, SI // buf+=64 |
|||
SUBQ $64, CX // len-=64 |
|||
CMPQ CX, $64 // Less than 64 bytes left? |
|||
JGE loopback64 |
|||
|
|||
// Fold result into a single register (X1) |
|||
remain64: |
|||
MOVOA r4r3kp<>+0(SB), X0 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X2, X1 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X3, X1 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X4, X1 |
|||
|
|||
// More than 16 bytes left? |
|||
CMPQ CX, $16 |
|||
JB finish |
|||
|
|||
// Encode 16 bytes |
|||
remain16: |
|||
MOVOU (SI), X10 |
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X10, X1 |
|||
SUBQ $16, CX |
|||
ADDQ $16, SI |
|||
CMPQ CX, $16 |
|||
JGE remain16 |
|||
|
|||
finish: |
|||
// Fold final result into 32 bits and return it |
|||
PCMPEQB X3, X3 |
|||
PCLMULQDQ $1, X1, X0 |
|||
PSRLDQ $8, X1 |
|||
PXOR X0, X1 |
|||
|
|||
MOVOA X1, X2 |
|||
MOVQ r5kp<>+0(SB), X0 |
|||
|
|||
// Creates 32 bit mask. Note that we don't care about upper half. |
|||
PSRLQ $32, X3 |
|||
|
|||
PSRLDQ $4, X2 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PXOR X2, X1 |
|||
|
|||
MOVOA rupolykp<>+0(SB), X0 |
|||
|
|||
MOVOA X1, X2 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0x10, X0, X1 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PXOR X2, X1 |
|||
|
|||
// PEXTRD $1, X1, AX (SSE 4.1) |
|||
BYTE $0x66; BYTE $0x0f; BYTE $0x3a |
|||
BYTE $0x16; BYTE $0xc8; BYTE $0x01 |
|||
MOVL AX, ret+32(FP) |
|||
|
|||
RET |
|||
@ -0,0 +1,40 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !appengine,!gccgo
|
|||
|
|||
package crc32 |
|||
|
|||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
|||
// CRC.
|
|||
|
|||
// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
|
|||
// support.
|
|||
func haveSSE42() bool |
|||
|
|||
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
|
|||
// instruction.
|
|||
//go:noescape
|
|||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
|
|||
var sse42 = haveSSE42() |
|||
|
|||
func updateCastagnoli(crc uint32, p []byte) uint32 { |
|||
if sse42 { |
|||
return castagnoliSSE42(crc, p) |
|||
} |
|||
return update(crc, castagnoliTable, p) |
|||
} |
|||
|
|||
func updateIEEE(crc uint32, p []byte) uint32 { |
|||
// only use slicing-by-8 when input is >= 4KB
|
|||
if len(p) >= 4096 { |
|||
ieeeTable8Once.Do(func() { |
|||
ieeeTable8 = makeTable8(IEEE) |
|||
}) |
|||
return updateSlicingBy8(crc, ieeeTable8, p) |
|||
} |
|||
|
|||
return update(crc, IEEETable, p) |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved. |
|||
// Use of this source code is governed by a BSD-style |
|||
// license that can be found in the LICENSE file. |
|||
|
|||
// +build gc |
|||
|
|||
#define NOSPLIT 4 |
|||
#define RODATA 8 |
|||
|
|||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), AX // CRC value |
|||
MOVL p+4(FP), SI // data pointer |
|||
MOVL p_len+8(FP), CX // len(p) |
|||
|
|||
NOTL AX |
|||
|
|||
// If there's less than 8 bytes to process, we do it byte-by-byte. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
// Process individual bytes until the input is 8-byte aligned. |
|||
startup: |
|||
MOVQ SI, BX |
|||
ANDQ $7, BX |
|||
JZ aligned |
|||
|
|||
CRC32B (SI), AX |
|||
DECQ CX |
|||
INCQ SI |
|||
JMP startup |
|||
|
|||
aligned: |
|||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
CRC32Q (SI), AX |
|||
ADDQ $8, SI |
|||
SUBQ $8, CX |
|||
JMP aligned |
|||
|
|||
cleanup: |
|||
// We may have some bytes left over that we process one at a time. |
|||
CMPQ CX, $0 |
|||
JE done |
|||
|
|||
CRC32B (SI), AX |
|||
INCQ SI |
|||
DECQ CX |
|||
JMP cleanup |
|||
|
|||
done: |
|||
NOTL AX |
|||
MOVL AX, ret+16(FP) |
|||
RET |
|||
|
|||
// func haveSSE42() bool |
|||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $20, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
@ -0,0 +1,29 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !amd64,!amd64p32 appengine gccgo
|
|||
|
|||
package crc32 |
|||
|
|||
// This file contains the generic version of updateCastagnoli which does
|
|||
// slicing-by-8, or uses the fallback for very small sizes.
|
|||
|
|||
func updateCastagnoli(crc uint32, p []byte) uint32 { |
|||
// only use slicing-by-8 when input is >= 16 Bytes
|
|||
if len(p) >= 16 { |
|||
return updateSlicingBy8(crc, castagnoliTable8, p) |
|||
} |
|||
return update(crc, castagnoliTable, p) |
|||
} |
|||
|
|||
func updateIEEE(crc uint32, p []byte) uint32 { |
|||
// only use slicing-by-8 when input is >= 16 Bytes
|
|||
if len(p) >= 16 { |
|||
ieeeTable8Once.Do(func() { |
|||
ieeeTable8 = makeTable8(IEEE) |
|||
}) |
|||
return updateSlicingBy8(crc, ieeeTable8, p) |
|||
} |
|||
return update(crc, IEEETable, p) |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue