12 changed files with 775 additions and 4 deletions
-
5Makefile
-
5weed/Godeps/Godeps.json
-
24weed/vendor/github.com/klauspost/crc32/.gitignore
-
12weed/vendor/github.com/klauspost/crc32/.travis.yml
-
28weed/vendor/github.com/klauspost/crc32/LICENSE
-
84weed/vendor/github.com/klauspost/crc32/README.md
-
186weed/vendor/github.com/klauspost/crc32/crc32.go
-
62weed/vendor/github.com/klauspost/crc32/crc32_amd64.go
-
237weed/vendor/github.com/klauspost/crc32/crc32_amd64.s
-
40weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
-
67weed/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
-
29weed/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -0,0 +1,24 @@ |
|||||
|
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
||||
|
*.o |
||||
|
*.a |
||||
|
*.so |
||||
|
|
||||
|
# Folders |
||||
|
_obj |
||||
|
_test |
||||
|
|
||||
|
# Architecture specific extensions/prefixes |
||||
|
*.[568vq] |
||||
|
[568vq].out |
||||
|
|
||||
|
*.cgo1.go |
||||
|
*.cgo2.c |
||||
|
_cgo_defun.c |
||||
|
_cgo_gotypes.go |
||||
|
_cgo_export.* |
||||
|
|
||||
|
_testmain.go |
||||
|
|
||||
|
*.exe |
||||
|
*.test |
||||
|
*.prof |
||||
@ -0,0 +1,12 @@ |
|||||
|
language: go |
||||
|
|
||||
|
go: |
||||
|
- 1.3 |
||||
|
- 1.4 |
||||
|
- 1.5 |
||||
|
- 1.6 |
||||
|
- tip |
||||
|
|
||||
|
script: |
||||
|
- go test -v . |
||||
|
- go test -v -race . |
||||
@ -0,0 +1,28 @@ |
|||||
|
Copyright (c) 2012 The Go Authors. All rights reserved. |
||||
|
Copyright (c) 2015 Klaus Post |
||||
|
|
||||
|
Redistribution and use in source and binary forms, with or without |
||||
|
modification, are permitted provided that the following conditions are |
||||
|
met: |
||||
|
|
||||
|
* Redistributions of source code must retain the above copyright |
||||
|
notice, this list of conditions and the following disclaimer. |
||||
|
* Redistributions in binary form must reproduce the above |
||||
|
copyright notice, this list of conditions and the following disclaimer |
||||
|
in the documentation and/or other materials provided with the |
||||
|
distribution. |
||||
|
* Neither the name of Google Inc. nor the names of its |
||||
|
contributors may be used to endorse or promote products derived from |
||||
|
this software without specific prior written permission. |
||||
|
|
||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
@ -0,0 +1,84 @@ |
|||||
|
# crc32 |
||||
|
CRC32 hash with x64 optimizations |
||||
|
|
||||
|
This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. |
||||
|
|
||||
|
[](https://travis-ci.org/klauspost/crc32) |
||||
|
|
||||
|
# usage |
||||
|
|
||||
|
Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. |
||||
|
|
||||
|
Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. |
||||
|
|
||||
|
# changes |
||||
|
|
||||
|
* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. |
||||
|
|
||||
|
|
||||
|
# performance |
||||
|
|
||||
|
For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: |
||||
|
``` |
||||
|
benchmark old ns/op new ns/op delta |
||||
|
BenchmarkCrc32KB 99955 10258 -89.74% |
||||
|
|
||||
|
benchmark old MB/s new MB/s speedup |
||||
|
BenchmarkCrc32KB 327.83 3194.20 9.74x |
||||
|
``` |
||||
|
|
||||
|
For other tables and "CLMUL" capable machines the performance is the same as the standard library. |
||||
|
|
||||
|
Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. |
||||
|
|
||||
|
``` |
||||
|
Std: Standard Go 1.5 library |
||||
|
Crc: Indicates IEEE type CRC. |
||||
|
40B: Size of each slice encoded. |
||||
|
NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). |
||||
|
Castagnoli: Castagnoli CRC type. |
||||
|
|
||||
|
BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s |
||||
|
BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) |
||||
|
BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) |
||||
|
|
||||
|
BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s |
||||
|
BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) |
||||
|
BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) |
||||
|
BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) |
||||
|
BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) |
||||
|
BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) |
||||
|
BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s |
||||
|
BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) |
||||
|
BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) |
||||
|
BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s |
||||
|
BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) |
||||
|
BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) |
||||
|
BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s |
||||
|
BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) |
||||
|
BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) |
||||
|
BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) |
||||
|
|
||||
|
BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s |
||||
|
BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) |
||||
|
BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) |
||||
|
BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) |
||||
|
``` |
||||
|
|
||||
|
The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. |
||||
|
|
||||
|
However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. |
||||
|
|
||||
|
# license |
||||
|
|
||||
|
Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. |
||||
@ -0,0 +1,186 @@ |
|||||
|
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
|
// Use of this source code is governed by a BSD-style
|
||||
|
// license that can be found in the LICENSE file.
|
||||
|
|
||||
|
// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
|
||||
|
// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
|
||||
|
// information.
|
||||
|
//
|
||||
|
// Polynomials are represented in LSB-first form also known as reversed representation.
|
||||
|
//
|
||||
|
// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
|
||||
|
// for information.
|
||||
|
package crc32 |
||||
|
|
||||
|
import ( |
||||
|
"hash" |
||||
|
"sync" |
||||
|
) |
||||
|
|
||||
|
// The size of a CRC-32 checksum in bytes.
|
||||
|
const Size = 4 |
||||
|
|
||||
|
// Predefined polynomials.
|
||||
|
const ( |
||||
|
// IEEE is by far and away the most common CRC-32 polynomial.
|
||||
|
// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
|
||||
|
IEEE = 0xedb88320 |
||||
|
|
||||
|
// Castagnoli's polynomial, used in iSCSI.
|
||||
|
// Has better error detection characteristics than IEEE.
|
||||
|
// http://dx.doi.org/10.1109/26.231911
|
||||
|
Castagnoli = 0x82f63b78 |
||||
|
|
||||
|
// Koopman's polynomial.
|
||||
|
// Also has better error detection characteristics than IEEE.
|
||||
|
// http://dx.doi.org/10.1109/DSN.2002.1028931
|
||||
|
Koopman = 0xeb31d82e |
||||
|
) |
||||
|
|
||||
|
// Table is a 256-word table representing the polynomial for efficient processing.
|
||||
|
type Table [256]uint32 |
||||
|
|
||||
|
// castagnoliTable points to a lazily initialized Table for the Castagnoli
|
||||
|
// polynomial. MakeTable will always return this value when asked to make a
|
||||
|
// Castagnoli table so we can compare against it to find when the caller is
|
||||
|
// using this polynomial.
|
||||
|
var castagnoliTable *Table |
||||
|
var castagnoliTable8 *slicing8Table |
||||
|
var castagnoliOnce sync.Once |
||||
|
|
||||
|
func castagnoliInit() { |
||||
|
castagnoliTable = makeTable(Castagnoli) |
||||
|
castagnoliTable8 = makeTable8(Castagnoli) |
||||
|
} |
||||
|
|
||||
|
// IEEETable is the table for the IEEE polynomial.
|
||||
|
var IEEETable = makeTable(IEEE) |
||||
|
|
||||
|
// slicing8Table is array of 8 Tables
|
||||
|
type slicing8Table [8]Table |
||||
|
|
||||
|
// ieeeTable8 is the slicing8Table for IEEE
|
||||
|
var ieeeTable8 *slicing8Table |
||||
|
var ieeeTable8Once sync.Once |
||||
|
|
||||
|
// MakeTable returns a Table constructed from the specified polynomial.
|
||||
|
// The contents of this Table must not be modified.
|
||||
|
func MakeTable(poly uint32) *Table { |
||||
|
switch poly { |
||||
|
case IEEE: |
||||
|
return IEEETable |
||||
|
case Castagnoli: |
||||
|
castagnoliOnce.Do(castagnoliInit) |
||||
|
return castagnoliTable |
||||
|
} |
||||
|
return makeTable(poly) |
||||
|
} |
||||
|
|
||||
|
// makeTable returns the Table constructed from the specified polynomial.
|
||||
|
func makeTable(poly uint32) *Table { |
||||
|
t := new(Table) |
||||
|
for i := 0; i < 256; i++ { |
||||
|
crc := uint32(i) |
||||
|
for j := 0; j < 8; j++ { |
||||
|
if crc&1 == 1 { |
||||
|
crc = (crc >> 1) ^ poly |
||||
|
} else { |
||||
|
crc >>= 1 |
||||
|
} |
||||
|
} |
||||
|
t[i] = crc |
||||
|
} |
||||
|
return t |
||||
|
} |
||||
|
|
||||
|
// makeTable8 returns slicing8Table constructed from the specified polynomial.
|
||||
|
func makeTable8(poly uint32) *slicing8Table { |
||||
|
t := new(slicing8Table) |
||||
|
t[0] = *makeTable(poly) |
||||
|
for i := 0; i < 256; i++ { |
||||
|
crc := t[0][i] |
||||
|
for j := 1; j < 8; j++ { |
||||
|
crc = t[0][crc&0xFF] ^ (crc >> 8) |
||||
|
t[j][i] = crc |
||||
|
} |
||||
|
} |
||||
|
return t |
||||
|
} |
||||
|
|
||||
|
// digest represents the partial evaluation of a checksum.
|
||||
|
type digest struct { |
||||
|
crc uint32 |
||||
|
tab *Table |
||||
|
} |
||||
|
|
||||
|
// New creates a new hash.Hash32 computing the CRC-32 checksum
|
||||
|
// using the polynomial represented by the Table.
|
||||
|
// Its Sum method will lay the value out in big-endian byte order.
|
||||
|
func New(tab *Table) hash.Hash32 { return &digest{0, tab} } |
||||
|
|
||||
|
// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
|
||||
|
// using the IEEE polynomial.
|
||||
|
// Its Sum method will lay the value out in big-endian byte order.
|
||||
|
func NewIEEE() hash.Hash32 { return New(IEEETable) } |
||||
|
|
||||
|
func (d *digest) Size() int { return Size } |
||||
|
|
||||
|
func (d *digest) BlockSize() int { return 1 } |
||||
|
|
||||
|
func (d *digest) Reset() { d.crc = 0 } |
||||
|
|
||||
|
func update(crc uint32, tab *Table, p []byte) uint32 { |
||||
|
crc = ^crc |
||||
|
for _, v := range p { |
||||
|
crc = tab[byte(crc)^v] ^ (crc >> 8) |
||||
|
} |
||||
|
return ^crc |
||||
|
} |
||||
|
|
||||
|
// updateSlicingBy8 updates CRC using Slicing-by-8
|
||||
|
func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 { |
||||
|
crc = ^crc |
||||
|
for len(p) > 8 { |
||||
|
crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 |
||||
|
crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ |
||||
|
tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ |
||||
|
tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] |
||||
|
p = p[8:] |
||||
|
} |
||||
|
crc = ^crc |
||||
|
if len(p) == 0 { |
||||
|
return crc |
||||
|
} |
||||
|
return update(crc, &tab[0], p) |
||||
|
} |
||||
|
|
||||
|
// Update returns the result of adding the bytes in p to the crc.
|
||||
|
func Update(crc uint32, tab *Table, p []byte) uint32 { |
||||
|
if tab == castagnoliTable { |
||||
|
return updateCastagnoli(crc, p) |
||||
|
} |
||||
|
if tab == IEEETable { |
||||
|
return updateIEEE(crc, p) |
||||
|
} |
||||
|
return update(crc, tab, p) |
||||
|
} |
||||
|
|
||||
|
func (d *digest) Write(p []byte) (n int, err error) { |
||||
|
d.crc = Update(d.crc, d.tab, p) |
||||
|
return len(p), nil |
||||
|
} |
||||
|
|
||||
|
func (d *digest) Sum32() uint32 { return d.crc } |
||||
|
|
||||
|
func (d *digest) Sum(in []byte) []byte { |
||||
|
s := d.Sum32() |
||||
|
return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) |
||||
|
} |
||||
|
|
||||
|
// Checksum returns the CRC-32 checksum of data
|
||||
|
// using the polynomial represented by the Table.
|
||||
|
func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } |
||||
|
|
||||
|
// ChecksumIEEE returns the CRC-32 checksum of data
|
||||
|
// using the IEEE polynomial.
|
||||
|
func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) } |
||||
@ -0,0 +1,62 @@ |
|||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
|
// Use of this source code is governed by a BSD-style
|
||||
|
// license that can be found in the LICENSE file.
|
||||
|
|
||||
|
// +build !appengine,!gccgo
|
||||
|
|
||||
|
package crc32 |
||||
|
|
||||
|
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
|
// and IEEE CRC.
|
||||
|
|
||||
|
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
|
||||
|
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
|
||||
|
func haveSSE41() bool |
||||
|
func haveSSE42() bool |
||||
|
func haveCLMUL() bool |
||||
|
|
||||
|
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
|
||||
|
// instruction.
|
||||
|
//go:noescape
|
||||
|
func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
|
||||
|
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
|
||||
|
// instruction as well as SSE 4.1.
|
||||
|
//go:noescape
|
||||
|
func ieeeCLMUL(crc uint32, p []byte) uint32 |
||||
|
|
||||
|
var sse42 = haveSSE42() |
||||
|
var useFastIEEE = haveCLMUL() && haveSSE41() |
||||
|
|
||||
|
func updateCastagnoli(crc uint32, p []byte) uint32 { |
||||
|
if sse42 { |
||||
|
return castagnoliSSE42(crc, p) |
||||
|
} |
||||
|
// only use slicing-by-8 when input is >= 16 Bytes
|
||||
|
if len(p) >= 16 { |
||||
|
return updateSlicingBy8(crc, castagnoliTable8, p) |
||||
|
} |
||||
|
return update(crc, castagnoliTable, p) |
||||
|
} |
||||
|
|
||||
|
func updateIEEE(crc uint32, p []byte) uint32 { |
||||
|
if useFastIEEE && len(p) >= 64 { |
||||
|
left := len(p) & 15 |
||||
|
do := len(p) - left |
||||
|
crc = ^ieeeCLMUL(^crc, p[:do]) |
||||
|
if left > 0 { |
||||
|
crc = update(crc, IEEETable, p[do:]) |
||||
|
} |
||||
|
return crc |
||||
|
} |
||||
|
|
||||
|
// only use slicing-by-8 when input is >= 16 Bytes
|
||||
|
if len(p) >= 16 { |
||||
|
ieeeTable8Once.Do(func() { |
||||
|
ieeeTable8 = makeTable8(IEEE) |
||||
|
}) |
||||
|
return updateSlicingBy8(crc, ieeeTable8, p) |
||||
|
} |
||||
|
|
||||
|
return update(crc, IEEETable, p) |
||||
|
} |
||||
@ -0,0 +1,237 @@ |
|||||
|
// Copyright 2011 The Go Authors. All rights reserved. |
||||
|
// Use of this source code is governed by a BSD-style |
||||
|
// license that can be found in the LICENSE file. |
||||
|
|
||||
|
// +build gc |
||||
|
|
||||
|
#define NOSPLIT 4 |
||||
|
#define RODATA 8 |
||||
|
|
||||
|
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
||||
|
MOVL crc+0(FP), AX // CRC value |
||||
|
MOVQ p+8(FP), SI // data pointer |
||||
|
MOVQ p_len+16(FP), CX // len(p) |
||||
|
|
||||
|
NOTL AX |
||||
|
|
||||
|
// If there's less than 8 bytes to process, we do it byte-by-byte. |
||||
|
CMPQ CX, $8 |
||||
|
JL cleanup |
||||
|
|
||||
|
// Process individual bytes until the input is 8-byte aligned. |
||||
|
startup: |
||||
|
MOVQ SI, BX |
||||
|
ANDQ $7, BX |
||||
|
JZ aligned |
||||
|
|
||||
|
CRC32B (SI), AX |
||||
|
DECQ CX |
||||
|
INCQ SI |
||||
|
JMP startup |
||||
|
|
||||
|
aligned: |
||||
|
// The input is now 8-byte aligned and we can process 8-byte chunks. |
||||
|
CMPQ CX, $8 |
||||
|
JL cleanup |
||||
|
|
||||
|
CRC32Q (SI), AX |
||||
|
ADDQ $8, SI |
||||
|
SUBQ $8, CX |
||||
|
JMP aligned |
||||
|
|
||||
|
cleanup: |
||||
|
// We may have some bytes left over that we process one at a time. |
||||
|
CMPQ CX, $0 |
||||
|
JE done |
||||
|
|
||||
|
CRC32B (SI), AX |
||||
|
INCQ SI |
||||
|
DECQ CX |
||||
|
JMP cleanup |
||||
|
|
||||
|
done: |
||||
|
NOTL AX |
||||
|
MOVL AX, ret+32(FP) |
||||
|
RET |
||||
|
|
||||
|
// func haveSSE42() bool |
||||
|
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
||||
|
XORQ AX, AX |
||||
|
INCL AX |
||||
|
CPUID |
||||
|
SHRQ $20, CX |
||||
|
ANDQ $1, CX |
||||
|
MOVB CX, ret+0(FP) |
||||
|
RET |
||||
|
|
||||
|
// func haveCLMUL() bool |
||||
|
TEXT ·haveCLMUL(SB), NOSPLIT, $0 |
||||
|
XORQ AX, AX |
||||
|
INCL AX |
||||
|
CPUID |
||||
|
SHRQ $1, CX |
||||
|
ANDQ $1, CX |
||||
|
MOVB CX, ret+0(FP) |
||||
|
RET |
||||
|
|
||||
|
// func haveSSE41() bool |
||||
|
TEXT ·haveSSE41(SB), NOSPLIT, $0 |
||||
|
XORQ AX, AX |
||||
|
INCL AX |
||||
|
CPUID |
||||
|
SHRQ $19, CX |
||||
|
ANDQ $1, CX |
||||
|
MOVB CX, ret+0(FP) |
||||
|
RET |
||||
|
|
||||
|
// CRC32 polynomial data |
||||
|
// |
||||
|
// These constants are lifted from the |
||||
|
// Linux kernel, since they avoid the costly |
||||
|
// PSHUFB 16 byte reversal proposed in the |
||||
|
// original Intel paper. |
||||
|
DATA r2r1kp<>+0(SB)/8, $0x154442bd4 |
||||
|
DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 |
||||
|
DATA r4r3kp<>+0(SB)/8, $0x1751997d0 |
||||
|
DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e |
||||
|
DATA rupolykp<>+0(SB)/8, $0x1db710641 |
||||
|
DATA rupolykp<>+8(SB)/8, $0x1f7011641 |
||||
|
DATA r5kp<>+0(SB)/8, $0x163cd6124 |
||||
|
|
||||
|
GLOBL r2r1kp<>(SB), RODATA, $16 |
||||
|
GLOBL r4r3kp<>(SB), RODATA, $16 |
||||
|
GLOBL rupolykp<>(SB), RODATA, $16 |
||||
|
GLOBL r5kp<>(SB), RODATA, $8 |
||||
|
|
||||
|
// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
||||
|
// len(p) must be at least 64, and must be a multiple of 16. |
||||
|
|
||||
|
// func ieeeCLMUL(crc uint32, p []byte) uint32 |
||||
|
TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 |
||||
|
MOVL crc+0(FP), X0 // Initial CRC value |
||||
|
MOVQ p+8(FP), SI // data pointer |
||||
|
MOVQ p_len+16(FP), CX // len(p) |
||||
|
|
||||
|
MOVOU (SI), X1 |
||||
|
MOVOU 16(SI), X2 |
||||
|
MOVOU 32(SI), X3 |
||||
|
MOVOU 48(SI), X4 |
||||
|
PXOR X0, X1 |
||||
|
ADDQ $64, SI // buf+=64 |
||||
|
SUBQ $64, CX // len-=64 |
||||
|
CMPQ CX, $64 // Less than 64 bytes left |
||||
|
JB remain64 |
||||
|
|
||||
|
MOVOA r2r1kp<>+0(SB), X0 |
||||
|
|
||||
|
loopback64: |
||||
|
MOVOA X1, X5 |
||||
|
MOVOA X2, X6 |
||||
|
MOVOA X3, X7 |
||||
|
MOVOA X4, X8 |
||||
|
|
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PCLMULQDQ $0, X0, X2 |
||||
|
PCLMULQDQ $0, X0, X3 |
||||
|
PCLMULQDQ $0, X0, X4 |
||||
|
|
||||
|
// Load next early |
||||
|
MOVOU (SI), X11 |
||||
|
MOVOU 16(SI), X12 |
||||
|
MOVOU 32(SI), X13 |
||||
|
MOVOU 48(SI), X14 |
||||
|
|
||||
|
PCLMULQDQ $0x11, X0, X5 |
||||
|
PCLMULQDQ $0x11, X0, X6 |
||||
|
PCLMULQDQ $0x11, X0, X7 |
||||
|
PCLMULQDQ $0x11, X0, X8 |
||||
|
|
||||
|
PXOR X5, X1 |
||||
|
PXOR X6, X2 |
||||
|
PXOR X7, X3 |
||||
|
PXOR X8, X4 |
||||
|
|
||||
|
PXOR X11, X1 |
||||
|
PXOR X12, X2 |
||||
|
PXOR X13, X3 |
||||
|
PXOR X14, X4 |
||||
|
|
||||
|
ADDQ $0x40, DI |
||||
|
ADDQ $64, SI // buf+=64 |
||||
|
SUBQ $64, CX // len-=64 |
||||
|
CMPQ CX, $64 // Less than 64 bytes left? |
||||
|
JGE loopback64 |
||||
|
|
||||
|
// Fold result into a single register (X1) |
||||
|
remain64: |
||||
|
MOVOA r4r3kp<>+0(SB), X0 |
||||
|
|
||||
|
MOVOA X1, X5 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PCLMULQDQ $0x11, X0, X5 |
||||
|
PXOR X5, X1 |
||||
|
PXOR X2, X1 |
||||
|
|
||||
|
MOVOA X1, X5 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PCLMULQDQ $0x11, X0, X5 |
||||
|
PXOR X5, X1 |
||||
|
PXOR X3, X1 |
||||
|
|
||||
|
MOVOA X1, X5 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PCLMULQDQ $0x11, X0, X5 |
||||
|
PXOR X5, X1 |
||||
|
PXOR X4, X1 |
||||
|
|
||||
|
// More than 16 bytes left? |
||||
|
CMPQ CX, $16 |
||||
|
JB finish |
||||
|
|
||||
|
// Encode 16 bytes |
||||
|
remain16: |
||||
|
MOVOU (SI), X10 |
||||
|
MOVOA X1, X5 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PCLMULQDQ $0x11, X0, X5 |
||||
|
PXOR X5, X1 |
||||
|
PXOR X10, X1 |
||||
|
SUBQ $16, CX |
||||
|
ADDQ $16, SI |
||||
|
CMPQ CX, $16 |
||||
|
JGE remain16 |
||||
|
|
||||
|
finish: |
||||
|
// Fold final result into 32 bits and return it |
||||
|
PCMPEQB X3, X3 |
||||
|
PCLMULQDQ $1, X1, X0 |
||||
|
PSRLDQ $8, X1 |
||||
|
PXOR X0, X1 |
||||
|
|
||||
|
MOVOA X1, X2 |
||||
|
MOVQ r5kp<>+0(SB), X0 |
||||
|
|
||||
|
// Creates 32 bit mask. Note that we don't care about upper half. |
||||
|
PSRLQ $32, X3 |
||||
|
|
||||
|
PSRLDQ $4, X2 |
||||
|
PAND X3, X1 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PXOR X2, X1 |
||||
|
|
||||
|
MOVOA rupolykp<>+0(SB), X0 |
||||
|
|
||||
|
MOVOA X1, X2 |
||||
|
PAND X3, X1 |
||||
|
PCLMULQDQ $0x10, X0, X1 |
||||
|
PAND X3, X1 |
||||
|
PCLMULQDQ $0, X0, X1 |
||||
|
PXOR X2, X1 |
||||
|
|
||||
|
// PEXTRD $1, X1, AX (SSE 4.1) |
||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x3a |
||||
|
BYTE $0x16; BYTE $0xc8; BYTE $0x01 |
||||
|
MOVL AX, ret+32(FP) |
||||
|
|
||||
|
RET |
||||
@ -0,0 +1,40 @@ |
|||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
|
// Use of this source code is governed by a BSD-style
|
||||
|
// license that can be found in the LICENSE file.
|
||||
|
|
||||
|
// +build !appengine,!gccgo
|
||||
|
|
||||
|
package crc32 |
||||
|
|
||||
|
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
|
// CRC.
|
||||
|
|
||||
|
// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
|
||||
|
// support.
|
||||
|
func haveSSE42() bool |
||||
|
|
||||
|
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
|
||||
|
// instruction.
|
||||
|
//go:noescape
|
||||
|
func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
|
||||
|
var sse42 = haveSSE42() |
||||
|
|
||||
|
func updateCastagnoli(crc uint32, p []byte) uint32 { |
||||
|
if sse42 { |
||||
|
return castagnoliSSE42(crc, p) |
||||
|
} |
||||
|
return update(crc, castagnoliTable, p) |
||||
|
} |
||||
|
|
||||
|
func updateIEEE(crc uint32, p []byte) uint32 { |
||||
|
// only use slicing-by-8 when input is >= 4KB
|
||||
|
if len(p) >= 4096 { |
||||
|
ieeeTable8Once.Do(func() { |
||||
|
ieeeTable8 = makeTable8(IEEE) |
||||
|
}) |
||||
|
return updateSlicingBy8(crc, ieeeTable8, p) |
||||
|
} |
||||
|
|
||||
|
return update(crc, IEEETable, p) |
||||
|
} |
||||
@ -0,0 +1,67 @@ |
|||||
|
// Copyright 2011 The Go Authors. All rights reserved. |
||||
|
// Use of this source code is governed by a BSD-style |
||||
|
// license that can be found in the LICENSE file. |
||||
|
|
||||
|
// +build gc |
||||
|
|
||||
|
#define NOSPLIT 4 |
||||
|
#define RODATA 8 |
||||
|
|
||||
|
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
||||
|
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
||||
|
MOVL crc+0(FP), AX // CRC value |
||||
|
MOVL p+4(FP), SI // data pointer |
||||
|
MOVL p_len+8(FP), CX // len(p) |
||||
|
|
||||
|
NOTL AX |
||||
|
|
||||
|
// If there's less than 8 bytes to process, we do it byte-by-byte. |
||||
|
CMPQ CX, $8 |
||||
|
JL cleanup |
||||
|
|
||||
|
// Process individual bytes until the input is 8-byte aligned. |
||||
|
startup: |
||||
|
MOVQ SI, BX |
||||
|
ANDQ $7, BX |
||||
|
JZ aligned |
||||
|
|
||||
|
CRC32B (SI), AX |
||||
|
DECQ CX |
||||
|
INCQ SI |
||||
|
JMP startup |
||||
|
|
||||
|
aligned: |
||||
|
// The input is now 8-byte aligned and we can process 8-byte chunks. |
||||
|
CMPQ CX, $8 |
||||
|
JL cleanup |
||||
|
|
||||
|
CRC32Q (SI), AX |
||||
|
ADDQ $8, SI |
||||
|
SUBQ $8, CX |
||||
|
JMP aligned |
||||
|
|
||||
|
cleanup: |
||||
|
// We may have some bytes left over that we process one at a time. |
||||
|
CMPQ CX, $0 |
||||
|
JE done |
||||
|
|
||||
|
CRC32B (SI), AX |
||||
|
INCQ SI |
||||
|
DECQ CX |
||||
|
JMP cleanup |
||||
|
|
||||
|
done: |
||||
|
NOTL AX |
||||
|
MOVL AX, ret+16(FP) |
||||
|
RET |
||||
|
|
||||
|
// func haveSSE42() bool |
||||
|
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
||||
|
XORQ AX, AX |
||||
|
INCL AX |
||||
|
CPUID |
||||
|
SHRQ $20, CX |
||||
|
ANDQ $1, CX |
||||
|
MOVB CX, ret+0(FP) |
||||
|
RET |
||||
|
|
||||
@ -0,0 +1,29 @@ |
|||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
|
// Use of this source code is governed by a BSD-style
|
||||
|
// license that can be found in the LICENSE file.
|
||||
|
|
||||
|
// +build !amd64,!amd64p32 appengine gccgo
|
||||
|
|
||||
|
package crc32 |
||||
|
|
||||
|
// This file contains the generic version of updateCastagnoli which does
|
||||
|
// slicing-by-8, or uses the fallback for very small sizes.
|
||||
|
|
||||
|
func updateCastagnoli(crc uint32, p []byte) uint32 { |
||||
|
// only use slicing-by-8 when input is >= 16 Bytes
|
||||
|
if len(p) >= 16 { |
||||
|
return updateSlicingBy8(crc, castagnoliTable8, p) |
||||
|
} |
||||
|
return update(crc, castagnoliTable, p) |
||||
|
} |
||||
|
|
||||
|
func updateIEEE(crc uint32, p []byte) uint32 { |
||||
|
// only use slicing-by-8 when input is >= 16 Bytes
|
||||
|
if len(p) >= 16 { |
||||
|
ieeeTable8Once.Do(func() { |
||||
|
ieeeTable8 = makeTable8(IEEE) |
||||
|
}) |
||||
|
return updateSlicingBy8(crc, ieeeTable8, p) |
||||
|
} |
||||
|
return update(crc, IEEETable, p) |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue