Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions common/bitutil/and_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s

//go:build !purego

#include "textflag.h"

// func andBytesASM(dst, a, b *byte, n int)
TEXT ·andBytesASM(SB), NOSPLIT, $0
MOVQ dst+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ n+24(FP), DX
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned

aligned:
MOVQ $0, AX // position in slices

PCALIGN $16
loop16b:
MOVOU (SI)(AX*1), X0 // AND 16byte forwards.
MOVOU (CX)(AX*1), X1
PAND X1, X0
MOVOU X0, (BX)(AX*1)
ADDQ $16, AX
CMPQ DX, AX
JNE loop16b
RET

PCALIGN $16
loop_1b:
SUBQ $1, DX // AND 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
ANDB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
JNZ loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ aligned

not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
JNE loop_1b
SUBQ $8, DX // AND 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
ANDQ AX, DI
MOVQ DI, (BX)(DX*1)
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE aligned

ret:
RET
70 changes: 70 additions & 0 deletions common/bitutil/and_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_arm64.s

//go:build !purego

#include "textflag.h"

// func andBytesASM(dst, a, b *byte, n int)
TEXT ·andBytesASM(SB), NOSPLIT|NOFRAME, $0
MOVD dst+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
MOVD n+24(FP), R3
CMP $64, R3
BLT tail
loop_64:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
VAND V0.B16, V4.B16, V4.B16
VAND V1.B16, V5.B16, V5.B16
VAND V2.B16, V6.B16, V6.B16
VAND V3.B16, V7.B16, V7.B16
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
SUBS $64, R3
CMP $64, R3
BGE loop_64
tail:
// quick end
CBZ R3, end
TBZ $5, R3, less_than32
VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1.P 32(R2), [V2.B16, V3.B16]
VAND V0.B16, V2.B16, V2.B16
VAND V1.B16, V3.B16, V3.B16
VST1.P [V2.B16, V3.B16], 32(R0)
less_than32:
TBZ $4, R3, less_than16
LDP.P 16(R1), (R11, R12)
LDP.P 16(R2), (R13, R14)
AND R11, R13, R13
AND R12, R14, R14
STP.P (R13, R14), 16(R0)
less_than16:
TBZ $3, R3, less_than8
MOVD.P 8(R1), R11
MOVD.P 8(R2), R12
AND R11, R12, R12
MOVD.P R12, 8(R0)
less_than8:
TBZ $2, R3, less_than4
MOVWU.P 4(R1), R13
MOVWU.P 4(R2), R14
ANDW R13, R14, R14
MOVWU.P R14, 4(R0)
less_than4:
TBZ $1, R3, less_than2
MOVHU.P 2(R1), R15
MOVHU.P 2(R2), R16
ANDW R15, R16, R16
MOVHU.P R16, 2(R0)
less_than2:
TBZ $0, R3, end
MOVBU (R1), R17
MOVBU (R2), R19
ANDW R17, R19, R19
MOVBU R19, (R0)
end:
RET
20 changes: 20 additions & 0 deletions common/bitutil/and_asm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_asm.go

//go:build (amd64 || arm64) && !purego

package bitutil

func andBytes(dst, a, b []byte) int {
n := min(len(a), len(b))
if n == 0 {
return 0
}
andBytesASM(&dst[0], &a[0], &b[0], n)
return n
}

//go:noescape
func andBytesASM(dst, a, b *byte, n int)
38 changes: 38 additions & 0 deletions common/bitutil/and_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build (!amd64 && !arm64) || purego

package bitutil

import "unsafe"

func andBytes(dst, a, b []byte) int {
if supportsUnaligned {
return fastANDBytes(dst, a, b)
}
return safeANDBytes(dst, a, b)
}

// fastANDBytes ands in bulk. It only works on architectures that support
// unaligned read/writes.
func fastANDBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] & bw[i]
}
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] & b[i]
}
return n
}
27 changes: 1 addition & 26 deletions common/bitutil/bitutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,32 +62,7 @@ func safeXORBytes(dst, a, b []byte) int {
// ANDBytes ands the bytes in a and b. The destination is assumed to have enough
// space. Returns the number of bytes and'd.
func ANDBytes(dst, a, b []byte) int {
if supportsUnaligned {
return fastANDBytes(dst, a, b)
}
return safeANDBytes(dst, a, b)
}

// fastANDBytes ands in bulk. It only works on architectures that support
// unaligned read/writes.
func fastANDBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] & bw[i]
}
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] & b[i]
}
return n
return andBytes(dst, a, b)
}

// safeANDBytes ands one by one. It works on all architectures, independent if
Expand Down