Skip to content

Commit

Permalink
use popcnt if available on amd64
Browse files Browse the repository at this point in the history
  • Loading branch information
salviati committed Feb 16, 2014
1 parent 3582df1 commit c2dab7c
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 52 deletions.
62 changes: 10 additions & 52 deletions bitset.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,37 +213,10 @@ func (b *BitSet) Copy(c *BitSet) (count uint) {
return
}

// From Wikipedia: http://en.wikipedia.org/wiki/Hamming_weight
const m1 uint64 = 0x5555555555555555 //binary: 0101...
const m2 uint64 = 0x3333333333333333 //binary: 00110011..
const m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ...
const m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ...
const m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ...
const m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones
const hff uint64 = 0xffffffffffffffff //binary: all ones
const h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3...

// From Wikipedia: count number of set bits.
// This is algorithm popcount_2 in the article retrieved May 9, 2011

func popcount_2(x uint64) uint64 {
x -= (x >> 1) & m1 //put count of each 2 bits into those 2 bits
x = (x & m2) + ((x >> 2) & m2) //put count of each 4 bits into those 4 bits
x = (x + (x >> 4)) & m4 //put count of each 8 bits into those 8 bits
x += x >> 8 //put count of each 16 bits into their lowest 8 bits
x += x >> 16 //put count of each 32 bits into their lowest 8 bits
x += x >> 32 //put count of each 64 bits into their lowest 8 bits
return x & 0x7f
}

// Count (number of set bits)
func (b *BitSet) Count() uint {
if b != nil && b.set != nil {
cnt := uint64(0)
for _, word := range b.set {
cnt += popcount_2(word)
}
return uint(cnt)
return uint(popcntSlice(b.set))
}
return 0
}
Expand Down Expand Up @@ -313,12 +286,8 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
l = int(b.wordCount())
}
cnt := uint64(0)
for i := 0; i < l; i++ {
cnt += popcount_2(b.set[i] &^ compare.set[i])
}
for i := l; i < len(b.set); i++ {
cnt += popcount_2(b.set[i])
}
cnt += popcntMaskSlice(b.set[:l], compare.set[:l])
cnt += popcntSlice(b.set[l:])
return uint(cnt)
}

Expand Down Expand Up @@ -365,10 +334,7 @@ func (b *BitSet) IntersectionCardinality(compare *BitSet) uint {
panicIfNull(b)
panicIfNull(compare)
b, compare = sortByLength(b, compare)
cnt := uint64(0)
for i, word := range b.set {
cnt += popcount_2(word & compare.set[i])
}
cnt := popcntAndSlice(b.set, compare.set)
return uint(cnt)
}

Expand Down Expand Up @@ -410,14 +376,10 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint {
panicIfNull(b)
panicIfNull(compare)
b, compare = sortByLength(b, compare)
cnt := uint64(0)
for i, word := range b.set {
cnt += popcount_2(word | compare.set[i])
cnt := popcntOrSlice(b.set, compare.set)
if len(compare.set) > len(b.set) {
cnt += popcntSlice(compare.set[len(b.set):])
}
for i := len(b.set); i < len(compare.set); i++ {
cnt += popcount_2(compare.set[i])
}

return uint(cnt)
}

Expand Down Expand Up @@ -462,14 +424,10 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint {
panicIfNull(b)
panicIfNull(compare)
b, compare = sortByLength(b, compare)
cnt := uint64(0)
for i, word := range b.set {
cnt += popcount_2(word ^ compare.set[i])
cnt := popcntXorSlice(b.set, compare.set)
if len(compare.set) > len(b.set) {
cnt += popcntSlice(compare.set[len(b.set):])
}
for i := len(b.set); i < len(compare.set); i++ {
cnt += popcount_2(compare.set[i])
}

return uint(cnt)
}

Expand Down
64 changes: 64 additions & 0 deletions popcnt.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package bitset

// From Wikipedia: http://en.wikipedia.org/wiki/Hamming_weight
const m1 uint64 = 0x5555555555555555 //binary: 0101...
const m2 uint64 = 0x3333333333333333 //binary: 00110011..
const m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ...
const m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ...
const m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ...
const m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones
const hff uint64 = 0xffffffffffffffff //binary: all ones
const h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3...

// From Wikipedia: count number of set bits.
// This is algorithm popcount_2 in the article retrieved May 9, 2011

func popcount_2(x uint64) uint64 {
x -= (x >> 1) & m1 //put count of each 2 bits into those 2 bits
x = (x & m2) + ((x >> 2) & m2) //put count of each 4 bits into those 4 bits
x = (x + (x >> 4)) & m4 //put count of each 8 bits into those 8 bits
x += x >> 8 //put count of each 16 bits into their lowest 8 bits
x += x >> 16 //put count of each 32 bits into their lowest 8 bits
x += x >> 32 //put count of each 64 bits into their lowest 8 bits
return x & 0x7f
}

func popcntSliceGo(s []uint64) uint64 {
cnt := uint64(0)
for _, x := range s {
cnt += popcount_2(x)
}
return cnt
}

func popcntMaskSliceGo(s, m []uint64) uint64 {
cnt := uint64(0)
for i := range s {
cnt += popcount_2(s[i] &^ m[i])
}
return cnt
}

func popcntAndSliceGo(s, m []uint64) uint64 {
cnt := uint64(0)
for i := range s {
cnt += popcount_2(s[i] & m[i])
}
return cnt
}

func popcntOrSliceGo(s, m []uint64) uint64 {
cnt := uint64(0)
for i := range s {
cnt += popcount_2(s[i] | m[i])
}
return cnt
}

func popcntXorSliceGo(s, m []uint64) uint64 {
cnt := uint64(0)
for i := range s {
cnt += popcount_2(s[i] ^ m[i])
}
return cnt
}
102 changes: 102 additions & 0 deletions popcnt_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
TEXT ·hasAsm(SB),4,$0
MOVQ $1, AX
CPUID
SHRQ $23, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET


#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2

TEXT ·popcntSliceAsm(SB),4,$0-32
XORQ AX, AX
MOVQ s+0(FP), SI
MOVQ s+8(FP), CX
TESTQ CX, CX
JZ popcntSliceEnd
popcntSliceLoop:
BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX
ADDQ DX, AX
ADDQ $8, SI
LOOP popcntSliceLoop
popcntSliceEnd:
MOVQ AX, ret+24(FP)
RET

TEXT ·popcntMaskSliceAsm(SB),4,$0-56
XORQ AX, AX
MOVQ s+0(FP), SI
MOVQ s+8(FP), CX
TESTQ CX, CX
JZ popcntMaskSliceEnd
MOVQ m+24(FP), DI
popcntMaskSliceLoop:
MOVQ (DI), DX
NOTQ DX
ANDQ (SI), DX
POPCNTQ_DX_DX
ADDQ DX, AX
ADDQ $8, SI
ADDQ $8, DI
LOOP popcntMaskSliceLoop
popcntMaskSliceEnd:
MOVQ AX, ret+48(FP)
RET

TEXT ·popcntAndSliceAsm(SB),4,$0-56
XORQ AX, AX
MOVQ s+0(FP), SI
MOVQ s+8(FP), CX
TESTQ CX, CX
JZ popcntAndSliceEnd
MOVQ m+24(FP), DI
popcntAndSliceLoop:
MOVQ (DI), DX
ANDQ (SI), DX
POPCNTQ_DX_DX
ADDQ DX, AX
ADDQ $8, SI
ADDQ $8, DI
LOOP popcntAndSliceLoop
popcntAndSliceEnd:
MOVQ AX, ret+48(FP)
RET

TEXT ·popcntOrSliceAsm(SB),4,$0-56
XORQ AX, AX
MOVQ s+0(FP), SI
MOVQ s+8(FP), CX
TESTQ CX, CX
JZ popcntOrSliceEnd
MOVQ m+24(FP), DI
popcntOrSliceLoop:
MOVQ (DI), DX
ORQ (SI), DX
POPCNTQ_DX_DX
ADDQ DX, AX
ADDQ $8, SI
ADDQ $8, DI
LOOP popcntOrSliceLoop
popcntOrSliceEnd:
MOVQ AX, ret+48(FP)
RET

TEXT ·popcntXorSliceAsm(SB),4,$0-56
XORQ AX, AX
MOVQ s+0(FP), SI
MOVQ s+8(FP), CX
TESTQ CX, CX
JZ popcntXorSliceEnd
MOVQ m+24(FP), DI
popcntXorSliceLoop:
MOVQ (DI), DX
XORQ (SI), DX
POPCNTQ_DX_DX
ADDQ DX, AX
ADDQ $8, SI
ADDQ $8, DI
LOOP popcntXorSliceLoop
popcntXorSliceEnd:
MOVQ AX, ret+48(FP)
RET
64 changes: 64 additions & 0 deletions popcnt_asm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// +build amd64

package bitset

//go:noescape

func hasAsm() bool

var useAsm = hasAsm()

//go:noescape

func popcntSliceAsm(s []uint64) uint64

//go:noescape

func popcntMaskSliceAsm(s, m []uint64) uint64

//go:noescape

func popcntAndSliceAsm(s, m []uint64) uint64

//go:noescape

func popcntOrSliceAsm(s, m []uint64) uint64

//go:noescape

func popcntXorSliceAsm(s, m []uint64) uint64

func popcntSlice(s []uint64) uint64 {
if useAsm {
return popcntSliceAsm(s)
}
return popcntSliceGo(s)
}

func popcntMaskSlice(s, m []uint64) uint64 {
if useAsm {
return popcntMaskSliceAsm(s, m)
}
return popcntMaskSliceGo(s, m)
}

func popcntAndSlice(s, m []uint64) uint64 {
if useAsm {
return popcntAndSliceAsm(s, m)
}
return popcntAndSliceGo(s, m)
}

func popcntOrSlice(s, m []uint64) uint64 {
if useAsm {
return popcntOrSliceAsm(s, m)
}
return popcntOrSliceGo(s, m)
}

func popcntXorSlice(s, m []uint64) uint64 {
if useAsm {
return popcntXorSliceAsm(s, m)
}
return popcntXorSliceGo(s, m)
}
23 changes: 23 additions & 0 deletions popcnt_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// +build !amd64

package bitset

func popcntSlice(s []uint64) uint64 {
return popcntSliceGo(s)
}

func popcntMaskSlice(s, m []uint64) uint64 {
return popcntMaskSliceGo(s, m)
}

func popcntAndSlice(s, m []uint64) uint64 {
return popcntAndSliceGo(s, m)
}

func popcntOrSlice(s, m []uint64) uint64 {
return popcntOrSliceGo(s, m)
}

func popcntXorSlice(s, m []uint64) uint64 {
return popcntSliceGo(s, m)
}

0 comments on commit c2dab7c

Please sign in to comment.