From c2dab7c620d976dc85bcf62252cd90ce044a1fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Utkan=20G=C3=BCng=C3=B6rd=C3=BC?= Date: Sun, 16 Feb 2014 18:57:36 +0900 Subject: [PATCH] use popcnt if available on amd64 --- bitset.go | 62 +++++----------------------- popcnt.go | 64 +++++++++++++++++++++++++++++ popcnt_amd64.s | 102 ++++++++++++++++++++++++++++++++++++++++++++++ popcnt_asm.go | 64 +++++++++++++++++++++++++++++ popcnt_generic.go | 23 +++++++++++ 5 files changed, 263 insertions(+), 52 deletions(-) create mode 100644 popcnt.go create mode 100644 popcnt_amd64.s create mode 100644 popcnt_asm.go create mode 100644 popcnt_generic.go diff --git a/bitset.go b/bitset.go index 53c0915..bed232e 100644 --- a/bitset.go +++ b/bitset.go @@ -213,37 +213,10 @@ func (b *BitSet) Copy(c *BitSet) (count uint) { return } -// From Wikipedia: http://en.wikipedia.org/wiki/Hamming_weight -const m1 uint64 = 0x5555555555555555 //binary: 0101... -const m2 uint64 = 0x3333333333333333 //binary: 00110011.. -const m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ... -const m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ... -const m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ... -const m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones -const hff uint64 = 0xffffffffffffffff //binary: all ones -const h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3... - -// From Wikipedia: count number of set bits. -// This is algorithm popcount_2 in the article retrieved May 9, 2011 - -func popcount_2(x uint64) uint64 { - x -= (x >> 1) & m1 //put count of each 2 bits into those 2 bits - x = (x & m2) + ((x >> 2) & m2) //put count of each 4 bits into those 4 bits - x = (x + (x >> 4)) & m4 //put count of each 8 bits into those 8 bits - x += x >> 8 //put count of each 16 bits into their lowest 8 bits - x += x >> 16 //put count of each 32 bits into their lowest 8 bits - x += x >> 32 //put count of each 64 bits into their lowest 8 bits - return x & 0x7f -} - // Count (number of set bits) func (b *BitSet) Count() uint { if b != nil && b.set != nil { - cnt := uint64(0) - for _, word := range b.set { - cnt += popcount_2(word) - } - return uint(cnt) + return uint(popcntSlice(b.set)) } return 0 } @@ -313,12 +286,8 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint { l = int(b.wordCount()) } cnt := uint64(0) - for i := 0; i < l; i++ { - cnt += popcount_2(b.set[i] &^ compare.set[i]) - } - for i := l; i < len(b.set); i++ { - cnt += popcount_2(b.set[i]) - } + cnt += popcntMaskSlice(b.set[:l], compare.set[:l]) + cnt += popcntSlice(b.set[l:]) return uint(cnt) } @@ -365,10 +334,7 @@ func (b *BitSet) IntersectionCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) b, compare = sortByLength(b, compare) - cnt := uint64(0) - for i, word := range b.set { - cnt += popcount_2(word & compare.set[i]) - } + cnt := popcntAndSlice(b.set, compare.set) return uint(cnt) } @@ -410,14 +376,10 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) b, compare = sortByLength(b, compare) - cnt := uint64(0) - for i, word := range b.set { - cnt += popcount_2(word | compare.set[i]) + cnt := popcntOrSlice(b.set, compare.set) + if len(compare.set) > len(b.set) { + cnt += popcntSlice(compare.set[len(b.set):]) } - for i := len(b.set); i < len(compare.set); i++ { - cnt += popcount_2(compare.set[i]) - } - return uint(cnt) } @@ -462,14 +424,10 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) b, compare = sortByLength(b, compare) - cnt := uint64(0) - for i, word := range b.set { - cnt += popcount_2(word ^ compare.set[i]) + cnt := popcntXorSlice(b.set, compare.set) + if len(compare.set) > len(b.set) { + cnt += popcntSlice(compare.set[len(b.set):]) } - for i := len(b.set); i < len(compare.set); i++ { - cnt += popcount_2(compare.set[i]) - } - return uint(cnt) } diff --git a/popcnt.go b/popcnt.go new file mode 100644 index 0000000..36be0d1 --- /dev/null +++ b/popcnt.go @@ -0,0 +1,64 @@ +package bitset + +// From Wikipedia: http://en.wikipedia.org/wiki/Hamming_weight +const m1 uint64 = 0x5555555555555555 //binary: 0101... +const m2 uint64 = 0x3333333333333333 //binary: 00110011.. +const m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ... +const m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ... +const m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ... +const m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones +const hff uint64 = 0xffffffffffffffff //binary: all ones +const h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3... + +// From Wikipedia: count number of set bits. +// This is algorithm popcount_2 in the article retrieved May 9, 2011 + +func popcount_2(x uint64) uint64 { + x -= (x >> 1) & m1 //put count of each 2 bits into those 2 bits + x = (x & m2) + ((x >> 2) & m2) //put count of each 4 bits into those 4 bits + x = (x + (x >> 4)) & m4 //put count of each 8 bits into those 8 bits + x += x >> 8 //put count of each 16 bits into their lowest 8 bits + x += x >> 16 //put count of each 32 bits into their lowest 8 bits + x += x >> 32 //put count of each 64 bits into their lowest 8 bits + return x & 0x7f +} + +func popcntSliceGo(s []uint64) uint64 { + cnt := uint64(0) + for _, x := range s { + cnt += popcount_2(x) + } + return cnt +} + +func popcntMaskSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount_2(s[i] &^ m[i]) + } + return cnt +} + +func popcntAndSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount_2(s[i] & m[i]) + } + return cnt +} + +func popcntOrSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount_2(s[i] | m[i]) + } + return cnt +} + +func popcntXorSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount_2(s[i] ^ m[i]) + } + return cnt +} diff --git a/popcnt_amd64.s b/popcnt_amd64.s new file mode 100644 index 0000000..121ce7b --- /dev/null +++ b/popcnt_amd64.s @@ -0,0 +1,102 @@ +TEXT ·hasAsm(SB),4,$0 +MOVQ $1, AX +CPUID +SHRQ $23, CX +ANDQ $1, CX +MOVB CX, ret+0(FP) +RET + + +#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2 + +TEXT ·popcntSliceAsm(SB),4,$0-32 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s+8(FP), CX +TESTQ CX, CX +JZ popcntSliceEnd +popcntSliceLoop: +BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX +ADDQ DX, AX +ADDQ $8, SI +LOOP popcntSliceLoop +popcntSliceEnd: +MOVQ AX, ret+24(FP) +RET + +TEXT ·popcntMaskSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s+8(FP), CX +TESTQ CX, CX +JZ popcntMaskSliceEnd +MOVQ m+24(FP), DI +popcntMaskSliceLoop: +MOVQ (DI), DX +NOTQ DX +ANDQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntMaskSliceLoop +popcntMaskSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntAndSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s+8(FP), CX +TESTQ CX, CX +JZ popcntAndSliceEnd +MOVQ m+24(FP), DI +popcntAndSliceLoop: +MOVQ (DI), DX +ANDQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntAndSliceLoop +popcntAndSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntOrSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s+8(FP), CX +TESTQ CX, CX +JZ popcntOrSliceEnd +MOVQ m+24(FP), DI +popcntOrSliceLoop: +MOVQ (DI), DX +ORQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntOrSliceLoop +popcntOrSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntXorSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s+8(FP), CX +TESTQ CX, CX +JZ popcntXorSliceEnd +MOVQ m+24(FP), DI +popcntXorSliceLoop: +MOVQ (DI), DX +XORQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntXorSliceLoop +popcntXorSliceEnd: +MOVQ AX, ret+48(FP) +RET diff --git a/popcnt_asm.go b/popcnt_asm.go new file mode 100644 index 0000000..ef75be7 --- /dev/null +++ b/popcnt_asm.go @@ -0,0 +1,64 @@ +// +build amd64 + +package bitset + +//go:noescape + +func hasAsm() bool + +var useAsm = hasAsm() + +//go:noescape + +func popcntSliceAsm(s []uint64) uint64 + +//go:noescape + +func popcntMaskSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntAndSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntOrSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntXorSliceAsm(s, m []uint64) uint64 + +func popcntSlice(s []uint64) uint64 { + if useAsm { + return popcntSliceAsm(s) + } + return popcntSliceGo(s) +} + +func popcntMaskSlice(s, m []uint64) uint64 { + if useAsm { + return popcntMaskSliceAsm(s, m) + } + return popcntMaskSliceGo(s, m) +} + +func popcntAndSlice(s, m []uint64) uint64 { + if useAsm { + return popcntAndSliceAsm(s, m) + } + return popcntAndSliceGo(s, m) +} + +func popcntOrSlice(s, m []uint64) uint64 { + if useAsm { + return popcntOrSliceAsm(s, m) + } + return popcntOrSliceGo(s, m) +} + +func popcntXorSlice(s, m []uint64) uint64 { + if useAsm { + return popcntXorSliceAsm(s, m) + } + return popcntXorSliceGo(s, m) +} diff --git a/popcnt_generic.go b/popcnt_generic.go new file mode 100644 index 0000000..6fb2ad8 --- /dev/null +++ b/popcnt_generic.go @@ -0,0 +1,23 @@ +// +build !amd64 + +package bitset + +func popcntSlice(s []uint64) uint64 { + return popcntSliceGo(s) +} + +func popcntMaskSlice(s, m []uint64) uint64 { + return popcntMaskSliceGo(s, m) +} + +func popcntAndSlice(s, m []uint64) uint64 { + return popcntAndSliceGo(s, m) +} + +func popcntOrSlice(s, m []uint64) uint64 { + return popcntOrSliceGo(s, m) +} + +func popcntXorSlice(s, m []uint64) uint64 { + return popcntSliceGo(s, m) +}