Skip to content

Commit 6f986f0

Browse files
Petko Nikolovrxin
authored andcommitted
[SPARK-1268] Adding XOR and AND-NOT operations to spark.util.collection.BitSet
Symmetric difference (xor) in particular is useful for computing some distance metrics (e.g. Hamming). Unit tests added. Author: Petko Nikolov <[email protected]> Closes apache#172 from petko-nikolov/bitset-imprv and squashes the following commits: 451f28b [Petko Nikolov] fixed style mistakes 5beba18 [Petko Nikolov] rm outer loop in andNot test 0e61035 [Petko Nikolov] conform to spark style; rm redundant asserts; more unit tests added; use arraycopy instead of loop d53cdb9 [Petko Nikolov] rm incidentally added space 4e1df43 [Petko Nikolov] adding xor and and-not to BitSet; unit tests added
1 parent 53953d0 commit 6f986f0

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

core/src/main/scala/org/apache/spark/util/collection/BitSet.scala

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,45 @@ class BitSet(numBits: Int) extends Serializable {
8888
newBS
8989
}
9090

91+
/**
92+
* Compute the symmetric difference by performing bit-wise XOR of the two sets returning the
93+
* result.
94+
*/
95+
def ^(other: BitSet): BitSet = {
96+
val newBS = new BitSet(math.max(capacity, other.capacity))
97+
val smaller = math.min(numWords, other.numWords)
98+
var ind = 0
99+
while (ind < smaller) {
100+
newBS.words(ind) = words(ind) ^ other.words(ind)
101+
ind += 1
102+
}
103+
if (ind < numWords) {
104+
Array.copy( words, ind, newBS.words, ind, numWords - ind )
105+
}
106+
if (ind < other.numWords) {
107+
Array.copy( other.words, ind, newBS.words, ind, other.numWords - ind )
108+
}
109+
newBS
110+
}
111+
112+
/**
113+
* Compute the difference of the two sets by performing bit-wise AND-NOT returning the
114+
* result.
115+
*/
116+
def andNot(other: BitSet): BitSet = {
117+
val newBS = new BitSet(capacity)
118+
val smaller = math.min(numWords, other.numWords)
119+
var ind = 0
120+
while (ind < smaller) {
121+
newBS.words(ind) = words(ind) & ~other.words(ind)
122+
ind += 1
123+
}
124+
if (ind < numWords) {
125+
Array.copy( words, ind, newBS.words, ind, numWords - ind )
126+
}
127+
newBS
128+
}
129+
91130
/**
92131
* Sets the bit at the specified index to true.
93132
* @param index the bit index

core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,87 @@ class BitSetSuite extends FunSuite {
6969
assert(bitset.nextSetBit(96) === 96)
7070
assert(bitset.nextSetBit(97) === -1)
7171
}
72+
73+
test( "xor len(bitsetX) < len(bitsetY)" ) {
74+
val setBitsX = Seq( 0, 2, 3, 37, 41 )
75+
val setBitsY = Seq( 0, 1, 3, 37, 38, 41, 85)
76+
val bitsetX = new BitSet(60)
77+
setBitsX.foreach( i => bitsetX.set(i))
78+
val bitsetY = new BitSet(100)
79+
setBitsY.foreach( i => bitsetY.set(i))
80+
81+
val bitsetXor = bitsetX ^ bitsetY
82+
83+
assert(bitsetXor.nextSetBit(0) === 1)
84+
assert(bitsetXor.nextSetBit(1) === 1)
85+
assert(bitsetXor.nextSetBit(2) === 2)
86+
assert(bitsetXor.nextSetBit(3) === 38)
87+
assert(bitsetXor.nextSetBit(38) === 38)
88+
assert(bitsetXor.nextSetBit(39) === 85)
89+
assert(bitsetXor.nextSetBit(42) === 85)
90+
assert(bitsetXor.nextSetBit(85) === 85)
91+
assert(bitsetXor.nextSetBit(86) === -1)
92+
93+
}
94+
95+
test( "xor len(bitsetX) > len(bitsetY)" ) {
96+
val setBitsX = Seq( 0, 1, 3, 37, 38, 41, 85)
97+
val setBitsY = Seq( 0, 2, 3, 37, 41 )
98+
val bitsetX = new BitSet(100)
99+
setBitsX.foreach( i => bitsetX.set(i))
100+
val bitsetY = new BitSet(60)
101+
setBitsY.foreach( i => bitsetY.set(i))
102+
103+
val bitsetXor = bitsetX ^ bitsetY
104+
105+
assert(bitsetXor.nextSetBit(0) === 1)
106+
assert(bitsetXor.nextSetBit(1) === 1)
107+
assert(bitsetXor.nextSetBit(2) === 2)
108+
assert(bitsetXor.nextSetBit(3) === 38)
109+
assert(bitsetXor.nextSetBit(38) === 38)
110+
assert(bitsetXor.nextSetBit(39) === 85)
111+
assert(bitsetXor.nextSetBit(42) === 85)
112+
assert(bitsetXor.nextSetBit(85) === 85)
113+
assert(bitsetXor.nextSetBit(86) === -1)
114+
115+
}
116+
117+
test( "andNot len(bitsetX) < len(bitsetY)" ) {
118+
val setBitsX = Seq( 0, 2, 3, 37, 41, 48 )
119+
val setBitsY = Seq( 0, 1, 3, 37, 38, 41, 85)
120+
val bitsetX = new BitSet(60)
121+
setBitsX.foreach( i => bitsetX.set(i))
122+
val bitsetY = new BitSet(100)
123+
setBitsY.foreach( i => bitsetY.set(i))
124+
125+
val bitsetDiff = bitsetX.andNot( bitsetY )
126+
127+
assert(bitsetDiff.nextSetBit(0) === 2)
128+
assert(bitsetDiff.nextSetBit(1) === 2)
129+
assert(bitsetDiff.nextSetBit(2) === 2)
130+
assert(bitsetDiff.nextSetBit(3) === 48)
131+
assert(bitsetDiff.nextSetBit(48) === 48)
132+
assert(bitsetDiff.nextSetBit(49) === -1)
133+
assert(bitsetDiff.nextSetBit(65) === -1)
134+
}
135+
136+
test( "andNot len(bitsetX) > len(bitsetY)" ) {
137+
val setBitsX = Seq( 0, 1, 3, 37, 38, 41, 85)
138+
val setBitsY = Seq( 0, 2, 3, 37, 41, 48 )
139+
val bitsetX = new BitSet(100)
140+
setBitsX.foreach( i => bitsetX.set(i))
141+
val bitsetY = new BitSet(60)
142+
setBitsY.foreach( i => bitsetY.set(i))
143+
144+
val bitsetDiff = bitsetX.andNot( bitsetY )
145+
146+
assert(bitsetDiff.nextSetBit(0) === 1)
147+
assert(bitsetDiff.nextSetBit(1) === 1)
148+
assert(bitsetDiff.nextSetBit(2) === 38)
149+
assert(bitsetDiff.nextSetBit(3) === 38)
150+
assert(bitsetDiff.nextSetBit(38) === 38)
151+
assert(bitsetDiff.nextSetBit(39) === 85)
152+
assert(bitsetDiff.nextSetBit(85) === 85)
153+
assert(bitsetDiff.nextSetBit(86) === -1)
154+
}
72155
}

0 commit comments

Comments
 (0)