Skip to content

Commit 3a79a99

Browse files
authored
issue #433: bugfix on message drain for panic in visits (#434)
* issue 433: bugfix on message drain for panic in visits
1 parent 185d24a commit 3a79a99

File tree

3 files changed

+123
-17
lines changed

3 files changed

+123
-17
lines changed

options.go

+2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ const (
5050
defaultBackoffStep = 10 * time.Second
5151
// maximum duration to wait for the backoff
5252
defaultBackoffMax = 120 * time.Second
53+
54+
defaultPPVisitChannelSize = 100
5355
)
5456

5557
// DefaultProcessorStoragePath is the default path where processor state

partition_processor.go

+22-8
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func newPartitionProcessor(partition int32,
156156
joins: make(map[string]*PartitionTable),
157157
input: make(chan *message, opts.partitionChannelSize),
158158
inputTopics: topicList,
159-
visitInput: make(chan *visit, 100),
159+
visitInput: make(chan *visit, defaultPPVisitChannelSize),
160160
visitCallbacks: visitCallbacks,
161161
graph: graph,
162162
stats: newPartitionProcStats(topicList, outputList),
@@ -693,12 +693,25 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta
693693
}
694694

695695
var wg sync.WaitGroup
696-
// drain the channel and set all items to done we have added.
697-
// Otherwise the caller will wait forever on the waitgroup
698-
drainVisitInput := func() {
696+
697+
// drains the channel and drops out when closed.
698+
// This is done when the processor shuts down during visit
699+
// and makes sure the waitgroup is fully counted down.
700+
drainUntilClose := func() {
701+
for range pp.visitInput {
702+
wg.Done()
703+
}
704+
}
705+
706+
// drains the input channel until there are no more items.
707+
// does not wait for close, because the channel stays open for the next visit
708+
drainUntilEmpty := func() {
699709
for {
700710
select {
701-
case <-pp.visitInput:
711+
case _, ok := <-pp.visitInput:
712+
if !ok {
713+
return
714+
}
702715
wg.Done()
703716
default:
704717
return
@@ -717,11 +730,11 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta
717730
wg.Add(1)
718731
select {
719732
case <-stopping:
720-
drainVisitInput()
733+
drainUntilClose()
721734
wg.Done()
722735
return ErrVisitAborted
723736
case <-ctx.Done():
724-
drainVisitInput()
737+
drainUntilEmpty()
725738
wg.Done()
726739
return ctx.Err()
727740
// enqueue the visit
@@ -747,9 +760,10 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta
747760
}()
748761
select {
749762
case <-stopping:
750-
drainVisitInput()
763+
drainUntilClose()
751764
return ErrVisitAborted
752765
case <-ctx.Done():
766+
drainUntilEmpty()
753767
return ctx.Err()
754768
case <-wgDone:
755769
}

systemtest/processor_visit_test.go

+99-9
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ import (
1313
"github.com/stretchr/testify/require"
1414
)
1515

16+
// size of the channel used for the visitor, defined in goka/config.go
17+
var (
18+
visitChannelSize = 100
19+
numPartitions = 10
20+
)
21+
1622
// TestProcessorVisit tests the visiting functionality.
1723
func TestProcessorVisit(t *testing.T) {
1824
brokers := initSystemTest(t)
@@ -34,7 +40,7 @@ func TestProcessorVisit(t *testing.T) {
3440
}
3541

3642
createEmitter := func(topic goka.Stream) (*goka.Emitter, func()) {
37-
err = tm.EnsureStreamExists(string(topic), 10)
43+
err = tm.EnsureStreamExists(string(topic), numPartitions)
3844
require.NoError(t, err)
3945

4046
em, err := goka.NewEmitter(brokers, topic, new(codec.Int64),
@@ -90,7 +96,7 @@ func TestProcessorVisit(t *testing.T) {
9096

9197
pollTimed(t, "recovered", proc.Recovered)
9298

93-
em.EmitSync("value1", int64(1))
99+
_ = em.EmitSync("value1", int64(1))
94100

95101
pollTimed(t, "value-ok", func() bool {
96102
val1, _ := proc.Get("value1")
@@ -114,7 +120,7 @@ func TestProcessorVisit(t *testing.T) {
114120

115121
pollTimed(t, "recovered", proc.Recovered)
116122

117-
em.EmitSync("value1", int64(1))
123+
_ = em.EmitSync("value1", int64(1))
118124

119125
pollTimed(t, "value-ok", func() bool {
120126
val1, _ := proc.Get("value1")
@@ -128,6 +134,90 @@ func TestProcessorVisit(t *testing.T) {
128134
require.Error(t, <-done)
129135
})
130136

137+
// Tests if a panic occurs while visiting, while the iterator is still pushing
138+
// messages into the partition processor's visit-channel.
139+
// Regression test for https://github.com/lovoo/goka/issues/433
140+
t.Run("visit-panic-slow", func(t *testing.T) {
141+
group, input := nextTopics()
142+
em, finish := createEmitter(input)
143+
defer finish()
144+
proc, cancel, done := runProc(createProc(group, input, 500*time.Millisecond))
145+
146+
pollTimed(t, "recovered", proc.Recovered)
147+
148+
// create twice as many items in the table as the visit-channel's size.
149+
// This way we can make sure that the visitor will have to block on
150+
// pushing it to the partition-processor visitInputChannel.
151+
numMsgs := visitChannelSize * numPartitions * 2
152+
for i := 0; i < numMsgs; i++ {
153+
_, _ = em.Emit(fmt.Sprintf("value-%d", i), int64(1))
154+
}
155+
156+
// wait for all messages to have propagated
157+
pollTimed(t, "value-ok", func() bool {
158+
val1, _ := proc.Get(fmt.Sprintf("value-%d", numMsgs-1))
159+
return val1 != nil && val1.(int64) == 1
160+
})
161+
162+
// pass wrong type to visitor -> which will be passed to the visit --> will panic
163+
require.Error(t, proc.VisitAll(context.Background(), "visitor", "asdf"))
164+
165+
// no need to cancel, the visitAll will kill the processor.
166+
_ = cancel
167+
require.Error(t, <-done)
168+
})
169+
170+
// Verifies a visit is gracefully shutdown when the processor is canceled while
171+
// the visit is running.
172+
t.Run("visit-shutdown-slow", func(t *testing.T) {
173+
group, input := nextTopics()
174+
em, finish := createEmitter(input)
175+
defer finish()
176+
proc, cancel, done := runProc(createProc(group, input, 1*time.Second))
177+
178+
pollTimed(t, "recovered", proc.Recovered)
179+
180+
// create twice as many items in the table as the visit-channel's size.
181+
// This way we can make sure that the visitor will have to block on
182+
// pushing it to the partition-processor visitInputChannel.
183+
numMsgs := visitChannelSize * numPartitions * 2
184+
for i := 0; i < numMsgs; i++ {
185+
_, _ = em.Emit(fmt.Sprintf("value-%d", i), int64(1))
186+
}
187+
188+
// wait for all messages to have propagated
189+
pollTimed(t, "value-ok", func() bool {
190+
val1, _ := proc.Get(fmt.Sprintf("value-%d", numMsgs-1))
191+
return val1 != nil && val1.(int64) == 1
192+
})
193+
194+
visitCtx, visitCancel := context.WithCancel(context.Background())
195+
defer visitCancel()
196+
197+
var (
198+
visitErr error
199+
visitDone = make(chan struct{})
200+
)
201+
202+
// start the visitor
203+
go func() {
204+
defer close(visitDone)
205+
visitErr = proc.VisitAll(visitCtx, "visitor", int64(25))
206+
}()
207+
208+
// wait half of what the processor takes for message to process, so we can stop it in the middle
209+
time.Sleep(500 * time.Millisecond)
210+
// stop the visit
211+
visitCancel()
212+
213+
// wait for visiting done
214+
<-visitDone
215+
require.ErrorContains(t, visitErr, "canceled")
216+
217+
cancel()
218+
require.NoError(t, <-done)
219+
})
220+
131221
t.Run("visit-shutdown", func(t *testing.T) {
132222
group, input := nextTopics()
133223
em, finish := createEmitter(input)
@@ -138,8 +228,8 @@ func TestProcessorVisit(t *testing.T) {
138228

139229
// emit two values where goka.DefaultHasher says they're in the same partition.
140230
// We need to achieve this to test that a shutdown will visit one value but not the other
141-
em.EmitSync("0", int64(1))
142-
em.EmitSync("02", int64(1))
231+
_ = em.EmitSync("0", int64(1))
232+
_ = em.EmitSync("02", int64(1))
143233

144234
pollTimed(t, "value-ok", func() bool {
145235
val1, _ := proc.Get("02")
@@ -196,7 +286,7 @@ func TestProcessorVisit(t *testing.T) {
196286
defer emFinish()
197287
// create the group table manually, otherwise the proc and the view are racing
198288

199-
tm.EnsureTableExists(string(goka.GroupTable(group)), 10)
289+
_ = tm.EnsureTableExists(string(goka.GroupTable(group)), 10)
200290
// scenario: sleep in visit, processor shuts down--> visit should cancel too
201291
proc, cancel, done := runProc(createProc(group, input, 500*time.Millisecond))
202292
view, viewCancel, viewDone := runView(createView(group))
@@ -207,7 +297,7 @@ func TestProcessorVisit(t *testing.T) {
207297
// emit two values where goka.DefaultHasher says they're in the same partition.
208298
// We need to achieve this to test that a shutdown will visit one value but not the other
209299
for i := 0; i < 100; i++ {
210-
em.Emit(fmt.Sprintf("value-%d", i), int64(1))
300+
_, _ = em.Emit(fmt.Sprintf("value-%d", i), int64(1))
211301
}
212302
// emFinish()
213303

@@ -251,7 +341,7 @@ func TestProcessorVisit(t *testing.T) {
251341
em, finish := createEmitter(input)
252342
defer finish()
253343
// create the group table manually, otherwise the proc and the view are racing
254-
tm.EnsureTableExists(string(goka.GroupTable(group)), 10)
344+
_ = tm.EnsureTableExists(string(goka.GroupTable(group)), 10)
255345
// scenario: sleep in visit, processor shuts down--> visit should cancel too
256346
proc1, cancel1, done1 := runProc(createProc(group, input, 500*time.Millisecond))
257347

@@ -260,7 +350,7 @@ func TestProcessorVisit(t *testing.T) {
260350
// emit two values where goka.DefaultHasher says they're in the same partition.
261351
// We need to achieve this to test that a shutdown will visit one value but not the other
262352
for i := 0; i < 100; i++ {
263-
em.Emit(fmt.Sprintf("value-%d", i), int64(1))
353+
_, _ = em.Emit(fmt.Sprintf("value-%d", i), int64(1))
264354
}
265355

266356
// poll until all values are there

0 commit comments

Comments
 (0)