-
Notifications
You must be signed in to change notification settings - Fork 527
tools: pingpong total latency #4757
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f80e4e2
86645fc
ed4801f
1fb74bb
5138c89
430befc
14f8cc2
c1c01fd
a569089
99ec33c
88a6d45
4f3223e
b00d7fc
cf3c437
e08a71d
9f8a93b
9224c5b
1c3197b
2a44310
f8c87a4
04f778a
226f3a6
315c271
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,13 @@ | |
| package pingpong | ||
|
|
||
| import ( | ||
| "bufio" | ||
| "compress/gzip" | ||
| "context" | ||
| "encoding/binary" | ||
| "errors" | ||
| "fmt" | ||
| "io" | ||
| "math" | ||
| "math/rand" | ||
| "os" | ||
|
|
@@ -34,6 +37,7 @@ import ( | |
| "github.com/algorand/go-algorand/crypto" | ||
| "github.com/algorand/go-algorand/daemon/algod/api/server/v2/generated/model" | ||
| "github.com/algorand/go-algorand/data/basics" | ||
| "github.com/algorand/go-algorand/data/bookkeeping" | ||
| "github.com/algorand/go-algorand/data/transactions" | ||
| "github.com/algorand/go-algorand/data/transactions/logic" | ||
| "github.com/algorand/go-algorand/libgoal" | ||
|
|
@@ -128,6 +132,11 @@ func (ppa *pingPongAccount) String() string { | |
| return ow.String() | ||
| } | ||
|
|
||
| type txidSendTime struct { | ||
| txid string | ||
| when time.Time | ||
| } | ||
|
|
||
| // WorkerState object holds a running pingpong worker | ||
| type WorkerState struct { | ||
| cfg PpConfig | ||
|
|
@@ -149,6 +158,11 @@ type WorkerState struct { | |
| refreshPos int | ||
|
|
||
| client *libgoal.Client | ||
|
|
||
| // TotalLatencyOut stuff | ||
| sentTxid chan txidSendTime | ||
| latencyBlocks chan bookkeeping.Block | ||
| latencyOuts []io.Writer // latencyOuts is a chain of *os.File, gzip, etc. Write to last element. .Close() last to first. | ||
| } | ||
|
|
||
| // returns the number of boxes per app | ||
|
|
@@ -345,6 +359,25 @@ func (pps *WorkerState) schedule(n int) { | |
| //fmt.Printf("schedule now=%s next=%s\n", now, pps.nextSendTime) | ||
| } | ||
|
|
||
| func (pps *WorkerState) recordTxidSent(txid string, err error) { | ||
| if err != nil { | ||
| return | ||
| } | ||
| if pps.sentTxid == nil { | ||
| return | ||
| } | ||
| rec := txidSendTime{ | ||
| txid: txid, | ||
| when: time.Now(), | ||
| } | ||
| select { | ||
| case pps.sentTxid <- rec: | ||
| // ok! | ||
| default: | ||
| // drop, oh well | ||
| } | ||
| } | ||
|
|
||
| func (pps *WorkerState) fundAccounts(client *libgoal.Client) error { | ||
| var srcFunds, minFund uint64 | ||
| var err error | ||
|
|
@@ -545,6 +578,9 @@ func (pps *WorkerState) RunPingPong(ctx context.Context, ac *libgoal.Client) { | |
| // error = fundAccounts() | ||
| // } | ||
|
|
||
| if pps.cfg.TotalLatencyOut != "" { | ||
| pps.startTxLatency(ctx, ac) | ||
| } | ||
| pps.nextSendTime = time.Now() | ||
| ac.SetSuggestedParamsCacheAge(200 * time.Millisecond) | ||
| pps.client = ac | ||
|
|
@@ -773,7 +809,9 @@ func (pps *WorkerState) sendFromTo( | |
|
|
||
| sentCount++ | ||
| pps.schedule(1) | ||
| _, sendErr = client.BroadcastTransaction(stxn) | ||
| var txid string | ||
| txid, sendErr = client.BroadcastTransaction(stxn) | ||
| pps.recordTxidSent(txid, sendErr) | ||
| } else { | ||
| // Generate txn group | ||
|
|
||
|
|
@@ -844,6 +882,8 @@ func (pps *WorkerState) sendFromTo( | |
| sentCount += uint64(len(txGroup)) | ||
| pps.schedule(len(txGroup)) | ||
| sendErr = client.BroadcastTransactionGroup(stxGroup) | ||
| txid := txGroup[0].ID().String() | ||
| pps.recordTxidSent(txid, sendErr) | ||
| } | ||
|
|
||
| if sendErr != nil { | ||
|
|
@@ -1298,3 +1338,152 @@ func signTxn(signer *pingPongAccount, txn transactions.Transaction, cfg PpConfig | |
| } | ||
| return | ||
| } | ||
|
|
||
| func (pps *WorkerState) startTxLatency(ctx context.Context, ac *libgoal.Client) { | ||
| fout, err := os.Create(pps.cfg.TotalLatencyOut) | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "%s: %v", pps.cfg.TotalLatencyOut, err) | ||
| return | ||
| } | ||
| pps.latencyOuts = append(pps.latencyOuts, fout) | ||
| if strings.HasSuffix(pps.cfg.TotalLatencyOut, ".gz") { | ||
| gzout := gzip.NewWriter(fout) | ||
| pps.latencyOuts = append(pps.latencyOuts, gzout) | ||
| } else { | ||
| bw := bufio.NewWriter(fout) | ||
| pps.latencyOuts = append(pps.latencyOuts, bw) | ||
| } | ||
| pps.sentTxid = make(chan txidSendTime, 1000) | ||
| pps.latencyBlocks = make(chan bookkeeping.Block, 1) | ||
| go pps.txidLatency(ctx) | ||
| go pps.txidLatencyBlockWaiter(ctx, ac) | ||
| } | ||
|
|
||
| type txidSendTimeIndexed struct { | ||
| txidSendTime | ||
| index int | ||
| } | ||
|
|
||
| const txidLatencySampleSize = 10000 | ||
|
|
||
| // thread which handles measuring total send-to-commit latency | ||
| func (pps *WorkerState) txidLatency(ctx context.Context) { | ||
| byTxid := make(map[string]txidSendTimeIndexed, txidLatencySampleSize) | ||
| txidList := make([]string, 0, txidLatencySampleSize) | ||
| out := pps.latencyOuts[len(pps.latencyOuts)-1] | ||
| for { | ||
| select { | ||
| case st := <-pps.sentTxid: | ||
| if len(txidList) < txidLatencySampleSize { | ||
| index := len(txidList) | ||
| txidList = append(txidList, st.txid) | ||
| byTxid[st.txid] = txidSendTimeIndexed{ | ||
| st, | ||
| index, | ||
| } | ||
| } else { | ||
| // random replacement | ||
| evict := rand.Intn(len(txidList)) | ||
| delete(byTxid, txidList[evict]) | ||
| txidList[evict] = st.txid | ||
| byTxid[st.txid] = txidSendTimeIndexed{ | ||
| st, | ||
| evict, | ||
| } | ||
| } | ||
|
Comment on lines
+1376
to
+1393
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like this random replacement scheme, just thinking out loud -- if your sample size is smaller than the number of data points, why not just do a circular buffer? Advantages I see would be that the datapoints would be still well-ordered and you would not be missing any data for the range of time the sample was collected. The way it works now makes it so that the most recent datapoints are most-likely to be included, and the least recent datapoints are least-likely to be included, which would also be the case with a circular buffer.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the rate is larger than the buffer then a circular buffer could lose almost all the data. With a buffer of 10_000 but 26_000 transactions in a block it would only know about the most recent transactions and only measure their latency. Better to measure over a longer duration.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry why not make it 26000 then?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Old habit from working in RAM-scare environments. And to make up some more justification: maybe I don't even want to log all of the txns, but just a sample, because we also don't need to process a full 6000 TPS of this data. |
||
| case bl := <-pps.latencyBlocks: | ||
| now := time.Now() | ||
| txns, err := bl.DecodePaysetFlat() | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "block[%d] payset err %v", bl.Round(), err) | ||
| return | ||
| } | ||
| for _, stxn := range txns { | ||
| txid := stxn.ID().String() | ||
| st, ok := byTxid[txid] | ||
| if ok { | ||
| dt := now.Sub(st.when) | ||
| fmt.Fprintf(out, "%d\n", dt.Nanoseconds()) | ||
| } | ||
| } | ||
| case <-ctx.Done(): | ||
| return | ||
| } | ||
| } | ||
| } | ||
|
|
||
| type flusher interface { | ||
| Flush() error | ||
| } | ||
|
|
||
| func (pps *WorkerState) txidLatencyDone() { | ||
| for i := len(pps.latencyOuts); i >= 0; i-- { | ||
| xo := pps.latencyOuts[i] | ||
| if fl, ok := xo.(flusher); ok { | ||
| err := fl.Flush() | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "%s: %v", pps.cfg.TotalLatencyOut, err) | ||
| } | ||
| } | ||
| if cl, ok := xo.(io.Closer); ok { | ||
| err := cl.Close() | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "%s: %v", pps.cfg.TotalLatencyOut, err) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| const errRestartTime = time.Second | ||
|
|
||
| func (pps *WorkerState) txidLatencyBlockWaiter(ctx context.Context, ac *libgoal.Client) { | ||
| defer close(pps.latencyBlocks) | ||
| done := ctx.Done() | ||
| isDone := func(err error) bool { | ||
| select { | ||
| case <-done: | ||
| return true | ||
| default: | ||
| } | ||
| fmt.Fprintf(os.Stderr, "block waiter st : %v", err) | ||
| time.Sleep(errRestartTime) | ||
| return false | ||
| } | ||
| restart: | ||
| select { | ||
| case <-done: | ||
| return | ||
| default: | ||
| } | ||
| st, err := ac.Status() | ||
| if err != nil { | ||
| if isDone(err) { | ||
| return | ||
| } | ||
| goto restart | ||
| } | ||
| nextRound := st.LastRound | ||
| for { | ||
| select { | ||
| case <-done: | ||
| return | ||
| default: | ||
| } | ||
| st, err = ac.WaitForRound(nextRound) | ||
| if err != nil { | ||
| if isDone(err) { | ||
| return | ||
| } | ||
| goto restart | ||
| } | ||
| bb, err := ac.BookkeepingBlock(st.LastRound) | ||
| if err != nil { | ||
| if isDone(err) { | ||
| return | ||
| } | ||
| goto restart | ||
| } | ||
| pps.latencyBlocks <- bb | ||
| nextRound = st.LastRound | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
true sampling should be done on the
pps.sentTxidwriter side. Otherwise 10k samples will be fully overwritten in few rounds under full TPS.