Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gfp_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ TEXT ·gfpMul(SB),0,$0-24
MOVD b+16(FP), R0
loadBlock(0(R0), R5,R6,R7,R8)

mul(R9,R10,R11,R12,R13,R14,R15,R16)
mul(R9,R10,R11,R12,R13,R14,R15,R16,R17,EMPTY)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: R17 is used later, but at this point it is uninitialized, so we can use it and not care about it's value being modified.

gfpReduce()

MOVD c+0(FP), R0
Expand Down
35 changes: 35 additions & 0 deletions gfp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ import (
"encoding/binary"
"io"
"math/big"
"runtime"
"sync"
"testing"
"time"
)

// randomGF returns a random integer between 0 and p-1.
Expand Down Expand Up @@ -112,6 +115,38 @@ func TestGFp(t *testing.T) {
}
})

t.Run("mul_fp_corruption", func(t *testing.T) {
// By enabling the mutex profiling, the go runtime will traverse the
// stack to measure mutex operations when a mutex is unlocked when a
// goroutine is blocking on it.
runtime.SetMutexProfileFraction(1)

var (
lock sync.Mutex
wg sync.WaitGroup
)
wg.Add(testTimes)
for i := 0; i < testTimes; i++ {
go func() {
defer wg.Done()

a := togfP(randomGF(rand.Reader))
b := togfP(randomGF(rand.Reader))
c := &gfP{}
gfpMul(c, a, b)

lock.Lock()
// Make it more likely for goroutines to block on this mutex.
time.Sleep(time.Microsecond)

// If the frame pointer was corrupted, and another goroutine is
// blocked on this mutex, then this will segfault.
lock.Unlock()
Comment on lines +138 to +144
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could make this an arm specific test and include some asm code to explicitly return R27 and R29 so that we verify gfpMul doesn't modify them. I felt like that was a bit more work than I wanted to do... But this test is a bit contrived as it is right now (and it only verifies R29 as not being corrupted)

}()
}
wg.Wait()
})

t.Run("neg", func(t *testing.T) {
c := &gfP{}
bigC := new(big.Int)
Expand Down
49 changes: 29 additions & 20 deletions mul_arm64.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
#define EMPTY

#define RELOAD \
MOVD ·p2+0(SB), R5

#define mul(c0,c1,c2,c3,c4,c5,c6,c7,t0,reset) \
Comment on lines +3 to +6
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might make this a bit more clear.

Suggested change
#define RELOAD \
MOVD ·p2+0(SB), R5
#define mul(c0,c1,c2,c3,c4,c5,c6,c7,t0,reset) \
#define ResetR5 \
MOVD ·p2+0(SB), R5
#define mul(c0,c1,c2,c3,c4,c5,c6,c7,maybeR5,maybeResetR5) \

MUL R1, R5, c0 \
UMULH R1, R5, c1 \
MUL R1, R6, R0 \
Expand All @@ -16,54 +21,56 @@
UMULH R2, R5, R26 \
MUL R2, R6, R0 \
ADDS R0, R26 \
UMULH R2, R6, R27 \
UMULH R2, R6, c6 \
MUL R2, R7, R0 \
ADCS R0, R27 \
UMULH R2, R7, R29 \
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could continue to use R29 if we decided to just save the value onto the stack at the start of gfpMul and then we restore it prior to RET. That would mean that we wouldn't need the t0 and reset modifications to this macro (as we can replace R27 with R29 and only restore R6 in gfpReduce.

This would allocate 8 bytes onto the stack, but would keep the same number of additional instructions (3). The current modification has 3 MOVD instructions, this would have a push+pop+MOVD.

If that seems preferable then I can switch to that approach.

ADCS R0, c6 \
UMULH R2, R7, c7 \
MUL R2, R8, R0 \
ADCS R0, R29 \
ADCS R0, c7 \
UMULH R2, R8, c5 \
ADCS ZR, c5 \
ADDS R1, c1 \
ADCS R26, c2 \
ADCS R27, c3 \
ADCS R29, c4 \
ADCS c6, c3 \
ADCS c7, c4 \
ADCS ZR, c5 \
\
MUL R3, R5, R1 \
UMULH R3, R5, R26 \
MUL R3, R6, R0 \
ADDS R0, R26 \
UMULH R3, R6, R27 \
UMULH R3, R6, t0 \
MUL R3, R7, R0 \
ADCS R0, R27 \
UMULH R3, R7, R29 \
ADCS R0, t0 \
UMULH R3, R7, c7 \
MUL R3, R8, R0 \
ADCS R0, R29 \
ADCS R0, c7 \
UMULH R3, R8, c6 \
ADCS ZR, c6 \
ADDS R1, c2 \
ADCS R26, c3 \
ADCS R27, c4 \
ADCS R29, c5 \
ADCS t0, c4 \
ADCS c7, c5 \
ADCS ZR, c6 \
\
reset \
\
MUL R4, R5, R1 \
UMULH R4, R5, R26 \
MUL R4, R6, R0 \
ADDS R0, R26 \
UMULH R4, R6, R27 \
UMULH R4, R6, R5 \
MUL R4, R7, R0 \
ADCS R0, R27 \
UMULH R4, R7, R29 \
ADCS R0, R5 \
UMULH R4, R7, R6 \
MUL R4, R8, R0 \
ADCS R0, R29 \
ADCS R0, R6 \
UMULH R4, R8, c7 \
ADCS ZR, c7 \
ADDS R1, c3 \
ADCS R26, c4 \
ADCS R27, c5 \
ADCS R29, c6 \
ADCS R5, c5 \
ADCS R6, c6 \
ADCS ZR, c7

#define gfpReduce() \
Expand Down Expand Up @@ -107,7 +114,7 @@
\
\ // m * N
loadModulus(R5,R6,R7,R8) \
mul(R17,R25,R19,R20,R21,R22,R23,R24) \
mul(R17,R25,R19,R20,R21,R22,R23,R24,R5,RELOAD) \
\
\ // Add the 512-bit intermediate to m*N
MOVD ZR, R0 \
Expand All @@ -121,6 +128,8 @@
ADCS R16, R24 \
ADCS ZR, R0 \
\
MOVD ·p2+0(SB), R5 \ // Restore R5
MOVD ·p2+8(SB), R6 \ // Restore R6
\ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
SUBS R5, R21, R10 \
SBCS R6, R22, R11 \
Expand Down