cloudflare · StephenButtolph · May 11, 2025 · May 11, 2025 · May 11, 2025 · May 11, 2025
diff --git a/gfp_arm64.s b/gfp_arm64.s
@@ -105,7 +105,7 @@ TEXT ·gfpMul(SB),0,$0-24
 	MOVD b+16(FP), R0
 	loadBlock(0(R0), R5,R6,R7,R8)
 
-	mul(R9,R10,R11,R12,R13,R14,R15,R16)
+	mul(R9,R10,R11,R12,R13,R14,R15,R16,R17,EMPTY)
 	gfpReduce()
 
 	MOVD c+0(FP), R0

diff --git a/gfp_test.go b/gfp_test.go
@@ -5,7 +5,10 @@ import (
 	"encoding/binary"
 	"io"
 	"math/big"
+	"runtime"
+	"sync"
 	"testing"
+	"time"
 )
 
 // randomGF returns a random integer between 0 and p-1.
@@ -112,6 +115,38 @@ func TestGFp(t *testing.T) {
 		}
 	})
 
+	t.Run("mul_fp_corruption", func(t *testing.T) {
+		// By enabling the mutex profiling, the go runtime will traverse the
+		// stack to measure mutex operations when a mutex is unlocked when a
+		// goroutine is blocking on it.
+		runtime.SetMutexProfileFraction(1)
+
+		var (
+			lock sync.Mutex
+			wg   sync.WaitGroup
+		)
+		wg.Add(testTimes)
+		for i := 0; i < testTimes; i++ {
+			go func() {
+				defer wg.Done()
+
+				a := togfP(randomGF(rand.Reader))
+				b := togfP(randomGF(rand.Reader))
+				c := &gfP{}
+				gfpMul(c, a, b)
+
+				lock.Lock()
+				// Make it more likely for goroutines to block on this mutex.
+				time.Sleep(time.Microsecond)
+
+				// If the frame pointer was corrupted, and another goroutine is
+				// blocked on this mutex, then this will segfault.
+				lock.Unlock()
+			}()
+		}
+		wg.Wait()
+	})
+
 	t.Run("neg", func(t *testing.T) {
 		c := &gfP{}
 		bigC := new(big.Int)

diff --git a/mul_arm64.h b/mul_arm64.h
@@ -1,4 +1,9 @@
-#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
+#define EMPTY
+
+#define RELOAD \
+	MOVD ·p2+0(SB), R5
+
+#define mul(c0,c1,c2,c3,c4,c5,c6,c7,t0,reset) \
-#define RELOAD \
-	MOVD ·p2+0(SB), R5
-
-#define mul(c0,c1,c2,c3,c4,c5,c6,c7,t0,reset) \
+#define ResetR5 \
+	MOVD ·p2+0(SB), R5
+
+#define mul(c0,c1,c2,c3,c4,c5,c6,c7,maybeR5,maybeResetR5) \
-#define RELOAD \
-	MOVD ·p2+0(SB), R5
-
-#define mul(c0,c1,c2,c3,c4,c5,c6,c7,t0,reset) \
+#define ResetR5 \
+	MOVD ·p2+0(SB), R5
+
+#define mul(c0,c1,c2,c3,c4,c5,c6,c7,maybeR5,maybeResetR5) \
 	MUL R1, R5, c0 \
 	UMULH R1, R5, c1 \
 	MUL R1, R6, R0 \
@@ -16,54 +21,56 @@
 	UMULH R2, R5, R26 \
 	MUL R2, R6, R0 \
 	ADDS R0, R26 \
-	UMULH R2, R6, R27 \
+	UMULH R2, R6, c6 \
 	MUL R2, R7, R0 \
-	ADCS R0, R27 \
-	UMULH R2, R7, R29 \
+	ADCS R0, c6 \
+	UMULH R2, R7, c7 \
 	MUL R2, R8, R0 \
-	ADCS R0, R29 \
+	ADCS R0, c7 \
 	UMULH R2, R8, c5 \
 	ADCS ZR, c5 \
 	ADDS R1, c1 \
 	ADCS R26, c2 \
-	ADCS R27, c3 \
-	ADCS R29, c4 \
+	ADCS c6, c3 \
+	ADCS c7, c4 \
 	ADCS  ZR, c5 \
 	\
 	MUL R3, R5, R1 \
 	UMULH R3, R5, R26 \
 	MUL R3, R6, R0 \
 	ADDS R0, R26 \
-	UMULH R3, R6, R27 \
+	UMULH R3, R6, t0 \
 	MUL R3, R7, R0 \
-	ADCS R0, R27 \
-	UMULH R3, R7, R29 \
+	ADCS R0, t0 \
+	UMULH R3, R7, c7 \
 	MUL R3, R8, R0 \
-	ADCS R0, R29 \
+	ADCS R0, c7 \
 	UMULH R3, R8, c6 \
 	ADCS ZR, c6 \
 	ADDS R1, c2 \
 	ADCS R26, c3 \
-	ADCS R27, c4 \
-	ADCS R29, c5 \
+	ADCS t0, c4 \
+	ADCS c7, c5 \
 	ADCS  ZR, c6 \
 	\
+	reset \
+	\
 	MUL R4, R5, R1 \
 	UMULH R4, R5, R26 \
 	MUL R4, R6, R0 \
 	ADDS R0, R26 \
-	UMULH R4, R6, R27 \
+	UMULH R4, R6, R5 \
 	MUL R4, R7, R0 \
-	ADCS R0, R27 \
-	UMULH R4, R7, R29 \
+	ADCS R0, R5 \
+	UMULH R4, R7, R6 \
 	MUL R4, R8, R0 \
-	ADCS R0, R29 \
+	ADCS R0, R6 \
 	UMULH R4, R8, c7 \
 	ADCS ZR, c7 \
 	ADDS R1, c3 \
 	ADCS R26, c4 \
-	ADCS R27, c5 \
-	ADCS R29, c6 \
+	ADCS R5, c5 \
+	ADCS R6, c6 \
 	ADCS  ZR, c7
 
 #define gfpReduce() \
@@ -107,7 +114,7 @@
 	\
 	\ // m * N
 	loadModulus(R5,R6,R7,R8) \
-	mul(R17,R25,R19,R20,R21,R22,R23,R24) \
+	mul(R17,R25,R19,R20,R21,R22,R23,R24,R5,RELOAD) \
 	\
 	\ // Add the 512-bit intermediate to m*N
 	MOVD  ZR, R0 \
@@ -121,6 +128,8 @@
 	ADCS R16, R24 \
 	ADCS  ZR, R0 \
 	\
+	MOVD ·p2+0(SB), R5 \ // Restore R5
+	MOVD ·p2+8(SB), R6 \ // Restore R6
 	\ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
 	SUBS R5, R21, R10 \
 	SBCS R6, R22, R11 \