Skip to content

Commit caa4d7b

Browse files
committed
Add KangarooTwelve draft -10
1 parent 90d7565 commit caa4d7b

File tree

4 files changed

+481
-0
lines changed

4 files changed

+481
-0
lines changed

internal/sha3/sha3.go

+4
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,7 @@ func (d *State) Sum(in []byte) []byte {
194194
_, _ = dup.Read(hash)
195195
return append(in, hash...)
196196
}
197+
198+
func (d *State) IsAbsorbing() bool {
199+
return d.state == spongeAbsorbing
200+
}

internal/sha3/shake.go

+4
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,7 @@ func TurboShakeSum256(hash, data []byte, D byte) {
113113
_, _ = h.Write(data)
114114
_, _ = h.Read(hash)
115115
}
116+
117+
func (d *State) SwitchDS(D byte) {
118+
d.dsbyte = D
119+
}

xof/k12/k12.go

+377
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
// k12 implements the KangarooTwelve XOF.
2+
//
3+
// KangarooTwelve is being standardised at the CFFRG working group
4+
// of the IRTF. This package implements draft 10.
5+
//
6+
// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/
7+
package k12
8+
9+
import (
10+
"encoding/binary"
11+
12+
"github.com/cloudflare/circl/internal/sha3"
13+
"github.com/cloudflare/circl/simd/keccakf1600"
14+
)
15+
16+
const chunkSize = 8192 // aka B
17+
18+
// KangarooTwelve splits the message into chunks of 8192 bytes each.
19+
// The first chunk is absorbed directly in a TurboSHAKE128 instance, which
20+
// we call the stalk. The subsequent chunks aren't absorbed directly, but
21+
// instead their hash is absorbed: they're like leafs on a stalk.
22+
// If we have a fast TurboSHAKE128 available, we buffer chunks until we have
23+
// enough to do the parallel TurboSHAKE128. If not, we absorb directly into
24+
// a separate TurboSHAKE128 state.
25+
26+
type State struct {
27+
initialTodo int // Bytes left to absorb for the first chunk.
28+
29+
stalk sha3.State
30+
31+
context []byte // context string "C" provided by the user
32+
33+
// buffer of incoming data so we can do parallel TurboSHAKE128:
34+
// nil when we haven't aborbed the first chunk yet;
35+
// empty if we have, but we do not have a fast parallel TurboSHAKE128;
36+
// and chunkSize*lanes in length if we have.
37+
buf []byte
38+
39+
offset int // offset in buf or bytes written to leaf
40+
41+
// Number of chunk hashes ("CV_i") absorbed into the stalk.
42+
chunk uint
43+
44+
// TurboSHAKE128 instance to compute the leaf in case we don't have
45+
// a fast parallel TurboSHAKE128, viz when lanes == 1.
46+
leaf *sha3.State
47+
48+
lanes uint8 // number of TurboSHAKE128s to compute in parallel
49+
}
50+
51+
// NewDraft10 creates a new instance of Kangaroo12 draft version -10.
52+
func NewDraft10(c []byte) State {
53+
var lanes byte = 1
54+
55+
if keccakf1600.IsEnabledX4() {
56+
lanes = 4
57+
} else if keccakf1600.IsEnabledX2() {
58+
lanes = 2
59+
}
60+
61+
return newDraft10(c, lanes)
62+
}
63+
64+
func newDraft10(c []byte, lanes byte) State {
65+
return State{
66+
initialTodo: chunkSize,
67+
stalk: sha3.NewTurboShake128(0x07),
68+
context: c,
69+
lanes: lanes,
70+
}
71+
}
72+
73+
func (s *State) Reset() {
74+
s.initialTodo = chunkSize
75+
s.stalk.Reset()
76+
s.stalk.SwitchDS(0x07)
77+
s.buf = nil
78+
s.offset = 0
79+
s.chunk = 0
80+
}
81+
82+
func Draft10Sum(hash []byte, msg []byte, c []byte) {
83+
// TODO Tweak number of lanes depending on the length of the message
84+
s := NewDraft10(c)
85+
_, _ = s.Write(msg)
86+
_, _ = s.Read(hash)
87+
}
88+
89+
func (s *State) Write(p []byte) (int, error) {
90+
written := len(p)
91+
92+
// The first chunk is written directly to the stalk.
93+
if s.initialTodo > 0 {
94+
taken := s.initialTodo
95+
if len(p) < taken {
96+
taken = len(p)
97+
}
98+
headP := p[:taken]
99+
_, _ = s.stalk.Write(headP)
100+
s.initialTodo -= taken
101+
p = p[taken:]
102+
}
103+
104+
if len(p) == 0 {
105+
return written, nil
106+
}
107+
108+
// If this is the first bit of data written after the initial chunk,
109+
// we're out of the fast-path and allocate some buffers.
110+
if s.buf == nil {
111+
if s.lanes != 1 {
112+
s.buf = make([]byte, int(s.lanes)*chunkSize)
113+
} else {
114+
// We create the buffer to signal we're past the first chunk,
115+
// but do not use it.
116+
s.buf = make([]byte, 0)
117+
h := sha3.NewTurboShake128(0x0B)
118+
s.leaf = &h
119+
}
120+
_, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})
121+
s.stalk.SwitchDS(0x06)
122+
}
123+
124+
// If we're just using one lane, we don't need to cache in a buffer
125+
// for parallel hashing. Instead, we feed directly to TurboSHAKE.
126+
if s.lanes == 1 {
127+
for len(p) > 0 {
128+
// Write to current leaf.
129+
to := chunkSize - s.offset
130+
if len(p) < to {
131+
to = len(p)
132+
}
133+
_, _ = s.leaf.Write(p[:to])
134+
p = p[to:]
135+
s.offset += to
136+
137+
// Did we fill the chunk?
138+
if s.offset == chunkSize {
139+
var cv [32]byte
140+
_, _ = s.leaf.Read(cv[:])
141+
_, _ = s.stalk.Write(cv[:])
142+
s.leaf.Reset()
143+
s.offset = 0
144+
s.chunk++
145+
}
146+
}
147+
148+
return written, nil
149+
}
150+
151+
// If we can't fill all our lanes or the buffer isn't empty, we write the
152+
// data to the buffer.
153+
if s.offset != 0 || len(p) < len(s.buf) {
154+
to := len(s.buf) - s.offset
155+
if len(p) < to {
156+
to = len(p)
157+
}
158+
p2 := p[:to]
159+
p = p[to:]
160+
copy(s.buf[s.offset:], p2)
161+
s.offset += to
162+
}
163+
164+
// Absorb the buffer if we filled it
165+
if s.offset == len(s.buf) {
166+
s.writeX(s.buf)
167+
s.offset = 0
168+
}
169+
170+
// Note that at this point we may assume that s.offset = 0 if len(p) != 0
171+
if len(p) != 0 && s.offset != 0 {
172+
panic("shouldn't happen")
173+
}
174+
175+
// Absorb a bunch of chunks at the same time.
176+
if len(p) >= int(s.lanes)*chunkSize {
177+
p = s.writeX(p)
178+
}
179+
180+
// Put the remainder in the buffer.
181+
if len(p) > 0 {
182+
copy(s.buf, p)
183+
s.offset = len(p)
184+
}
185+
186+
return written, nil
187+
}
188+
189+
// Absorb a multiple of a multiple of lanes * chunkSize.
190+
// Returns the remainder.
191+
func (s *State) writeX(p []byte) []byte {
192+
switch s.lanes {
193+
case 4:
194+
return s.writeX4(p)
195+
default:
196+
return s.writeX2(p)
197+
}
198+
}
199+
200+
func (s *State) writeX4(p []byte) []byte {
201+
for len(p) >= 4*chunkSize {
202+
var x4 keccakf1600.StateX4
203+
a := x4.Initialize(true)
204+
205+
for offset := 0; offset < 48*168; offset += 168 {
206+
for i := 0; i < 21; i++ {
207+
a[i*4] ^= binary.LittleEndian.Uint64(
208+
p[8*i+offset:],
209+
)
210+
a[i*4+1] ^= binary.LittleEndian.Uint64(
211+
p[chunkSize+8*i+offset:],
212+
)
213+
a[i*4+2] ^= binary.LittleEndian.Uint64(
214+
p[chunkSize*2+8*i+offset:],
215+
)
216+
a[i*4+3] ^= binary.LittleEndian.Uint64(
217+
p[chunkSize*3+8*i+offset:],
218+
)
219+
}
220+
221+
x4.Permute()
222+
}
223+
224+
for i := 0; i < 16; i++ {
225+
a[i*4] ^= binary.LittleEndian.Uint64(
226+
p[8*i+48*168:],
227+
)
228+
a[i*4+1] ^= binary.LittleEndian.Uint64(
229+
p[chunkSize+8*i+48*168:],
230+
)
231+
a[i*4+2] ^= binary.LittleEndian.Uint64(
232+
p[chunkSize*2+8*i+48*168:],
233+
)
234+
a[i*4+3] ^= binary.LittleEndian.Uint64(
235+
p[chunkSize*3+8*i+48*168:],
236+
)
237+
}
238+
239+
a[16*4] ^= 0x0b
240+
a[16*4+1] ^= 0x0b
241+
a[16*4+2] ^= 0x0b
242+
a[16*4+3] ^= 0x0b
243+
a[20*4] ^= 0x80 << 56
244+
a[20*4+1] ^= 0x80 << 56
245+
a[20*4+2] ^= 0x80 << 56
246+
a[20*4+3] ^= 0x80 << 56
247+
248+
x4.Permute()
249+
250+
var buf [32 * 4]byte
251+
for i := 0; i < 4; i++ {
252+
binary.LittleEndian.PutUint64(buf[8*i:], a[4*i])
253+
binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1])
254+
binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2])
255+
binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3])
256+
}
257+
258+
_, _ = s.stalk.Write(buf[:])
259+
p = p[chunkSize*4:]
260+
s.chunk += 4
261+
}
262+
263+
return p
264+
}
265+
266+
func (s *State) writeX2(p []byte) []byte {
267+
// TODO On M2 Pro, 1/3 of the time is spent on this function
268+
// and LittleEndian.Uint64 excluding the actual permutation.
269+
// Rewriting in assembler might be worthwhile.
270+
for len(p) >= 2*chunkSize {
271+
var x2 keccakf1600.StateX2
272+
a := x2.Initialize(true)
273+
274+
for offset := 0; offset < 48*168; offset += 168 {
275+
for i := 0; i < 21; i++ {
276+
a[i*2] ^= binary.LittleEndian.Uint64(
277+
p[8*i+offset:],
278+
)
279+
a[i*2+1] ^= binary.LittleEndian.Uint64(
280+
p[chunkSize+8*i+offset:],
281+
)
282+
}
283+
284+
x2.Permute()
285+
}
286+
287+
for i := 0; i < 16; i++ {
288+
a[i*2] ^= binary.LittleEndian.Uint64(
289+
p[8*i+48*168:],
290+
)
291+
a[i*2+1] ^= binary.LittleEndian.Uint64(
292+
p[chunkSize+8*i+48*168:],
293+
)
294+
}
295+
296+
a[16*2] ^= 0x0b
297+
a[16*2+1] ^= 0x0b
298+
a[20*2] ^= 0x80 << 56
299+
a[20*2+1] ^= 0x80 << 56
300+
301+
x2.Permute()
302+
303+
var buf [32 * 2]byte
304+
for i := 0; i < 4; i++ {
305+
binary.LittleEndian.PutUint64(buf[8*i:], a[2*i])
306+
binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1])
307+
}
308+
309+
_, _ = s.stalk.Write(buf[:])
310+
p = p[chunkSize*2:]
311+
s.chunk += 2
312+
}
313+
314+
return p
315+
}
316+
317+
func (s *State) Read(p []byte) (int, error) {
318+
if s.stalk.IsAbsorbing() {
319+
// Write context string C
320+
_, _ = s.Write(s.context)
321+
322+
// Write length_encode( |C| )
323+
var buf [9]byte
324+
binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context)))
325+
326+
// Find first non-zero digit in big endian encoding of context length
327+
i := 0
328+
for buf[i] == 0 && i < 8 {
329+
i++
330+
}
331+
332+
buf[8] = byte(8 - i) // number of bytes to represent |C|
333+
_, _ = s.Write(buf[i:])
334+
335+
// We need to write the chunk number if we're past the first chunk.
336+
if s.buf != nil {
337+
// Write last remaining chunk(s)
338+
var cv [32]byte
339+
if s.lanes == 1 {
340+
if s.offset != 0 {
341+
_, _ = s.leaf.Read(cv[:])
342+
_, _ = s.stalk.Write(cv[:])
343+
s.chunk++
344+
}
345+
} else {
346+
remainingBuf := s.buf[:s.offset]
347+
for len(remainingBuf) > 0 {
348+
h := sha3.NewTurboShake128(0x0B)
349+
to := chunkSize
350+
if len(remainingBuf) < to {
351+
to = len(remainingBuf)
352+
}
353+
_, _ = h.Write(remainingBuf[:to])
354+
_, _ = h.Read(cv[:])
355+
_, _ = s.stalk.Write(cv[:])
356+
s.chunk++
357+
remainingBuf = remainingBuf[to:]
358+
}
359+
}
360+
361+
// Write length_encode( chunk )
362+
binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk))
363+
364+
// Find first non-zero digit in big endian encoding of number of chunks
365+
i = 0
366+
for buf[i] == 0 && i < 8 {
367+
i++
368+
}
369+
370+
buf[8] = byte(8 - i) // number of bytes to represent number of chunks.
371+
_, _ = s.stalk.Write(buf[i:])
372+
_, _ = s.stalk.Write([]byte{0xff, 0xff})
373+
}
374+
}
375+
376+
return s.stalk.Read(p)
377+
}

0 commit comments

Comments
 (0)