Source File
chacha8_generic.go
Belonging Package
internal/chacha8rand
// Copyright 2023 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// ChaCha8 is ChaCha with 8 rounds.// See https://cr.yp.to/chacha/chacha-20080128.pdf.//// ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to://// const1 const2 const3 const4// seed seed seed seed// seed seed seed seed// counter64 0 0//// We use the same constants as ChaCha20 does, a random seed,// and a counter. Running ChaCha8 on this input produces// a 4x4 matrix of pseudo-random values with as much entropy// as the seed.//// Given SIMD registers that can hold N uint32s, it is possible// to run N ChaCha8 block transformations in parallel by filling// the first register with the N copies of const1, the second// with N copies of const2, and so on, and then running the operations.//// Each iteration of ChaCha8Rand operates over 32 bytes of input and// produces 992 bytes of RNG output, plus 32 bytes of input for the next// iteration.//// The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to// produce 1024 bytes of output (16 blocks, with counters 0 to 15).// First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,// 0x6b206574 are subtracted from the 32-bit little-endian words at// position 0, 1, 2, and 3 respectively, and an increasing counter// starting at zero is subtracted from each word at position 12. Then,// this stream is permuted such that for each sequence of four blocks,// first we output the first four bytes of each block, then the next four// bytes of each block, and so on. Finally, the last 32 bytes of output// are used as the input of the next iteration, and the remaining 992// bytes are the RNG output.//// See https://c2sp.org/chacha8rand for additional details.//// Normal ChaCha20 implementations for encryption use this same// parallelism but then have to deinterlace the results so that// it appears the blocks were generated separately. For the purposes// of generating random numbers, the interlacing is fine.// We are simply locked in to preserving the 4-way interlacing// in any future optimizations.package chacha8randimport ()// setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.// Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced// the same way they would be in a 4-way SIMD implementations.func ( *[4]uint64, *[16][4]uint32, uint32) {// Convert to uint64 to do half as many stores to memory.:= (*[16][2]uint64)(unsafe.Pointer())// Constants; same as in ChaCha20: "expand 32-byte k"[0][0] = 0x61707865_61707865[0][1] = 0x61707865_61707865[1][0] = 0x3320646e_3320646e[1][1] = 0x3320646e_3320646e[2][0] = 0x79622d32_79622d32[2][1] = 0x79622d32_79622d32[3][0] = 0x6b206574_6b206574[3][1] = 0x6b206574_6b206574// Seed values.var uint64var uint32= uint32([0])= uint64()<<32 | uint64()[4][0] =[4][1] == uint32([0] >> 32)= uint64()<<32 | uint64()[5][0] =[5][1] == uint32([1])= uint64()<<32 | uint64()[6][0] =[6][1] == uint32([1] >> 32)= uint64()<<32 | uint64()[7][0] =[7][1] == uint32([2])= uint64()<<32 | uint64()[8][0] =[8][1] == uint32([2] >> 32)= uint64()<<32 | uint64()[9][0] =[9][1] == uint32([3])= uint64()<<32 | uint64()[10][0] =[10][1] == uint32([3] >> 32)= uint64()<<32 | uint64()[11][0] =[11][1] =// Counters.if goarch.BigEndian {[12][0] = uint64(+0)<<32 | uint64(+1)[12][1] = uint64(+2)<<32 | uint64(+3)} else {[12][0] = uint64(+0) | uint64(+1)<<32[12][1] = uint64(+2) | uint64(+3)<<32}// Zeros.[13][0] = 0[13][1] = 0[14][0] = 0[14][1] = 0[15][0] = 0[15][1] = 0}func () {// block and block_generic must have same type:= block= block_generic_ =}// block_generic is the non-assembly block implementation,// for use on systems without special assembly.// Even on such systems, it is quite fast: on GOOS=386,// ChaCha8 using this code generates random values faster than PCG-DXSM.func ( *[4]uint64, *[32]uint64, uint32) {:= (*[16][4]uint32)(unsafe.Pointer())setup(, , )for := range [0] {// Load block i from b[*][i] into local variables.:= [0][]:= [1][]:= [2][]:= [3][]:= [4][]:= [5][]:= [6][]:= [7][]:= [8][]:= [9][]:= [10][]:= [11][]:= [12][]:= [13][]:= [14][]:= [15][]// 4 iterations of eight quarter-rounds each is 8 roundsfor := 0; < 4; ++ {, , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , ), , , = qr(, , , )}// Store block i back into b[*][i].// Add b4..b11 back to the original key material,// like in ChaCha20, to avoid trivial invertibility.// There is no entropy in b0..b3 and b12..b15// so we can skip the additions and save some time.[0][] =[1][] =[2][] =[3][] =[4][] +=[5][] +=[6][] +=[7][] +=[8][] +=[9][] +=[10][] +=[11][] +=[12][] =[13][] =[14][] =[15][] =}if goarch.BigEndian {// On a big-endian system, reading the uint32 pairs as uint64s// will word-swap them compared to little-endian, so we word-swap// them here first to make the next swap get the right answer.for , := range {[] = >>32 | <<32}}}// qr is the (inlinable) ChaCha8 quarter round.func (, , , uint32) (, , , uint32) {+=^== <<16 | >>16+=^== <<12 | >>20+=^== <<8 | >>24+=^== <<7 | >>25return , , ,}
The pages are generated with Golds v0.7.6. (GOOS=linux GOARCH=amd64)