diff --git a/KeccakF-1600-int-set.h b/KeccakF-1600-int-set.h new file mode 100644 index 0000000..4ec3523 --- /dev/null +++ b/KeccakF-1600-int-set.h @@ -0,0 +1 @@ +#define ProvideFast1088 \ No newline at end of file diff --git a/KeccakF-1600-interface.h b/KeccakF-1600-interface.h new file mode 100644 index 0000000..22185a4 --- /dev/null +++ b/KeccakF-1600-interface.h @@ -0,0 +1,46 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakPermutationInterface_h_ +#define _KeccakPermutationInterface_h_ + +#include "KeccakF-1600-int-set.h" + +void KeccakInitialize( void ); +void KeccakInitializeState(unsigned char *state); +void KeccakPermutation(unsigned char *state); +#ifdef ProvideFast576 +void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast832 +void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1024 +void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1088 +void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1152 +void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data); +#endif +#ifdef ProvideFast1344 +void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data); +#endif +void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount); +#ifdef ProvideFast1024 +void KeccakExtract1024bits(const unsigned char *state, unsigned char *data); +#endif +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount); + +#endif diff --git a/KeccakF-1600-x86-64-asm.c b/KeccakF-1600-x86-64-asm.c new file mode 100644 index 0000000..68fb9bd --- /dev/null +++ b/KeccakF-1600-x86-64-asm.c @@ -0,0 +1,62 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by Ronny Van Keer, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "KeccakF-1600-interface.h" + +#define UseBebigokimisa + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; + +void KeccakInitialize() +{ +} + +void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount) +{ + memcpy(data, state, laneCount*8); +#ifdef UseBebigokimisa + if (laneCount > 8) + { + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8]; + + if (laneCount > 12) + { + ((UINT64*)data)[12] = ~((UINT64*)data)[12]; + if (laneCount > 17) + { + ((UINT64*)data)[17] = ~((UINT64*)data)[17]; + if (laneCount > 20) + { + ((UINT64*)data)[20] = ~((UINT64*)data)[20]; + } + } + } + } + else + { + if (laneCount > 1) + { + ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1]; + if (laneCount > 2) + { + ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2]; + } + } + } + +#endif +} diff --git a/KeccakF-1600-x86-64-shld-gas.s b/KeccakF-1600-x86-64-shld-gas.s new file mode 100644 index 0000000..bc84762 --- /dev/null +++ b/KeccakF-1600-x86-64-shld-gas.s @@ -0,0 +1,766 @@ +# +# The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +# Michaël Peeters and Gilles Van Assche. For more information, feedback or +# questions, please refer to our website: http://keccak.noekeon.org/ +# +# Implementation by Ronny Van Keer, +# hereby denoted as "the implementer". +# +# To the extent possible under law, the implementer has waived all copyright +# and related or neighboring rights to the source code in this file. +# http://creativecommons.org/publicdomain/zero/1.0/ +# + + .text + + +#// --- defines + +.equ UseSIMD, 1 + + +.equ _ba, 0*8 +.equ _be, 1*8 +.equ _bi, 2*8 +.equ _bo, 3*8 +.equ _bu, 4*8 +.equ _ga, 5*8 +.equ _ge, 6*8 +.equ _gi, 7*8 +.equ _go, 8*8 +.equ _gu, 9*8 +.equ _ka, 10*8 +.equ _ke, 11*8 +.equ _ki, 12*8 +.equ _ko, 13*8 +.equ _ku, 14*8 +.equ _ma, 15*8 +.equ _me, 16*8 +.equ _mi, 17*8 +.equ _mo, 18*8 +.equ _mu, 19*8 +.equ _sa, 20*8 +.equ _se, 21*8 +.equ _si, 22*8 +.equ _so, 23*8 +.equ _su, 24*8 + + +# arguments +.equ apState, %rdi +.equ apInput, %rsi +.equ aNbrWords, %rdx + +# xor input into state section +.equ xpState, %r9 + +# round vars +.equ rT1, %rax +.equ rpState, %rdi +.equ rpStack, %rsp + +.equ rDa, %rbx +.equ rDe, %rcx +.equ rDi, %rdx +.equ rDo, %r8 +.equ rDu, %r9 + +.equ rBa, %r10 +.equ rBe, %r11 +.equ rBi, %r12 +.equ rBo, %r13 +.equ rBu, %r14 + +.equ rCa, %rsi +.equ rCe, %rbp +.equ rCi, rBi +.equ rCo, rBo +.equ rCu, %r15 + +.macro mKeccakRound iState, oState, rc, lastRound + + movq rCe, rDa + shld $1, rDa, rDa + + movq _bi(\iState), rCi + xorq _gi(\iState), rDi + xorq _ki(\iState), rCi + xorq rCu, rDa + xorq _mi(\iState), rDi + xorq rDi, rCi + + movq rCi, rDe + shld $1, rDe, rDe + + movq _bo(\iState), rCo + xorq _go(\iState), rDo + xorq _ko(\iState), rCo + xorq rCa, rDe + xorq _mo(\iState), rDo + xorq rDo, rCo + + movq rCo, rDi + shld $1, rDi, rDi + + movq rCu, rDo + xorq rCe, rDi + shld $1, rDo, rDo + + movq rCa, rDu + xorq rCi, rDo + shld $1, rDu, rDu + + movq _ba(\iState), rBa + movq _ge(\iState), rBe + xorq rCo, rDu + movq _ki(\iState), rBi + movq _mo(\iState), rBo + movq _su(\iState), rBu + xorq rDe, rBe + shld $44, rBe, rBe + xorq rDi, rBi + xorq rDa, rBa + shld $43, rBi, rBi + + movq rBe, rCa + movq $\rc, rT1 + orq rBi, rCa + xorq rBa, rT1 + xorq rT1, rCa + movq rCa, _ba(\oState) + + xorq rDu, rBu + shld $14, rBu, rBu + movq rBa, rCu + andq rBe, rCu + xorq rBu, rCu + movq rCu, _bu(\oState) + + xorq rDo, rBo + shld $21, rBo, rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _bi(\oState) + + notq rBi + orq rBa, rBu + orq rBo, rBi + xorq rBo, rBu + xorq rBe, rBi + movq rBu, _bo(\oState) + movq rBi, _be(\oState) + .if \lastRound == 0 + movq rBi, rCe + .endif + + + movq _gu(\iState), rBe + xorq rDu, rBe + movq _ka(\iState), rBi + shld $20, rBe, rBe + xorq rDa, rBi + shld $3, rBi, rBi + movq _bo(\iState), rBa + movq rBe, rT1 + orq rBi, rT1 + xorq rDo, rBa + movq _me(\iState), rBo + movq _si(\iState), rBu + shld $28, rBa, rBa + xorq rBa, rT1 + movq rT1, _ga(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDe, rBo + shld $45, rBo, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ge(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDi, rBu + shld $61, rBu, rBu + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _go(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _gu(\oState) + notq rBu + .if \lastRound == 0 + xorq rBa, rCu + .endif + + orq rBu, rBo + xorq rBi, rBo + movq rBo, _gi(\oState) + + + movq _be(\iState), rBa + movq _gi(\iState), rBe + movq _ko(\iState), rBi + movq _mu(\iState), rBo + movq _sa(\iState), rBu + xorq rDi, rBe + shld $6, rBe, rBe + xorq rDo, rBi + shld $25, rBi, rBi + movq rBe, rT1 + orq rBi, rT1 + xorq rDe, rBa + shld $1, rBa, rBa + xorq rBa, rT1 + movq rT1, _ka(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDu, rBo + shld $8, rBo, rBo + movq rBi, rT1 + andq rBo, rT1 + xorq rBe, rT1 + movq rT1, _ke(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDa, rBu + shld $18, rBu, rBu + notq rBo + movq rBo, rT1 + andq rBu, rT1 + xorq rBi, rT1 + movq rT1, _ki(\oState) + + movq rBu, rT1 + orq rBa, rT1 + xorq rBo, rT1 + movq rT1, _ko(\oState) + + andq rBe, rBa + xorq rBu, rBa + movq rBa, _ku(\oState) + .if \lastRound == 0 + xorq rBa, rCu + .endif + + movq _ga(\iState), rBe + xorq rDa, rBe + movq _ke(\iState), rBi + shld $36, rBe, rBe + xorq rDe, rBi + movq _bu(\iState), rBa + shld $10, rBi, rBi + movq rBe, rT1 + movq _mi(\iState), rBo + andq rBi, rT1 + xorq rDu, rBa + movq _so(\iState), rBu + shld $27, rBa, rBa + xorq rBa, rT1 + movq rT1, _ma(\oState) + .if \lastRound == 0 + xor rT1, rCa + .endif + + xorq rDi, rBo + shld $15, rBo, rBo + movq rBi, rT1 + orq rBo, rT1 + xorq rBe, rT1 + movq rT1, _me(\oState) + .if \lastRound == 0 + xorq rT1, rCe + .endif + + xorq rDo, rBu + shld $56, rBu, rBu + notq rBo + movq rBo, rT1 + orq rBu, rT1 + xorq rBi, rT1 + movq rT1, _mi(\oState) + + orq rBa, rBe + xorq rBu, rBe + movq rBe, _mu(\oState) + + andq rBa, rBu + xorq rBo, rBu + movq rBu, _mo(\oState) + .if \lastRound == 0 + xorq rBe, rCu + .endif + + + movq _bi(\iState), rBa + movq _go(\iState), rBe + movq _ku(\iState), rBi + xorq rDi, rBa + movq _ma(\iState), rBo + shld $62, rBa, rBa + xorq rDo, rBe + movq _se(\iState), rBu + shld $55, rBe, rBe + + xorq rDu, rBi + movq rBa, rDu + xorq rDe, rBu + shld $2, rBu, rBu + andq rBe, rDu + xorq rBu, rDu + movq rDu, _su(\oState) + + shld $39, rBi, rBi + .if \lastRound == 0 + xorq rDu, rCu + .endif + notq rBe + xorq rDa, rBo + movq rBe, rDa + andq rBi, rDa + xorq rBa, rDa + movq rDa, _sa(\oState) + .if \lastRound == 0 + xor rDa, rCa + .endif + + shld $41, rBo, rBo + movq rBi, rDe + orq rBo, rDe + xorq rBe, rDe + movq rDe, _se(\oState) + .if \lastRound == 0 + xorq rDe, rCe + .endif + + movq rBo, rDi + movq rBu, rDo + andq rBu, rDi + orq rBa, rDo + xorq rBi, rDi + xorq rBo, rDo + movq rDi, _si(\oState) + movq rDo, _so(\oState) + + .endm + +.macro mKeccakPermutation + + subq $8*25, %rsp + + movq _ba(rpState), rCa + movq _be(rpState), rCe + movq _bu(rpState), rCu + + xorq _ga(rpState), rCa + xorq _ge(rpState), rCe + xorq _gu(rpState), rCu + + xorq _ka(rpState), rCa + xorq _ke(rpState), rCe + xorq _ku(rpState), rCu + + xorq _ma(rpState), rCa + xorq _me(rpState), rCe + xorq _mu(rpState), rCu + + xorq _sa(rpState), rCa + xorq _se(rpState), rCe + movq _si(rpState), rDi + movq _so(rpState), rDo + xorq _su(rpState), rCu + + + mKeccakRound rpState, rpStack, 0x0000000000000001, 0 + mKeccakRound rpStack, rpState, 0x0000000000008082, 0 + mKeccakRound rpState, rpStack, 0x800000000000808a, 0 + mKeccakRound rpStack, rpState, 0x8000000080008000, 0 + mKeccakRound rpState, rpStack, 0x000000000000808b, 0 + mKeccakRound rpStack, rpState, 0x0000000080000001, 0 + + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008009, 0 + mKeccakRound rpState, rpStack, 0x000000000000008a, 0 + mKeccakRound rpStack, rpState, 0x0000000000000088, 0 + mKeccakRound rpState, rpStack, 0x0000000080008009, 0 + mKeccakRound rpStack, rpState, 0x000000008000000a, 0 + + mKeccakRound rpState, rpStack, 0x000000008000808b, 0 + mKeccakRound rpStack, rpState, 0x800000000000008b, 0 + mKeccakRound rpState, rpStack, 0x8000000000008089, 0 + mKeccakRound rpStack, rpState, 0x8000000000008003, 0 + mKeccakRound rpState, rpStack, 0x8000000000008002, 0 + mKeccakRound rpStack, rpState, 0x8000000000000080, 0 + + mKeccakRound rpState, rpStack, 0x000000000000800a, 0 + mKeccakRound rpStack, rpState, 0x800000008000000a, 0 + mKeccakRound rpState, rpStack, 0x8000000080008081, 0 + mKeccakRound rpStack, rpState, 0x8000000000008080, 0 + mKeccakRound rpState, rpStack, 0x0000000080000001, 0 + mKeccakRound rpStack, rpState, 0x8000000080008008, 1 + + addq $8*25, %rsp + + .endm + +.macro mPushRegs + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + .endm + + +.macro mPopRegs + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + .endm + + +.macro mXorState128 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %rcx + xorq %rax, \offset(\state) + xorq %rcx, \offset+8(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu %xmm0, \offset(\state) + .endif + .endm + +.macro mXorState256 input, state, offset + .if UseSIMD == 0 + movq \offset(\input), %rax + movq \offset+8(\input), %r10 + movq \offset+16(\input), %rcx + movq \offset+24(\input), %r8 + xorq %rax, \offset(\state) + xorq %r10, \offset+8(\state) + xorq %rcx, \offset+16(\state) + xorq %r8, \offset+24(\state) + .else + movdqu \offset(\input), %xmm0 + pxor \offset(\state), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu %xmm1, \offset+16(\state) + .endif + .endm + +.macro mXorState512 input, state, offset + .if UseSIMD == 0 + mXorState256 \input, \state, \offset + mXorState256 \input, \state, \offset+32 + .else + movdqu \offset(\input), %xmm0 + movdqu \offset+16(\input), %xmm1 + pxor \offset(\state), %xmm0 + movdqu \offset+32(\input), %xmm2 + pxor \offset+16(\state), %xmm1 + movdqu %xmm0, \offset(\state) + movdqu \offset+48(\input), %xmm3 + pxor \offset+32(\state), %xmm2 + movdqu %xmm1, \offset+16(\state) + pxor \offset+48(\state), %xmm3 + movdqu %xmm2, \offset+32(\state) + movdqu %xmm3, \offset+48(\state) + .endif + .endm + +# ------------------------------------------------------------------------- + + .size KeccakPermutation, .-KeccakPermutation + .align 2 + .global KeccakPermutation + .type KeccakPermutation, %function +KeccakPermutation: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb576bits, .-KeccakAbsorb576bits + .align 2 + .global KeccakAbsorb576bits + .type KeccakAbsorb576bits, %function +KeccakAbsorb576bits: + + mXorState512 apInput, apState, 0 + movq 64(apInput), %rax + xorq %rax, 64(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb832bits, .-KeccakAbsorb832bits + .align 2 + .global KeccakAbsorb832bits + .type KeccakAbsorb832bits, %function +KeccakAbsorb832bits: + + mXorState512 apInput, apState, 0 + mXorState256 apInput, apState, 64 + movq 96(apInput), %rax + xorq %rax, 96(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1024bits, .-KeccakAbsorb1024bits + .align 2 + .global KeccakAbsorb1024bits + .type KeccakAbsorb1024bits, %function +KeccakAbsorb1024bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1088bits, .-KeccakAbsorb1088bits + .align 2 + .global KeccakAbsorb1088bits + .type KeccakAbsorb1088bits, %function +KeccakAbsorb1088bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + movq 128(apInput), %rax + xorq %rax, 128(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1152bits, .-KeccakAbsorb1152bits + .align 2 + .global KeccakAbsorb1152bits + .type KeccakAbsorb1152bits, %function +KeccakAbsorb1152bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState128 apInput, apState, 128 + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb1344bits, .-KeccakAbsorb1344bits + .align 2 + .global KeccakAbsorb1344bits + .type KeccakAbsorb1344bits, %function +KeccakAbsorb1344bits: + + mXorState512 apInput, apState, 0 + mXorState512 apInput, apState, 64 + mXorState256 apInput, apState, 128 + movq 160(apInput), %rax + xorq %rax, 160(apState) + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakAbsorb, .-KeccakAbsorb + .align 2 + .global KeccakAbsorb + .type KeccakAbsorb, %function +KeccakAbsorb: + + movq apState, xpState + + test $16, aNbrWords + jz xorInputToState8 + mXorState512 apInput, xpState, 0 + mXorState512 apInput, xpState, 64 + addq $128, apInput + addq $128, xpState + +xorInputToState8: + test $8, aNbrWords + jz xorInputToState4 + mXorState512 apInput, xpState, 0 + addq $64, apInput + addq $64, xpState + +xorInputToState4: + test $4, aNbrWords + jz xorInputToState2 + mXorState256 apInput, xpState, 0 + addq $32, apInput + addq $32, xpState + +xorInputToState2: + test $2, aNbrWords + jz xorInputToState1 + mXorState128 apInput, xpState, 0 + addq $16, apInput + addq $16, xpState + +xorInputToState1: + test $1, aNbrWords + jz xorInputToStateDone + movq (apInput), %rax + xorq %rax, (xpState) + +xorInputToStateDone: + + mPushRegs + mKeccakPermutation + mPopRegs + ret + +# ------------------------------------------------------------------------- + + .size KeccakInitializeState, .-KeccakInitializeState + .align 2 + .global KeccakInitializeState + .type KeccakInitializeState, %function +KeccakInitializeState: + xorq %rax, %rax + xorq %rcx, %rcx + notq %rcx + + .if UseSIMD == 0 + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movq %rax, 4*8(apState) + movq %rax, 5*8(apState) + movq %rax, 6*8(apState) + movq %rax, 7*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movq %rax, 10*8(apState) + movq %rax, 11*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movq %rax, 14*8(apState) + movq %rax, 15*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movq %rax, 18*8(apState) + movq %rax, 19*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movq %rax, 22*8(apState) + movq %rax, 23*8(apState) + movq %rax, 24*8(apState) + .else + pxor %xmm0, %xmm0 + + movq %rax, 0*8(apState) + movq %rcx, 1*8(apState) + movq %rcx, 2*8(apState) + movq %rax, 3*8(apState) + movdqu %xmm0, 4*8(apState) + movdqu %xmm0, 6*8(apState) + movq %rcx, 8*8(apState) + movq %rax, 9*8(apState) + movdqu %xmm0, 10*8(apState) + movq %rcx, 12*8(apState) + movq %rax, 13*8(apState) + movdqu %xmm0, 14*8(apState) + movq %rax, 16*8(apState) + movq %rcx, 17*8(apState) + movdqu %xmm0, 18*8(apState) + movq %rcx, 20*8(apState) + movq %rax, 21*8(apState) + movdqu %xmm0, 22*8(apState) + movq %rax, 24*8(apState) + .endif + ret + +# ------------------------------------------------------------------------- + + .size KeccakExtract1024bits, .-KeccakExtract1024bits + .align 2 + .global KeccakExtract1024bits + .type KeccakExtract1024bits, %function +KeccakExtract1024bits: + + movq 0*8(apState), %rax + movq 1*8(apState), %rcx + movq 2*8(apState), %rdx + movq 3*8(apState), %r8 + notq %rcx + notq %rdx + movq %rax, 0*8(%rsi) + movq %rcx, 1*8(%rsi) + movq %rdx, 2*8(%rsi) + movq %r8, 3*8(%rsi) + + movq 4*8(apState), %rax + movq 5*8(apState), %rcx + movq 6*8(apState), %rdx + movq 7*8(apState), %r8 + movq %rax, 4*8(%rsi) + movq %rcx, 5*8(%rsi) + movq %rdx, 6*8(%rsi) + movq %r8, 7*8(%rsi) + + movq 8*8(apState), %rax + movq 9*8(apState), %rcx + movq 10*8(apState), %rdx + movq 11*8(apState), %r8 + notq %rax + movq %rax, 8*8(%rsi) + movq %rcx, 9*8(%rsi) + movq %rdx, 10*8(%rsi) + movq %r8, 11*8(%rsi) + + movq 12*8(apState), %rax + movq 13*8(apState), %rcx + movq 14*8(apState), %rdx + movq 15*8(apState), %r8 + notq %rax + movq %rax, 12*8(%rsi) + movq %rcx, 13*8(%rsi) + movq %rdx, 14*8(%rsi) + movq %r8, 15*8(%rsi) + ret + diff --git a/KeccakSponge.c b/KeccakSponge.c new file mode 100644 index 0000000..5939ba4 --- /dev/null +++ b/KeccakSponge.c @@ -0,0 +1,266 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include +#include "KeccakSponge.h" +#include "KeccakF-1600-interface.h" +#ifdef KeccakReference +#include "displayIntermediateValues.h" +#endif + +int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity) +{ + if (rate+capacity != 1600) + return 1; + if ((rate <= 0) || (rate >= 1600) || ((rate % 64) != 0)) + return 1; + KeccakInitialize(); + state->rate = rate; + state->capacity = capacity; + state->fixedOutputLength = 0; + KeccakInitializeState(state->state); + memset(state->dataQueue, 0, KeccakMaximumRateInBytes); + state->bitsInQueue = 0; + state->squeezing = 0; + state->bitsAvailableForSqueezing = 0; + + return 0; +} + +void AbsorbQueue(spongeState *state) +{ + // state->bitsInQueue is assumed to be equal to state->rate + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", state->dataQueue, state->rate/8); + #endif +#ifdef ProvideFast576 + if (state->rate == 576) + KeccakAbsorb576bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast832 + if (state->rate == 832) + KeccakAbsorb832bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1024 + if (state->rate == 1024) + KeccakAbsorb1024bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1088 + if (state->rate == 1088) + KeccakAbsorb1088bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1152 + if (state->rate == 1152) + KeccakAbsorb1152bits(state->state, state->dataQueue); + else +#endif +#ifdef ProvideFast1344 + if (state->rate == 1344) + KeccakAbsorb1344bits(state->state, state->dataQueue); + else +#endif + KeccakAbsorb(state->state, state->dataQueue, state->rate/64); + state->bitsInQueue = 0; +} + +int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen) +{ + unsigned long long i, j, wholeBlocks; + unsigned int partialBlock, partialByte; + const unsigned char *curData; + + if ((state->bitsInQueue % 8) != 0) + return 1; // Only the last call may contain a partial byte + if (state->squeezing) + return 1; // Too late for additional input + + i = 0; + while(i < databitlen) { + if ((state->bitsInQueue == 0) && (databitlen >= state->rate) && (i <= (databitlen-state->rate))) { + wholeBlocks = (databitlen-i)/state->rate; + curData = data+i/8; +#ifdef ProvideFast576 + if (state->rate == 576) { + for(j=0; jrate/8); + #endif + KeccakAbsorb576bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast832 + if (state->rate == 832) { + for(j=0; jrate/8); + #endif + KeccakAbsorb832bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1024 + if (state->rate == 1024) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1024bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1088 + if (state->rate == 1088) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1088bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1152 + if (state->rate == 1152) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1152bits(state->state, curData); + } + } + else +#endif +#ifdef ProvideFast1344 + if (state->rate == 1344) { + for(j=0; jrate/8); + #endif + KeccakAbsorb1344bits(state->state, curData); + } + } + else +#endif + { + for(j=0; jrate/8) { + #ifdef KeccakReference + displayBytes(1, "Block to be absorbed", curData, state->rate/8); + #endif + KeccakAbsorb(state->state, curData, state->rate/64); + } + } + i += wholeBlocks*state->rate; + } + else { + partialBlock = (unsigned int)(databitlen - i); + if (partialBlock+state->bitsInQueue > state->rate) + partialBlock = state->rate-state->bitsInQueue; + partialByte = partialBlock % 8; + partialBlock -= partialByte; + memcpy(state->dataQueue+state->bitsInQueue/8, data+i/8, partialBlock/8); + state->bitsInQueue += partialBlock; + i += partialBlock; + if (state->bitsInQueue == state->rate) + AbsorbQueue(state); + if (partialByte > 0) { + unsigned char mask = (1 << partialByte)-1; + state->dataQueue[state->bitsInQueue/8] = data[i/8] & mask; + state->bitsInQueue += partialByte; + i += partialByte; + } + } + } + return 0; +} + +void PadAndSwitchToSqueezingPhase(spongeState *state) +{ + // Note: the bits are numbered from 0=LSB to 7=MSB + if (state->bitsInQueue + 1 == state->rate) { + state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8); + AbsorbQueue(state); + memset(state->dataQueue, 0, state->rate/8); + } + else { + memset(state->dataQueue + (state->bitsInQueue+7)/8, 0, state->rate/8 - (state->bitsInQueue+7)/8); + state->dataQueue[state->bitsInQueue/8 ] |= 1 << (state->bitsInQueue % 8); + } + state->dataQueue[(state->rate-1)/8] |= 1 << ((state->rate-1) % 8); + AbsorbQueue(state); + + #ifdef KeccakReference + displayText(1, "--- Switching to squeezing phase ---"); + #endif +#ifdef ProvideFast1024 + if (state->rate == 1024) { + KeccakExtract1024bits(state->state, state->dataQueue); + state->bitsAvailableForSqueezing = 1024; + } + else +#endif + { + KeccakExtract(state->state, state->dataQueue, state->rate/64); + state->bitsAvailableForSqueezing = state->rate; + } + #ifdef KeccakReference + displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8); + #endif + state->squeezing = 1; +} + +int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength) +{ + unsigned long long i; + unsigned int partialBlock; + + if (!state->squeezing) + PadAndSwitchToSqueezingPhase(state); + if ((outputLength % 8) != 0) + return 1; // Only multiple of 8 bits are allowed, truncation can be done at user level + + i = 0; + while(i < outputLength) { + if (state->bitsAvailableForSqueezing == 0) { + KeccakPermutation(state->state); +#ifdef ProvideFast1024 + if (state->rate == 1024) { + KeccakExtract1024bits(state->state, state->dataQueue); + state->bitsAvailableForSqueezing = 1024; + } + else +#endif + { + KeccakExtract(state->state, state->dataQueue, state->rate/64); + state->bitsAvailableForSqueezing = state->rate; + } + #ifdef KeccakReference + displayBytes(1, "Block available for squeezing", state->dataQueue, state->bitsAvailableForSqueezing/8); + #endif + } + partialBlock = state->bitsAvailableForSqueezing; + if ((unsigned long long)partialBlock > outputLength - i) + partialBlock = (unsigned int)(outputLength - i); + memcpy(output+i/8, state->dataQueue+(state->rate-state->bitsAvailableForSqueezing)/8, partialBlock/8); + state->bitsAvailableForSqueezing -= partialBlock; + i += partialBlock; + } + return 0; +} diff --git a/KeccakSponge.h b/KeccakSponge.h new file mode 100644 index 0000000..df3d797 --- /dev/null +++ b/KeccakSponge.h @@ -0,0 +1,76 @@ +/* +The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +Michaël Peeters and Gilles Van Assche. For more information, feedback or +questions, please refer to our website: http://keccak.noekeon.org/ + +Implementation by the designers, +hereby denoted as "the implementer". + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#ifndef _KeccakSponge_h_ +#define _KeccakSponge_h_ + +#define KeccakPermutationSize 1600 +#define KeccakPermutationSizeInBytes (KeccakPermutationSize/8) +#define KeccakMaximumRate 1536 +#define KeccakMaximumRateInBytes (KeccakMaximumRate/8) + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + +ALIGN typedef struct spongeStateStruct { + ALIGN unsigned char state[KeccakPermutationSizeInBytes]; + ALIGN unsigned char dataQueue[KeccakMaximumRateInBytes]; + unsigned int rate; + unsigned int capacity; + unsigned int bitsInQueue; + unsigned int fixedOutputLength; + int squeezing; + unsigned int bitsAvailableForSqueezing; +} spongeState; + +/** + * Function to initialize the state of the Keccak[r, c] sponge function. + * The sponge function is set to the absorbing phase. + * @param state Pointer to the state of the sponge function to be initialized. + * @param rate The value of the rate r. + * @param capacity The value of the capacity c. + * @pre One must have r+c=1600 and the rate a multiple of 64 bits in this implementation. + * @return Zero if successful, 1 otherwise. + */ +int InitSponge(spongeState *state, unsigned int rate, unsigned int capacity); +/** + * Function to give input data for the sponge function to absorb. + * @param state Pointer to the state of the sponge function initialized by InitSponge(). + * @param data Pointer to the input data. + * When @a databitLen is not a multiple of 8, the last bits of data must be + * in the least significant bits of the last byte. + * @param databitLen The number of input bits provided in the input data. + * @pre In the previous call to Absorb(), databitLen was a multiple of 8. + * @pre The sponge function must be in the absorbing phase, + * i.e., Squeeze() must not have been called before. + * @return Zero if successful, 1 otherwise. + */ +int Absorb(spongeState *state, const unsigned char *data, unsigned long long databitlen); +/** + * Function to squeeze output data from the sponge function. + * If the sponge function was in the absorbing phase, this function + * switches it to the squeezing phase. + * @param state Pointer to the state of the sponge function initialized by InitSponge(). + * @param output Pointer to the buffer where to store the output data. + * @param outputLength The number of output bits desired. + * It must be a multiple of 8. + * @return Zero if successful, 1 otherwise. + */ +int Squeeze(spongeState *state, unsigned char *output, unsigned long long outputLength); + +#endif diff --git a/Makefile.am b/Makefile.am index 164c350..bc2c038 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,7 +17,7 @@ minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S \ - keccak.c maxcoin.c + keccak.c maxcoin.c KeccakSponge.c KeccakF-1600-x86-64-asm.c KeccakF-1600-x86-64-shld-gas.s minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ diff --git a/NEWS b/NEWS index c24e16e..c8af9f9 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +Version 2.3.3 - Feb 27, 2014 + - asm optimizations for keccak (4100 to 8100 khash/s) + asm code by Ronny Van Keer + added to cpuminer by otila from bitcointalk.org + Version 2.3.2a - Feb 2014 - Added support for MaxCoin (Keccak/SHA3) diff --git a/configure.ac b/configure.ac index 6a4d222..b461e42 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer], [2.3.2]) +AC_INIT([cpuminer], [2.3.3]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index f275b36..70448ca 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -38,6 +38,7 @@ #include #include "compat.h" #include "miner.h" +#include "KeccakSponge.h" #define PROGRAM_NAME "minerd" #define DEF_RPC_URL "http://127.0.0.1:8669/" @@ -1291,6 +1292,8 @@ void signal_handler(int sig) } #endif +extern spongeState keccak512_init; + int main(int argc, char *argv[]) { struct thr_info *thr; @@ -1310,6 +1313,8 @@ int main(int argc, char *argv[]) pthread_mutex_init(&stratum.sock_lock, NULL); pthread_mutex_init(&stratum.work_lock, NULL); + InitSponge(&keccak512_init, 1088, 512); + flags = strncmp(rpc_url, "https:", 6) ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) : CURL_GLOBAL_ALL; diff --git a/maxcoin.c b/maxcoin.c index 5c8c21f..90b6d0a 100644 --- a/maxcoin.c +++ b/maxcoin.c @@ -1,21 +1,19 @@ -#include "cpuminer-config.h" -#include "miner.h" - #include #include -#include "sph_keccak.h" +#include "cpuminer-config.h" +#include "miner.h" +#include "KeccakSponge.h" + +spongeState keccak512_init; static void keccakhash(void *state, const void *input) { - sph_keccak256_context ctx_keccak; - uint32_t hash[32]; - - sph_keccak256_init(&ctx_keccak); - sph_keccak256 (&ctx_keccak,input, 80); - sph_keccak256_close(&ctx_keccak, hash); + spongeState keccak512_tmp; - memcpy(state, hash, 32); + memcpy(&keccak512_tmp, &keccak512_init, sizeof(keccak512_init)); + Absorb(&keccak512_tmp, input, 80*8); + Squeeze(&keccak512_tmp, state, 32*8); } int scanhash_keccak(int thr_id, uint32_t *pdata, const uint32_t *ptarget, --- /dev/null 2014-02-15 15:59:56.749282015 +0200 +++ cpuminer-1gh/.gitignore 2014-02-27 16:42:02.807681711 +0200 @@ -0,0 +1,18 @@ +.*.swp +*~ +/autom4te.cache/ +/.deps/ +config.log +config.status +configure +cpuminer-config.h.in +cpuminer-config.h +Makefile +Makefile.in +stamp-h1 +aclocal.m4 + +ID +tags +*.o +minerd