diff --git a/.gitignore b/.gitignore index 1eea430..3a9a32b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ - minerd minerd.exe *.o +*~ +ID +tags autom4te.cache .deps diff --git a/Makefile.am b/Makefile.am index e69a01b..6ee01d7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,7 +18,7 @@ dist_man_MANS = minerd.1 minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c scrypt.c \ - keccak.c groestl.c blake.c \ + keccak.c groestl512.c blake512_sse41.c \ hefty1.c heavy.c if ARCH_x86 minerd_SOURCES += sha2-x86.S scrypt-x86.S diff --git a/blake512_sse41.c b/blake512_sse41.c new file mode 100644 index 0000000..aa5341d --- /dev/null +++ b/blake512_sse41.c @@ -0,0 +1,272 @@ +#include +#include +#include +#include +#include +#include + +/* CONFIG START */ +#define AVOID_BRANCHING 1 +//#define HAVE_XOP 1 +/* CONFIG END */ + +#include "blake512_sse41.h" +#include "blake512_sse41_rounds.h" + +#define U8TO32(p) \ + (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \ + ((u32)((p)[2]) << 8) | ((u32)((p)[3]) )) +#define U8TO64(p) \ + (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4)) +#define U32TO8(p, v) \ + (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \ + (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) ); +#define U64TO8(p, v) \ + U32TO8((p), (u32)((v) >> 32)); \ + U32TO8((p) + 4, (u32)((v) )); + +static const u8 padding[129] = +{ + 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; + +static inline int blake512_compress( blake512_state * state, const u8 * datablock ) +{ + + __m128i row1l,row1h; + __m128i row2l,row2h; + __m128i row3l,row3h; + __m128i row4l,row4h; + + const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); + const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i t0, t1, t2, t3, t4, t5, t6, t7; + __m128i b0, b1, b2, b3; + + m0 = _mm_loadu_si128((__m128i*)(datablock + 0)); + m1 = _mm_loadu_si128((__m128i*)(datablock + 16)); + m2 = _mm_loadu_si128((__m128i*)(datablock + 32)); + m3 = _mm_loadu_si128((__m128i*)(datablock + 48)); + m4 = _mm_loadu_si128((__m128i*)(datablock + 64)); + m5 = _mm_loadu_si128((__m128i*)(datablock + 80)); + m6 = _mm_loadu_si128((__m128i*)(datablock + 96)); + m7 = _mm_loadu_si128((__m128i*)(datablock + 112)); + + m0 = BSWAP64(m0); + m1 = BSWAP64(m1); + m2 = BSWAP64(m2); + m3 = BSWAP64(m3); + m4 = BSWAP64(m4); + m5 = BSWAP64(m5); + m6 = BSWAP64(m6); + m7 = BSWAP64(m7); + + row1l = state->h[0]; + row1h = state->h[1]; + row2l = state->h[2]; + row2h = state->h[3]; + row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); + row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL); + + row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); + row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL); + +#ifdef AVOID_BRANCHING + do + { + const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt)); + const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask); + const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask); + row4l = _mm_xor_si128(row4l, xor1); + row4h = _mm_xor_si128(row4h, xor2); + } while(0); +#else + if(!state->nullt) + { + row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); + row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1])); + } +#endif + + ROUND( 0); + ROUND( 1); + ROUND( 2); + ROUND( 3); + ROUND( 4); + ROUND( 5); + ROUND( 6); + ROUND( 7); + ROUND( 8); + ROUND( 9); + ROUND(10); + ROUND(11); + ROUND(12); + ROUND(13); + ROUND(14); + ROUND(15); + + row1l = _mm_xor_si128(row3l,row1l); + row1h = _mm_xor_si128(row3h,row1h); + + state->h[0] = _mm_xor_si128(row1l, state->h[0]); + state->h[1] = _mm_xor_si128(row1h, state->h[1]); + + row2l = _mm_xor_si128(row4l,row2l); + row2h = _mm_xor_si128(row4h,row2h); + + state->h[2] = _mm_xor_si128(row2l, state->h[2]); + state->h[3] = _mm_xor_si128(row2h, state->h[3]); + + return 0; +} + + +void blake512_update( blake512_state * S, const u8 * data, u64 datalen ) +{ + + + int left = (S->buflen >> 3); + int fill = 128 - left; + + if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) { + memcpy( (void *) (S->buf + left), (void *) data, fill ); + S->t[0] += 1024; + blake512_compress( S, S->buf ); + data += fill; + datalen -= (fill << 3); + left = 0; + } + + while( datalen >= 1024 ) { + S->t[0] += 1024; + blake512_compress( S, data ); + data += 128; + datalen -= 1024; + } + + if( datalen > 0 ) { + memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F ); + S->buflen = (left<<3) + datalen; + } + else S->buflen=0; +} + +void blake512_final( blake512_state * S, u8 * digest ) +{ + + u8 msglen[16], zo=0x01,oo=0x81; + u64 lo=S->t[0] + S->buflen, hi = S->t[1]; + if ( lo < S->buflen ) hi++; + U64TO8( msglen + 0, hi ); + U64TO8( msglen + 8, lo ); + + if ( S->buflen == 888 ) /* one padding byte */ + { + S->t[0] -= 8; + blake512_update( S, &oo, 8 ); + } + else + { + if ( S->buflen < 888 ) /* enough space to fill the block */ + { + if ( S->buflen == 0 ) S->nullt=1; + S->t[0] -= 888 - S->buflen; + blake512_update( S, padding, 888 - S->buflen ); + } + else /* NOT enough space, need 2 compressions */ + { + S->t[0] -= 1024 - S->buflen; + blake512_update( S, padding, 1024 - S->buflen ); + S->t[0] -= 888; + blake512_update( S, padding+1, 888 ); + S->nullt = 1; + } + blake512_update( S, &zo, 8 ); + S->t[0] -= 8; + } + S->t[0] -= 128; + blake512_update( S, msglen, 128 ); + + do + { + const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + _mm_storeu_si128((__m128i*)(digest + 0), BSWAP64(S->h[0])); + _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1])); + _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2])); + _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3])); + } while(0); +} + +#if 0 +int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) +{ + + blake512_state S; + blake512_init( &S ); + blake512_update( &S, in, inlen*8 ); + blake512_final( &S, out ); + return 0; +} + +int main() +{ +#if 1 + int i; + uint64_t foo[10] = {0xbeef, 0xbabe, 0xf00d}; + blake512_state S, S_tmp; + char out[64]; + + blake512_init( &S ); + for (i = 0; i < 5000000; i++) { + memcpy(&S_tmp, &S, sizeof(S)); + blake512_update(&S_tmp, (const unsigned char *)foo, 80*8); + blake512_final( &S_tmp, out); + foo[0]++; + } + write(1, out, 64); + exit(0); +#else + int i, v; + u8 data[144], digest[64]; + u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, + 0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, + 0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, + 0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3}; + u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, + 0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, + 0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, + 0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE}; + + for(i=0; i<144; ++i) data[i]=0; + + crypto_hash( digest, data, 1 ); + v=0; + for(i=0; i<64; ++i) { + printf("%02X", digest[i]); + if ( digest[i] != test1[i]) v=1; + } + if (v) printf("\nerror\n"); + else printf("\nok\n"); + + for(i=0; i<144; ++i) data[i]=0; + + crypto_hash( digest, data, 144 ); + v=0; + for(i=0; i<64; ++i) { + printf("%02X", digest[i]); + if ( digest[i] != test2[i]) v=1; + } + if (v) printf("\nerror\n"); + else printf("\nok\n"); + + return 0; +#endif +} + +#endif + diff --git a/blake512_sse41.h b/blake512_sse41.h new file mode 100644 index 0000000..6b16485 --- /dev/null +++ b/blake512_sse41.h @@ -0,0 +1,32 @@ +#ifndef _BLAKE512_SSE41_H_ +#define _BLAKE512_SSE41_H_ + +#include +#include + +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint8_t u8; + +typedef struct +{ + __m128i h[4]; + u64 s[4], t[2]; + u32 buflen, nullt; + u8 buf[128]; +} blake512_state __attribute__ ((aligned (64))); + +static inline void blake512_init( blake512_state * S ) +{ + memset(S, 0, sizeof(blake512_state)); + S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL); + S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL); + S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL); + S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL); +} + +extern void blake512_update(blake512_state * S, const u8 * data, u64 datalen ); +extern void blake512_final(blake512_state * S, u8 * digest ); + +#endif + diff --git a/blake512_sse41_rounds.h b/blake512_sse41_rounds.h new file mode 100644 index 0000000..303bd11 --- /dev/null +++ b/blake512_sse41_rounds.h @@ -0,0 +1,871 @@ + +#ifndef __BLAKE512_ROUNDS_H__ +#define __BLAKE512_ROUNDS_H__ + +#ifndef HAVE_XOP + #define BSWAP64(x) _mm_shuffle_epi8((x), u8to64) + + #define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) +#else + #define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64) +#endif + + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m0, m1); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m2, m3); \ +t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m0, m1); \ +t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m2, m3); \ +t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m4, m5); \ +t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m5); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m7, m2); \ +t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m4, m6); \ +t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m5, m4); \ +t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m3, m7, 8); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m5, m2); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m6, m1); \ +t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m3, m1); \ +t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m6, m5, 8); \ +t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m2, m7); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m4, m0); \ +t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m1, m6, 0xF0); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m5, m1, 0xF0); \ +t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m3, m4); \ +t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m7, m3); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m2, m0, 8); \ +t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m3, m1); \ +t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m6, m5); \ +t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m0); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m1, m2, 0xF0); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m2, m7, 0xF0); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m3, m5); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m0, m4); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m2); \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m1, m5); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m0, m3, 0xF0); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m2, m7, 0xF0); \ +t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m7, m5, 0xF0); \ +t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m3, m1, 0xF0); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m6, m0, 8); \ +t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m4, m6, 0xF0); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m1, m3); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m0, m4); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m6, m5); \ +t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m5, m1); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m2, m3, 0xF0); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m7, m0); \ +t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m6, m2); \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m7, m4, 0xF0); \ +t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m6, m0, 0xF0); \ +t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m7, m2); \ +t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m2, m7); \ +t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m5, m6, 8); \ +t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m0, m3); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m3, m1); \ +t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m1, m5, 0xF0); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m6, m3); \ +t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m6, m1, 0xF0); \ +t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m7, m5, 8); \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m0, m4); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m2, m7); \ +t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m4, m1); \ +t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m0, m2); \ +t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m3, m5); \ +t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m3, m7); \ +t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m0, m5, 8); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m7, m4); \ +t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m4, m1, 8); \ +t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +t0 = m6; \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m5, m0, 8); \ +t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m1, m3, 0xF0); \ +t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = m2; \ +t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m5, m4); \ +t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m3, m0); \ +t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m1, m2); \ +t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m3, m2, 0xF0); \ +t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m7, m4); \ +t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m1, m6); \ +t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m7, m5, 8); \ +t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m6, m0); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m0, m1); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m2, m3); \ +t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m0, m1); \ +t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m2, m3); \ +t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m4, m5); \ +t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m5); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m7, m2); \ +t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m4, m6); \ +t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m5, m4); \ +t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m3, m7, 8); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m5, m2); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m6, m1); \ +t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m3, m1); \ +t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_12_1(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m6, m5, 8); \ +t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m2, m7); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_12_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m4, m0); \ +t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m1, m6, 0xF0); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_12_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m5, m1, 0xF0); \ +t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m3, m4); \ +t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_12_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m7, m3); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_alignr_epi8(m2, m0, 8); \ +t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_13_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m3, m1); \ +t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m6, m5); \ +t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_13_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m0); \ +t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m6, m7); \ +t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_13_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m1, m2, 0xF0); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m2, m7, 0xF0); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_13_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m3, m5); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m0, m4); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_14_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m4, m2); \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m1, m5); \ +t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_14_2(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m0, m3, 0xF0); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m2, m7, 0xF0); \ +t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_14_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m7, m5, 0xF0); \ +t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m3, m1, 0xF0); \ +t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_14_4(b0, b1) \ +do \ +{ \ +t0 = _mm_alignr_epi8(m6, m0, 8); \ +t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m4, m6, 0xF0); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_15_1(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m1, m3); \ +t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpacklo_epi64(m0, m4); \ +t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_15_2(b0, b1) \ +do \ +{ \ +t0 = _mm_unpacklo_epi64(m6, m5); \ +t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m5, m1); \ +t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_15_3(b0, b1) \ +do \ +{ \ +t0 = _mm_blend_epi16(m2, m3, 0xF0); \ +t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_unpackhi_epi64(m7, m0); \ +t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + +#define LOAD_MSG_15_4(b0, b1) \ +do \ +{ \ +t0 = _mm_unpackhi_epi64(m6, m2); \ +t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ +b0 = _mm_xor_si128(t0, t1); \ +t2 = _mm_blend_epi16(m7, m4, 0xF0); \ +t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ +b1 = _mm_xor_si128(t2, t3); \ +} while(0) + + + + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -25); \ + row2h = _mm_roti_epi64(row2h, -25); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -11); \ + row2h = _mm_roti_epi64(row2h, -11); \ + + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif + diff --git a/brg_endian.h b/brg_endian.h new file mode 100644 index 0000000..e3cf0d1 --- /dev/null +++ b/brg_endian.h @@ -0,0 +1,133 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/brg_types.h b/brg_types.h new file mode 100644 index 0000000..fd603b7 --- /dev/null +++ b/brg_types.h @@ -0,0 +1,234 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + (a few lines added by Soeren S. Thomsen, October 2008) + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef _BRG_TYPES_H +#define _BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) +# include +# define ptrint_t intptr_t +#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 ) +# include +# define ptrint_t intptr_t +#else +# define ptrint_t int +#endif + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( __MVS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned int long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# endif +#endif + +#if !defined( BRG_UI64 ) +# if defined( NEED_UINT_64T ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; + /*# error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/ +# endif +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to detect and set the memory alignment of pointers. + Note that offsets are in bytes. + + ALIGN_OFFSET(x,n) return the positive or zero offset of + the memory addressed by the pointer 'x' + from an address that is aligned on an + 'n' byte boundary ('n' is a power of 2) + + ALIGN_FLOOR(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not higher than the memory address + pointed to by 'x' ('n' is a power of 2) + + ALIGN_CEIL(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not lower than the memory address + pointed to by 'x' ('n' is a power of 2) +*/ + +#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1)) +#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1))) +#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1))) + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8. NOTE that the + buffer size is in bytes but the type length is in bits + + UNIT_TYPEDEF(x,size) declares a variable 'x' of length + 'size' bits + + BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + UNIT_CAST(x,size) casts a variable to a type of + length 'size' bits + + UPTR_CAST(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define UI_TYPE(size) uint_##size##t +#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x +#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)] +#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x)) +#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x)) + + /* Added by Soeren S. Thomsen (begin) */ +#define u8 uint_8t +#define u32 uint_32t +#define u64 uint_64t + /* (end) */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/cpu-miner.c b/cpu-miner.c index 4a0ade7..1a64649 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -35,6 +35,7 @@ #endif #include #include + #include "compat.h" #include "miner.h" @@ -63,7 +64,7 @@ static inline void affine_to_cpu(int id, int cpu) CPU_ZERO(&set); CPU_SET(cpu, &set); - sched_setaffinity(0, sizeof(&set), &set); + sched_setaffinity(0, sizeof(set), &set); } #elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */ #include @@ -263,6 +264,8 @@ struct work { static struct work g_work; static time_t g_work_time; static pthread_mutex_t g_work_lock; +static pthread_mutex_t g_restart_lock; +static pthread_mutex_t g_work_time_lock; static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) @@ -706,6 +709,35 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff); } +static inline time_t get_g_work_time(void) +{ + time_t gnow; + + pthread_mutex_lock(&g_work_time_lock); + gnow = g_work_time; + pthread_mutex_unlock(&g_work_time_lock); + return gnow; +} + +static inline void set_g_work_time(time_t now) +{ + pthread_mutex_lock(&g_work_time_lock); + g_work_time = now; + pthread_mutex_unlock(&g_work_time_lock); +} + +static inline void add_g_work_time(time_t addtime) +{ + pthread_mutex_lock(&g_work_time_lock); + g_work_time += addtime; + pthread_mutex_unlock(&g_work_time_lock); +} + +static inline void sub_g_work_time(time_t addtime) +{ + add_g_work_time(-addtime); +} + static void *miner_thread(void *userdata) { struct thr_info *mythr = userdata; @@ -746,7 +778,7 @@ static void *miner_thread(void *userdata) int rc; if (have_stratum) { - while (time(NULL) >= g_work_time + 120) + while (time(NULL) >= get_g_work_time() + 120) sleep(1); pthread_mutex_lock(&g_work_lock); if (work.data[19] >= end_nonce) @@ -755,7 +787,7 @@ static void *miner_thread(void *userdata) /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); if (!have_stratum && (!have_longpoll || - time(NULL) >= g_work_time + LP_SCANTIME*3/4 || + time(NULL) >= get_g_work_time() + LP_SCANTIME*3/4 || work.data[19] >= end_nonce)) { if (unlikely(!get_work(mythr, &g_work))) { applog(LOG_ERR, "work retrieval failed, exiting " @@ -763,7 +795,7 @@ static void *miner_thread(void *userdata) pthread_mutex_unlock(&g_work_lock); goto out; } - g_work_time = have_stratum ? 0 : time(NULL); + set_g_work_time(have_stratum ? 0 : time(NULL)); } if (have_stratum) { pthread_mutex_unlock(&g_work_lock); @@ -775,16 +807,19 @@ static void *miner_thread(void *userdata) work.data[19] = 0xffffffffU / opt_n_threads * thr_id; } else work.data[19]++; - pthread_mutex_unlock(&g_work_lock); + pthread_mutex_lock(&g_restart_lock); work_restart[thr_id].restart = 0; + pthread_mutex_unlock(&g_restart_lock); /* adjust max_nonce to meet target scan time */ if (have_stratum) max64 = LP_SCANTIME; else - max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) + max64 = get_g_work_time() + (have_longpoll ? LP_SCANTIME : opt_scantime) - time(NULL); + pthread_mutex_lock(&stats_lock); max64 *= thr_hashrates[thr_id]; + pthread_mutex_unlock(&stats_lock); if (max64 <= 0) max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL; if (work.data[19] + max64 > end_nonce) @@ -793,6 +828,8 @@ static void *miner_thread(void *userdata) max_nonce = work.data[19] + max64; hashes_done = 0; + pthread_mutex_unlock(&g_work_lock); + gettimeofday(&tv_start, NULL); /* scan nonces for a proof-of-work hash */ @@ -825,7 +862,7 @@ static void *miner_thread(void *userdata) // fprintf(stderr, " vote = %u\n", ext[0]); // fprintf(stderr, " reward = %u\n", ext[1]); - if (opt_vote > work.maxvote) { + if (opt_vote > work.maxvote && !opt_benchmark) { printf("Warning: Your block reward vote (%hu) exceeds " "the maxvote reported by the pool (%hu).\n", opt_vote, work.maxvote); @@ -891,15 +928,23 @@ static void *miner_thread(void *userdata) pthread_mutex_unlock(&stats_lock); } if (!opt_quiet) { - sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", - 1e-3 * thr_hashrates[thr_id]); + double myrate; + + pthread_mutex_lock(&stats_lock); + myrate = thr_hashrates[thr_id]; + pthread_mutex_unlock(&stats_lock); + + sprintf(s, myrate >= 1e6 ? "%.0f" : "%.2f", + 1e-3 * myrate); applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s", thr_id, hashes_done, s); } if (opt_benchmark && thr_id == opt_n_threads - 1) { double hashrate = 0.; + pthread_mutex_lock(&stats_lock); for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) hashrate += thr_hashrates[i]; + pthread_mutex_unlock(&stats_lock); if (i == opt_n_threads) { sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); applog(LOG_INFO, "Total: %s khash/s", s); @@ -921,8 +966,10 @@ static void restart_threads(void) { int i; + pthread_mutex_lock(&g_restart_lock); for (i = 0; i < opt_n_threads; i++) work_restart[i].restart = 1; + pthread_mutex_unlock(&g_restart_lock); } static void *longpoll_thread(void *userdata) @@ -983,15 +1030,13 @@ start: if (work_decode(json_object_get(val, "result"), &g_work)) { if (opt_debug) applog(LOG_DEBUG, "DEBUG: got new work"); - time(&g_work_time); + set_g_work_time(time(NULL)); restart_threads(); } pthread_mutex_unlock(&g_work_lock); json_decref(val); } else { - pthread_mutex_lock(&g_work_lock); - g_work_time -= LP_SCANTIME; - pthread_mutex_unlock(&g_work_lock); + sub_g_work_time(LP_SCANTIME); if (err == CURLE_OPERATION_TIMEDOUT) { restart_threads(); } else { @@ -1061,7 +1106,7 @@ static void *stratum_thread(void *userdata) while (!stratum.curl) { pthread_mutex_lock(&g_work_lock); - g_work_time = 0; + set_g_work_time(0); pthread_mutex_unlock(&g_work_lock); restart_threads(); @@ -1080,10 +1125,10 @@ static void *stratum_thread(void *userdata) } if (stratum.job.job_id && - (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) { + (strcmp(stratum.job.job_id, g_work.job_id) || !get_g_work_time())) { pthread_mutex_lock(&g_work_lock); stratum_gen_work(&stratum, &g_work); - time(&g_work_time); + set_g_work_time(time(NULL)); pthread_mutex_unlock(&g_work_lock); if (stratum.job.clean) { applog(LOG_INFO, "Stratum detected new block"); @@ -1290,6 +1335,7 @@ static void parse_arg (int key, char *arg) want_longpoll = false; want_stratum = false; have_stratum = false; + opt_vote = 1024; break; case 1003: want_longpoll = false; @@ -1395,6 +1441,8 @@ static void signal_handler(int sig) } #endif +extern void init_hashstates(void); + int main(int argc, char *argv[]) { struct thr_info *thr; @@ -1419,9 +1467,13 @@ int main(int argc, char *argv[]) sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); } + init_hashstates(); + pthread_mutex_init(&applog_lock, NULL); pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&g_work_lock, NULL); + pthread_mutex_init(&g_work_time_lock, NULL); + pthread_mutex_init(&g_restart_lock, NULL); pthread_mutex_init(&stratum.sock_lock, NULL); pthread_mutex_init(&stratum.work_lock, NULL); diff --git a/groestl-intr-aes.h b/groestl-intr-aes.h new file mode 100644 index 0000000..d12be98 --- /dev/null +++ b/groestl-intr-aes.h @@ -0,0 +1,962 @@ +/* groestl-intr-aes.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, and aes + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include + +#include "groestl512.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_1B; +__m128i ALL_FF; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + j = _mm_xor_si128(j, j);\ + j = _mm_cmpgt_epi8(j, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + a2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b6 = _mm_xor_si128(b6, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b0 = _mm_xor_si128(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm_xor_si128(b3, a7);\ + b1 = _mm_xor_si128(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm_xor_si128(b4, a0);\ + b2 = _mm_xor_si128(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm_xor_si128(b5, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + b1 = a1;\ + b6 = _mm_xor_si128(b6, a2);\ + b4 = _mm_xor_si128(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm_xor_si128(b7, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = ALL_1B;\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = i0;\ + t0 = i2;\ + \ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = i0;\ + o3 = o1;\ + \ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ + SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ + SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ + SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ + }\ +}while(0); + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_aesenclast_si128(a7, b0);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant P1024 */\ + xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant P1024 */\ + xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +#define ROUNDS_Q(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant Q1024 */\ + xmm1 = ALL_FF;\ + xmm8 = _mm_xor_si128(xmm8, xmm1);\ + xmm9 = _mm_xor_si128(xmm9, xmm1);\ + xmm10 = _mm_xor_si128(xmm10, xmm1);\ + xmm11 = _mm_xor_si128(xmm11, xmm1);\ + xmm12 = _mm_xor_si128(xmm12, xmm1);\ + xmm13 = _mm_xor_si128(xmm13, xmm1);\ + xmm14 = _mm_xor_si128(xmm14, xmm1);\ + xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant Q1024 */\ + xmm9 = ALL_FF;\ + xmm0 = _mm_xor_si128(xmm0, xmm9);\ + xmm1 = _mm_xor_si128(xmm1, xmm9);\ + xmm2 = _mm_xor_si128(xmm2, xmm9);\ + xmm3 = _mm_xor_si128(xmm3, xmm9);\ + xmm4 = _mm_xor_si128(xmm4, xmm9);\ + xmm5 = _mm_xor_si128(xmm5, xmm9);\ + xmm6 = _mm_xor_si128(xmm6, xmm9);\ + xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK;\ +\ + i6 = _mm_shuffle_epi8(i6, t0);\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + t1 = i2;\ + i4 = _mm_shuffle_epi8(i4, t0);\ + i5 = _mm_shuffle_epi8(i5, t0);\ + t2 = i4;\ + t3 = i6;\ + i7 = _mm_shuffle_epi8(i7, t0);\ +\ + /* continue with unpack using 4 temp registers */\ + t0 = i0;\ + t2 = _mm_unpackhi_epi16(t2, i5);\ + i4 = _mm_unpacklo_epi16(i4, i5);\ + t3 = _mm_unpackhi_epi16(t3, i7);\ + i6 = _mm_unpacklo_epi16(i6, i7);\ + t0 = _mm_unpackhi_epi16(t0, i1);\ + t1 = _mm_unpackhi_epi16(t1, i3);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ +\ + /* shuffle with immediate */\ + t0 = _mm_shuffle_epi32(t0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t2 = _mm_shuffle_epi32(t2, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + i4 = _mm_shuffle_epi32(i4, 216);\ + i6 = _mm_shuffle_epi32(i6, 216);\ +\ + /* continue with unpack */\ + t4 = i0;\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + t4 = _mm_unpackhi_epi32(t4, i2);\ + t5 = t0;\ + t0 = _mm_unpacklo_epi32(t0, t1);\ + t5 = _mm_unpackhi_epi32(t5, t1);\ + t6 = i4;\ + i4 = _mm_unpacklo_epi32(i4, i6);\ + t7 = t2;\ + t6 = _mm_unpackhi_epi32(t6, i6);\ + i2 = t0;\ + t2 = _mm_unpacklo_epi32(t2, t3);\ + i3 = t0;\ + t7 = _mm_unpackhi_epi32(t7, t3);\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = i0;\ + i1 = _mm_unpackhi_epi64(i1, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + i4 = t4;\ + i3 = _mm_unpackhi_epi64(i3, t2);\ + i5 = t4;\ + i2 = _mm_unpacklo_epi64(i2, t2);\ + i6 = t5;\ + i5 = _mm_unpackhi_epi64(i5, t6);\ + i7 = t5;\ + i4 = _mm_unpacklo_epi64(i4, t6);\ + i7 = _mm_unpackhi_epi64(i7, t7);\ + i6 = _mm_unpacklo_epi64(i6, t7);\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + o1 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o1 = _mm_unpackhi_epi64(o1, i1);\ + t0 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + t0 = _mm_unpackhi_epi64(t0, i3);\ + t1 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + t1 = _mm_unpackhi_epi64(t1, i5);\ + t2 = i6;\ + o0 = TRANSP_MASK;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + t2 = _mm_unpackhi_epi64(t2, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm_shuffle_epi8(i0, o0);\ + i2 = _mm_shuffle_epi8(i2, o0);\ + i4 = _mm_shuffle_epi8(i4, o0);\ + i6 = _mm_shuffle_epi8(i6, o0);\ + o1 = _mm_shuffle_epi8(o1, o0);\ + t0 = _mm_shuffle_epi8(t0, o0);\ + t1 = _mm_shuffle_epi8(t1, o0);\ + t2 = _mm_shuffle_epi8(t2, o0);\ + /* continue with unpack using 4 temp registers */\ + t3 = i4;\ + o2 = o1;\ + o0 = i0;\ + t4 = t1;\ + \ + t3 = _mm_unpackhi_epi16(t3, i6);\ + i4 = _mm_unpacklo_epi16(i4, i6);\ + o0 = _mm_unpackhi_epi16(o0, i2);\ + i0 = _mm_unpacklo_epi16(i0, i2);\ + o2 = _mm_unpackhi_epi16(o2, t0);\ + o1 = _mm_unpacklo_epi16(o1, t0);\ + t4 = _mm_unpackhi_epi16(t4, t2);\ + t1 = _mm_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm_shuffle_epi32(i4, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + o2 = _mm_shuffle_epi32(o2, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o0 = _mm_shuffle_epi32(o0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t4 = _mm_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = i0;\ + i3 = o0;\ + i5 = o1;\ + i7 = o2;\ + i0 = _mm_unpacklo_epi32(i0, i4);\ + i1 = _mm_unpackhi_epi32(i1, i4);\ + o0 = _mm_unpacklo_epi32(o0, t3);\ + i3 = _mm_unpackhi_epi32(i3, t3);\ + o1 = _mm_unpacklo_epi32(o1, t1);\ + i5 = _mm_unpackhi_epi32(i5, t1);\ + o2 = _mm_unpacklo_epi32(o2, t4);\ + i7 = _mm_unpackhi_epi32(i7, t4);\ + /* transpose done */\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i QTEMP[8]; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm8 = message[0]; + xmm9 = message[1]; + xmm10 = message[2]; + xmm11 = message[3]; + xmm12 = message[4]; + xmm13 = message[5]; + xmm14 = message[6]; + xmm15 = message[7]; + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store message M (Q input) for later */ + QTEMP[0] = xmm8; + QTEMP[1] = xmm9; + QTEMP[2] = xmm10; + QTEMP[3] = xmm11; + QTEMP[4] = xmm12; + QTEMP[5] = xmm13; + QTEMP[6] = xmm14; + QTEMP[7] = xmm15; + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store P(CV+M)+CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + /* load message M (Q input) into xmm8-15 */ + xmm8 = QTEMP[0]; + xmm9 = QTEMP[1]; + xmm10 = QTEMP[2]; + xmm11 = QTEMP[3]; + xmm12 = QTEMP[4]; + xmm13 = QTEMP[5]; + xmm14 = QTEMP[6]; + xmm15 = QTEMP[7]; + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF1024(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +} + +#endif + diff --git a/groestl512.c b/groestl512.c new file mode 100644 index 0000000..4ee4c26 --- /dev/null +++ b/groestl512.c @@ -0,0 +1,221 @@ +/* hash.c Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#include "groestl-intr-aes.h" + +/* digest up to len bytes of input (full blocks only) */ +static void Transform(groestl512_hashState *ctx, + const u8 *in, + unsigned long long len) { + + /* increment block counter */ + ctx->block_counter += len/SIZE; + + /* digest message, one block at a time */ + for (; len >= SIZE; len -= SIZE, in += SIZE) +#if LENGTH<=256 + TF512((u64*)ctx->chaining, (u64*)in); +#else + TF1024((u64*)ctx->chaining, (u64*)in); +#endif + + asm volatile ("emms"); +} + +/* given state h, do h <- P(h)+h */ +void OutputTransformation(groestl512_hashState *ctx) { + + /* determine variant */ +#if (LENGTH <= 256) + OF512((u64*)ctx->chaining); +#else + OF1024((u64*)ctx->chaining); +#endif + + asm volatile ("emms"); +} + +/* initialise context */ +HashReturn groestl512_Init(groestl512_hashState* ctx) { + u8 i = 0; + + /* output size (in bits) must be a positive integer less than or + equal to 512, and divisible by 8 */ + if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512) + return BAD_HASHLEN; + + /* set number of state columns and state size depending on + variant */ + ctx->columns = COLS; + ctx->statesize = SIZE; +#if (LENGTH <= 256) + ctx->v = SHORT; +#else + ctx->v = LONG; +#endif + + SET_CONSTANTS(); + + for (i=0; ichaining[i] = 0; + for (i=0; ibuffer[i] = 0; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL; + + /* set initial value */ + ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH); + + INIT(ctx->chaining); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter = 0; + ctx->bits_in_last_byte = 0; + + return SUCCESS; +} + +/* update state with databitlen bits of input */ +HashReturn groestl512_Update(groestl512_hashState* ctx, + const BitSequence* input, + DataLength databitlen) { + int index = 0; + int msglen = (int)(databitlen/8); + int rem = (int)(databitlen%8); + + /* non-integral number of message bytes can only be supplied in the + last call to this function */ + if (ctx->bits_in_last_byte) return FAIL; + + /* if the buffer contains data that has not yet been digested, first + add data to buffer until full */ + if (ctx->buf_ptr) { + while (ctx->buf_ptr < ctx->statesize && index < msglen) { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + if (ctx->buf_ptr < ctx->statesize) { + /* buffer still not full, return */ + if (rem) { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } + return SUCCESS; + } + + /* digest buffer */ + ctx->buf_ptr = 0; + Transform(ctx, ctx->buffer, ctx->statesize); + } + + /* digest bulk of message */ + Transform(ctx, input+index, msglen-index); + index += ((msglen-index)/ctx->statesize)*ctx->statesize; + + /* store remaining data in buffer */ + while (index < msglen) { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + + /* if non-integral number of bytes have been supplied, store + remaining bits in last byte, together with information about + number of bits */ + if (rem) { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } + return SUCCESS; +} + +#define BILB ctx->bits_in_last_byte + +/* finalise: process remaining data (including padding), perform + output transformation, and write hash result to 'output' */ +HashReturn groestl512_Final(groestl512_hashState* ctx, + BitSequence* output) { + int i, j = 0, hashbytelen = LENGTH/8; + u8 *s = (BitSequence*)ctx->chaining; + + /* pad with '1'-bit and first few '0'-bits */ + if (BILB) { + ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); + BILB = 0; + } + else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + + /* pad with '0'-bits */ + if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + /* padding requires two blocks */ + while (ctx->buf_ptr < ctx->statesize) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + /* digest first padding block */ + Transform(ctx, ctx->buffer, ctx->statesize); + ctx->buf_ptr = 0; + } + while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + + /* length padding */ + ctx->block_counter++; + ctx->buf_ptr = ctx->statesize; + while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter; + ctx->block_counter >>= 8; + } + + /* digest final padding block */ + Transform(ctx, ctx->buffer, ctx->statesize); + /* perform output transformation */ + OutputTransformation(ctx); + + /* store hash result in output */ + for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) { + output[j] = s[i]; + } + + /* zeroise relevant variables and deallocate memory */ + + for (i = 0; i < ctx->columns; i++) { + ctx->chaining[i] = 0; + } + + for (i = 0; i < ctx->statesize; i++) { + ctx->buffer[i] = 0; + } +// free(ctx->chaining); +// free(ctx->buffer); + + return SUCCESS; +} + +/* hash bit sequence */ +HashReturn groestl512_Hash(int hashbitlen, + const BitSequence* data, + DataLength databitlen, + BitSequence* hashval) { + HashReturn ret; + groestl512_hashState context; + + /* initialise */ + if ((ret = groestl512_Init(&context)) != SUCCESS) + return ret; + + /* process message */ + if ((ret = groestl512_Update(&context, data, databitlen)) != SUCCESS) + return ret; + + /* finalise */ + ret = groestl512_Final(&context, hashval); + + return ret; +} + diff --git a/groestl512.h b/groestl512.h new file mode 100644 index 0000000..8cac9d3 --- /dev/null +++ b/groestl512.h @@ -0,0 +1,94 @@ +/* hash.h Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#ifndef __hash_h +#define __hash_h + +#include +#include +#include + +#define crypto_hash_BYTES 64 +#define LENGTH (crypto_hash_BYTES*8) +typedef uint8_t u8; +typedef uint32_t u32; +typedef uint64_t u64; + +#include "brg_endian.h" +#define NEED_UINT_64T +#include "brg_types.h" + +#ifdef IACA_TRACE + #include IACA_MARKS +#endif + +#ifndef LENGTH +#define LENGTH 256 +#endif + +/* some sizes (number of bytes) */ +#define ROWS 8 +#define LENGTHFIELDLEN ROWS +#define COLS512 8 +#define COLS1024 16 +#define SIZE512 (ROWS*COLS512) +#define SIZE1024 (ROWS*COLS1024) +#define ROUNDS512 10 +#define ROUNDS1024 14 + +#if LENGTH<=256 +#define COLS COLS512 +#define SIZE SIZE512 +#define ROUNDS ROUNDS512 +#else +#define COLS COLS1024 +#define SIZE SIZE1024 +#define ROUNDS ROUNDS1024 +#endif + +#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) +#define U64BIG(a) (a) +#endif /* IS_BIG_ENDIAN */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define U64BIG(a) \ + ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ + (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ + (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ + (ROTL64(a,56) & li_64(FF000000FF000000))) +#endif /* IS_LITTLE_ENDIAN */ + +typedef enum { LONG, SHORT } Var; + +/* NIST API begin */ +typedef unsigned char BitSequence; +typedef unsigned long long DataLength; +typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn; +typedef struct { + __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ + __attribute__ ((aligned (32))) BitSequence buffer[SIZE]; /* data buffer */ + u64 block_counter; /* message block counter */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of + data buffer */ + int columns; /* no. of columns in state */ + int statesize; /* total no. of bytes in state */ + Var v; /* LONG or SHORT */ +} groestl512_hashState; + +HashReturn groestl512_Init(groestl512_hashState*); +HashReturn groestl512_Update(groestl512_hashState*, const BitSequence*, DataLength); +HashReturn groestl512_Final(groestl512_hashState*, BitSequence*); +HashReturn groestl512_Hash(int, const BitSequence*, DataLength, BitSequence*); +/* NIST API end */ + +#endif /* __hash_h */ diff --git a/heavy.c b/heavy.c index 24efe59..478db47 100644 --- a/heavy.c +++ b/heavy.c @@ -4,8 +4,17 @@ #include "miner.h" #include "hefty1.h" #include "sph_keccak.h" -#include "sph_blake.h" -#include "sph_groestl.h" +#include "blake512_sse41.h" +#include "groestl512.h" + +static groestl512_hashState groestlCtx_init; +static sph_keccak512_context keccakCtx_init; + +void init_hashstates(void) +{ + groestl512_Init(&groestlCtx_init); + sph_keccak512_init(&keccakCtx_init); +} /* Combines top 64-bits from each hash into a single hash */ static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) @@ -33,11 +42,11 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le HEFTY1(input, len, hash1); DATA_ALIGN64(uint32_t hash5[16]); - sph_blake512_context blakeCtx; - sph_blake512_init(&blakeCtx); - sph_blake512(&blakeCtx, input, len); - sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); - sph_blake512_close(&blakeCtx, (void *)&hash5); + blake512_state blakeCtx; + blake512_init(&blakeCtx); + blake512_update(&blakeCtx, input, len*8); + blake512_update(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)*8); + blake512_final(&blakeCtx, (void *)&hash5); if ((*((unsigned char *)hash5 + 31) & 0xF0) != 0) return 0; @@ -52,7 +61,7 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le DATA_ALIGN64(uint32_t hash3[16]); sph_keccak512_context keccakCtx; - sph_keccak512_init(&keccakCtx); + memcpy(&keccakCtx, &keccakCtx_init, sizeof(keccakCtx)); sph_keccak512(&keccakCtx, input, len); sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); sph_keccak512_close(&keccakCtx, (void *)&hash3); @@ -60,11 +69,11 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le return 0; DATA_ALIGN64(uint32_t hash4[16]); - sph_groestl512_context groestlCtx; - sph_groestl512_init(&groestlCtx); - sph_groestl512(&groestlCtx, input, len); - sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); - sph_groestl512_close(&groestlCtx, (void *)&hash4); + groestl512_hashState groestlCtx; + memcpy(&groestlCtx, &groestlCtx_init, sizeof(groestlCtx)); + groestl512_Update(&groestlCtx, input, len*8); + groestl512_Update(&groestlCtx, hash1, sizeof(hash1)*8); + groestl512_Final(&groestlCtx, (void *)&hash4); if ((*((unsigned char *)hash4 + 31) & 0xF0) != 0) return 0; @@ -76,7 +85,7 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le void heavycoin_hash(unsigned char* output, const unsigned char* input, int len) { - unsigned char hash1[32]; + unsigned char DATA_ALIGN64(hash1[32]); HEFTY1(input, len, hash1); /* HEFTY1 is new, so take an extra security measure to eliminate @@ -86,7 +95,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len) * * N.B. '+' is concatenation. */ - unsigned char hash2[32];; + DATA_ALIGN64(unsigned char hash2[32]); SHA256_CTX ctx; SHA256_Init(&ctx); SHA256_Update(&ctx, input, len); @@ -99,26 +108,26 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len) * and BLAKE512. */ - uint32_t hash3[16]; + DATA_ALIGN64(uint32_t hash3[16]); sph_keccak512_context keccakCtx; - sph_keccak512_init(&keccakCtx); + memcpy(&keccakCtx, &keccakCtx_init, sizeof(keccakCtx)); sph_keccak512(&keccakCtx, input, len); sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); sph_keccak512_close(&keccakCtx, (void *)&hash3); - uint32_t hash4[16]; - sph_groestl512_context groestlCtx; - sph_groestl512_init(&groestlCtx); - sph_groestl512(&groestlCtx, input, len); - sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); - sph_groestl512_close(&groestlCtx, (void *)&hash4); - - uint32_t hash5[16]; - sph_blake512_context blakeCtx; - sph_blake512_init(&blakeCtx); - sph_blake512(&blakeCtx, input, len); - sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); - sph_blake512_close(&blakeCtx, (void *)&hash5); + DATA_ALIGN64(uint32_t hash4[16]); + groestl512_hashState groestlCtx; + memcpy(&groestlCtx, &groestlCtx_init, sizeof(groestlCtx)); + groestl512_Update(&groestlCtx, input, len*8); + groestl512_Update(&groestlCtx, hash1, sizeof(hash1)*8); + groestl512_Final(&groestlCtx, (void *)&hash4); + + DATA_ALIGN64(uint32_t hash5[16]); + blake512_state blakeCtx; + blake512_init(&blakeCtx); + blake512_update(&blakeCtx, input, len*8); + blake512_update(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)*8); + blake512_final(&blakeCtx, (void *)&hash5); uint32_t *final = (uint32_t *)output; combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); diff --git a/hefty1.c b/hefty1.c index a8808c4..a0588d9 100644 --- a/hefty1.c +++ b/hefty1.c @@ -34,6 +34,8 @@ #include "hefty1.h" +#define NDEBUG + #define Min(A, B) (A <= B ? A : B) #define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ { \ @@ -158,52 +160,59 @@ static inline uint8_t Smoosh4(uint8_t X) /* Smoosh 32-bit word into 2-bits */ static inline uint8_t Smoosh2(uint32_t X) { - uint16_t w = (X >> 16) ^ (X & 0xffff); - uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); - return (n >> 2) ^ (n & 0x3); + X ^= X >> 16; + X ^= X >> 8; + X ^= X >> 4; + X ^= X >> 2; + return X & 3; } static void Mangle(uint32_t *S) { uint32_t *R = S; uint32_t *C = &S[1]; - uint8_t r0 = Smoosh4(R[0] >> 24); uint8_t r1 = Smoosh4(R[0] >> 16); uint8_t r2 = Smoosh4(R[0] >> 8); uint8_t r3 = Smoosh4(R[0] & 0xff); - int i; - - /* Diffuse */ uint32_t tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) { - uint8_t r = Smoosh2(tmp); - switch (r) { - case 0: - C[i] ^= Rr(R[0], i + r0); - break; - case 1: - C[i] += Rr(~R[0], i + r1); - break; - case 2: - C[i] &= Rr(~R[0], i + r2); - break; - case 3: - C[i] ^= Rr(R[0], i + r3); - break; - } - tmp ^= C[i]; + C[0] ^= Rr(R[0], r0); + tmp ^= C[0]; + + switch (Smoosh2(tmp)) { + case 0: + C[1] ^= Rr(R[0], 1 + r0); + break; + case 1: + C[1] += Rr(~R[0], 1 + r1); + break; + case 2: + C[1] &= Rr(~R[0], 1 + r2); + break; + case 3: + C[1] ^= Rr(R[0], 1 + r3); + break; + } + tmp ^= C[1]; + + switch (Smoosh2(tmp)) { + case 0: + C[2] ^= Rr(R[0], 2 + r0); + break; + case 1: + C[2] += Rr(~R[0], 2 + r1); + break; + case 2: + C[2] &= Rr(~R[0], 2 + r2); + break; + case 3: + C[2] ^= Rr(R[0], 2 + r3); + break; } /* Compress */ - tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) - if (i % 2) - tmp ^= C[i]; - else - tmp += C[i]; - R[0] ^= tmp; + R[0] ^= ((C[0] ^ C[1]) + C[2]); } static void Absorb(uint32_t *S, uint32_t X) @@ -224,26 +233,24 @@ static uint32_t Squeeze(uint32_t *S) static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) { uint32_t R = Squeeze(ctx->sponge); - uint8_t r0 = R >> 8; uint8_t r1 = R & 0xff; - - uint32_t Y = 1 << (r0 % 32); + uint32_t Y; switch (r1 % 4) { case 0: - /* Do nothing */ - break; + return X; case 1: + Y = 1 << (r0 % 32); return X & ~Y; case 2: + Y = 1 << (r0 % 32); return X | Y; case 3: + Y = 1 << (r0 % 32); return X ^ Y; } - - return X; } static void HashBlock(HEFTY1_CTX *ctx) @@ -287,16 +294,6 @@ static void HashBlock(HEFTY1_CTX *ctx) ctx->h[6] += G; ctx->h[7] += H; - A = 0; - B = 0; - C = 0; - D = 0; - E = 0; - F = 0; - G = 0; - H = 0; - - memset(W, 0, sizeof(W)); } /* Public interface */ @@ -359,7 +356,6 @@ void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) ctx->h[i] = Reverse32(ctx->h[i]); memcpy(digest, ctx->h, sizeof(ctx->h)); - memset(ctx, 0, sizeof(HEFTY1_CTX)); } unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)