diff --git a/.gitignore b/.gitignore
index 1eea430..3a9a32b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,9 @@
-
 minerd
 minerd.exe
 *.o
+*~
+ID
+tags
 
 autom4te.cache
 .deps
diff --git a/Makefile.am b/Makefile.am
index e69a01b..6ee01d7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,7 @@ dist_man_MANS	= minerd.1
 minerd_SOURCES	= elist.h miner.h compat.h \
 		  cpu-miner.c util.c \
 		  sha2.c scrypt.c \
-                  keccak.c groestl.c blake.c \
+                  keccak.c groestl512.c blake512_sse41.c \
                   hefty1.c heavy.c
 if ARCH_x86
 minerd_SOURCES += sha2-x86.S scrypt-x86.S
diff --git a/blake512_sse41.c b/blake512_sse41.c
new file mode 100644
index 0000000..aa5341d
--- /dev/null
+++ b/blake512_sse41.c
@@ -0,0 +1,272 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <x86intrin.h>
+#include <stdint.h>
+
+/* CONFIG START */
+#define AVOID_BRANCHING 1
+//#define HAVE_XOP 1
+/* CONFIG END */
+
+#include "blake512_sse41.h"
+#include "blake512_sse41_rounds.h"
+
+#define U8TO32(p) \
+  (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
+   ((u32)((p)[2]) <<  8) | ((u32)((p)[3])      ))
+#define U8TO64(p) \
+  (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
+#define U32TO8(p, v) \
+    (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
+    (p)[2] = (u8)((v) >>  8); (p)[3] = (u8)((v)      ); 
+#define U64TO8(p, v) \
+    U32TO8((p),     (u32)((v) >> 32));	\
+    U32TO8((p) + 4, (u32)((v)      )); 
+
+static const u8 padding[129] =
+{ 
+	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+static inline int blake512_compress( blake512_state * state, const u8 * datablock ) 
+{
+
+  __m128i row1l,row1h;
+  __m128i row2l,row2h;
+  __m128i row3l,row3h;
+  __m128i row4l,row4h;
+
+  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
+  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+  __m128i b0, b1, b2, b3;
+
+  m0 = _mm_loadu_si128((__m128i*)(datablock +   0));
+  m1 = _mm_loadu_si128((__m128i*)(datablock +  16));
+  m2 = _mm_loadu_si128((__m128i*)(datablock +  32));
+  m3 = _mm_loadu_si128((__m128i*)(datablock +  48));
+  m4 = _mm_loadu_si128((__m128i*)(datablock +  64));
+  m5 = _mm_loadu_si128((__m128i*)(datablock +  80));
+  m6 = _mm_loadu_si128((__m128i*)(datablock +  96));
+  m7 = _mm_loadu_si128((__m128i*)(datablock + 112));
+
+  m0 = BSWAP64(m0);
+  m1 = BSWAP64(m1);
+  m2 = BSWAP64(m2);
+  m3 = BSWAP64(m3);
+  m4 = BSWAP64(m4);
+  m5 = BSWAP64(m5);
+  m6 = BSWAP64(m6);
+  m7 = BSWAP64(m7);
+
+  row1l = state->h[0];
+  row1h = state->h[1];
+  row2l = state->h[2];
+  row2h = state->h[3];
+  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
+  row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL);
+
+  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
+  row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL);
+
+#ifdef AVOID_BRANCHING
+  do
+  {
+    const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt));
+    const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask);
+    const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask);
+    row4l = _mm_xor_si128(row4l, xor1);
+    row4h = _mm_xor_si128(row4h, xor2);
+  } while(0);
+#else
+  if(!state->nullt)
+  {
+  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
+  	row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1]));
+  }
+#endif
+
+  ROUND( 0);
+  ROUND( 1);
+  ROUND( 2);
+  ROUND( 3);
+  ROUND( 4);
+  ROUND( 5);
+  ROUND( 6);
+  ROUND( 7);
+  ROUND( 8);
+  ROUND( 9);
+  ROUND(10);
+  ROUND(11);
+  ROUND(12);
+  ROUND(13);
+  ROUND(14);
+  ROUND(15);
+
+  row1l = _mm_xor_si128(row3l,row1l);
+  row1h = _mm_xor_si128(row3h,row1h);
+
+  state->h[0] = _mm_xor_si128(row1l, state->h[0]);
+  state->h[1] = _mm_xor_si128(row1h, state->h[1]);
+
+  row2l = _mm_xor_si128(row4l,row2l);
+  row2h = _mm_xor_si128(row4h,row2h);
+
+  state->h[2] = _mm_xor_si128(row2l, state->h[2]);
+  state->h[3] = _mm_xor_si128(row2h, state->h[3]);
+  
+  return 0;
+}
+
+
+void blake512_update( blake512_state * S, const u8 * data, u64 datalen ) 
+{
+
+
+  int left = (S->buflen >> 3); 
+  int fill = 128 - left;
+
+  if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
+    memcpy( (void *) (S->buf + left), (void *) data, fill );
+    S->t[0] += 1024;
+    blake512_compress( S, S->buf );
+    data += fill;
+    datalen  -= (fill << 3);       
+    left = 0;
+  }
+
+  while( datalen >= 1024 ) {  
+    S->t[0] += 1024;
+    blake512_compress( S, data );
+    data += 128;
+    datalen  -= 1024;
+  }
+
+  if( datalen > 0 ) {
+    memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
+    S->buflen = (left<<3) + datalen;
+  }
+  else S->buflen=0;
+}
+
+void blake512_final( blake512_state * S, u8 * digest ) 
+{
+
+  u8 msglen[16], zo=0x01,oo=0x81;
+  u64 lo=S->t[0] + S->buflen, hi = S->t[1];
+  if ( lo < S->buflen ) hi++;
+  U64TO8(  msglen + 0, hi );
+  U64TO8(  msglen + 8, lo );
+
+  if ( S->buflen == 888 ) /* one padding byte */
+  { 
+    S->t[0] -= 8; 
+    blake512_update( S, &oo, 8 );
+  }
+  else 
+  {
+    if ( S->buflen < 888 ) /* enough space to fill the block */
+    { 
+      if ( S->buflen == 0 ) S->nullt=1;
+      S->t[0] -= 888 - S->buflen;
+      blake512_update( S, padding, 888 - S->buflen );
+    }
+    else /* NOT enough space, need 2 compressions */ 
+    { 
+      S->t[0] -= 1024 - S->buflen; 
+      blake512_update( S, padding, 1024 - S->buflen );
+      S->t[0] -= 888;
+      blake512_update( S, padding+1, 888 );
+      S->nullt = 1;
+    }
+    blake512_update( S, &zo, 8 );
+    S->t[0] -= 8;
+  }
+  S->t[0] -= 128;
+  blake512_update( S, msglen, 128 );    
+
+  do
+  {
+    const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+    _mm_storeu_si128((__m128i*)(digest +  0), BSWAP64(S->h[0]));
+    _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1]));
+    _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2]));
+    _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3]));
+  } while(0);
+}
+
+#if 0
+int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) 
+{
+
+  blake512_state S;
+  blake512_init( &S );
+  blake512_update( &S, in, inlen*8 );
+  blake512_final( &S, out );
+  return 0;
+}
+
+int main() 
+{
+#if 1
+  int i;
+  uint64_t foo[10] = {0xbeef, 0xbabe, 0xf00d};
+  blake512_state S, S_tmp;
+  char out[64];
+
+  blake512_init( &S );
+  for (i = 0; i < 5000000; i++) {
+      memcpy(&S_tmp, &S, sizeof(S));
+      blake512_update(&S_tmp, (const unsigned char *)foo, 80*8);
+      blake512_final( &S_tmp, out);
+      foo[0]++;
+  }
+  write(1, out, 64);
+  exit(0);
+#else
+  int i, v;
+  u8 data[144], digest[64];
+  u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, 
+	       0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, 
+	       0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, 
+	       0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3};
+  u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, 
+	       0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, 
+	       0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, 
+	       0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE};
+
+  for(i=0; i<144; ++i) data[i]=0;  
+
+  crypto_hash( digest, data, 1 );    
+  v=0;
+  for(i=0; i<64; ++i) {
+    printf("%02X", digest[i]);
+    if ( digest[i] != test1[i]) v=1;
+  }
+  if (v) printf("\nerror\n");
+  else  printf("\nok\n");
+
+  for(i=0; i<144; ++i) data[i]=0;  
+
+  crypto_hash( digest, data, 144 );    
+  v=0;
+  for(i=0; i<64; ++i) {
+    printf("%02X", digest[i]);
+    if ( digest[i] != test2[i]) v=1;
+  }
+  if (v) printf("\nerror\n");
+  else printf("\nok\n");
+
+  return 0;
+#endif
+}
+
+#endif
+
diff --git a/blake512_sse41.h b/blake512_sse41.h
new file mode 100644
index 0000000..6b16485
--- /dev/null
+++ b/blake512_sse41.h
@@ -0,0 +1,32 @@
+#ifndef _BLAKE512_SSE41_H_
+#define _BLAKE512_SSE41_H_
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint8_t u8;
+
+typedef struct
+{ 
+  __m128i h[4];
+  u64 s[4], t[2];
+  u32 buflen, nullt;
+  u8 buf[128];
+} blake512_state __attribute__ ((aligned (64)));
+
+static inline void blake512_init( blake512_state * S )
+{
+  memset(S, 0, sizeof(blake512_state));
+  S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL);
+  S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL);
+  S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL);
+  S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL);
+}
+
+extern void blake512_update(blake512_state * S, const u8 * data, u64 datalen );
+extern void blake512_final(blake512_state * S, u8 * digest );
+
+#endif
+ 
diff --git a/blake512_sse41_rounds.h b/blake512_sse41_rounds.h
new file mode 100644
index 0000000..303bd11
--- /dev/null
+++ b/blake512_sse41_rounds.h
@@ -0,0 +1,871 @@
+
+#ifndef __BLAKE512_ROUNDS_H__
+#define __BLAKE512_ROUNDS_H__
+
+#ifndef HAVE_XOP
+	#define BSWAP64(x) _mm_shuffle_epi8((x), u8to64)
+
+	#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+		: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) 
+#else
+	#define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64)
+#endif
+
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m4, m6); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m3, m7, 8); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m2); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m1); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m1); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m5, 8); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m7); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m5, m1, 0xF0); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m4); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m2, m0, 8); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m5); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m2, 0xF0); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m5); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m1, m5); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m0, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m7, m5, 0xF0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m1, 0xF0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m0, 8); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m4, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m3); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m5); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m1); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m2, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m7, m0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m7, m4, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m6, m0, 0xF0); \
+t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m7, m2); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m2, m7); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m5, m6, 8); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m5, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m3); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m6, m1, 0xF0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m7, m5, 8); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m2, m7); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m4, m1); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m2); \
+t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m3, m5); \
+t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m7); \
+t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m0, m5, 8); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m7, m4); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m4, m1, 8); \
+t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+t0 = m6; \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m5, m0, 8); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m3, 0xF0); \
+t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = m2; \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m0); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m2, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m7, m4); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m1, m6); \
+t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m7, m5, 8); \
+t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m4, m6); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m3, m7, 8); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m2); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m1); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m1); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_1(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m5, 8); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m7); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m5, m1, 0xF0); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m4); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m2, m0, 8); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m5); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m2, 0xF0); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m5); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m1, m5); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_2(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m0, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m7, m5, 0xF0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m1, 0xF0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m0, 8); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m4, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m3); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m5); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m1); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m2, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m7, m0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m7, m4, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -32); \
+  row4h = _mm_roti_epi64(row4h, -32); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -25); \
+  row2h = _mm_roti_epi64(row2h, -25); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -16); \
+  row4h = _mm_roti_epi64(row4h, -16); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -11); \
+  row2h = _mm_roti_epi64(row2h, -11); \
+
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0; 
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0; 
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
diff --git a/brg_endian.h b/brg_endian.h
new file mode 100644
index 0000000..e3cf0d1
--- /dev/null
+++ b/brg_endian.h
@@ -0,0 +1,133 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/brg_types.h b/brg_types.h
new file mode 100644
index 0000000..fd603b7
--- /dev/null
+++ b/brg_types.h
@@ -0,0 +1,234 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ (a few lines added by Soeren S. Thomsen, October 2008)
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef _BRG_TYPES_H
+#define _BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  include <stddef.h>
+#  define ptrint_t intptr_t
+#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
+#  include <stdint.h>
+#  define ptrint_t intptr_t
+#else
+#  define ptrint_t int
+#endif
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( __MVS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned int long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  endif
+#endif
+
+#if !defined( BRG_UI64 )
+#  if defined( NEED_UINT_64T )
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+  /*#    error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
+#  endif
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*	These defines are used to detect and set the memory alignment of pointers.
+    Note that offsets are in bytes.
+
+	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
+								the memory addressed by the pointer 'x' 
+								from an address that is aligned on an 
+								'n' byte boundary ('n' is a power of 2)
+
+	ALIGN_FLOOR(x,n)			return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not higher than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+
+	ALIGN_CEIL(x,n)				return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not lower than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+*/
+
+#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
+#define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
+#define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8. NOTE that the 
+    buffer size is in bytes but the type length is in bits
+
+    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
+                                'size' bits
+
+    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    UNIT_CAST(x,size)           casts a variable to a type of 
+                                length 'size' bits
+
+    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define UI_TYPE(size)               uint_##size##t
+#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
+#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
+#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
+#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
+
+  /* Added by Soeren S. Thomsen (begin) */
+#define u8 uint_8t
+#define u32 uint_32t
+#define u64 uint_64t
+  /* (end) */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/cpu-miner.c b/cpu-miner.c
index 4a0ade7..1a64649 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -35,6 +35,7 @@
 #endif
 #include <jansson.h>
 #include <curl/curl.h>
+
 #include "compat.h"
 #include "miner.h"
 
@@ -63,7 +64,7 @@ static inline void affine_to_cpu(int id, int cpu)
 
 	CPU_ZERO(&set);
 	CPU_SET(cpu, &set);
-	sched_setaffinity(0, sizeof(&set), &set);
+	sched_setaffinity(0, sizeof(set), &set);
 }
 #elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
 #include <sys/cpuset.h>
@@ -263,6 +264,8 @@ struct work {
 static struct work g_work;
 static time_t g_work_time;
 static pthread_mutex_t g_work_lock;
+static pthread_mutex_t g_restart_lock;
+static pthread_mutex_t g_work_time_lock;
 
 static bool jobj_binary(const json_t *obj, const char *key,
 			void *buf, size_t buflen)
@@ -706,6 +709,35 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff);
 }
 
+static inline time_t get_g_work_time(void)
+{
+       time_t gnow;
+
+       pthread_mutex_lock(&g_work_time_lock);
+       gnow = g_work_time;
+       pthread_mutex_unlock(&g_work_time_lock);
+       return gnow;
+}
+
+static inline void set_g_work_time(time_t now)
+{
+       pthread_mutex_lock(&g_work_time_lock);
+       g_work_time = now;
+       pthread_mutex_unlock(&g_work_time_lock);
+}
+
+static inline void add_g_work_time(time_t addtime)
+{
+       pthread_mutex_lock(&g_work_time_lock);
+       g_work_time += addtime;
+       pthread_mutex_unlock(&g_work_time_lock);
+}
+
+static inline void sub_g_work_time(time_t addtime)
+{
+	add_g_work_time(-addtime);
+}
+
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@@ -746,7 +778,7 @@ static void *miner_thread(void *userdata)
 		int rc;
 
 		if (have_stratum) {
-			while (time(NULL) >= g_work_time + 120)
+			while (time(NULL) >= get_g_work_time() + 120)
 				sleep(1);
 			pthread_mutex_lock(&g_work_lock);
 			if (work.data[19] >= end_nonce)
@@ -755,7 +787,7 @@ static void *miner_thread(void *userdata)
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
 			if (!have_stratum && (!have_longpoll ||
-					time(NULL) >= g_work_time + LP_SCANTIME*3/4 ||
+					time(NULL) >= get_g_work_time() + LP_SCANTIME*3/4 ||
 					work.data[19] >= end_nonce)) {
 				if (unlikely(!get_work(mythr, &g_work))) {
 					applog(LOG_ERR, "work retrieval failed, exiting "
@@ -763,7 +795,7 @@ static void *miner_thread(void *userdata)
 					pthread_mutex_unlock(&g_work_lock);
 					goto out;
 				}
-				g_work_time = have_stratum ? 0 : time(NULL);
+				set_g_work_time(have_stratum ? 0 : time(NULL));
 			}
 			if (have_stratum) {
 				pthread_mutex_unlock(&g_work_lock);
@@ -775,16 +807,19 @@ static void *miner_thread(void *userdata)
 			work.data[19] = 0xffffffffU / opt_n_threads * thr_id;
 		} else
 			work.data[19]++;
-		pthread_mutex_unlock(&g_work_lock);
+		pthread_mutex_lock(&g_restart_lock);
 		work_restart[thr_id].restart = 0;
+		pthread_mutex_unlock(&g_restart_lock);
 
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
 			max64 = LP_SCANTIME;
 		else
-			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
+			max64 = get_g_work_time() + (have_longpoll ? LP_SCANTIME : opt_scantime)
 			      - time(NULL);
+		pthread_mutex_lock(&stats_lock);
 		max64 *= thr_hashrates[thr_id];
+		pthread_mutex_unlock(&stats_lock);
 		if (max64 <= 0)
 			max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL;
 		if (work.data[19] + max64 > end_nonce)
@@ -793,6 +828,8 @@ static void *miner_thread(void *userdata)
 			max_nonce = work.data[19] + max64;
 
 		hashes_done = 0;
+		pthread_mutex_unlock(&g_work_lock);
+
 		gettimeofday(&tv_start, NULL);
 
 		/* scan nonces for a proof-of-work hash */
@@ -825,7 +862,7 @@ static void *miner_thread(void *userdata)
                     // fprintf(stderr, "  vote           = %u\n", ext[0]);
                     // fprintf(stderr, "  reward         = %u\n", ext[1]);
 
-                    if (opt_vote > work.maxvote) {
+                    if (opt_vote > work.maxvote && !opt_benchmark) {
                         printf("Warning: Your block reward vote (%hu) exceeds "
                                "the maxvote reported by the pool (%hu).\n",
                                opt_vote, work.maxvote);
@@ -891,15 +928,23 @@ static void *miner_thread(void *userdata)
 			pthread_mutex_unlock(&stats_lock);
 		}
 		if (!opt_quiet) {
-			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
-				1e-3 * thr_hashrates[thr_id]);
+			double myrate;
+
+			pthread_mutex_lock(&stats_lock);
+			myrate = thr_hashrates[thr_id];
+			pthread_mutex_unlock(&stats_lock);
+
+			sprintf(s, myrate >= 1e6 ? "%.0f" : "%.2f",
+				1e-3 * myrate);
 			applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s",
 				thr_id, hashes_done, s);
 		}
 		if (opt_benchmark && thr_id == opt_n_threads - 1) {
 			double hashrate = 0.;
+			pthread_mutex_lock(&stats_lock);
 			for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
 				hashrate += thr_hashrates[i];
+			pthread_mutex_unlock(&stats_lock);
 			if (i == opt_n_threads) {
 				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
 				applog(LOG_INFO, "Total: %s khash/s", s);
@@ -921,8 +966,10 @@ static void restart_threads(void)
 {
 	int i;
 
+	pthread_mutex_lock(&g_restart_lock);
 	for (i = 0; i < opt_n_threads; i++)
 		work_restart[i].restart = 1;
+	pthread_mutex_unlock(&g_restart_lock);
 }
 
 static void *longpoll_thread(void *userdata)
@@ -983,15 +1030,13 @@ start:
 			if (work_decode(json_object_get(val, "result"), &g_work)) {
 				if (opt_debug)
 					applog(LOG_DEBUG, "DEBUG: got new work");
-				time(&g_work_time);
+				set_g_work_time(time(NULL));
 				restart_threads();
 			}
 			pthread_mutex_unlock(&g_work_lock);
 			json_decref(val);
 		} else {
-			pthread_mutex_lock(&g_work_lock);
-			g_work_time -= LP_SCANTIME;
-			pthread_mutex_unlock(&g_work_lock);
+			sub_g_work_time(LP_SCANTIME);
 			if (err == CURLE_OPERATION_TIMEDOUT) {
 				restart_threads();
 			} else {
@@ -1061,7 +1106,7 @@ static void *stratum_thread(void *userdata)
 
 		while (!stratum.curl) {
 			pthread_mutex_lock(&g_work_lock);
-			g_work_time = 0;
+			set_g_work_time(0);
 			pthread_mutex_unlock(&g_work_lock);
 			restart_threads();
 
@@ -1080,10 +1125,10 @@ static void *stratum_thread(void *userdata)
 		}
 
 		if (stratum.job.job_id &&
-		    (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) {
+		    (strcmp(stratum.job.job_id, g_work.job_id) || !get_g_work_time())) {
 			pthread_mutex_lock(&g_work_lock);
 			stratum_gen_work(&stratum, &g_work);
-			time(&g_work_time);
+			set_g_work_time(time(NULL));
 			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				applog(LOG_INFO, "Stratum detected new block");
@@ -1290,6 +1335,7 @@ static void parse_arg (int key, char *arg)
 		want_longpoll = false;
 		want_stratum = false;
 		have_stratum = false;
+		opt_vote = 1024;
 		break;
 	case 1003:
 		want_longpoll = false;
@@ -1395,6 +1441,8 @@ static void signal_handler(int sig)
 }
 #endif
 
+extern void init_hashstates(void);
+
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
@@ -1419,9 +1467,13 @@ int main(int argc, char *argv[])
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 
+	init_hashstates();
+
 	pthread_mutex_init(&applog_lock, NULL);
 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
+	pthread_mutex_init(&g_work_time_lock, NULL);
+	pthread_mutex_init(&g_restart_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
 	pthread_mutex_init(&stratum.work_lock, NULL);
 
diff --git a/groestl-intr-aes.h b/groestl-intr-aes.h
new file mode 100644
index 0000000..d12be98
--- /dev/null
+++ b/groestl-intr-aes.h
@@ -0,0 +1,962 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+#include "groestl512.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8(j, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  a2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b0 = _mm_xor_si128(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm_xor_si128(b3, a7);\
+  b1 = _mm_xor_si128(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm_xor_si128(b4, a0);\
+  b2 = _mm_xor_si128(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm_xor_si128(b5, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b1 = a1;\
+  b6 = _mm_xor_si128(b6, a2);\
+  b4 = _mm_xor_si128(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm_xor_si128(b7, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0);
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}while(0);
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm_xor_si128(b0, b0);\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant P1024 */\
+    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant Q1024 */\
+    xmm1 = ALL_FF;\
+    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
+    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = ALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
+    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
+    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
+    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
+    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
+    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
+    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
+    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm_shuffle_epi8(i6, t0);\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm_shuffle_epi8(i4, t0);\
+  i5 = _mm_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm_unpackhi_epi16(t2, i5);\
+  i4 = _mm_unpacklo_epi16(i4, i5);\
+  t3 = _mm_unpackhi_epi16(t3, i7);\
+  i6 = _mm_unpacklo_epi16(i6, i7);\
+  t0 = _mm_unpackhi_epi16(t0, i1);\
+  t1 = _mm_unpackhi_epi16(t1, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t2 = _mm_shuffle_epi32(t2, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  i6 = _mm_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  t4 = _mm_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm_unpacklo_epi32(t0, t1);\
+  t5 = _mm_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm_unpackhi_epi64(i1, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm_unpacklo_epi64(i4, t6);\
+  i7 = _mm_unpackhi_epi64(i7, t7);\
+  i6 = _mm_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  t0 = _mm_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  t1 = _mm_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  t2 = _mm_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm_shuffle_epi8(i0, o0);\
+  i2 = _mm_shuffle_epi8(i2, o0);\
+  i4 = _mm_shuffle_epi8(i4, o0);\
+  i6 = _mm_shuffle_epi8(i6, o0);\
+  o1 = _mm_shuffle_epi8(o1, o0);\
+  t0 = _mm_shuffle_epi8(t0, o0);\
+  t1 = _mm_shuffle_epi8(t1, o0);\
+  t2 = _mm_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm_unpackhi_epi16(t3, i6);\
+  i4 = _mm_unpacklo_epi16(i4, i6);\
+  o0 = _mm_unpackhi_epi16(o0, i2);\
+  i0 = _mm_unpacklo_epi16(i0, i2);\
+  o2 = _mm_unpackhi_epi16(o2, t0);\
+  o1 = _mm_unpacklo_epi16(o1, t0);\
+  t4 = _mm_unpackhi_epi16(t4, t2);\
+  t1 = _mm_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  o2 = _mm_shuffle_epi32(o2, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o0 = _mm_shuffle_epi32(o0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t4 = _mm_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm_unpacklo_epi32(i0, i4);\
+  i1 = _mm_unpackhi_epi32(i1, i4);\
+  o0 = _mm_unpacklo_epi32(o0, t3);\
+  i3 = _mm_unpackhi_epi32(i3, t3);\
+  o1 = _mm_unpacklo_epi32(o1, t1);\
+  i5 = _mm_unpackhi_epi32(i5, t1);\
+  o2 = _mm_unpacklo_epi32(o2, t4);\
+  i7 = _mm_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+
+void INIT(u64* h)
+{
+   __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i QTEMP[8];
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF1024(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif
+
diff --git a/groestl512.c b/groestl512.c
new file mode 100644
index 0000000..4ee4c26
--- /dev/null
+++ b/groestl512.c
@@ -0,0 +1,221 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include "groestl-intr-aes.h"
+
+/* digest up to len bytes of input (full blocks only) */
+static void Transform(groestl512_hashState *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+
+    /* increment block counter */
+    ctx->block_counter += len/SIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= SIZE; len -= SIZE, in += SIZE)
+#if LENGTH<=256
+      TF512((u64*)ctx->chaining, (u64*)in);
+#else
+      TF1024((u64*)ctx->chaining, (u64*)in);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void OutputTransformation(groestl512_hashState *ctx) {
+
+    /* determine variant */
+#if (LENGTH <= 256)
+    OF512((u64*)ctx->chaining);
+#else
+    OF1024((u64*)ctx->chaining);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* initialise context */
+HashReturn groestl512_Init(groestl512_hashState* ctx) {
+  u8 i = 0;
+
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+  if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
+    return BAD_HASHLEN;
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->columns = COLS;
+  ctx->statesize = SIZE;
+#if (LENGTH <= 256)
+    ctx->v = SHORT;
+#else
+    ctx->v = LONG;
+#endif
+
+  SET_CONSTANTS();
+
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL;
+
+  /* set initial value */
+  ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
+
+  INIT(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return SUCCESS;
+}
+
+/* update state with databitlen bits of input */
+HashReturn groestl512_Update(groestl512_hashState* ctx,
+		  const BitSequence* input,
+		  DataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->bits_in_last_byte) return FAIL;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->buf_ptr) {
+    while (ctx->buf_ptr < ctx->statesize && index < msglen) {
+      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+    }
+    if (ctx->buf_ptr < ctx->statesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->bits_in_last_byte = rem;
+        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+      }
+      return SUCCESS;
+    }
+
+    /* digest buffer */
+    ctx->buf_ptr = 0;
+    Transform(ctx, ctx->buffer, ctx->statesize);
+  }
+
+  /* digest bulk of message */
+  Transform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->bits_in_last_byte = rem;
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+  }
+  return SUCCESS;
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+HashReturn groestl512_Final(groestl512_hashState* ctx,
+		 BitSequence* output) {
+  int i, j = 0, hashbytelen = LENGTH/8;
+  u8 *s = (BitSequence*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->buf_ptr < ctx->statesize) {
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    Transform(ctx, ctx->buffer, ctx->statesize);
+    ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = ctx->statesize;
+  while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  Transform(ctx, ctx->buffer, ctx->statesize);
+  /* perform output transformation */
+  OutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->columns; i++) {
+    ctx->chaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->statesize; i++) {
+    ctx->buffer[i] = 0;
+  }
+//  free(ctx->chaining);
+//  free(ctx->buffer);
+
+  return SUCCESS;
+}
+
+/* hash bit sequence */
+HashReturn groestl512_Hash(int hashbitlen,
+		const BitSequence* data, 
+		DataLength databitlen,
+		BitSequence* hashval) {
+  HashReturn ret;
+  groestl512_hashState context;
+
+  /* initialise */
+  if ((ret = groestl512_Init(&context)) != SUCCESS)
+    return ret;
+
+  /* process message */
+  if ((ret = groestl512_Update(&context, data, databitlen)) != SUCCESS)
+    return ret;
+
+  /* finalise */
+  ret = groestl512_Final(&context, hashval);
+
+  return ret;
+}
+
diff --git a/groestl512.h b/groestl512.h
new file mode 100644
index 0000000..8cac9d3
--- /dev/null
+++ b/groestl512.h
@@ -0,0 +1,94 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __hash_h
+#define __hash_h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#define crypto_hash_BYTES 64
+#define LENGTH (crypto_hash_BYTES*8)
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#ifdef IACA_TRACE
+  #include IACA_MARKS
+#endif
+
+#ifndef LENGTH
+#define LENGTH 256
+#endif
+
+/* some sizes (number of bytes) */
+#define ROWS 8
+#define LENGTHFIELDLEN ROWS
+#define COLS512 8
+#define COLS1024 16
+#define SIZE512 (ROWS*COLS512)
+#define SIZE1024 (ROWS*COLS1024)
+#define ROUNDS512 10
+#define ROUNDS1024 14
+
+#if LENGTH<=256
+#define COLS COLS512
+#define SIZE SIZE512
+#define ROUNDS ROUNDS512
+#else
+#define COLS COLS1024
+#define SIZE SIZE1024
+#define ROUNDS ROUNDS1024
+#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LONG, SHORT } Var;
+
+/* NIST API begin */
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+typedef struct {
+  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) BitSequence buffer[SIZE];  /* data buffer */
+  u64 block_counter;        /* message block counter */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int columns;              /* no. of columns in state */
+  int statesize;            /* total no. of bytes in state */
+  Var v;                    /* LONG or SHORT */
+} groestl512_hashState;
+
+HashReturn groestl512_Init(groestl512_hashState*);
+HashReturn groestl512_Update(groestl512_hashState*, const BitSequence*, DataLength);
+HashReturn groestl512_Final(groestl512_hashState*, BitSequence*);
+HashReturn groestl512_Hash(int, const BitSequence*, DataLength, BitSequence*);
+/* NIST API end   */
+
+#endif /* __hash_h */
diff --git a/heavy.c b/heavy.c
index 24efe59..478db47 100644
--- a/heavy.c
+++ b/heavy.c
@@ -4,8 +4,17 @@
 #include "miner.h"
 #include "hefty1.h"
 #include "sph_keccak.h"
-#include "sph_blake.h"
-#include "sph_groestl.h"
+#include "blake512_sse41.h"
+#include "groestl512.h"
+
+static groestl512_hashState groestlCtx_init;
+static sph_keccak512_context keccakCtx_init;
+    
+void init_hashstates(void)
+{
+    groestl512_Init(&groestlCtx_init);
+    sph_keccak512_init(&keccakCtx_init);
+}
 
 /* Combines top 64-bits from each hash into a single hash */
 static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
@@ -33,11 +42,11 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le
     HEFTY1(input, len, hash1);
 
     DATA_ALIGN64(uint32_t hash5[16]);
-    sph_blake512_context blakeCtx;
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
+    blake512_state blakeCtx;
+    blake512_init(&blakeCtx);
+    blake512_update(&blakeCtx, input, len*8);
+    blake512_update(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)*8);
+    blake512_final(&blakeCtx, (void *)&hash5);
     if ((*((unsigned char *)hash5 + 31) & 0xF0) != 0)
         return 0;
 
@@ -52,7 +61,7 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le
 
     DATA_ALIGN64(uint32_t hash3[16]);
     sph_keccak512_context keccakCtx;
-    sph_keccak512_init(&keccakCtx);
+    memcpy(&keccakCtx, &keccakCtx_init, sizeof(keccakCtx));
     sph_keccak512(&keccakCtx, input, len);
     sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
     sph_keccak512_close(&keccakCtx, (void *)&hash3);
@@ -60,11 +69,11 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le
         return 0;
 
     DATA_ALIGN64(uint32_t hash4[16]);
-    sph_groestl512_context groestlCtx;
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
+    groestl512_hashState groestlCtx;
+    memcpy(&groestlCtx, &groestlCtx_init, sizeof(groestlCtx));
+    groestl512_Update(&groestlCtx, input, len*8);
+    groestl512_Update(&groestlCtx, hash1, sizeof(hash1)*8);
+    groestl512_Final(&groestlCtx, (void *)&hash4);
     if ((*((unsigned char *)hash4 + 31) & 0xF0) != 0)
         return 0;
 
@@ -76,7 +85,7 @@ int heavycoin_scanhash(unsigned char* output, const unsigned char* input, int le
 
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len)
 {
-    unsigned char hash1[32];
+    unsigned char DATA_ALIGN64(hash1[32]);
     HEFTY1(input, len, hash1);
 
     /* HEFTY1 is new, so take an extra security measure to eliminate
@@ -86,7 +95,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len)
      *
      * N.B. '+' is concatenation.
      */
-    unsigned char hash2[32];;
+    DATA_ALIGN64(unsigned char hash2[32]);
     SHA256_CTX ctx;
     SHA256_Init(&ctx);
     SHA256_Update(&ctx, input, len);
@@ -99,26 +108,26 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len)
      * and BLAKE512.
      */
 
-    uint32_t hash3[16];
+    DATA_ALIGN64(uint32_t hash3[16]);
     sph_keccak512_context keccakCtx;
-    sph_keccak512_init(&keccakCtx);
+    memcpy(&keccakCtx, &keccakCtx_init, sizeof(keccakCtx));
     sph_keccak512(&keccakCtx, input, len);
     sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
     sph_keccak512_close(&keccakCtx, (void *)&hash3);
 
-    uint32_t hash4[16];
-    sph_groestl512_context groestlCtx;
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    uint32_t hash5[16];
-    sph_blake512_context blakeCtx;
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
+    DATA_ALIGN64(uint32_t hash4[16]);
+    groestl512_hashState groestlCtx;
+    memcpy(&groestlCtx, &groestlCtx_init, sizeof(groestlCtx));
+    groestl512_Update(&groestlCtx, input, len*8);
+    groestl512_Update(&groestlCtx, hash1, sizeof(hash1)*8);
+    groestl512_Final(&groestlCtx, (void *)&hash4);
+
+    DATA_ALIGN64(uint32_t hash5[16]);
+    blake512_state blakeCtx;
+    blake512_init(&blakeCtx);
+    blake512_update(&blakeCtx, input, len*8);
+    blake512_update(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)*8);
+    blake512_final(&blakeCtx, (void *)&hash5);
 
     uint32_t *final = (uint32_t *)output;
     combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
diff --git a/hefty1.c b/hefty1.c
index a8808c4..a0588d9 100644
--- a/hefty1.c
+++ b/hefty1.c
@@ -34,6 +34,8 @@
 
 #include "hefty1.h"
 
+#define NDEBUG
+
 #define Min(A, B) (A <= B ? A : B)
 #define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K)                    \
     {                                                                   \
@@ -158,52 +160,59 @@ static inline uint8_t Smoosh4(uint8_t X)
 /* Smoosh 32-bit word into 2-bits */
 static inline uint8_t Smoosh2(uint32_t X)
 {
-    uint16_t w = (X >> 16) ^ (X & 0xffff);
-    uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
-    return (n >> 2) ^ (n & 0x3);
+    X ^= X >> 16;
+    X ^= X >> 8;
+    X ^= X >> 4;
+    X ^= X >> 2;
+    return X & 3;
 }
 
 static void Mangle(uint32_t *S)
 {
     uint32_t *R = S;
     uint32_t *C = &S[1];
-
     uint8_t r0 = Smoosh4(R[0] >> 24);
     uint8_t r1 = Smoosh4(R[0] >> 16);
     uint8_t r2 = Smoosh4(R[0] >> 8);
     uint8_t r3 = Smoosh4(R[0] & 0xff);
 
-    int i;
-
-    /* Diffuse */
     uint32_t tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
-        uint8_t r = Smoosh2(tmp);
-        switch (r) {
-        case 0:
-            C[i] ^= Rr(R[0], i + r0);
-            break;
-        case 1:
-            C[i] += Rr(~R[0], i + r1);
-            break;
-        case 2:
-            C[i] &= Rr(~R[0], i + r2);
-            break;
-        case 3:
-            C[i] ^= Rr(R[0], i + r3);
-            break;
-        }
-        tmp ^= C[i];
+    C[0] ^= Rr(R[0], r0);
+    tmp ^= C[0];
+
+    switch (Smoosh2(tmp)) {
+    case 0:
+        C[1] ^= Rr(R[0], 1 + r0);
+        break;
+    case 1:
+        C[1] += Rr(~R[0], 1 + r1);
+        break;
+    case 2:
+        C[1] &= Rr(~R[0], 1 + r2);
+        break;
+    case 3:
+        C[1] ^= Rr(R[0], 1 + r3);
+        break;
+    }
+    tmp ^= C[1];
+
+    switch (Smoosh2(tmp)) {
+    case 0:
+        C[2] ^= Rr(R[0], 2 + r0);
+        break;
+    case 1:
+        C[2] += Rr(~R[0], 2 + r1);
+        break;
+    case 2:
+        C[2] &= Rr(~R[0], 2 + r2);
+        break;
+    case 3:
+        C[2] ^= Rr(R[0], 2 + r3);
+        break;
     }
 
     /* Compress */
-    tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
-        if (i % 2)
-            tmp ^= C[i];
-        else
-            tmp += C[i];
-    R[0] ^= tmp;
+    R[0] ^= ((C[0] ^ C[1]) + C[2]);
 }
 
 static void Absorb(uint32_t *S, uint32_t X)
@@ -224,26 +233,24 @@ static uint32_t Squeeze(uint32_t *S)
 static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
 {
     uint32_t R = Squeeze(ctx->sponge);
-
     uint8_t r0 = R >> 8;
     uint8_t r1 = R & 0xff;
-
-    uint32_t Y = 1 << (r0 % 32);
+    uint32_t Y;
 
     switch (r1 % 4)
     {
     case 0:
-        /* Do nothing */
-        break;
+        return X;
     case 1:
+        Y = 1 << (r0 % 32);
         return X & ~Y;
     case 2:
+        Y = 1 << (r0 % 32);
         return X | Y;
     case 3:
+        Y = 1 << (r0 % 32);
         return X ^ Y;
     }
-
-    return X;
 }
 
 static void HashBlock(HEFTY1_CTX *ctx)
@@ -287,16 +294,6 @@ static void HashBlock(HEFTY1_CTX *ctx)
     ctx->h[6] += G;
     ctx->h[7] += H;
 
-    A = 0;
-    B = 0;
-    C = 0;
-    D = 0;
-    E = 0;
-    F = 0;
-    G = 0;
-    H = 0;
-
-    memset(W, 0, sizeof(W));
 }
 
 /* Public interface */
@@ -359,7 +356,6 @@ void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
         ctx->h[i] = Reverse32(ctx->h[i]);
 
     memcpy(digest, ctx->h, sizeof(ctx->h));
-    memset(ctx, 0, sizeof(HEFTY1_CTX));
 }
 
 unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)