70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
86 #if (__SUNPRO_CC >= 0x5130)
89 # define MAYBE_CONST const
92 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
93 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
94 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
95 using namespace rdtable;
97 static word64 Te[256];
99 static word64 Td[256];
100 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
101 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
103 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
105 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
106 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
107 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
109 static volatile bool s_TeFilled =
false, s_TdFilled =
false;
113 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
114 a ^= L(T, 3, byte(t)); t >>= 8;\
115 b ^= L(T, 2, byte(t)); t >>= 8;\
116 c ^= L(T, 1, byte(t)); t >>= 8;\
119 #define QUARTER_ROUND_LE(t, a, b, c, d) \
120 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
121 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
122 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
123 tempBlock[d] = ((byte *)(Te+t))[1];
125 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
126 #define QUARTER_ROUND_LD(t, a, b, c, d) \
127 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
128 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
129 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
130 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
132 #define QUARTER_ROUND_LD(t, a, b, c, d) \
133 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
134 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
135 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
136 tempBlock[d] = Sd[t];
139 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
140 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
142 #ifdef IS_LITTLE_ENDIAN
143 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
144 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
145 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
146 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
147 #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
149 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
150 #define TL_M(T, i, x) T[i*256 + x]
153 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
154 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
155 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
156 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
159 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
160 #define TL_M(T, i, x) T[i*256 + x]
165 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
166 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
167 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
169 #define f3(x) (f2(x) ^ x)
170 #define f9(x) (f8(x) ^ x)
171 #define fb(x) (f8(x) ^ f2(x) ^ x)
172 #define fd(x) (f8(x) ^ f4(x) ^ x)
173 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
175 void Rijndael::Base::FillEncTable()
177 for (
int i=0; i<256; i++)
180 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
181 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
182 Te[i] = word64(y | f3(x))<<32 | y;
184 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
185 for (
int j=0; j<4; j++)
192 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
193 Te[256] = Te[257] = 0;
198 void Rijndael::Base::FillDecTable()
200 for (
int i=0; i<256; i++)
203 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
204 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
205 Td[i] = word64(y | fb(x))<<32 | y | x;
207 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
208 for (
int j=0; j<4; j++)
218 void Rijndael::Base::UncheckedSetKey(
const byte *userKey,
unsigned int keylen,
const NameValuePairs &)
220 AssertValidKeyLength(keylen);
222 m_rounds = keylen/4 + 6;
223 m_key.New(4*(m_rounds+1));
227 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
231 static const word32 rcLE[] = {
232 0x01, 0x02, 0x04, 0x08,
233 0x10, 0x20, 0x40, 0x80,
236 const word32 *rc = rcLE;
238 __m128i temp = _mm_loadu_si128((__m128i *)(
void *)(userKey+keylen-16));
239 memcpy(rk, userKey, keylen);
243 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
244 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
245 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
246 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
248 if (rk + keylen/4 + 4 == m_key.end())
253 rk[10] = rk[ 4] ^ rk[ 9];
254 rk[11] = rk[ 5] ^ rk[10];
255 temp = _mm_insert_epi32(temp, rk[11], 3);
257 else if (keylen == 32)
259 temp = _mm_insert_epi32(temp, rk[11], 3);
260 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
261 rk[13] = rk[ 5] ^ rk[12];
262 rk[14] = rk[ 6] ^ rk[13];
263 rk[15] = rk[ 7] ^ rk[14];
264 temp = _mm_insert_epi32(temp, rk[15], 3);
267 temp = _mm_insert_epi32(temp, rk[7], 3);
272 if (!IsForwardTransformation())
277 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
280 vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
282 std::swap(*(__m128i *)(
void *)(rk), *(__m128i *)(
void *)(rk+4*m_rounds));
284 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
286 temp = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+i));
287 *(__m128i *)(
void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+j));
288 *(__m128i *)(
void *)(rk+j) = temp;
291 *(__m128i *)(
void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+i));
299 const word32 *rc = rcon;
304 temp = rk[keylen/4-1];
305 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
306 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
307 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
308 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
309 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
311 if (rk + keylen/4 + 4 == m_key.end())
316 rk[10] = rk[ 4] ^ rk[ 9];
317 rk[11] = rk[ 5] ^ rk[10];
319 else if (keylen == 32)
322 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
323 rk[13] = rk[ 5] ^ rk[12];
324 rk[14] = rk[ 6] ^ rk[13];
325 rk[15] = rk[ 7] ^ rk[14];
332 if (IsForwardTransformation())
347 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
349 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
351 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
352 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
353 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
354 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
357 rk[i+0] = InverseMixColumn(rk[i+0]);
358 rk[i+1] = InverseMixColumn(rk[i+1]);
359 rk[i+2] = InverseMixColumn(rk[i+2]);
360 rk[i+3] = InverseMixColumn(rk[i+3]);
368 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
374 void Rijndael::Enc::ProcessAndXorBlock(
const byte *inBlock,
const byte *xorBlock,
byte *outBlock)
const
376 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
377 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
383 return (
void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
389 word32 s0, s1, s2, s3, t0, t1, t2, t3;
390 Block::Get(inBlock)(s0)(s1)(s2)(s3);
392 const word32 *rk = m_key;
407 volatile word32 _u = 0;
409 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
410 for (i=0; i<2048; i+=cacheLineSize)
412 for (i=0; i<1024; i+=cacheLineSize)
414 u &= *(
const word32 *)(
const void *)(((
const byte *)Te)+i);
416 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
418 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
419 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
420 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
421 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
424 unsigned int r = m_rounds/2 - 1;
427 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
429 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
430 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
431 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
432 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
434 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
436 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
437 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
438 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
439 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
445 byte *
const tempBlock = (
byte *)tbw;
447 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
448 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
449 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
450 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
452 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
455 void Rijndael::Dec::ProcessAndXorBlock(const
byte *inBlock, const
byte *xorBlock,
byte *outBlock)
const
457 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
460 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
467 word32 s0, s1, s2, s3, t0, t1, t2, t3;
468 Block::Get(inBlock)(s0)(s1)(s2)(s3);
470 const word32 *rk = m_key;
485 volatile word32 _u = 0;
487 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
488 for (i=0; i<2048; i+=cacheLineSize)
490 for (i=0; i<1024; i+=cacheLineSize)
492 u &= *(
const word32 *)(
const void *)(((
const byte *)Td)+i);
494 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
496 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
497 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
498 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
499 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
502 unsigned int r = m_rounds/2 - 1;
505 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
507 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
508 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
509 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
510 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
512 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
514 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
515 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
516 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
517 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
522 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
527 for (i=0; i<256; i+=cacheLineSize)
528 u &= *(
const word32 *)(
const void *)(Sd+i);
529 u &= *(
const word32 *)(
const void *)(Sd+252);
530 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
534 byte *
const tempBlock = (
byte *)tbw;
536 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
537 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
538 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
539 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
541 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
546 #if CRYPTOPP_MSC_VERSION
547 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
550 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
552 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
554 CRYPTOPP_NAKED
void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(
void *locals,
const word32 *k)
556 CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
558 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
561 #define L_INDEX(i) (L_REG+768+i)
562 #define L_INXORBLOCKS L_INBLOCKS+4
563 #define L_OUTXORBLOCKS L_INBLOCKS+8
564 #define L_OUTBLOCKS L_INBLOCKS+12
565 #define L_INCREMENTS L_INDEX(16*15)
566 #define L_SP L_INDEX(16*16)
567 #define L_LENGTH L_INDEX(16*16+4)
568 #define L_KEYS_BEGIN L_INDEX(16*16+8)
573 #define MXOR(a,b,c) \
575 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
576 AS2( pxor MM(a), mm7)\
578 #define MMOV(a,b,c) \
580 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
585 #define L_INDEX(i) (L_REG+i)
586 #define L_INXORBLOCKS L_INBLOCKS+8
587 #define L_OUTXORBLOCKS L_INBLOCKS+16
588 #define L_OUTBLOCKS L_INBLOCKS+24
589 #define L_INCREMENTS L_INDEX(16*16)
590 #define L_LENGTH L_INDEX(16*18+8)
591 #define L_KEYS_BEGIN L_INDEX(16*19)
603 #define MXOR(a,b,c) \
605 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
607 #define MMOV(a,b,c) \
609 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
613 #define L_SUBKEYS L_INDEX(0)
614 #define L_SAVED_X L_SUBKEYS
615 #define L_KEY12 L_INDEX(16*12)
616 #define L_LASTROUND L_INDEX(16*13)
617 #define L_INBLOCKS L_INDEX(16*14)
618 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
622 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
626 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
628 #ifdef CRYPTOPP_GENERATE_X64_MASM
630 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
637 mov AS_REG_7, ?Te@rdtable@
CryptoPP@@3PA_KA
638 mov edi, DWORD PTR [?g_cacheLineSize@
CryptoPP@@3IA]
639 #elif defined(__GNUC__)
643 #
if CRYPTOPP_BOOL_X64
648 AS2( mov AS_REG_7, WORD_REG(si))
654 AS2( lea AS_REG_7, [Te])
655 AS2( mov edi, [g_cacheLineSize])
658 #
if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
659 AS2( mov [ecx+16*12+16*4], esp)
660 AS2( lea esp, [ecx-768])
664 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
665 AS2( mov WORD_REG(ax), 16)
666 AS2( and WORD_REG(ax), WORD_REG(si))
667 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
668 AS2( movdqa [L_KEY12], xmm3)
669 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
670 AS2( sub WORD_REG(ax), WORD_REG(si))
672 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
673 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
674 AS2( add WORD_REG(si), 16)
675 AS2( cmp WORD_REG(si), 16*12)
681 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
682 AS2( movdqa xmm1, [WORD_REG(dx)])
683 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
684 AS2( mov ebx, [WORD_REG(dx)+5*4])
685 AS2( mov ecx, [WORD_REG(dx)+6*4])
686 AS2( mov edx, [WORD_REG(dx)+7*4])
689 AS2( xor WORD_REG(ax), WORD_REG(ax))
691 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
692 AS2( add WORD_REG(ax), WORD_REG(di))
693 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
694 AS2( add WORD_REG(ax), WORD_REG(di))
695 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
696 AS2( add WORD_REG(ax), WORD_REG(di))
697 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
698 AS2( add WORD_REG(ax), WORD_REG(di))
699 AS2( cmp WORD_REG(ax), 2048)
705 AS2( test DWORD PTR [L_LENGTH], 1)
711 AS2( mov WORD_REG(si), [L_INBLOCKS])
712 AS2( movdqu xmm2, [WORD_REG(si)])
713 AS2( pxor xmm2, xmm1)
714 AS2( psrldq xmm1, 14)
716 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
717 AS2( MOVD MM(2), eax)
718 #
if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
756 AS2( mov eax, [L_KEY12+0*4])
757 AS2( mov edi, [L_KEY12+2*4])
758 AS2( MOVD MM(0), [L_KEY12+3*4])
765 AS2( xor ebx, [L_KEY12+1*4])
777 AS2( MOVD edx, MM(1))
778 AS2( MOVD [L_SAVED_X+3*4], MM(0))
779 AS2( mov [L_SAVED_X+0*4], eax)
780 AS2( mov [L_SAVED_X+1*4], ebx)
781 AS2( mov [L_SAVED_X+2*4], edi)
787 AS2( MOVD MM(1), [L_KEY12+0*4])
788 AS2( mov ebx, [L_KEY12+1*4])
789 AS2( mov ecx, [L_KEY12+2*4])
790 AS2( mov edx, [L_KEY12+3*4])
792 AS2( mov WORD_REG(ax), [L_INBLOCKS])
793 AS2( movdqu xmm2, [WORD_REG(ax)])
794 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
795 AS2( movdqu xmm5, [WORD_REG(si)])
796 AS2( pxor xmm2, xmm1)
797 AS2( pxor xmm2, xmm5)
830 AS2( MOVD eax, MM(1))
832 AS2( add L_REG, [L_KEYS_BEGIN])
833 AS2( add L_REG, 4*16)
839 AS2( MOVD ecx, MM(2))
840 AS2( MOVD edx, MM(1))
841 AS2( mov eax, [L_SAVED_X+0*4])
842 AS2( mov ebx, [L_SAVED_X+1*4])
844 AS2( and WORD_REG(cx), 255)
846 #
if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
847 AS2( paddb MM(2), mm3)
852 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
856 AS2( xor ecx, [L_SAVED_X+2*4])
859 AS2( xor edx, [L_SAVED_X+3*4])
861 AS2( add L_REG, [L_KEYS_BEGIN])
862 AS2( add L_REG, 3*16)
893 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
894 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
897 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
898 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
899 AS2( MOVD edx, MM(0))
902 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
903 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
906 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
907 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
908 AS2( MOVD edx, MM(0))
911 AS2( test L_REG, 255)
915 AS2( sub L_REG, 16*16)
917 #define LAST(a, b, c) \
919 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
921 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
922 AS2( mov WORD PTR [L_LASTROUND+c], di )\
938 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
939 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
941 AS2( mov WORD_REG(cx), [L_LENGTH])
942 AS2( sub WORD_REG(cx), 16)
944 AS2( movdqu xmm2, [WORD_REG(ax)])
945 AS2( pxor xmm2, xmm4)
947 #
if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
948 AS2( movdqa xmm0, [L_INCREMENTS])
949 AS2( paddd xmm0, [L_INBLOCKS])
950 AS2( movdqa [L_INBLOCKS], xmm0)
952 AS2( movdqa xmm0, [L_INCREMENTS+16])
953 AS2( paddq xmm0, [L_INBLOCKS+16])
954 AS2( movdqa [L_INBLOCKS+16], xmm0)
957 AS2( pxor xmm2, [L_LASTROUND])
958 AS2( movdqu [WORD_REG(bx)], xmm2)
963 AS2( mov [L_LENGTH], WORD_REG(cx))
964 AS2( test WORD_REG(cx), 1)
968 #
if CRYPTOPP_BOOL_X64
969 AS2( movdqa xmm0, [L_INCREMENTS])
970 AS2( paddq xmm0, [L_INBLOCKS])
971 AS2( movdqa [L_INBLOCKS], xmm0)
979 AS2( xorps xmm0, xmm0)
980 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
981 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
982 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
983 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
984 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
985 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
986 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
987 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
988 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
989 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
990 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
991 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
992 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
993 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
994 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
995 #
if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
996 AS2( mov esp, [L_SP])
1001 #
if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1006 #ifdef CRYPTOPP_GENERATE_X64_MASM
1012 Rijndael_Enc_AdvancedProcessBlocks ENDP
1017 :
"c" (locals),
"d" (k),
"S" (Te),
"D" (g_cacheLineSize)
1018 :
"memory",
"cc",
"%eax"
1019 #
if CRYPTOPP_BOOL_X64
1020 ,
"%rbx",
"%r8",
"%r9",
"%r10",
"%r11",
"%r12"
1028 #ifndef CRYPTOPP_GENERATE_X64_MASM
1030 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1032 void Rijndael_Enc_AdvancedProcessBlocks(
void *locals,
const word32 *k);
1036 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1038 static inline bool AliasedWithTable(
const byte *begin,
const byte *end)
1040 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1041 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+
sizeof(Te))%4096;
1043 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1045 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1048 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1050 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1052 block = _mm_xor_si128(block, subkeys[0]);
1053 for (
unsigned int i=1; i<rounds-1; i+=2)
1055 block = _mm_aesenc_si128(block, subkeys[i]);
1056 block = _mm_aesenc_si128(block, subkeys[i+1]);
1058 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1059 block = _mm_aesenclast_si128(block, subkeys[rounds]);
1062 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1064 __m128i rk = subkeys[0];
1065 block0 = _mm_xor_si128(block0, rk);
1066 block1 = _mm_xor_si128(block1, rk);
1067 block2 = _mm_xor_si128(block2, rk);
1068 block3 = _mm_xor_si128(block3, rk);
1069 for (
unsigned int i=1; i<rounds; i++)
1072 block0 = _mm_aesenc_si128(block0, rk);
1073 block1 = _mm_aesenc_si128(block1, rk);
1074 block2 = _mm_aesenc_si128(block2, rk);
1075 block3 = _mm_aesenc_si128(block3, rk);
1077 rk = subkeys[rounds];
1078 block0 = _mm_aesenclast_si128(block0, rk);
1079 block1 = _mm_aesenclast_si128(block1, rk);
1080 block2 = _mm_aesenclast_si128(block2, rk);
1081 block3 = _mm_aesenclast_si128(block3, rk);
1084 inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1086 block = _mm_xor_si128(block, subkeys[0]);
1087 for (
unsigned int i=1; i<rounds-1; i+=2)
1089 block = _mm_aesdec_si128(block, subkeys[i]);
1090 block = _mm_aesdec_si128(block, subkeys[i+1]);
1092 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1093 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1096 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1098 __m128i rk = subkeys[0];
1099 block0 = _mm_xor_si128(block0, rk);
1100 block1 = _mm_xor_si128(block1, rk);
1101 block2 = _mm_xor_si128(block2, rk);
1102 block3 = _mm_xor_si128(block3, rk);
1103 for (
unsigned int i=1; i<rounds; i++)
1106 block0 = _mm_aesdec_si128(block0, rk);
1107 block1 = _mm_aesdec_si128(block1, rk);
1108 block2 = _mm_aesdec_si128(block2, rk);
1109 block3 = _mm_aesdec_si128(block3, rk);
1111 rk = subkeys[rounds];
1112 block0 = _mm_aesdeclast_si128(block0, rk);
1113 block1 = _mm_aesdeclast_si128(block1, rk);
1114 block2 = _mm_aesdeclast_si128(block2, rk);
1115 block3 = _mm_aesdeclast_si128(block3, rk);
1118 CRYPTOPP_ALIGN_DATA(16)
1119 static const word32 s_one[] = {0, 0, 0, 1<<24};
1121 template <
typename F1,
typename F4>
1122 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys,
unsigned int rounds,
const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
1124 size_t blockSize = 16;
1126 size_t xorIncrement = xorBlocks ? blockSize : 0;
1131 assert(length % blockSize == 0);
1132 inBlocks += length - blockSize;
1133 xorBlocks += length - blockSize;
1134 outBlocks += length - blockSize;
1135 inIncrement = 0-inIncrement;
1136 xorIncrement = 0-xorIncrement;
1137 outIncrement = 0-outIncrement;
1140 if (flags & BlockTransformation::BT_AllowParallel)
1142 while (length >= 4*blockSize)
1144 __m128i block0 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks), block1, block2, block3;
1147 const __m128i be1 = *(
const __m128i *)(
const void *)s_one;
1148 block1 = _mm_add_epi32(block0, be1);
1149 block2 = _mm_add_epi32(block1, be1);
1150 block3 = _mm_add_epi32(block2, be1);
1151 _mm_storeu_si128((__m128i *)(
void *)inBlocks, _mm_add_epi32(block3, be1));
1155 inBlocks += inIncrement;
1156 block1 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1157 inBlocks += inIncrement;
1158 block2 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1159 inBlocks += inIncrement;
1160 block3 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1161 inBlocks += inIncrement;
1164 if (flags & BlockTransformation::BT_XorInput)
1166 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1167 xorBlocks += xorIncrement;
1168 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1169 xorBlocks += xorIncrement;
1170 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1171 xorBlocks += xorIncrement;
1172 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1173 xorBlocks += xorIncrement;
1176 func4(block0, block1, block2, block3, subkeys, rounds);
1178 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1180 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1181 xorBlocks += xorIncrement;
1182 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1183 xorBlocks += xorIncrement;
1184 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1185 xorBlocks += xorIncrement;
1186 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1187 xorBlocks += xorIncrement;
1190 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block0);
1191 outBlocks += outIncrement;
1192 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block1);
1193 outBlocks += outIncrement;
1194 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block2);
1195 outBlocks += outIncrement;
1196 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block3);
1197 outBlocks += outIncrement;
1199 length -= 4*blockSize;
1203 while (length >= blockSize)
1205 __m128i block = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1207 if (flags & BlockTransformation::BT_XorInput)
1208 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1211 const_cast<byte *
>(inBlocks)[15]++;
1213 func1(block, subkeys, rounds);
1215 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1216 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1218 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block);
1220 inBlocks += inIncrement;
1221 outBlocks += outIncrement;
1222 xorBlocks += xorIncrement;
1223 length -= blockSize;
1230 size_t Rijndael::Enc::AdvancedProcessBlocks(
const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
const
1232 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1234 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(
const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1237 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1240 if (length < BLOCKSIZE)
1245 word32 subkeys[4*12], workspace[8];
1246 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1248 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1249 size_t regSpill, lengthAndCounterFlag, keysBegin;
1252 const byte* zeros = (
byte *)(Te+256);
1256 #if (CRYPTOPP_MSC_VERSION >= 1400)
1258 space = (
byte *)_malloca(255+
sizeof(Locals));
1259 space += (256-(size_t)space%256)%256;
1261 space = (
byte *)alloca(255+
sizeof(Locals));
1262 space += (256-(size_t)space%256)%256;
1265 while (AliasedWithTable(space, space+
sizeof(Locals)));
1267 size_t increment = BLOCKSIZE;
1268 if (flags & BT_ReverseDirection)
1270 assert(length % BLOCKSIZE == 0);
1271 inBlocks += length - BLOCKSIZE;
1272 xorBlocks += length - BLOCKSIZE;
1273 outBlocks += length - BLOCKSIZE;
1274 increment = 0-increment;
1277 Locals &locals = *(Locals *)(
void *)space;
1279 locals.inBlocks = inBlocks;
1280 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1281 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1282 locals.outBlocks = outBlocks;
1284 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1285 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1286 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1287 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1289 locals.lengthAndCounterFlag = length - (length%16) -
bool(flags & BT_InBlockIsCounter);
1290 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1291 locals.keysBegin = (12-keysToCopy)*16;
1293 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1295 #if (CRYPTOPP_MSC_VERSION >= 1400)
1299 return length % BLOCKSIZE;
1308 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1310 size_t Rijndael::Dec::AdvancedProcessBlocks(
const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
const
1313 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(
const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1318 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE