00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063 #include "pch.h"
00064
00065 #ifndef CRYPTOPP_IMPORTS
00066 #ifndef CRYPTOPP_GENERATE_X64_MASM
00067
00068 #include "rijndael.h"
00069 #include "misc.h"
00070 #include "cpu.h"
00071
00072 #ifdef __sun
00073 #include <alloca.h>
00074 #endif
00075
00076 NAMESPACE_BEGIN(CryptoPP)
00077
00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00081 using namespace rdtable;
00082 #else
00083 static word64 Te[256];
00084 #endif
00085 static word64 Td[256];
00086 #else
00087 static word32 Te[256*4], Td[256*4];
00088 #endif
00089 static bool s_TeFilled = false, s_TdFilled = false;
00090
00091
00092
00093 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00094 a ^= L(T, 3, byte(t)); t >>= 8;\
00095 b ^= L(T, 2, byte(t)); t >>= 8;\
00096 c ^= L(T, 1, byte(t)); t >>= 8;\
00097 d ^= L(T, 0, t);
00098
00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00103 tempBlock[d] = ((byte *)(Te+t))[1];
00104
00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00106 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00111 #else
00112 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00116 tempBlock[d] = Sd[t];
00117 #endif
00118
00119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00121
00122 #ifdef IS_LITTLE_ENDIAN
00123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00128 #else
00129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00130 #define TL_M(T, i, x) T[i*256 + x]
00131 #endif
00132 #else
00133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00137 #define TL_M TL_F
00138 #else
00139 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00140 #define TL_M(T, i, x) T[i*256 + x]
00141 #endif
00142 #endif
00143
00144
00145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00148
00149 #define f3(x) (f2(x) ^ x)
00150 #define f9(x) (f8(x) ^ x)
00151 #define fb(x) (f8(x) ^ f2(x) ^ x)
00152 #define fd(x) (f8(x) ^ f4(x) ^ x)
00153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00154
00155 void Rijndael::Base::FillEncTable()
00156 {
00157 for (int i=0; i<256; i++)
00158 {
00159 byte x = Se[i];
00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00162 Te[i] = word64(y | f3(x))<<32 | y;
00163 #else
00164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00165 for (int j=0; j<4; j++)
00166 {
00167 Te[i+j*256] = y;
00168 y = rotrFixed(y, 8);
00169 }
00170 #endif
00171 }
00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00173 Te[256] = Te[257] = 0;
00174 #endif
00175 s_TeFilled = true;
00176 }
00177
00178 void Rijndael::Base::FillDecTable()
00179 {
00180 for (int i=0; i<256; i++)
00181 {
00182 byte x = Sd[i];
00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00185 Td[i] = word64(y | fb(x))<<32 | y | x;
00186 #else
00187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00188 for (int j=0; j<4; j++)
00189 {
00190 Td[i+j*256] = y;
00191 y = rotrFixed(y, 8);
00192 }
00193 #endif
00194 }
00195 s_TdFilled = true;
00196 }
00197
00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00199 {
00200 AssertValidKeyLength(keylen);
00201
00202 m_rounds = keylen/4 + 6;
00203 m_key.New(4*(m_rounds+1));
00204
00205 word32 temp, *rk = m_key;
00206 const word32 *rc = rcon;
00207
00208 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00209
00210 while (true)
00211 {
00212 temp = rk[keylen/4-1];
00213 rk[keylen/4] = rk[0] ^
00214 (word32(Se[GETBYTE(temp, 2)]) << 24) ^
00215 (word32(Se[GETBYTE(temp, 1)]) << 16) ^
00216 (word32(Se[GETBYTE(temp, 0)]) << 8) ^
00217 Se[GETBYTE(temp, 3)] ^
00218 *(rc++);
00219 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00220 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00221 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00222
00223 if (rk + keylen/4 + 4 == m_key.end())
00224 break;
00225
00226 if (keylen == 24)
00227 {
00228 rk[10] = rk[ 4] ^ rk[ 9];
00229 rk[11] = rk[ 5] ^ rk[10];
00230 }
00231 else if (keylen == 32)
00232 {
00233 temp = rk[11];
00234 rk[12] = rk[ 4] ^
00235 (word32(Se[GETBYTE(temp, 3)]) << 24) ^
00236 (word32(Se[GETBYTE(temp, 2)]) << 16) ^
00237 (word32(Se[GETBYTE(temp, 1)]) << 8) ^
00238 Se[GETBYTE(temp, 0)];
00239 rk[13] = rk[ 5] ^ rk[12];
00240 rk[14] = rk[ 6] ^ rk[13];
00241 rk[15] = rk[ 7] ^ rk[14];
00242 }
00243 rk += keylen/4;
00244 }
00245
00246 if (IsForwardTransformation())
00247 {
00248 if (!s_TeFilled)
00249 FillEncTable();
00250 }
00251 else
00252 {
00253 if (!s_TdFilled)
00254 FillDecTable();
00255
00256 unsigned int i, j;
00257 rk = m_key;
00258
00259
00260 for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) {
00261 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
00262 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
00263 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
00264 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
00265 }
00266
00267 #define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00268
00269
00270 for (i = 1; i < m_rounds; i++) {
00271 rk += 4;
00272 InverseMixColumn(rk[0]);
00273 InverseMixColumn(rk[1]);
00274 InverseMixColumn(rk[2]);
00275 InverseMixColumn(rk[3]);
00276 }
00277 }
00278
00279 ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16);
00280 ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
00281 }
00282
00283 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00284 {
00285 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00286 if (HasSSE2())
00287 {
00288 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00289 return;
00290 }
00291 #endif
00292
00293 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00294
00295 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00296 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00297
00298 const word32 *rk = m_key;
00299 s0 ^= rk[0];
00300 s1 ^= rk[1];
00301 s2 ^= rk[2];
00302 s3 ^= rk[3];
00303 t0 = rk[4];
00304 t1 = rk[5];
00305 t2 = rk[6];
00306 t3 = rk[7];
00307 rk += 8;
00308
00309
00310 const int cacheLineSize = GetCacheLineSize();
00311 unsigned int i;
00312 word32 u = 0;
00313 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00314 for (i=0; i<2048; i+=cacheLineSize)
00315 #else
00316 for (i=0; i<1024; i+=cacheLineSize)
00317 #endif
00318 u &= *(const word32 *)(((const byte *)Te)+i);
00319 u &= Te[255];
00320 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00321
00322 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00323 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00324 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00325 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00326
00327
00328 unsigned int r = m_rounds/2 - 1;
00329 do
00330 {
00331 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00332
00333 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00334 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00335 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00336 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00337
00338 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00339
00340 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00341 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00342 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00343 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00344
00345 rk += 8;
00346 } while (--r);
00347
00348 word32 tbw[4];
00349 byte *const tempBlock = (byte *)tbw;
00350
00351 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00352 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00353 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00354 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00355
00356 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00357 }
00358
00359 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00360 {
00361 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00362
00363 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00364 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00365
00366 const word32 *rk = m_key;
00367 s0 ^= rk[0];
00368 s1 ^= rk[1];
00369 s2 ^= rk[2];
00370 s3 ^= rk[3];
00371 t0 = rk[4];
00372 t1 = rk[5];
00373 t2 = rk[6];
00374 t3 = rk[7];
00375 rk += 8;
00376
00377
00378 const int cacheLineSize = GetCacheLineSize();
00379 unsigned int i;
00380 word32 u = 0;
00381 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00382 for (i=0; i<2048; i+=cacheLineSize)
00383 #else
00384 for (i=0; i<1024; i+=cacheLineSize)
00385 #endif
00386 u &= *(const word32 *)(((const byte *)Td)+i);
00387 u &= Td[255];
00388 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00389
00390 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00391 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00392 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00393 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00394
00395
00396 unsigned int r = m_rounds/2 - 1;
00397 do
00398 {
00399 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00400
00401 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00402 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00403 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00404 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00405
00406 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00407
00408 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00409 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00410 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00411 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00412
00413 rk += 8;
00414 } while (--r);
00415
00416 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00417
00418
00419
00420 u = 0;
00421 for (i=0; i<256; i+=cacheLineSize)
00422 u &= *(const word32 *)(Sd+i);
00423 u &= *(const word32 *)(Sd+252);
00424 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00425 #endif
00426
00427 word32 tbw[4];
00428 byte *const tempBlock = (byte *)tbw;
00429
00430 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00431 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00432 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00433 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00434
00435 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00436 }
00437
00438
00439
00440 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00441
00442 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00443
00444 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00445
00446 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00447 {
00448 #if CRYPTOPP_BOOL_X86
00449
00450 #define L_REG esp
00451 #define L_INDEX(i) (L_REG+512+i)
00452 #define L_INXORBLOCKS L_INBLOCKS+4
00453 #define L_OUTXORBLOCKS L_INBLOCKS+8
00454 #define L_OUTBLOCKS L_INBLOCKS+12
00455 #define L_INCREMENTS L_INDEX(16*15)
00456 #define L_SP L_INDEX(16*16)
00457 #define L_LENGTH L_INDEX(16*16+4)
00458 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00459
00460 #define MOVD movd
00461 #define MM(i) mm##i
00462
00463 #define MXOR(a,b,c) \
00464 AS2( movzx esi, b)\
00465 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00466 AS2( pxor MM(a), mm7)\
00467
00468 #define MMOV(a,b,c) \
00469 AS2( movzx esi, b)\
00470 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00471
00472 #else
00473
00474 #define L_REG r8
00475 #define L_INDEX(i) (L_REG+i)
00476 #define L_INXORBLOCKS L_INBLOCKS+8
00477 #define L_OUTXORBLOCKS L_INBLOCKS+16
00478 #define L_OUTBLOCKS L_INBLOCKS+24
00479 #define L_INCREMENTS L_INDEX(16*16)
00480 #define L_LENGTH L_INDEX(16*18+8)
00481 #define L_KEYS_BEGIN L_INDEX(16*19)
00482
00483 #define MOVD mov
00484 #define MM_0 r9d
00485 #define MM_1 r12d
00486 #ifdef __GNUC__
00487 #define MM_2 r11d
00488 #else
00489 #define MM_2 r10d
00490 #endif
00491 #define MM(i) MM_##i
00492
00493 #define MXOR(a,b,c) \
00494 AS2( movzx esi, b)\
00495 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00496
00497 #define MMOV(a,b,c) \
00498 AS2( movzx esi, b)\
00499 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00500
00501 #endif
00502
00503 #define L_SUBKEYS L_INDEX(0)
00504 #define L_SAVED_X L_SUBKEYS
00505 #define L_KEY12 L_INDEX(16*12)
00506 #define L_LASTROUND L_INDEX(16*13)
00507 #define L_INBLOCKS L_INDEX(16*14)
00508 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00509
00510 #define XOR(a,b,c) \
00511 AS2( movzx esi, b)\
00512 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00513
00514 #define MOV(a,b,c) \
00515 AS2( movzx esi, b)\
00516 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00517
00518 #ifdef CRYPTOPP_GENERATE_X64_MASM
00519 ALIGN 8
00520 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00521 rex_push_reg rsi
00522 push_reg rdi
00523 push_reg rbx
00524 push_reg r12
00525 .endprolog
00526 mov L_REG, rcx
00527 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00528 mov rdi, QWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00529 #elif defined(__GNUC__)
00530 __asm__ __volatile__
00531 (
00532 ".intel_syntax noprefix;"
00533 #if CRYPTOPP_BOOL_X64
00534 AS2( mov L_REG, rcx)
00535 #endif
00536 AS_PUSH_IF86(bx)
00537 AS_PUSH_IF86(bp)
00538 AS2( mov AS_REG_7, WORD_REG(si))
00539 #else
00540 AS_PUSH_IF86(si)
00541 AS_PUSH_IF86(di)
00542 #if !defined(_MSC_VER) || (_MSC_VER < 1400)
00543 AS_PUSH_IF86(bx)
00544 #endif
00545 AS_PUSH_IF86(bp)
00546 AS2( lea AS_REG_7, [Te])
00547 AS2( mov edi, [g_cacheLineSize])
00548 #endif
00549
00550 #if CRYPTOPP_BOOL_X86
00551 AS2( mov [ecx+16*12+16*4], esp)
00552 AS2( lea esp, [ecx-512])
00553 #endif
00554
00555
00556 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00557 AS2( mov WORD_REG(ax), 16)
00558 AS2( and WORD_REG(ax), WORD_REG(si))
00559 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00560 AS2( movdqa [L_KEY12], xmm3)
00561 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00562 AS2( sub WORD_REG(ax), WORD_REG(si))
00563 ASL(0)
00564 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00565 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00566 AS2( add WORD_REG(si), 16)
00567 AS2( cmp WORD_REG(si), 16*12)
00568 ASJ( jl, 0, b)
00569
00570
00571 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00572 AS2( movdqa xmm1, [WORD_REG(dx)])
00573 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00574 AS2( mov ebx, [WORD_REG(dx)+5*4])
00575 AS2( mov ecx, [WORD_REG(dx)+6*4])
00576 AS2( mov edx, [WORD_REG(dx)+7*4])
00577
00578
00579 AS2( xor WORD_REG(ax), WORD_REG(ax))
00580 ASL(9)
00581 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00582 AS2( add WORD_REG(ax), WORD_REG(di))
00583 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00584 AS2( add WORD_REG(ax), WORD_REG(di))
00585 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00586 AS2( add WORD_REG(ax), WORD_REG(di))
00587 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00588 AS2( add WORD_REG(ax), WORD_REG(di))
00589 AS2( cmp WORD_REG(ax), 2048)
00590 ASJ( jl, 9, b)
00591 AS1( lfence)
00592
00593 AS2( test DWORD PTR [L_LENGTH], 1)
00594 ASJ( jz, 8, f)
00595
00596
00597 AS2( mov WORD_REG(si), [L_INBLOCKS])
00598 AS2( movdqu xmm2, [WORD_REG(si)])
00599 AS2( pxor xmm2, xmm1)
00600 AS2( psrldq xmm1, 14)
00601 AS2( movd eax, xmm1)
00602 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00603 AS2( MOVD MM(2), eax)
00604 #if CRYPTOPP_BOOL_X86
00605 AS2( mov eax, 1)
00606 AS2( movd mm3, eax)
00607 #endif
00608
00609
00610 AS2( movd eax, xmm2)
00611 AS2( psrldq xmm2, 4)
00612 AS2( movd edi, xmm2)
00613 AS2( psrldq xmm2, 4)
00614 MXOR( 1, al, 0)
00615 XOR( edx, ah, 1)
00616 AS2( shr eax, 16)
00617 XOR( ecx, al, 2)
00618 XOR( ebx, ah, 3)
00619 AS2( mov eax, edi)
00620 AS2( movd edi, xmm2)
00621 AS2( psrldq xmm2, 4)
00622 XOR( ebx, al, 0)
00623 MXOR( 1, ah, 1)
00624 AS2( shr eax, 16)
00625 XOR( edx, al, 2)
00626 XOR( ecx, ah, 3)
00627 AS2( mov eax, edi)
00628 AS2( movd edi, xmm2)
00629 XOR( ecx, al, 0)
00630 XOR( ebx, ah, 1)
00631 AS2( shr eax, 16)
00632 MXOR( 1, al, 2)
00633 XOR( edx, ah, 3)
00634 AS2( mov eax, edi)
00635 XOR( edx, al, 0)
00636 XOR( ecx, ah, 1)
00637 AS2( shr eax, 16)
00638 XOR( ebx, al, 2)
00639 AS2( psrldq xmm2, 3)
00640
00641
00642 AS2( mov eax, [L_KEY12+0*4])
00643 AS2( mov edi, [L_KEY12+2*4])
00644 AS2( MOVD MM(0), [L_KEY12+3*4])
00645 MXOR( 0, cl, 3)
00646 XOR( edi, bl, 3)
00647 MXOR( 0, bh, 2)
00648 AS2( shr ebx, 16)
00649 XOR( eax, bl, 1)
00650 MOV( ebx, bh, 0)
00651 AS2( xor ebx, [L_KEY12+1*4])
00652 XOR( eax, ch, 2)
00653 AS2( shr ecx, 16)
00654 XOR( eax, dl, 3)
00655 XOR( ebx, dh, 2)
00656 AS2( shr edx, 16)
00657 XOR( edi, ch, 0)
00658 XOR( ebx, cl, 1)
00659 XOR( edi, dl, 1)
00660 MXOR( 0, dh, 0)
00661
00662 AS2( movd ecx, xmm2)
00663 AS2( MOVD edx, MM(1))
00664 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00665 AS2( mov [L_SAVED_X+0*4], eax)
00666 AS2( mov [L_SAVED_X+1*4], ebx)
00667 AS2( mov [L_SAVED_X+2*4], edi)
00668 ASJ( jmp, 5, f)
00669
00670 ASL(3)
00671
00672 AS2( MOVD MM(1), [L_KEY12+0*4])
00673 AS2( mov ebx, [L_KEY12+1*4])
00674 AS2( mov ecx, [L_KEY12+2*4])
00675 AS2( mov edx, [L_KEY12+3*4])
00676 ASL(8)
00677 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00678 AS2( movdqu xmm2, [WORD_REG(ax)])
00679 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00680 AS2( movdqu xmm5, [WORD_REG(si)])
00681 AS2( pxor xmm2, xmm1)
00682 AS2( pxor xmm2, xmm5)
00683
00684
00685 AS2( movd eax, xmm2)
00686 AS2( psrldq xmm2, 4)
00687 AS2( movd edi, xmm2)
00688 AS2( psrldq xmm2, 4)
00689 MXOR( 1, al, 0)
00690 XOR( edx, ah, 1)
00691 AS2( shr eax, 16)
00692 XOR( ecx, al, 2)
00693 XOR( ebx, ah, 3)
00694 AS2( mov eax, edi)
00695 AS2( movd edi, xmm2)
00696 AS2( psrldq xmm2, 4)
00697 XOR( ebx, al, 0)
00698 MXOR( 1, ah, 1)
00699 AS2( shr eax, 16)
00700 XOR( edx, al, 2)
00701 XOR( ecx, ah, 3)
00702 AS2( mov eax, edi)
00703 AS2( movd edi, xmm2)
00704 XOR( ecx, al, 0)
00705 XOR( ebx, ah, 1)
00706 AS2( shr eax, 16)
00707 MXOR( 1, al, 2)
00708 XOR( edx, ah, 3)
00709 AS2( mov eax, edi)
00710 XOR( edx, al, 0)
00711 XOR( ecx, ah, 1)
00712 AS2( shr eax, 16)
00713 XOR( ebx, al, 2)
00714 MXOR( 1, ah, 3)
00715 AS2( MOVD eax, MM(1))
00716
00717 AS2( add L_REG, [L_KEYS_BEGIN])
00718 AS2( add L_REG, 4*16)
00719 ASJ( jmp, 2, f)
00720
00721 ASL(1)
00722
00723 AS2( MOVD ecx, MM(2))
00724 AS2( MOVD edx, MM(1))
00725 AS2( mov eax, [L_SAVED_X+0*4])
00726 AS2( mov ebx, [L_SAVED_X+1*4])
00727 AS2( xor cl, ch)
00728 AS2( and WORD_REG(cx), 255)
00729 ASL(5)
00730 #if CRYPTOPP_BOOL_X86
00731 AS2( paddb MM(2), mm3)
00732 #else
00733 AS2( add MM(2), 1)
00734 #endif
00735
00736 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00737 XOR( ebx, dl, 3)
00738 MOV( ecx, dh, 2)
00739 AS2( shr edx, 16)
00740 AS2( xor ecx, [L_SAVED_X+2*4])
00741 XOR( eax, dh, 0)
00742 MOV( edx, dl, 1)
00743 AS2( xor edx, [L_SAVED_X+3*4])
00744
00745 AS2( add L_REG, [L_KEYS_BEGIN])
00746 AS2( add L_REG, 3*16)
00747 ASJ( jmp, 4, f)
00748
00749
00750
00751 #define ROUND() \
00752 MXOR( 0, cl, 3) \
00753 AS2( mov cl, al) \
00754 XOR( edi, ah, 2) \
00755 AS2( shr eax, 16) \
00756 XOR( edi, bl, 3) \
00757 MXOR( 0, bh, 2) \
00758 AS2( shr ebx, 16) \
00759 MXOR( 0, al, 1) \
00760 MOV( eax, ah, 0) \
00761 XOR( eax, bl, 1) \
00762 MOV( ebx, bh, 0) \
00763 XOR( eax, ch, 2) \
00764 XOR( ebx, cl, 3) \
00765 AS2( shr ecx, 16) \
00766 XOR( eax, dl, 3) \
00767 XOR( ebx, dh, 2) \
00768 AS2( shr edx, 16) \
00769 XOR( edi, ch, 0) \
00770 XOR( ebx, cl, 1) \
00771 XOR( edi, dl, 1) \
00772 MXOR( 0, dh, 0) \
00773
00774 ASL(2)
00775 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00776 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00777 ROUND()
00778 AS2( mov ecx, edi)
00779 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00780 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00781 AS2( MOVD edx, MM(0))
00782
00783 ASL(4)
00784 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00785 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00786 ROUND()
00787 AS2( mov ecx, edi)
00788 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00789 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00790 AS2( MOVD edx, MM(0))
00791
00792 AS2( add L_REG, 32)
00793 AS2( test L_REG, 255)
00794 ASJ( jnz, 2, b)
00795 AS2( sub L_REG, 16*16)
00796
00797 #define LAST(a, b, c) \
00798 AS2( movzx esi, a )\
00799 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00800 AS2( movzx esi, b )\
00801 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00802 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00803
00804
00805 LAST(ch, dl, 2)
00806 LAST(dh, al, 6)
00807 AS2( shr edx, 16)
00808 LAST(ah, bl, 10)
00809 AS2( shr eax, 16)
00810 LAST(bh, cl, 14)
00811 AS2( shr ebx, 16)
00812 LAST(dh, al, 12)
00813 AS2( shr ecx, 16)
00814 LAST(ah, bl, 0)
00815 LAST(bh, cl, 4)
00816 LAST(ch, dl, 8)
00817
00818 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00819 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00820
00821 AS2( mov WORD_REG(cx), [L_LENGTH])
00822 AS2( sub WORD_REG(cx), 16)
00823
00824 AS2( movdqu xmm2, [WORD_REG(ax)])
00825 AS2( pxor xmm2, xmm4)
00826
00827 #if CRYPTOPP_BOOL_X86
00828 AS2( movdqa xmm0, [L_INCREMENTS])
00829 AS2( paddd xmm0, [L_INBLOCKS])
00830 AS2( movdqa [L_INBLOCKS], xmm0)
00831 #else
00832 AS2( movdqa xmm0, [L_INCREMENTS+16])
00833 AS2( paddq xmm0, [L_INBLOCKS+16])
00834 AS2( movdqa [L_INBLOCKS+16], xmm0)
00835 #endif
00836
00837 AS2( pxor xmm2, [L_LASTROUND])
00838 AS2( movdqu [WORD_REG(bx)], xmm2)
00839
00840 ASJ( jle, 7, f)
00841 AS2( mov [L_LENGTH], WORD_REG(cx))
00842 AS2( test WORD_REG(cx), 1)
00843 ASJ( jnz, 1, b)
00844 #if CRYPTOPP_BOOL_X64
00845 AS2( movdqa xmm0, [L_INCREMENTS])
00846 AS2( paddq xmm0, [L_INBLOCKS])
00847 AS2( movdqa [L_INBLOCKS], xmm0)
00848 #endif
00849 ASJ( jmp, 3, b)
00850
00851 ASL(7)
00852
00853 AS2( xorps xmm0, xmm0)
00854 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00855 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00856 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00857 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00858 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00859 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00860 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00861 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00862 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00863 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00864 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00865 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00866 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00867 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00868 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00869 #if CRYPTOPP_BOOL_X86
00870 AS2( mov esp, [L_SP])
00871 AS1( emms)
00872 #endif
00873 AS_POP_IF86(bp)
00874 #if !defined(_MSC_VER) || (_MSC_VER < 1400)
00875 AS_POP_IF86(bx)
00876 #endif
00877 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00878 AS_POP_IF86(di)
00879 AS_POP_IF86(si)
00880 AS1(ret)
00881 #endif
00882 #ifdef CRYPTOPP_GENERATE_X64_MASM
00883 pop r12
00884 pop rbx
00885 pop rdi
00886 pop rsi
00887 ret
00888 Rijndael_Enc_AdvancedProcessBlocks ENDP
00889 #endif
00890 #ifdef __GNUC__
00891 ".att_syntax prefix;"
00892 :
00893 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00894 : "memory", "cc", "%eax"
00895 #if CRYPTOPP_BOOL_X64
00896 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00897 #endif
00898 );
00899 #endif
00900 }
00901
00902 #endif
00903
00904 #ifndef CRYPTOPP_GENERATE_X64_MASM
00905
00906 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00907 extern "C" {
00908 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00909 }
00910 #endif
00911
00912 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00913
00914 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00915 {
00916 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00917 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00918 if (t1 > t0)
00919 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00920 else
00921 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
00922 }
00923
00924 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
00925 {
00926 if (length < BLOCKSIZE)
00927 return length;
00928
00929 if (HasSSE2())
00930 {
00931 struct Locals
00932 {
00933 word32 subkeys[4*12], workspace[8];
00934 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
00935 byte *outBlocks;
00936 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
00937 size_t regSpill, lengthAndCounterFlag, keysBegin;
00938 };
00939
00940 size_t increment = BLOCKSIZE;
00941 const byte* zeros = (byte *)(Te+256);
00942 byte *space;
00943
00944 do {
00945 space = (byte *)alloca(255+sizeof(Locals));
00946 space += (256-(size_t)space%256)%256;
00947 }
00948 while (AliasedWithTable(space, space+sizeof(Locals)));
00949
00950 if (flags & BT_ReverseDirection)
00951 {
00952 assert(length % BLOCKSIZE == 0);
00953 inBlocks += length - BLOCKSIZE;
00954 xorBlocks += length - BLOCKSIZE;
00955 outBlocks += length - BLOCKSIZE;
00956 increment = 0-increment;
00957 }
00958
00959 Locals &locals = *(Locals *)space;
00960
00961 locals.inBlocks = inBlocks;
00962 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
00963 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
00964 locals.outBlocks = outBlocks;
00965
00966 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
00967 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
00968 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
00969 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
00970
00971 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
00972 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
00973 locals.keysBegin = (12-keysToCopy)*16;
00974
00975 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
00976 return length%16;
00977 }
00978 else
00979 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
00980 }
00981
00982 #endif
00983
00984 NAMESPACE_END
00985
00986 #endif
00987 #endif