/* ==================================================================== * Copyright (c) 2010 The OpenSSL Project. All rights reserved. * * Redistribution and use is governed by OpenSSL license. * ==================================================================== */ #include #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) typedef __int64 i64; typedef unsigned __int64 u64; #define U64(C) C##UI64 #elif defined(__arch64__) typedef long i64; typedef unsigned long u64; #define U64(C) C##UL #else typedef long long i64; typedef unsigned long long u64; #define U64(C) C##ULL #endif typedef unsigned int u32; typedef unsigned char u8; #define STRICT_ALIGNMENT 1 #if defined(__i386) || defined(__i386__) || \ defined(__x86_64) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ defined(__s390__) || defined(__s390x__) # undef STRICT_ALIGNMENT #endif #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPNESSL_NO_INLINE_ASM) #if defined(__GNUC__) && __GNUC__>=2 # if defined(__x86_64) || defined(__x86_64__) # define BSWAP8(x) ({ u64 ret=(x); \ asm volatile ("bswapq %0" \ : "+r"(ret)); ret; }) # define BSWAP4(x) ({ u32 ret=(x); \ asm volatile ("bswapl %0" \ : "+r"(ret)); ret; }) # elif (defined(__i386) || defined(__i386__)) # define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ asm volatile ("bswapl %0; bswapl %1" \ : "+r"(hi),"+r"(lo)); \ (u64)hi<<32|lo; }) # define BSWAP4(x) ({ u32 ret=(x); \ asm volatile ("bswapl %0" \ : "+r"(ret)); ret; }) # endif #elif defined(_MSC_VER) # if _MSC_VER>=1300 # pragma intrinsic(_byteswap_uint64,_byteswap_ulong) # define BSWAP8(x) _byteswap_uint64((u64)(x)) # define BSWAP4(x) _byteswap_ulong((u32)(x)) # elif defined(_M_IX86) __inline u32 _bswap4(u32 val) { _asm mov eax,val _asm bswap eax } # define BSWAP4(x) _bswap4(x) # endif #endif #endif #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) #define GETU32(p) BSWAP4(*(const u32 *)(p)) #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) #else #define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) #define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) #endif /* GCM definitions */ typedef struct { u64 hi,lo; } u128; #ifdef TABLE_BITS #undef TABLE_BITS #endif /* * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should * never be set to 8. 8 is effectively reserved for testing purposes. * TABLE_BITS>1 are lookup-table-driven implementations referred to as * "Shoup's" in GCM specification. In other words OpenSSL does not cover * whole spectrum of possible table driven implementations. Why? In * non-"Shoup's" case memory access pattern is segmented in such manner, * that it's trivial to see that cache timing information can reveal * fair portion of intermediate hash value. Given that ciphertext is * always available to attacker, it's possible for him to attempt to * deduce secret parameter H and if successful, tamper with messages * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's * not as trivial, but there is no reason to believe that it's resistant * to cache-timing attack. And the thing about "8-bit" implementation is * that it consumes 16 (sixteen) times more memory, 4KB per individual * key + 1KB shared. Well, on pros side it should be twice as fast as * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version * was observed to run ~75% faster, closer to 100% for commercial * compilers... Yet "4-bit" procedure is preferred, because it's * believed to provide better security-performance balance and adequate * all-round performance. "All-round" refers to things like: * * - shorter setup time effectively improves overall timing for * handling short messages; * - larger table allocation can become unbearable because of VM * subsystem penalties (for example on Windows large enough free * results in VM working set trimming, meaning that consequent * malloc would immediately incur working set expansion); * - larger table has larger cache footprint, which can affect * performance of other code paths (not necessarily even from same * thread in Hyper-Threading world); */ #define TABLE_BITS 4 struct gcm128_context { /* Following 6 names follow names in GCM specification */ union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0, Xi,H,len; /* Pre-computed table used by gcm_gmult_* */ #if TABLE_BITS==8 u128 Htable[256]; #else u128 Htable[16]; void (*gmult)(u64 Xi[2],const u128 Htable[16]); void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); #endif unsigned int mres, ares; block128_f block; void *key; };