/* * Byte Swapping Operations * (C) 1999-2011 Jack Lloyd * (C) 2007 Yves Jerschow * * Distributed under the terms of the Botan license */ #ifndef BOTAN_BYTE_SWAP_H__ #define BOTAN_BYTE_SWAP_H__ #include <botan/types.h> #include <botan/rotate.h> #if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS) #include <emmintrin.h> #endif namespace Botan { /** * Swap a 16 bit integer */ inline u16bit reverse_bytes(u16bit val) { return rotate_left(val, 8); } /** * Swap a 32 bit integer */ inline u32bit reverse_bytes(u32bit val) { #if BOTAN_GCC_VERSION >= 430 && !defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) /* GCC intrinsic added in 4.3, works for a number of CPUs However avoid under ARM, as it branches to a function in libgcc instead of generating inline asm, so slower even than the generic rotate version below. */ return __builtin_bswap32(val); #elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_CPU_IS_X86_FAMILY) // GCC-style inline assembly for x86 or x86-64 asm("bswapl %0" : "=r" (val) : "0" (val)); return val; #elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY) asm ("eor r3, %1, %1, ror #16\n\t" "bic r3, r3, #0x00FF0000\n\t" "mov %0, %1, ror #8\n\t" "eor %0, %0, r3, lsr #8" : "=r" (val) : "0" (val) : "r3", "cc"); return val; #elif defined(_MSC_VER) && defined(BOTAN_TARGET_ARCH_IS_X86_32) // Visual C++ inline asm for 32-bit x86, by Yves Jerschow __asm mov eax, val; __asm bswap eax; #else // Generic implementation return (rotate_right(val, 8) & 0xFF00FF00) | (rotate_left (val, 8) & 0x00FF00FF); #endif } /** * Swap a 64 bit integer */ inline u64bit reverse_bytes(u64bit val) { #if BOTAN_GCC_VERSION >= 430 // GCC intrinsic added in 4.3, works for a number of CPUs return __builtin_bswap64(val); #elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_ARCH_IS_X86_64) // GCC-style inline assembly for x86-64 asm("bswapq %0" : "=r" (val) : "0" (val)); return val; #else /* Generic implementation. Defined in terms of 32-bit bswap so any * optimizations in that version can help here (particularly * useful for 32-bit x86). */ u32bit hi = static_cast<u32bit>(val >> 32); u32bit lo = static_cast<u32bit>(val); hi = reverse_bytes(hi); lo = reverse_bytes(lo); return (static_cast<u64bit>(lo) << 32) | hi; #endif } /** * Swap 4 Ts in an array */ template<typename T> inline void bswap_4(T x[4]) { x[0] = reverse_bytes(x[0]); x[1] = reverse_bytes(x[1]); x[2] = reverse_bytes(x[2]); x[3] = reverse_bytes(x[3]); } #if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS) /** * Swap 4 u32bits in an array using SSE2 shuffle instructions */ template<> inline void bswap_4(u32bit x[4]) { __m128i T = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1)); T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1)); T = _mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8)); _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T); } #endif } #endif