1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
/*
* Byte Swapping Operations
* (C) 1999-2011 Jack Lloyd
* (C) 2007 Yves Jerschow
*
* Distributed under the terms of the Botan license
*/
#ifndef BOTAN_BYTE_SWAP_H__
#define BOTAN_BYTE_SWAP_H__
#include <botan/types.h>
#include <botan/rotate.h>
#if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS)
#include <emmintrin.h>
#endif
namespace Botan {
/**
* Swap a 16 bit integer
*/
inline u16bit reverse_bytes(u16bit val)
{
return rotate_left(val, 8);
}
/**
* Swap a 32 bit integer
*/
inline u32bit reverse_bytes(u32bit val)
{
#if BOTAN_GCC_VERSION >= 430 && !defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY)
/*
GCC intrinsic added in 4.3, works for a number of CPUs
However avoid under ARM, as it branches to a function in libgcc
instead of generating inline asm, so slower even than the generic
rotate version below.
*/
return __builtin_bswap32(val);
#elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_CPU_IS_X86_FAMILY)
// GCC-style inline assembly for x86 or x86-64
asm("bswapl %0" : "=r" (val) : "0" (val));
return val;
#elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_CPU_IS_ARM_FAMILY)
asm ("eor r3, %1, %1, ror #16\n\t"
"bic r3, r3, #0x00FF0000\n\t"
"mov %0, %1, ror #8\n\t"
"eor %0, %0, r3, lsr #8"
: "=r" (val)
: "0" (val)
: "r3", "cc");
return val;
#elif defined(_MSC_VER) && defined(BOTAN_TARGET_ARCH_IS_IA32)
// Visual C++ inline asm for 32-bit x86, by Yves Jerschow
__asm mov eax, val;
__asm bswap eax;
#else
// Generic implementation
return (rotate_right(val, 8) & 0xFF00FF00) |
(rotate_left (val, 8) & 0x00FF00FF);
#endif
}
/**
* Swap a 64 bit integer
*/
inline u64bit reverse_bytes(u64bit val)
{
#if BOTAN_GCC_VERSION >= 430
// GCC intrinsic added in 4.3, works for a number of CPUs
return __builtin_bswap64(val);
#elif BOTAN_USE_GCC_INLINE_ASM && defined(BOTAN_TARGET_ARCH_IS_AMD64)
// GCC-style inline assembly for x86-64
asm("bswapq %0" : "=r" (val) : "0" (val));
return val;
#else
/* Generic implementation. Defined in terms of 32-bit bswap so any
* optimizations in that version can help here (particularly
* useful for 32-bit x86).
*/
u32bit hi = static_cast<u32bit>(val >> 32);
u32bit lo = static_cast<u32bit>(val);
hi = reverse_bytes(hi);
lo = reverse_bytes(lo);
return (static_cast<u64bit>(lo) << 32) | hi;
#endif
}
/**
* Swap 4 Ts in an array
*/
template<typename T>
inline void bswap_4(T x[4])
{
x[0] = reverse_bytes(x[0]);
x[1] = reverse_bytes(x[1]);
x[2] = reverse_bytes(x[2]);
x[3] = reverse_bytes(x[3]);
}
#if defined(BOTAN_TARGET_CPU_HAS_SSE2) && !defined(BOTAN_NO_SSE_INTRINSICS)
/**
* Swap 4 u32bits in an array using SSE2 shuffle instructions
*/
template<>
inline void bswap_4(u32bit x[4])
{
__m128i T = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
T = _mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(x), T);
}
#endif
}
#endif
|