33#define __attribute__(x)
36#define BR_TARGET(x) __attribute__((target(x)))
38#if defined(__GNUC__) && !defined(__clang__)
39 _Pragma(
"GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
53 return br_cpuid(0, 0, 0x00000002, 0);
58br_ghash_pclmul_get(
void)
60 return pclmul_supported() ? &br_ghash_pclmul : 0;
75#ifdef __clang__AND_NOT_WORKING
78pclmulqdq00(__m128i x, __m128i y)
80 __asm__ (
"pclmulqdq $0x00, %1, %0" :
"+x" (x) :
"x" (y));
85pclmulqdq11(__m128i x, __m128i y)
87 __asm__ (
"pclmulqdq $0x11, %1, %0" :
"+x" (x) :
"x" (y));
91#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
92#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
99#define BK(kw, kx) do { \
100 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
107#define PBK(k0, k1, kw, kx) do { \
108 kw = _mm_unpacklo_epi64(k1, k0); \
109 kx = _mm_xor_si128(k0, k1); \
116#define REDUCE_F128(x0, x1, x2, x3) do { \
117 x1 = _mm_xor_si128( \
122 _mm_srli_epi64(x3, 1)), \
124 _mm_srli_epi64(x3, 2), \
125 _mm_srli_epi64(x3, 7)))); \
126 x2 = _mm_xor_si128( \
129 _mm_slli_epi64(x3, 63)), \
131 _mm_slli_epi64(x3, 62), \
132 _mm_slli_epi64(x3, 57))); \
133 x0 = _mm_xor_si128( \
138 _mm_srli_epi64(x2, 1)), \
140 _mm_srli_epi64(x2, 2), \
141 _mm_srli_epi64(x2, 7)))); \
142 x1 = _mm_xor_si128( \
145 _mm_slli_epi64(x2, 63)), \
147 _mm_slli_epi64(x2, 62), \
148 _mm_slli_epi64(x2, 57))); \
152BR_TARGET(
"ssse3,pclmul")
154expand_key_pclmul(const
polyval_t *pv, pv_expanded_key_t *out)
157 __m128i lastw, lastx;
158 __m128i t0, t1, t2, t3;
160 h1w = PCLMUL_MEMBER(pv->key.h);
164 for (
int i = PV_BLOCK_STRIDE - 2; i >= 0; --i) {
167 t1 = pclmulqdq11(lastw, h1w);
168 t3 = pclmulqdq00(lastw, h1w);
169 t2 = pclmulqdq00(lastx, h1x);
170 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
171 t0 = _mm_shuffle_epi32(t1, 0x0E);
172 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
173 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
174 REDUCE_F128(t0, t1, t2, t3);
175 out->k[i] = lastw = _mm_unpacklo_epi64(t1, t0);
180BR_TARGET(
"ssse3,pclmul")
183 const uint8_t *input,
184 const pv_expanded_key_t *expanded)
186 __m128i t0, t1, t2, t3;
188 t1 = _mm_setzero_si128();
189 t2 = _mm_setzero_si128();
190 t3 = _mm_setzero_si128();
192 for (
int i = 0; i < PV_BLOCK_STRIDE; ++i, input += 16) {
193 __m128i aw = _mm_loadu_si128((
void *)(input));
197 aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y));
199 if (i == PV_BLOCK_STRIDE - 1) {
200 hw = PCLMUL_MEMBER(pv->key.h);
206 t1 = _mm_xor_si128(t1, pclmulqdq11(aw, hw));
207 t3 = _mm_xor_si128(t3, pclmulqdq00(aw, hw));
208 t2 = _mm_xor_si128(t2, pclmulqdq00(ax, hx));
211 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
212 t0 = _mm_shuffle_epi32(t1, 0x0E);
213 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
214 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
216 REDUCE_F128(t0, t1, t2, t3);
217 PCLMUL_MEMBER(pv->y) = _mm_unpacklo_epi64(t1, t0);
222BR_TARGET(
"ssse3,pclmul")
226 __m128i yw, h1w, h1x;
228 h1w = PCLMUL_MEMBER(pv->key.h);
233 __m128i t0, t1, t2, t3;
235 aw = PCLMUL_MEMBER(pv->y);
238 t1 = pclmulqdq11(aw, h1w);
239 t3 = pclmulqdq00(aw, h1w);
240 t2 = pclmulqdq00(ax, h1x);
241 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
242 t0 = _mm_shuffle_epi32(t1, 0x0E);
243 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
244 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
246 SL_256(t0, t1, t2, t3);
248 REDUCE_F128(t0, t1, t2, t3);
249 yw = _mm_unpacklo_epi64(t1, t0);
252 PCLMUL_MEMBER(pv->y) = yw;