Tor 0.4.9.2-alpha-dev
All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
pclmul.c
1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * This is the GHASH implementation that leverages the pclmulqdq opcode
27 * (from the AES-NI instructions).
28 */
29
30#include <wmmintrin.h>
31
32#ifndef __GNUC__
33#define __attribute__(x)
34#endif
35
36#define BR_TARGET(x) __attribute__((target(x)))
37
38#if defined(__GNUC__) && !defined(__clang__)
39 _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
40#endif
41
42#if 0
43/*
44 * Test CPU support for PCLMULQDQ.
45 */
46static inline int
47pclmul_supported(void)
48{
49 /*
50 * Bit mask for features in ECX:
51 * 1 PCLMULQDQ support
52 */
53 return br_cpuid(0, 0, 0x00000002, 0);
54}
55
56/* see bearssl_hash.h */
57br_ghash
58br_ghash_pclmul_get(void)
59{
60 return pclmul_supported() ? &br_ghash_pclmul : 0;
61}
62
63BR_TARGETS_X86_UP
64#endif
65/*
66 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
67 * for that compiler, we use inline assembly. Inline assembly is
68 * potentially a bit slower because the compiler does not understand
69 * what the opcode does, and thus cannot optimize instruction
70 * scheduling.
71 *
72 * We use a target of "sse2" only, so that Clang may still handle the
73 * '__m128i' type and allocate SSE2 registers.
74 */
75#ifdef __clang__AND_NOT_WORKING
76 BR_TARGET("sse2")
77static inline __m128i
78pclmulqdq00(__m128i x, __m128i y)
79{
80 __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
81 return x;
82}
83BR_TARGET("sse2")
84static inline __m128i
85pclmulqdq11(__m128i x, __m128i y)
86{
87 __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
88 return x;
89}
90#else
91#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
92#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
93#endif
94
95/*
96 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
97 * halves of kw (into the right half of kx; left half is unspecified).
98 */
99#define BK(kw, kx) do { \
100 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
101 } while (0)
102
103/*
104 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
105 * the XOR of the two values (kx).
106 */
107#define PBK(k0, k1, kw, kx) do { \
108 kw = _mm_unpacklo_epi64(k1, k0); \
109 kx = _mm_xor_si128(k0, k1); \
110 } while (0)
111
112/*
113 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
114 * result is written in x0..x1.
115 */
116#define REDUCE_F128(x0, x1, x2, x3) do { \
117 x1 = _mm_xor_si128( \
118 x1, \
119 _mm_xor_si128( \
120 _mm_xor_si128( \
121 x3, \
122 _mm_srli_epi64(x3, 1)), \
123 _mm_xor_si128( \
124 _mm_srli_epi64(x3, 2), \
125 _mm_srli_epi64(x3, 7)))); \
126 x2 = _mm_xor_si128( \
127 _mm_xor_si128( \
128 x2, \
129 _mm_slli_epi64(x3, 63)), \
130 _mm_xor_si128( \
131 _mm_slli_epi64(x3, 62), \
132 _mm_slli_epi64(x3, 57))); \
133 x0 = _mm_xor_si128( \
134 x0, \
135 _mm_xor_si128( \
136 _mm_xor_si128( \
137 x2, \
138 _mm_srli_epi64(x2, 1)), \
139 _mm_xor_si128( \
140 _mm_srli_epi64(x2, 2), \
141 _mm_srli_epi64(x2, 7)))); \
142 x1 = _mm_xor_si128( \
143 _mm_xor_si128( \
144 x1, \
145 _mm_slli_epi64(x2, 63)), \
146 _mm_xor_si128( \
147 _mm_slli_epi64(x2, 62), \
148 _mm_slli_epi64(x2, 57))); \
149 } while (0)
150
151
152BR_TARGET("ssse3,pclmul")
153static inline void
154expand_key_pclmul(const polyval_t *pv, pv_expanded_key_t *out)
155{
156 __m128i h1w, h1x;
157 __m128i lastw, lastx;
158 __m128i t0, t1, t2, t3;
159
160 h1w = PCLMUL_MEMBER(pv->key.h);
161 BK(h1w, h1x);
162 lastw = h1w;
163
164 for (int i = PV_BLOCK_STRIDE - 2; i >= 0; --i) {
165 BK(lastw, lastx);
166
167 t1 = pclmulqdq11(lastw, h1w);
168 t3 = pclmulqdq00(lastw, h1w);
169 t2 = pclmulqdq00(lastx, h1x);
170 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
171 t0 = _mm_shuffle_epi32(t1, 0x0E);
172 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
173 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
174 REDUCE_F128(t0, t1, t2, t3);
175 out->k[i] = lastw = _mm_unpacklo_epi64(t1, t0);
176 }
177}
178
179// Add PCLMUL_BLOCK_STRIDE * 16 bytes from input.
180BR_TARGET("ssse3,pclmul")
181static inline void
182pv_add_multiple_pclmul(polyval_t *pv,
183 const uint8_t *input,
184 const pv_expanded_key_t *expanded)
185{
186 __m128i t0, t1, t2, t3;
187
188 t1 = _mm_setzero_si128();
189 t2 = _mm_setzero_si128();
190 t3 = _mm_setzero_si128();
191
192 for (int i = 0; i < PV_BLOCK_STRIDE; ++i, input += 16) {
193 __m128i aw = _mm_loadu_si128((void *)(input));
194 __m128i ax;
195 __m128i hx, hw;
196 if (i == 0) {
197 aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y));
198 }
199 if (i == PV_BLOCK_STRIDE - 1) {
200 hw = PCLMUL_MEMBER(pv->key.h);
201 } else {
202 hw = expanded->k[i];
203 }
204 BK(aw, ax);
205 BK(hw, hx);
206 t1 = _mm_xor_si128(t1, pclmulqdq11(aw, hw));
207 t3 = _mm_xor_si128(t3, pclmulqdq00(aw, hw));
208 t2 = _mm_xor_si128(t2, pclmulqdq00(ax, hx));
209 }
210
211 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
212 t0 = _mm_shuffle_epi32(t1, 0x0E);
213 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
214 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
215
216 REDUCE_F128(t0, t1, t2, t3);
217 PCLMUL_MEMBER(pv->y) = _mm_unpacklo_epi64(t1, t0);
218}
219
220
221/* see bearssl_hash.h */
222BR_TARGET("ssse3,pclmul")
223static inline void
224pv_mul_y_h_pclmul(polyval_t *pv)
225{
226 __m128i yw, h1w, h1x;
227
228 h1w = PCLMUL_MEMBER(pv->key.h);
229 BK(h1w, h1x);
230
231 {
232 __m128i aw, ax;
233 __m128i t0, t1, t2, t3;
234
235 aw = PCLMUL_MEMBER(pv->y);
236 BK(aw, ax);
237
238 t1 = pclmulqdq11(aw, h1w);
239 t3 = pclmulqdq00(aw, h1w);
240 t2 = pclmulqdq00(ax, h1x);
241 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
242 t0 = _mm_shuffle_epi32(t1, 0x0E);
243 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
244 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
245#if 0 // This step is GHASH-only.
246 SL_256(t0, t1, t2, t3);
247#endif
248 REDUCE_F128(t0, t1, t2, t3);
249 yw = _mm_unpacklo_epi64(t1, t0);
250 }
251
252 PCLMUL_MEMBER(pv->y) = yw;
253}