1#include <stdint.h>
2#include <immintrin.h>
3#include <sys/types.h>
4
5#define MODE_URL 1
6#define MODE_RAW 2
7#define MODE_AVX2 4
8#define MODE_JSON 8
9
10#define as_m32v(v) (*(uint32_t *)(v))
11#define as_m64v(v) (*(uint64_t *)(v))
12
13#define as_m128p(v) ((__m128i *)(v))
14#define as_m256p(v) ((__m256i *)(v))
15
16#define as_m8c(v) ((const uint8_t *)(v))
17#define as_m128c(v) ((const __m128i *)(v))
18#define as_m256c(v) ((const __m256i *)(v))
19
20#define always_inline inline __attribute__((always_inline))
21
22struct slice_t {
23 char * buf;
24 size_t len;
25 size_t cap;
26};
27
28/** Exported Functions **/
29
30void b64encode(struct slice_t *out, const struct slice_t *src, int mode);
31ssize_t b64decode(struct slice_t *out, const char *src, size_t nb, int mode);
32
33/** Encoder Helper Functions **/
34
35static const char TabEncodeCharsetStd[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
36static const char TabEncodeCharsetURL[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
37
38static const uint8_t VecEncodeShuffles[32] = {
39 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
40 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
41};
42
43static const uint8_t VecEncodeCharsetStd[32] = {
44 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
45 '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A' , 0, 0,
46 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
47 '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A' , 0, 0,
48};
49
50static const uint8_t VecEncodeCharsetURL[32] = {
51 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
52 '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A' , 0, 0,
53 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
54 '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A' , 0, 0,
55};
56
57static always_inline __m256i encode_avx2(__m128i v0, __m128i v1, const uint8_t *tab) {
58 __m256i vv = _mm256_set_m128i (v1, v0);
59 __m256i sh = _mm256_loadu_si256 (as_m256c(VecEncodeShuffles));
60 __m256i in = _mm256_shuffle_epi8 (vv, sh);
61 __m256i t0 = _mm256_and_si256 (in, _mm256_set1_epi32(0x0fc0fc00));
62 __m256i t1 = _mm256_mulhi_epu16 (t0, _mm256_set1_epi32(0x04000040));
63 __m256i t2 = _mm256_and_si256 (in, _mm256_set1_epi32(0x003f03f0));
64 __m256i t3 = _mm256_mullo_epi16 (t2, _mm256_set1_epi32(0x01000010));
65 __m256i vi = _mm256_or_si256 (t1, t3);
66 __m256i s0 = _mm256_cmpgt_epi8 (_mm256_set1_epi8(26), vi);
67 __m256i s1 = _mm256_and_si256 (_mm256_set1_epi8(13), s0);
68 __m256i s2 = _mm256_loadu_si256 (as_m256c(tab));
69 __m256i r0 = _mm256_subs_epu8 (vi, _mm256_set1_epi8(51));
70 __m256i r1 = _mm256_or_si256 (r0, s1);
71 __m256i r2 = _mm256_shuffle_epi8 (s2, r1);
72 __m256i r3 = _mm256_add_epi8 (vi, r2);
73 return r3;
74}
75
76/** Function Implementations **/
77
78void b64encode(struct slice_t *out, const struct slice_t *src, int mode) {
79 char * ob = out->buf + out->len;
80 char * op = out->buf + out->len;
81 const char * ip = src->buf;
82 const char * ie = src->buf + src->len;
83 const char * st = TabEncodeCharsetStd;
84 const uint8_t * vt = VecEncodeCharsetStd;
85
86 /* check for empty string */
87 if (src->len == 0) {
88 return;
89 }
90
91 /* check for URL encoding */
92 if (mode & MODE_URL) {
93 st = TabEncodeCharsetURL;
94 vt = VecEncodeCharsetURL;
95 }
96
97 /* SIMD 24 bytes loop, but the SIMD instruction will load 4 bytes
98 * past the end, so it's safe only if there are 28 bytes or more left */
99 while ((ip <= ie - 28) && (mode & MODE_AVX2) != 0) {
100 __m128i v0 = _mm_loadu_si128 (as_m128c(ip));
101 __m128i v1 = _mm_loadu_si128 (as_m128c(ip + 12));
102 __m256i vv = encode_avx2 (v0, v1, vt);
103
104 /* store the result, and advance buffer pointers */
105 _mm256_storeu_si256(as_m256p(op), vv);
106 op += 32;
107 ip += 24;
108 }
109
110 /* can do one more 24 bytes round, but needs special handling */
111 if ((ip <= ie - 24) && (mode & MODE_AVX2) != 0) {
112 __m128i v0 = _mm_loadu_si128 (as_m128c(ip));
113 __m128i v1 = _mm_loadu_si128 (as_m128c(ip + 8));
114 __m128i v2 = _mm_srli_si128 (v1, 4);
115 __m256i vv = encode_avx2 (v0, v2, vt);
116
117 /* store the result, and advance buffer pointers */
118 _mm256_storeu_si256(as_m256p(op), vv);
119 op += 32;
120 ip += 24;
121 }
122
123 /* no more bytes */
124 if (ip == ie) {
125 out->len += op - ob;
126 return;
127 }
128
129 /* handle the remaining bytes with scalar code (with 4 bytes load) */
130 while (ip <= ie - 4) {
131 uint32_t v0 = __builtin_bswap32(*(const uint32_t *)ip);
132 uint8_t v1 = (v0 >> 26) & 0x3f;
133 uint8_t v2 = (v0 >> 20) & 0x3f;
134 uint8_t v3 = (v0 >> 14) & 0x3f;
135 uint8_t v4 = (v0 >> 8) & 0x3f;
136
137 /* encode the characters, and move to next block */
138 ip += 3;
139 *op++ = st[v1];
140 *op++ = st[v2];
141 *op++ = st[v3];
142 *op++ = st[v4];
143 }
144
145 /* load the last bytes */
146 size_t dp = ie - ip;
147 uint32_t v0 = (uint32_t)(uint8_t)ip[0] << 16;
148
149#define B2 v0 |= (uint32_t)(uint8_t)ip[2]
150#define B1 v0 |= (uint32_t)(uint8_t)ip[1] << 8
151
152#define R4 *op++ = st[(v0 >> 0) & 0x3f]
153#define R3 *op++ = st[(v0 >> 6) & 0x3f]
154#define R2 *op++ = st[(v0 >> 12) & 0x3f]
155#define R1 *op++ = st[(v0 >> 18) & 0x3f]
156
157#define NB { out->len += op - ob; }
158#define PD { if ((mode & MODE_RAW) == 0) { *op++ = '='; } }
159
160 /* encode the last few bytes */
161 switch (dp) {
162 case 3 : B2; B1; R1; R2; R3; R4; NB; break;
163 case 2 : B1; R1; R2; R3; PD; NB; break;
164 case 1 : R1; R2; PD; PD; NB; break;
165 default : NB; break;
166 }
167
168#undef PD
169#undef NB
170#undef R1
171#undef R2
172#undef R3
173#undef R4
174#undef B1
175#undef B2
176}
177
178/** Decoder Helper Functions **/
179
180static const uint8_t VecPacking[32] = {
181 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128,
182 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128
183};
184
185static const uint8_t VecDecodeBits[32] = {
186 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
187 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
188};
189
190static const uint8_t VecDecodeTableStd[128] = {
191 0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
192 0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
193 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
194 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
195 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
196 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
197 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
198 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
199};
200
201static const uint8_t VecDecodeTableURL[128] = {
202 0x00, 0x00, 0x11, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
203 0x00, 0x00, 0x11, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
204 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x50, 0x50, 0x54, 0x50, 0x70,
205 0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x50, 0x50, 0x54, 0x50, 0x70,
206 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f,
207 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f,
208 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
209 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
210};
211
212static const uint8_t VecDecodeCharsetStd[256] = {
213 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
214 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
215 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff, 0xff, 63,
216 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
217 0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
218 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 0xff,
219 0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
220 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff,
221 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
222 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
223 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
224 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
225 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
226 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
227 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
228 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
229};
230
231static const uint8_t VecDecodeCharsetURL[256] = {
232 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
233 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
234 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62, 0xff, 0xff,
235 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
236 0xff, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
237 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xff, 0xff, 0xff, 0xff, 63,
238 0xff, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
239 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xff, 0xff, 0xff, 0xff, 0xff,
240 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
241 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
242 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
243 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
244 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
245 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
246 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
247 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
248};
249
250static always_inline void memcopy_24(char *dp, const uint8_t *sp) {
251 *(uint64_t *)(dp + 0) = *(const uint64_t *)(sp + 0);
252 *(uint64_t *)(dp + 8) = *(const uint64_t *)(sp + 8);
253 *(uint64_t *)(dp + 16) = *(const uint64_t *)(sp + 16);
254}
255
256static always_inline __m256i decode_avx2(__m256i v0, int *pos, const uint8_t *tab) {
257 __m256i v1 = _mm256_srli_epi32 (v0, 4);
258 __m256i vl = _mm256_and_si256 (v0, _mm256_set1_epi8(0x0f));
259 __m256i vh = _mm256_and_si256 (v1, _mm256_set1_epi8(0x0f));
260 __m256i st = _mm256_loadu_si256 (as_m256c(tab));
261 __m256i mt = _mm256_loadu_si256 (as_m256c(tab + 32));
262 __m256i et = _mm256_loadu_si256 (as_m256c(tab + 64));
263 __m256i rt = _mm256_loadu_si256 (as_m256c(tab + 96));
264 __m256i pt = _mm256_loadu_si256 (as_m256c(VecPacking));
265 __m256i bt = _mm256_loadu_si256 (as_m256c(VecDecodeBits));
266 __m256i sh = _mm256_shuffle_epi8 (st, vh);
267 __m256i eq = _mm256_cmpeq_epi8 (v0, et);
268 __m256i sv = _mm256_blendv_epi8 (sh, rt, eq);
269 __m256i bm = _mm256_shuffle_epi8 (mt, vl);
270 __m256i bv = _mm256_shuffle_epi8 (bt, vh);
271 __m256i mr = _mm256_and_si256 (bm, bv);
272 __m256i nm = _mm256_cmpeq_epi8 (mr, _mm256_setzero_si256());
273 __m256i sr = _mm256_add_epi8 (v0, sv);
274 __m256i r0 = _mm256_and_si256 (sr, _mm256_set1_epi8(0x3f));
275 __m256i r1 = _mm256_maddubs_epi16 (r0, _mm256_set1_epi32(0x01400140));
276 __m256i r2 = _mm256_madd_epi16 (r1, _mm256_set1_epi32(0x00011000));
277 __m256i r3 = _mm256_shuffle_epi8 (r2, pt);
278 __m256i r4 = _mm256_permutevar8x32_epi32 (r3, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 3, 7));
279 int64_t mp = _mm256_movemask_epi8 (nm);
280 int32_t np = __builtin_ctzll (mp | 0xffffffff00000000);
281 return (*pos = np), r4;
282}
283
284
285#define ALL_01h (~0ul / 255)
286#define ALL_7fh (ALL_01h * 127)
287#define ALL_80h (ALL_01h * 128)
288
289static always_inline uint32_t hasless(uint32_t x, uint8_t n) {
290 return (x - ALL_01h * n) & ~x & ALL_80h;
291}
292
293static always_inline uint32_t hasmore(uint32_t x, uint8_t n) {
294 return (x + ALL_01h * (127 - n) | x) & ALL_80h;
295}
296
297static always_inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
298 return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
299}
300
301#undef ALL_01h
302#undef ALL_7fh
303#undef ALL_80h
304
305static always_inline char unhex16_is(const uint8_t *s) {
306 uint32_t v = *(uint32_t *)s;
307 return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
308}
309
310static always_inline uint32_t unhex16_fast(const uint8_t *s) {
311 uint32_t a = __builtin_bswap32(*(uint32_t *)s);
312 uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
313 uint32_t c = (b >> 4) | b;
314 uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
315 return d;
316}
317
318static always_inline uint8_t unescape_asc(const uint8_t * ie, const uint8_t ** ipp) {
319 const uint8_t * ee = (*ipp) + 1;
320 uint32_t ch = 0xff;
321 /* check eof */
322 if (ee > ie) {
323 return 0xff;
324 }
325 switch (ee[-1]) {
326 case 'r': ch = '\r'; break;
327 case 'n': ch = '\n'; break;
328 case '/': ch = '/'; break;
329 case 'u': /* neee more 4 bytes */
330 if (ie - ee >= 4 && unhex16_is(ee)) {
331 ch = unhex16_fast(ee);
332 /* if not ascii, as 0xff */
333 ch = ch < 128 ? ch : 0xff;
334 ee += 4;
335 }
336 break;
337 }
338 *ipp = ee;
339 return ch;
340}
341
342/* Return 0 if success, otherwise return the error position + 1 */
343static always_inline int64_t decode_block(
344 const uint8_t * ie,
345 const uint8_t ** ipp,
346 char ** opp,
347 const uint8_t * tab,
348 int mode
349) {
350 int nb = 0;
351 uint32_t v0 = 0;
352
353 /* buffer pointers */
354 char * op = *opp;
355 const uint8_t * ip = *ipp;
356 uint8_t id = 0;
357 uint8_t ch = 0;
358 int pad = 0;
359
360#define may_unescape() { if (ch == '\\' && (mode & MODE_JSON)) ch = unescape_asc(ie, &ip); }
361#define skip_newlines() { if (ch == '\r' || ch == '\n') continue; }
362
363 /* load up to 4 characters */
364 while (ip < ie && nb < 4) {
365 ch = *ip++;
366 may_unescape();
367 skip_newlines();
368
369 /* lookup the index, and check for invalid characters */
370 if ((id = tab[ch]) == 0xff) {
371 if ((mode & MODE_RAW) || ch != '=' || nb < 2) goto error;
372 pad++; goto tail;
373 }
374
375 /* decode the character */
376 v0 = (v0 << 6) | id;
377 nb++;
378 }
379
380 if (nb == 0) {
381 /* update the pointers */
382 *ipp = ip;
383 return 0;
384 }
385
386 /* check eof, MODE_STD need paddings */
387 if (ip >= ie && nb != 4) {
388 if (!(mode & MODE_RAW) || nb == 1) goto error;
389 }
390
391decode:
392 v0 <<= 6 * (4 - nb);
393 /* ends with eof or 4 characters, decode into output */
394 switch (nb) {
395 case 4: op[2] = (v0 >> 0) & 0xff;
396 case 3: op[1] = (v0 >> 8) & 0xff;
397 case 2: op[0] = (v0 >> 16) & 0xff;
398 }
399
400 /* update the pointers */
401 *ipp = ip;
402 *opp = op + nb - 1;
403 return 0;
404
405tail:
406 /* loop for more paddings */
407 while (ip < ie) {
408 ch = *ip++;
409 may_unescape();
410 skip_newlines();
411 if (ch != '=') goto error;
412 if (++pad + nb > 4) goto error;
413 }
414 goto decode;
415#undef may_unescape
416#undef skip_newlines
417
418error:
419 /* update eof error position */
420 if (ip == ie) ip++;
421 return ip - *ipp;
422
423
424}
425
426ssize_t b64decode(struct slice_t *out, const char *src, size_t nb, int mode) {
427 int ep;
428 __m256i vv;
429 int64_t dv;
430 uint8_t buf[32] = {0};
431
432 /* check for empty input */
433 if (nb == 0) {
434 return 0;
435 }
436
437 /* output buffer */
438 char *ob = out->buf + out->len;
439 char *op = out->buf + out->len;
440 char *oe = out->buf + out->cap;
441
442 /* input buffer */
443 const uint8_t *dt = VecDecodeTableStd;
444 const uint8_t *st = VecDecodeCharsetStd;
445 const uint8_t *ib = (const uint8_t *)src;
446 const uint8_t *ip = (const uint8_t *)src;
447 const uint8_t *ie = (const uint8_t *)src + nb;
448
449 /* check for URL encoding */
450 if (mode & MODE_URL) {
451 dt = VecDecodeTableURL;
452 st = VecDecodeCharsetURL;
453 }
454
455 /* decode every 32 bytes, the final round should be handled separately, because the
456 * SIMD instruction performs 32-byte store, and it might store past the end of the
457 * output buffer */
458 if ((mode & MODE_AVX2) == 0) {
459 goto scalar;
460 }
461 while ((ip <= ie - 32) && (op <= oe - 32)) {
462 vv = _mm256_loadu_si256(as_m256c(ip));
463 vv = decode_avx2(vv, &ep, dt);
464
465 /* check for invalid characters (or '=' paddings) */
466 if (ep < 32) {
467 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
468 return ib - ip - dv;
469 } else {
470 continue;
471 }
472 }
473
474 _mm256_storeu_si256(as_m256p(op), vv);
475
476 /* move to next block */
477 ip += 32;
478 op += 24;
479 }
480
481scalar:
482 /* handle the remaining bytes with scalar code (8 byte loop) */
483 while (ip <= ie - 8 && op <= oe - 8) {
484 uint8_t v0 = st[ip[0]];
485 uint8_t v1 = st[ip[1]];
486 uint8_t v2 = st[ip[2]];
487 uint8_t v3 = st[ip[3]];
488 uint8_t v4 = st[ip[4]];
489 uint8_t v5 = st[ip[5]];
490 uint8_t v6 = st[ip[6]];
491 uint8_t v7 = st[ip[7]];
492
493 /* check for invalid bytes */
494 if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) == 0xff) {
495 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
496 return ib - ip - dv;
497 } else {
498 continue;
499 }
500 }
501
502 /* construct the characters */
503 uint64_t vv = __builtin_bswap64(
504 ((uint64_t)v0 << 58) |
505 ((uint64_t)v1 << 52) |
506 ((uint64_t)v2 << 46) |
507 ((uint64_t)v3 << 40) |
508 ((uint64_t)v4 << 34) |
509 ((uint64_t)v5 << 28) |
510 ((uint64_t)v6 << 22) |
511 ((uint64_t)v7 << 16)
512 );
513
514 /* store the result, and move to next block */
515 as_m64v(op) = vv;
516 ip += 8;
517 op += 6;
518 }
519
520 /* handle the remaining bytes with scalar code (4 byte loop) */
521 while (ip <= ie - 4 && op <= oe - 4) {
522 uint8_t v0 = st[ip[0]];
523 uint8_t v1 = st[ip[1]];
524 uint8_t v2 = st[ip[2]];
525 uint8_t v3 = st[ip[3]];
526
527 /* check for invalid bytes */
528 if ((v0 | v1 | v2 | v3) == 0xff) {
529 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
530 return ib - ip - dv;
531 } else {
532 continue;
533 }
534 }
535
536 /* construct the characters */
537 uint32_t vv = __builtin_bswap32(
538 ((uint32_t)v0 << 26) |
539 ((uint32_t)v1 << 20) |
540 ((uint32_t)v2 << 14) |
541 ((uint32_t)v3 << 8)
542 );
543
544 /* store the result, and move to next block */
545 as_m32v(op) = vv;
546 ip += 4;
547 op += 3;
548 }
549
550 /* decode the last few bytes */
551 while (ip < ie) {
552 if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
553 return ib - ip - dv;
554 }
555 }
556
557 /* update the result length */
558 out->len += op - ob;
559 return op - ob;
560}
View as plain text