1/*
2 * Copyright 2021 ByteDance Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "native.h"
18#include "utils.h"
19#include <stdint.h>
20
21/** String Quoting **/
22#define MAX_ESCAPED_BYTES 8
23typedef struct {
24 const long n;
25 const char s[MAX_ESCAPED_BYTES];
26} quoted_t;
27
28static const quoted_t _SingleQuoteTab[256] = {
29 ['\x00'] = { .n = 6, .s = "\\u0000" },
30 ['\x01'] = { .n = 6, .s = "\\u0001" },
31 ['\x02'] = { .n = 6, .s = "\\u0002" },
32 ['\x03'] = { .n = 6, .s = "\\u0003" },
33 ['\x04'] = { .n = 6, .s = "\\u0004" },
34 ['\x05'] = { .n = 6, .s = "\\u0005" },
35 ['\x06'] = { .n = 6, .s = "\\u0006" },
36 ['\x07'] = { .n = 6, .s = "\\u0007" },
37 ['\b' ] = { .n = 6, .s = "\\u0008" },
38 ['\t' ] = { .n = 2, .s = "\\t" },
39 ['\n' ] = { .n = 2, .s = "\\n" },
40 ['\x0b'] = { .n = 6, .s = "\\u000b" },
41 ['\f' ] = { .n = 6, .s = "\\u000c" },
42 ['\r' ] = { .n = 2, .s = "\\r" },
43 ['\x0e'] = { .n = 6, .s = "\\u000e" },
44 ['\x0f'] = { .n = 6, .s = "\\u000f" },
45 ['\x10'] = { .n = 6, .s = "\\u0010" },
46 ['\x11'] = { .n = 6, .s = "\\u0011" },
47 ['\x12'] = { .n = 6, .s = "\\u0012" },
48 ['\x13'] = { .n = 6, .s = "\\u0013" },
49 ['\x14'] = { .n = 6, .s = "\\u0014" },
50 ['\x15'] = { .n = 6, .s = "\\u0015" },
51 ['\x16'] = { .n = 6, .s = "\\u0016" },
52 ['\x17'] = { .n = 6, .s = "\\u0017" },
53 ['\x18'] = { .n = 6, .s = "\\u0018" },
54 ['\x19'] = { .n = 6, .s = "\\u0019" },
55 ['\x1a'] = { .n = 6, .s = "\\u001a" },
56 ['\x1b'] = { .n = 6, .s = "\\u001b" },
57 ['\x1c'] = { .n = 6, .s = "\\u001c" },
58 ['\x1d'] = { .n = 6, .s = "\\u001d" },
59 ['\x1e'] = { .n = 6, .s = "\\u001e" },
60 ['\x1f'] = { .n = 6, .s = "\\u001f" },
61 ['"' ] = { .n = 2, .s = "\\\"" },
62 ['\\' ] = { .n = 2, .s = "\\\\" },
63};
64
65static const quoted_t _DoubleQuoteTab[256] = {
66 ['\x00'] = { .n = 7, .s = "\\\\u0000" },
67 ['\x01'] = { .n = 7, .s = "\\\\u0001" },
68 ['\x02'] = { .n = 7, .s = "\\\\u0002" },
69 ['\x03'] = { .n = 7, .s = "\\\\u0003" },
70 ['\x04'] = { .n = 7, .s = "\\\\u0004" },
71 ['\x05'] = { .n = 7, .s = "\\\\u0005" },
72 ['\x06'] = { .n = 7, .s = "\\\\u0006" },
73 ['\x07'] = { .n = 7, .s = "\\\\u0007" },
74 ['\b' ] = { .n = 7, .s = "\\\\u0008" },
75 ['\t' ] = { .n = 3, .s = "\\\\t" },
76 ['\n' ] = { .n = 3, .s = "\\\\n" },
77 ['\x0b'] = { .n = 7, .s = "\\\\u000b" },
78 ['\f' ] = { .n = 7, .s = "\\\\u000c" },
79 ['\r' ] = { .n = 3, .s = "\\\\r" },
80 ['\x0e'] = { .n = 7, .s = "\\\\u000e" },
81 ['\x0f'] = { .n = 7, .s = "\\\\u000f" },
82 ['\x10'] = { .n = 7, .s = "\\\\u0010" },
83 ['\x11'] = { .n = 7, .s = "\\\\u0011" },
84 ['\x12'] = { .n = 7, .s = "\\\\u0012" },
85 ['\x13'] = { .n = 7, .s = "\\\\u0013" },
86 ['\x14'] = { .n = 7, .s = "\\\\u0014" },
87 ['\x15'] = { .n = 7, .s = "\\\\u0015" },
88 ['\x16'] = { .n = 7, .s = "\\\\u0016" },
89 ['\x17'] = { .n = 7, .s = "\\\\u0017" },
90 ['\x18'] = { .n = 7, .s = "\\\\u0018" },
91 ['\x19'] = { .n = 7, .s = "\\\\u0019" },
92 ['\x1a'] = { .n = 7, .s = "\\\\u001a" },
93 ['\x1b'] = { .n = 7, .s = "\\\\u001b" },
94 ['\x1c'] = { .n = 7, .s = "\\\\u001c" },
95 ['\x1d'] = { .n = 7, .s = "\\\\u001d" },
96 ['\x1e'] = { .n = 7, .s = "\\\\u001e" },
97 ['\x1f'] = { .n = 7, .s = "\\\\u001f" },
98 ['"' ] = { .n = 4, .s = "\\\\\\\"" },
99 ['\\' ] = { .n = 4, .s = "\\\\\\\\" },
100};
101
102static const quoted_t _HtmlQuoteTab[256] = {
103 ['<'] = { .n = 6, .s = "\\u003c" },
104 ['>'] = { .n = 6, .s = "\\u003e" },
105 ['&'] = { .n = 6, .s = "\\u0026" },
106 // \u2028 and \u2029 is [E2 80 A8] and [E2 80 A9]
107 [0xe2] = { .n = 0, .s = {0} },
108 [0xa8] = { .n = 6, .s = "\\u2028" },
109 [0xa9] = { .n = 6, .s = "\\u2029" },
110};
111
112static inline __m128i _mm_find_quote(__m128i vv) {
113 __m128i e1 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(-1));
114 __m128i e2 = _mm_cmpgt_epi8 (vv, _mm_set1_epi8(31));
115 __m128i e3 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('"'));
116 __m128i e4 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('\\'));
117 __m128i r1 = _mm_andnot_si128 (e2, e1);
118 __m128i r2 = _mm_or_si128 (e3, e4);
119 __m128i rv = _mm_or_si128 (r1, r2);
120 return rv;
121}
122
123#if USE_AVX2
124static inline __m256i _mm256_find_quote(__m256i vv) {
125 __m256i e1 = _mm256_cmpgt_epi8 (vv, _mm256_set1_epi8(-1));
126 __m256i e2 = _mm256_cmpgt_epi8 (vv, _mm256_set1_epi8(31));
127 __m256i e3 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('"'));
128 __m256i e4 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('\\'));
129 __m256i r1 = _mm256_andnot_si256 (e2, e1);
130 __m256i r2 = _mm256_or_si256 (e3, e4);
131 __m256i rv = _mm256_or_si256 (r1, r2);
132 return rv;
133}
134#endif
135
136static inline ssize_t memcchr_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) {
137 uint32_t mm;
138 const char * ss = sp;
139
140#if USE_AVX2
141 /* 32-byte loop, full store */
142 while (nb >= 32 && dn >= 32) {
143 __m256i vv = _mm256_loadu_si256 ((const void *)sp);
144 __m256i rv = _mm256_find_quote (vv);
145 _mm256_storeu_si256 ((void *)dp, vv);
146
147 /* check for matches */
148 if ((mm = _mm256_movemask_epi8(rv)) != 0) {
149 return sp - ss + __builtin_ctz(mm);
150 }
151
152 /* move to next block */
153 sp += 32;
154 dp += 32;
155 nb -= 32;
156 dn -= 32;
157 }
158
159 /* 32-byte test, partial store */
160 if (nb >= 32) {
161 __m256i vv = _mm256_loadu_si256 ((const void *)sp);
162 __m256i rv = _mm256_find_quote (vv);
163 uint32_t mv = _mm256_movemask_epi8 (rv);
164 uint32_t fv = __builtin_ctzll ((uint64_t)mv | 0x0100000000);
165
166 /* copy at most `dn` characters */
167 if (fv <= dn) {
168 memcpy_p32(dp, sp, fv);
169 return sp - ss + fv;
170 } else {
171 memcpy_p32(dp, sp, dn);
172 return -(sp - ss + dn) - 1;
173 }
174 }
175
176 /* clear upper half to avoid AVX-SSE transition penalty */
177 _mm256_zeroupper();
178#endif
179
180 /* 16-byte loop, full store */
181 while (nb >= 16 && dn >= 16) {
182 __m128i vv = _mm_loadu_si128 ((const void *)sp);
183 __m128i rv = _mm_find_quote (vv);
184 _mm_storeu_si128 ((void *)dp, vv);
185
186 /* check for matches */
187 if ((mm = _mm_movemask_epi8(rv)) != 0) {
188 return sp - ss + __builtin_ctz(mm);
189 }
190
191 /* move to next block */
192 sp += 16;
193 dp += 16;
194 nb -= 16;
195 dn -= 16;
196 }
197
198 /* 16-byte test, partial store */
199 if (nb >= 16) {
200 __m128i vv = _mm_loadu_si128 ((const void *)sp);
201 __m128i rv = _mm_find_quote (vv);
202 uint32_t mv = _mm_movemask_epi8 (rv);
203 uint32_t fv = __builtin_ctz (mv | 0x010000);
204
205 /* copy at most `dn` characters */
206 if (fv <= dn) {
207 memcpy_p16(dp, sp, fv);
208 return sp - ss + fv;
209 } else {
210 memcpy_p16(dp, sp, dn);
211 return -(sp - ss + dn) - 1;
212 }
213 }
214
215 /* handle the remaining bytes with scalar code */
216 while (nb > 0 && dn > 0) {
217 if (_SingleQuoteTab[*(uint8_t *)sp].n) {
218 return sp - ss;
219 } else {
220 dn--, nb--;
221 *dp++ = *sp++;
222 }
223 }
224
225 /* check for dest buffer */
226 if (nb == 0) {
227 return sp - ss;
228 } else {
229 return -(sp - ss) - 1;
230 }
231}
232
233static const bool _EscTab[256] = {
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00-0x0F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10-0x1F
236 // '"'
237 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20-0x2F
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30-0x3F
239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40-0x4F
240 // '""
241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 0x50-0x5F
242 // 0x60-0xFF are zeroes
243};
244
245static inline uint8_t escape_mask4(const char *sp) {
246 return _EscTab[*(uint8_t *)(sp)] | (_EscTab[*(uint8_t *)(sp + 1)] << 1) | (_EscTab[*(uint8_t *)(sp + 2)] << 2) | (_EscTab[*(uint8_t *)(sp + 3)] << 3);
247}
248
249static inline ssize_t memcchr_quote_unsafe(const char *sp, ssize_t nb, char *dp, const quoted_t * tab) {
250 uint32_t mm;
251 const char * ds = dp;
252 size_t cn = 0;
253
254simd_copy:
255
256 if (nb < 16) goto scalar_copy;
257
258#if USE_AVX2
259 /* 32-byte loop, full store */
260 while (nb >= 32) {
261 __m256i vv = _mm256_loadu_si256 ((const void *)sp);
262 __m256i rv = _mm256_find_quote (vv);
263 _mm256_storeu_si256 ((void *)dp, vv);
264
265 /* check for matches */
266 if ((mm = _mm256_movemask_epi8(rv)) != 0) {
267 cn = __builtin_ctz(mm);
268 sp += cn;
269 nb -= cn;
270 dp += cn;
271 goto escape;
272 }
273
274 /* move to next block */
275 sp += 32;
276 dp += 32;
277 nb -= 32;
278 }
279
280 /* clear upper half to avoid AVX-SSE transition penalty */
281 _mm256_zeroupper();
282#endif
283
284 /* 16-byte loop, full store */
285 while (nb >= 16) {
286 __m128i vv = _mm_loadu_si128 ((const void *)sp);
287 __m128i rv = _mm_find_quote (vv);
288 _mm_storeu_si128 ((void *)dp, vv);
289
290 /* check for matches */
291 if ((mm = _mm_movemask_epi8(rv)) != 0) {
292 cn = __builtin_ctz(mm);
293 sp += cn;
294 nb -= cn;
295 dp += cn;
296 goto escape;
297 }
298
299 /* move to next block */
300 sp += 16;
301 dp += 16;
302 nb -= 16;
303 }
304
305 /* handle the remaining bytes with scalar code */
306 // while (nb > 0) {
307 // if (_EscTab[*(uint8_t *)sp]) {
308 // goto escape;
309 // } else {
310 // nb--;
311 // *dp++ = *sp++;
312 // }
313 // }
314 // optimize: loop unrolling here
315
316scalar_copy:
317 if (nb >= 8) {
318 uint8_t mask1 = escape_mask4(sp);
319 *(uint64_t *)dp = *(const uint64_t *)sp;
320 if (unlikely(mask1)) {
321 cn = __builtin_ctz(mask1);
322 sp += cn;
323 nb -= cn;
324 dp += cn;
325 goto escape;
326 }
327 uint8_t mask2 = escape_mask4(sp + 4);
328 if (unlikely(mask2)) {
329 cn = __builtin_ctz(mask2);
330 sp += cn + 4;
331 nb -= cn + 4;
332 dp += cn + 4;
333 goto escape;
334 }
335 dp += 8, sp += 8, nb -= 8;
336 }
337
338 if (nb >= 4) {
339 uint8_t mask2 = escape_mask4(sp);
340 *(uint32_t *)dp = *(const uint32_t *)sp;
341 if (unlikely(mask2)) {
342 cn = __builtin_ctz(mask2);
343 sp += cn;
344 nb -= cn;
345 dp += cn;
346 goto escape;
347 }
348 dp += 4, sp += 4, nb -= 4;
349 }
350
351 while (nb > 0) {
352 if (unlikely(_EscTab[*(uint8_t *)(sp)])) goto escape;
353 *dp++ = *sp++, nb--;
354 }
355 /* all quote done */
356 return dp - ds;
357escape:
358 /* get the escape entry, handle consecutive quotes */
359 do {
360 uint8_t ch = *(uint8_t *)sp;
361 int nc = tab[ch].n;
362 /* copy the quoted value.
363 * Note: dp always has at least 8 bytes (MAX_ESCAPED_BYTES) here.
364 * so, we not use memcpy_p8(dp, tab[ch].s, nc);
365 */
366 *(uint64_t *)dp = *(const uint64_t *)tab[ch].s;
367 sp++;
368 nb--;
369 dp += nc;
370 if (nb <= 0) break;
371 /* copy and find escape chars */
372 if (_EscTab[*(uint8_t *)(sp)] == 0) {
373 goto simd_copy;
374 }
375 } while (true);
376 return dp - ds;
377}
378
379ssize_t quote(const char *sp, ssize_t nb, char *dp, ssize_t *dn, uint64_t flags) {
380 ssize_t nd = *dn;
381 const char * ds = dp;
382 const char * ss = sp;
383 const quoted_t * tab;
384
385 /* select quoting table */
386 if (!(flags & F_DBLUNQ)) {
387 tab = _SingleQuoteTab;
388 } else {
389 tab = _DoubleQuoteTab;
390 }
391
392 if (*dn >= nb * MAX_ESCAPED_BYTES) {
393 *dn = memcchr_quote_unsafe(sp, nb, dp, tab);
394 return nb;
395 }
396
397 /* find the special characters, copy on the fly */
398 while (nb != 0) {
399 int nc;
400 uint8_t ch;
401 ssize_t rb = memcchr_quote(sp, nb, dp, nd);
402
403 /* not enough buffer space */
404 if (rb < 0) {
405 *dn = dp - ds - rb - 1;
406 return -(sp - ss - rb - 1) - 1;
407 }
408
409 /* skip already copied bytes */
410 sp += rb;
411 dp += rb;
412 nb -= rb;
413 nd -= rb;
414
415 /* get the escape entry, handle consecutive quotes */
416 while (nb != 0) {
417 ch = *(uint8_t *)sp;
418 nc = tab[ch].n;
419
420 /* check for escape character */
421 if (nc == 0) {
422 break;
423 }
424
425 /* check for buffer space */
426 if (nc > nd) {
427 *dn = dp - ds;
428 return -(sp - ss) - 1;
429 }
430
431 /* copy the quoted value */
432 memcpy_p8(dp, tab[ch].s, nc);
433 sp++;
434 nb--;
435 dp += nc;
436 nd -= nc;
437 }
438 }
439
440 /* all done */
441 *dn = dp - ds;
442 return sp - ss;
443}
444
445/** String Unquoting **/
446
447static const char _UnquoteTab[256] = {
448 ['/' ] = '/',
449 ['"' ] = '"',
450 ['b' ] = '\b',
451 ['f' ] = '\f',
452 ['n' ] = '\n',
453 ['r' ] = '\r',
454 ['t' ] = '\t',
455 ['u' ] = -1,
456 ['\\'] = '\\',
457};
458
459static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) {
460 int64_t r;
461 ssize_t n = nb;
462 const char * q = s;
463
464#if USE_AVX2
465 __m256i u;
466 __m256i v;
467 __m256i b = _mm256_set1_epi8('\\');
468
469 /* process every 32 bytes */
470 while (n >= 32) {
471 u = _mm256_loadu_si256 ((const void *)s);
472 v = _mm256_cmpeq_epi8 (u, b);
473 _mm256_storeu_si256 ((void *)p, u);
474
475 /* check for matches */
476 if ((r = _mm256_movemask_epi8(v)) != 0) {
477 return s - q + __builtin_ctzll(r);
478 }
479
480 /* move to the next 32 bytes */
481 s += 32;
482 p += 32;
483 n -= 32;
484 }
485
486 /* clear upper half to avoid AVX-SSE transition penalty */
487 _mm256_zeroupper();
488#endif
489
490 /* initialze with '\\' */
491 __m128i x;
492 __m128i y;
493 __m128i a = _mm_set1_epi8('\\');
494
495 /* process every 16 bytes */
496 while (n >= 16) {
497 x = _mm_loadu_si128 ((const void *)s);
498 y = _mm_cmpeq_epi8 (x, a);
499 _mm_storeu_si128 ((void *)p, x);
500
501 /* check for matches */
502 if ((r = _mm_movemask_epi8(y)) != 0) {
503 return s - q + __builtin_ctzll(r);
504 }
505
506 /* move to the next 16 bytes */
507 s += 16;
508 p += 16;
509 n -= 16;
510 }
511
512 /* remaining bytes, do with scalar code */
513 while (n--) {
514 if (*s != '\\') {
515 *p++ = *s++;
516 } else {
517 return s - q;
518 }
519 }
520
521 /* nothing found, but everything was copied */
522 return -1;
523}
524
525#define ALL_01h (~0ul / 255)
526#define ALL_7fh (ALL_01h * 127)
527#define ALL_80h (ALL_01h * 128)
528
529static inline uint32_t hasless(uint32_t x, uint8_t n) {
530 return (x - ALL_01h * n) & ~x & ALL_80h;
531}
532
533static inline uint32_t hasmore(uint32_t x, uint8_t n) {
534 return (x + ALL_01h * (127 - n) | x) & ALL_80h;
535}
536
537static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
538 return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
539}
540
541#undef ALL_01h
542#undef ALL_7fh
543#undef ALL_80h
544
545static inline char ishex(char c) {
546 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
547}
548
549static inline void unirep(char **dp) {
550 *(*dp)++ = 0xef;
551 *(*dp)++ = 0xbf;
552 *(*dp)++ = 0xbd;
553}
554
555static inline char unhex16_is(const char *s) {
556 uint32_t v = *(uint32_t *)s;
557 return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
558}
559
560static inline uint32_t unhex16_fast(const char *s) {
561 uint32_t a = __builtin_bswap32(*(uint32_t *)s);
562 uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
563 uint32_t c = (b >> 4) | b;
564 uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
565 return d;
566}
567
568ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) {
569 ssize_t n;
570 ssize_t x = nb;
571 const char * s = sp;
572 const char * p = dp;
573
574 /* scan & copy all the non-escape characters */
575 while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) {
576 char cc;
577 uint32_t r0;
578 uint32_t r1;
579
580 /* skip the plain text */
581 dp += n;
582 sp += n + 2;
583 nb -= n + 2;
584
585 /* check for EOF */
586 if (nb < 0) {
587 *ep = x;
588 return -ERR_EOF;
589 }
590
591 /* check for double unquote */
592 if (unlikely(flags & F_DBLUNQ)) {
593 int nr = nb;
594 char c1 = sp[-1];
595
596 /* must have at least 1 character left */
597 if (nr == 0) {
598 *ep = x;
599 return -ERR_EOF;
600 }
601
602 /* every quote must be a double quote */
603 if (c1 != '\\') {
604 *ep = sp - s - 1;
605 return -ERR_INVAL;
606 }
607
608 /* special case of '\\\\' and '\\\"' */
609 if (*sp == '\\') {
610 if (nr < 2) {
611 *ep = x;
612 return -ERR_EOF;
613 } else if (sp[1] != '"' && sp[1] != '\\') {
614 *ep = sp - s + 1;
615 return -ERR_INVAL;
616 } else {
617 sp++;
618 nb--;
619 }
620 }
621
622 /* skip the second escape */
623 sp++;
624 nb--;
625 }
626
627 /* check for escape sequence */
628 if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) {
629 *ep = sp - s - 1;
630 return -ERR_ESCAPE;
631 }
632
633 /* check for simple escape sequence */
634 if (cc != -1) {
635 *dp++ = cc;
636 continue;
637 }
638
639 /* must have at least 4 characters */
640 if (nb < 4) {
641 *ep = x;
642 return -ERR_EOF;
643 }
644
645 /* check for hexadecimal characters */
646 if (!unhex16_is(sp)) {
647 *ep = sp - s;
648 for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep;
649 return -ERR_INVAL;
650 }
651
652 /* decode the code-point */
653 r0 = unhex16_fast(sp);
654 sp += 4;
655 nb -= 4;
656
657 /* from line 598 */
658 retry_decode:
659
660 /* ASCII characters, unlikely */
661 if (unlikely(r0 <= 0x7f)) {
662 *dp++ = (char)r0;
663 continue;
664 }
665
666 /* latin-1 characters, unlikely */
667 if (unlikely(r0 <= 0x07ff)) {
668 *dp++ = (char)(0xc0 | (r0 >> 6));
669 *dp++ = (char)(0x80 | (r0 & 0x3f));
670 continue;
671 }
672
673 /* 3-byte characters, likely */
674 if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
675 *dp++ = (char)(0xe0 | ((r0 >> 12) ));
676 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
677 *dp++ = (char)(0x80 | ((r0 ) & 0x3f));
678 continue;
679 }
680
681 /* check for double unquote */
682 if (unlikely(flags & F_DBLUNQ)) {
683 if (nb < 1) {
684 if (likely(flags & F_UNIREP)) {
685 unirep(&dp);
686 continue;
687 } else {
688 *ep = x;
689 return -ERR_EOF;
690 }
691 } else {
692 if (sp[0] == '\\') {
693 nb--;
694 sp++;
695 } else if (likely(flags & F_UNIREP)) {
696 unirep(&dp);
697 continue;
698 } else {
699 *ep = sp - s - 4;
700 return -ERR_UNICODE;
701 }
702 }
703 }
704
705 /* surrogate half, must follows by the other half */
706 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
707 if (likely(flags & F_UNIREP)) {
708 unirep(&dp);
709 continue;
710 } else {
711 *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4);
712 return -ERR_UNICODE;
713 }
714 }
715
716 /* check the hexadecimal escape */
717 if (!unhex16_is(sp + 2)) {
718 *ep = sp - s + 2;
719 for (int i = 2; i < 6 && ishex(sp[i]); i++) ++*ep;
720 return -ERR_INVAL;
721 }
722
723 /* decode the second code-point */
724 r1 = unhex16_fast(sp + 2);
725 sp += 6;
726 nb -= 6;
727
728 /* it must be the other half */
729 if (r1 < 0xdc00 || r1 > 0xdfff) {
730 if (unlikely(!(flags & F_UNIREP))) {
731 *ep = sp - s - 4;
732 return -ERR_UNICODE;
733 } else {
734 r0 = r1;
735 unirep(&dp);
736 goto retry_decode;
737 }
738 }
739
740 /* merge two surrogates */
741 r0 = (r0 - 0xd800) << 10;
742 r1 = (r1 - 0xdc00) + 0x010000;
743 r0 += r1;
744
745 /* check the code point range */
746 if (r0 > 0x10ffff) {
747 if (likely(!(flags & F_UNIREP))) {
748 *ep = sp - s - 4;
749 return -ERR_UNICODE;
750 } else {
751 unirep(&dp);
752 continue;
753 }
754 }
755
756 /* encode the character */
757 *dp++ = (char)(0xf0 | ((r0 >> 18) ));
758 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
759 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
760 *dp++ = (char)(0x80 | ((r0 ) & 0x3f));
761 }
762
763 /* calculate the result length */
764 return dp + nb - p;
765}
766
767static inline __m128i _mm_find_html(__m128i vv) {
768 __m128i e1 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('<'));
769 __m128i e2 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('>'));
770 __m128i e3 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('&'));
771 __m128i e4 = _mm_cmpeq_epi8 (vv, _mm_set1_epi8('\xe2'));
772 __m128i r1 = _mm_or_si128 (e1, e2);
773 __m128i r2 = _mm_or_si128 (e3, e4);
774 __m128i rv = _mm_or_si128 (r1, r2);
775 return rv;
776}
777
778#if USE_AVX2
779static inline __m256i _mm256_find_html(__m256i vv) {
780 __m256i e1 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('<'));
781 __m256i e2 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('>'));
782 __m256i e3 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('&'));
783 __m256i e4 = _mm256_cmpeq_epi8 (vv, _mm256_set1_epi8('\xe2'));
784 __m256i r1 = _mm256_or_si256 (e1, e2);
785 __m256i r2 = _mm256_or_si256 (e3, e4);
786 __m256i rv = _mm256_or_si256 (r1, r2);
787 return rv;
788}
789#endif
790
791static inline ssize_t memcchr_html_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) {
792 uint32_t mm;
793 const char * ss = sp;
794
795#if USE_AVX2
796 /* 32-byte loop, full store */
797 while (nb >= 32 && dn >= 32) {
798 __m256i vv = _mm256_loadu_si256 ((const void *)sp);
799 __m256i rv = _mm256_find_html (vv);
800 _mm256_storeu_si256 ((void *)dp, vv);
801
802 /* check for matches */
803 if ((mm = _mm256_movemask_epi8(rv)) != 0) {
804 return sp - ss + __builtin_ctz(mm);
805 }
806
807 /* move to next block */
808 sp += 32;
809 dp += 32;
810 nb -= 32;
811 dn -= 32;
812 }
813
814 /* 32-byte test, partial store */
815 if (nb >= 32) {
816 __m256i vv = _mm256_loadu_si256 ((const void *)sp);
817 __m256i rv = _mm256_find_html (vv);
818 uint32_t mv = _mm256_movemask_epi8 (rv);
819 uint32_t fv = __builtin_ctzll ((uint64_t)mv | 0x0100000000);
820
821 /* copy at most `dn` characters */
822 if (fv <= dn) {
823 memcpy_p32(dp, sp, fv);
824 return sp - ss + fv;
825 } else {
826 memcpy_p32(dp, sp, dn);
827 return -(sp - ss + dn) - 1;
828 }
829 }
830
831 /* clear upper half to avoid AVX-SSE transition penalty */
832 _mm256_zeroupper();
833#endif
834
835 /* 16-byte loop, full store */
836 while (nb >= 16 && dn >= 16) {
837 __m128i vv = _mm_loadu_si128 ((const void *)sp);
838 __m128i rv = _mm_find_html (vv);
839 _mm_storeu_si128 ((void *)dp, vv);
840
841 /* check for matches */
842 if ((mm = _mm_movemask_epi8(rv)) != 0) {
843 return sp - ss + __builtin_ctz(mm);
844 }
845
846 /* move to next block */
847 sp += 16;
848 dp += 16;
849 nb -= 16;
850 dn -= 16;
851 }
852
853 /* 16-byte test, partial store */
854 if (nb >= 16) {
855 __m128i vv = _mm_loadu_si128 ((const void *)sp);
856 __m128i rv = _mm_find_html (vv);
857 uint32_t mv = _mm_movemask_epi8 (rv);
858 uint32_t fv = __builtin_ctz (mv | 0x010000);
859
860 /* copy at most `dn` characters */
861 if (fv <= dn) {
862 memcpy_p16(dp, sp, fv);
863 return sp - ss + fv;
864 } else {
865 memcpy_p16(dp, sp, dn);
866 return -(sp - ss + dn) - 1;
867 }
868 }
869
870 /* handle the remaining bytes with scalar code */
871 while (nb > 0 && dn > 0) {
872 if (*sp == '<' || *sp == '>' || *sp == '&' || *sp == '\xe2') {
873 return sp - ss;
874 } else {
875 dn--, nb--;
876 *dp++ = *sp++;
877 }
878 }
879
880 /* check for dest buffer */
881 if (nb == 0) {
882 return sp - ss;
883 } else {
884 return -(sp - ss) - 1;
885 }
886}
887
888ssize_t html_escape(const char *sp, ssize_t nb, char *dp, ssize_t *dn) {
889 ssize_t nd = *dn;
890 const char * ds = dp;
891 const char * ss = sp;
892 const quoted_t * tab = _HtmlQuoteTab;
893
894 /* find the special characters, copy on the fly */
895 while (nb > 0) {
896 int nc = 0;
897 uint8_t ch = 0;
898 ssize_t rb = 0;
899 const char * cur = 0;
900
901 /* not enough buffer space */
902 if (nd <= 0) {
903 return -(sp - ss) - 1;
904 }
905
906 /* find and copy */
907 if ((rb = memcchr_html_quote(sp, nb, dp, nd)) < 0) {
908 *dn = dp - ds - rb - 1;
909 return -(sp - ss - rb - 1) - 1;
910 }
911
912 /* skip already copied bytes */
913 sp += rb;
914 dp += rb;
915 nb -= rb;
916 nd -= rb;
917
918 /* stop if already finished */
919 if (nb <= 0) {
920 break;
921 }
922
923 /* mark cur postion */
924 cur = sp;
925
926 /* check for \u2028 and \u2029, binary is \xe2\x80\xa8 and \xe2\x80\xa9 */
927 if (unlikely(*sp == '\xe2')) {
928 if (nb >= 3 && *(sp+1) == '\x80' && (*(sp+2) == '\xa8' || *(sp+2) == '\xa9')) {
929 sp += 2, nb -= 2;
930 } else if (nd > 0) {
931 *dp++ = *sp++;
932 nb--, nd--;
933 continue;
934 } else {
935 return -(sp - ss) - 1;
936 }
937 }
938
939 /* get the escape entry, handle consecutive quotes */
940 ch = * (uint8_t*) sp;
941 nc = tab[ch].n;
942
943
944 /* check for buffer space */
945 if (nd < nc) {
946 *dn = dp - ds;
947 return -(cur - ss) - 1;
948 }
949
950 /* copy the quoted value */
951 memcpy_p8(dp, tab[ch].s, nc);
952 sp++;
953 nb--;
954 dp += nc;
955 nd -= nc;
956 }
957
958 /* all done */
959 *dn = dp - ds;
960 return sp - ss;
961}
962
963#undef MAX_ESCAPED_BYTES
964
965static inline long unescape(const char** src, const char* end, char* dp) {
966 const char* sp = *src;
967 long nb = end - sp;
968 char cc = 0;
969 uint32_t r0, r1;
970
971 if (nb <= 0) return -ERR_EOF;
972
973 if ((cc = _UnquoteTab[(uint8_t)sp[1]]) == 0) {
974 *src += 1;
975 return -ERR_ESCAPE;
976 }
977
978 if (cc != -1) {
979 *dp = cc;
980 *src += 2;
981 return 1;
982 }
983
984 if (nb < 4) {
985 *src += 1;
986 return -ERR_EOF;
987 }
988
989 /* check for hexadecimal characters */
990 if (!unhex16_is(sp + 2)) {
991 *src += 2;
992 return -ERR_INVAL;
993 }
994
995 /* decode the code-point */
996 r0 = unhex16_fast(sp + 2);
997 sp += 6;
998 *src = sp;
999
1000 /* ASCII characters, unlikely */
1001 if (unlikely(r0 <= 0x7f)) {
1002 *dp++ = (char)r0;
1003 return 1;
1004 }
1005
1006 /* latin-1 characters, unlikely */
1007 if (unlikely(r0 <= 0x07ff)) {
1008 *dp++ = (char)(0xc0 | (r0 >> 6));
1009 *dp++ = (char)(0x80 | (r0 & 0x3f));
1010 return 2;
1011 }
1012
1013 /* 3-byte characters, likely */
1014 if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
1015 *dp++ = (char)(0xe0 | ((r0 >> 12) ));
1016 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
1017 *dp++ = (char)(0x80 | ((r0 ) & 0x3f));
1018 return 3;
1019 }
1020
1021 /* surrogate half, must follows by the other half */
1022 if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
1023 return -ERR_UNICODE;
1024 }
1025
1026 /* check the hexadecimal escape */
1027 if (!unhex16_is(sp + 2)) {
1028 *src += 2;
1029 return -ERR_INVAL;
1030 }
1031
1032 /* decode the second code-point */
1033 r1 = unhex16_fast(sp + 2);
1034
1035 /* it must be the other half */
1036 if (r1 < 0xdc00 || r1 > 0xdfff) {
1037 *src += 2;
1038 return -ERR_UNICODE;
1039 }
1040
1041 /* merge two surrogates */
1042 r0 = (r0 - 0xd800) << 10;
1043 r1 = (r1 - 0xdc00) + 0x010000;
1044 r0 += r1;
1045
1046 /* encode the character */
1047 *dp++ = (char)(0xf0 | ((r0 >> 18) ));
1048 *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
1049 *dp++ = (char)(0x80 | ((r0 >> 6) & 0x3f));
1050 *dp++ = (char)(0x80 | ((r0 ) & 0x3f));
1051 *src = sp + 6;
1052 return 4;
1053}
View as plain text