...

Text file src/github.com/bytedance/sonic/native/parsing.c

Documentation: github.com/bytedance/sonic/native

     1/*
     2 * Copyright 2021 ByteDance Inc.
     3 *
     4 * Licensed under the Apache License, Version 2.0 (the "License");
     5 * you may not use this file except in compliance with the License.
     6 * You may obtain a copy of the License at
     7 *
     8 *     http://www.apache.org/licenses/LICENSE-2.0
     9 *
    10 * Unless required by applicable law or agreed to in writing, software
    11 * distributed under the License is distributed on an "AS IS" BASIS,
    12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13 * See the License for the specific language governing permissions and
    14 * limitations under the License.
    15 */
    16
    17#include "native.h"
    18#include "utils.h"
    19#include <stdint.h>
    20
    21/** String Quoting **/
    22#define MAX_ESCAPED_BYTES 8
    23typedef struct {
    24    const long n;
    25    const char s[MAX_ESCAPED_BYTES];
    26} quoted_t;
    27
    28static const quoted_t _SingleQuoteTab[256] = {
    29    ['\x00'] = { .n = 6, .s = "\\u0000" },
    30    ['\x01'] = { .n = 6, .s = "\\u0001" },
    31    ['\x02'] = { .n = 6, .s = "\\u0002" },
    32    ['\x03'] = { .n = 6, .s = "\\u0003" },
    33    ['\x04'] = { .n = 6, .s = "\\u0004" },
    34    ['\x05'] = { .n = 6, .s = "\\u0005" },
    35    ['\x06'] = { .n = 6, .s = "\\u0006" },
    36    ['\x07'] = { .n = 6, .s = "\\u0007" },
    37    ['\b'  ] = { .n = 6, .s = "\\u0008" },
    38    ['\t'  ] = { .n = 2, .s = "\\t"     },
    39    ['\n'  ] = { .n = 2, .s = "\\n"     },
    40    ['\x0b'] = { .n = 6, .s = "\\u000b" },
    41    ['\f'  ] = { .n = 6, .s = "\\u000c" },
    42    ['\r'  ] = { .n = 2, .s = "\\r"     },
    43    ['\x0e'] = { .n = 6, .s = "\\u000e" },
    44    ['\x0f'] = { .n = 6, .s = "\\u000f" },
    45    ['\x10'] = { .n = 6, .s = "\\u0010" },
    46    ['\x11'] = { .n = 6, .s = "\\u0011" },
    47    ['\x12'] = { .n = 6, .s = "\\u0012" },
    48    ['\x13'] = { .n = 6, .s = "\\u0013" },
    49    ['\x14'] = { .n = 6, .s = "\\u0014" },
    50    ['\x15'] = { .n = 6, .s = "\\u0015" },
    51    ['\x16'] = { .n = 6, .s = "\\u0016" },
    52    ['\x17'] = { .n = 6, .s = "\\u0017" },
    53    ['\x18'] = { .n = 6, .s = "\\u0018" },
    54    ['\x19'] = { .n = 6, .s = "\\u0019" },
    55    ['\x1a'] = { .n = 6, .s = "\\u001a" },
    56    ['\x1b'] = { .n = 6, .s = "\\u001b" },
    57    ['\x1c'] = { .n = 6, .s = "\\u001c" },
    58    ['\x1d'] = { .n = 6, .s = "\\u001d" },
    59    ['\x1e'] = { .n = 6, .s = "\\u001e" },
    60    ['\x1f'] = { .n = 6, .s = "\\u001f" },
    61    ['"'   ] = { .n = 2, .s = "\\\""    },
    62    ['\\'  ] = { .n = 2, .s = "\\\\"    },
    63};
    64
    65static const quoted_t _DoubleQuoteTab[256] = {
    66    ['\x00'] = { .n = 7, .s = "\\\\u0000" },
    67    ['\x01'] = { .n = 7, .s = "\\\\u0001" },
    68    ['\x02'] = { .n = 7, .s = "\\\\u0002" },
    69    ['\x03'] = { .n = 7, .s = "\\\\u0003" },
    70    ['\x04'] = { .n = 7, .s = "\\\\u0004" },
    71    ['\x05'] = { .n = 7, .s = "\\\\u0005" },
    72    ['\x06'] = { .n = 7, .s = "\\\\u0006" },
    73    ['\x07'] = { .n = 7, .s = "\\\\u0007" },
    74    ['\b'  ] = { .n = 7, .s = "\\\\u0008" },
    75    ['\t'  ] = { .n = 3, .s = "\\\\t"     },
    76    ['\n'  ] = { .n = 3, .s = "\\\\n"     },
    77    ['\x0b'] = { .n = 7, .s = "\\\\u000b" },
    78    ['\f'  ] = { .n = 7, .s = "\\\\u000c" },
    79    ['\r'  ] = { .n = 3, .s = "\\\\r"     },
    80    ['\x0e'] = { .n = 7, .s = "\\\\u000e" },
    81    ['\x0f'] = { .n = 7, .s = "\\\\u000f" },
    82    ['\x10'] = { .n = 7, .s = "\\\\u0010" },
    83    ['\x11'] = { .n = 7, .s = "\\\\u0011" },
    84    ['\x12'] = { .n = 7, .s = "\\\\u0012" },
    85    ['\x13'] = { .n = 7, .s = "\\\\u0013" },
    86    ['\x14'] = { .n = 7, .s = "\\\\u0014" },
    87    ['\x15'] = { .n = 7, .s = "\\\\u0015" },
    88    ['\x16'] = { .n = 7, .s = "\\\\u0016" },
    89    ['\x17'] = { .n = 7, .s = "\\\\u0017" },
    90    ['\x18'] = { .n = 7, .s = "\\\\u0018" },
    91    ['\x19'] = { .n = 7, .s = "\\\\u0019" },
    92    ['\x1a'] = { .n = 7, .s = "\\\\u001a" },
    93    ['\x1b'] = { .n = 7, .s = "\\\\u001b" },
    94    ['\x1c'] = { .n = 7, .s = "\\\\u001c" },
    95    ['\x1d'] = { .n = 7, .s = "\\\\u001d" },
    96    ['\x1e'] = { .n = 7, .s = "\\\\u001e" },
    97    ['\x1f'] = { .n = 7, .s = "\\\\u001f" },
    98    ['"'   ] = { .n = 4, .s = "\\\\\\\""  },
    99    ['\\'  ] = { .n = 4, .s = "\\\\\\\\"  },
   100};
   101
   102static const quoted_t _HtmlQuoteTab[256] = {
   103    ['<'] = { .n = 6, .s = "\\u003c" },
   104    ['>'] = { .n = 6, .s = "\\u003e" },
   105    ['&'] = { .n = 6, .s = "\\u0026" },
   106    // \u2028 and \u2029 is [E2 80 A8] and [E2 80 A9]
   107    [0xe2] = { .n = 0, .s = {0} },
   108    [0xa8] = { .n = 6, .s = "\\u2028" },
   109    [0xa9] = { .n = 6, .s = "\\u2029" },
   110};
   111
   112static inline __m128i _mm_find_quote(__m128i vv) {
   113    __m128i e1 = _mm_cmpgt_epi8   (vv, _mm_set1_epi8(-1));
   114    __m128i e2 = _mm_cmpgt_epi8   (vv, _mm_set1_epi8(31));
   115    __m128i e3 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('"'));
   116    __m128i e4 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('\\'));
   117    __m128i r1 = _mm_andnot_si128 (e2, e1);
   118    __m128i r2 = _mm_or_si128     (e3, e4);
   119    __m128i rv = _mm_or_si128     (r1, r2);
   120    return rv;
   121}
   122
   123#if USE_AVX2
   124static inline __m256i _mm256_find_quote(__m256i vv) {
   125    __m256i e1 = _mm256_cmpgt_epi8   (vv, _mm256_set1_epi8(-1));
   126    __m256i e2 = _mm256_cmpgt_epi8   (vv, _mm256_set1_epi8(31));
   127    __m256i e3 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('"'));
   128    __m256i e4 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('\\'));
   129    __m256i r1 = _mm256_andnot_si256 (e2, e1);
   130    __m256i r2 = _mm256_or_si256     (e3, e4);
   131    __m256i rv = _mm256_or_si256     (r1, r2);
   132    return rv;
   133}
   134#endif
   135
   136static inline ssize_t memcchr_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) {
   137    uint32_t     mm;
   138    const char * ss = sp;
   139
   140#if USE_AVX2
   141    /* 32-byte loop, full store */
   142    while (nb >= 32 && dn >= 32) {
   143        __m256i vv = _mm256_loadu_si256  ((const void *)sp);
   144        __m256i rv = _mm256_find_quote   (vv);
   145                     _mm256_storeu_si256 ((void *)dp, vv);
   146
   147        /* check for matches */
   148        if ((mm = _mm256_movemask_epi8(rv)) != 0) {
   149            return sp - ss + __builtin_ctz(mm);
   150        }
   151
   152        /* move to next block */
   153        sp += 32;
   154        dp += 32;
   155        nb -= 32;
   156        dn -= 32;
   157    }
   158
   159    /* 32-byte test, partial store */
   160    if (nb >= 32) {
   161        __m256i  vv = _mm256_loadu_si256   ((const void *)sp);
   162        __m256i  rv = _mm256_find_quote    (vv);
   163        uint32_t mv = _mm256_movemask_epi8 (rv);
   164        uint32_t fv = __builtin_ctzll      ((uint64_t)mv | 0x0100000000);
   165
   166        /* copy at most `dn` characters */
   167        if (fv <= dn) {
   168            memcpy_p32(dp, sp, fv);
   169            return sp - ss + fv;
   170        } else {
   171            memcpy_p32(dp, sp, dn);
   172            return -(sp - ss + dn) - 1;
   173        }
   174    }
   175
   176    /* clear upper half to avoid AVX-SSE transition penalty */
   177    _mm256_zeroupper();
   178#endif
   179
   180    /* 16-byte loop, full store */
   181    while (nb >= 16 && dn >= 16) {
   182        __m128i vv = _mm_loadu_si128  ((const void *)sp);
   183        __m128i rv = _mm_find_quote   (vv);
   184                     _mm_storeu_si128 ((void *)dp, vv);
   185
   186        /* check for matches */
   187        if ((mm = _mm_movemask_epi8(rv)) != 0) {
   188            return sp - ss + __builtin_ctz(mm);
   189        }
   190
   191        /* move to next block */
   192        sp += 16;
   193        dp += 16;
   194        nb -= 16;
   195        dn -= 16;
   196    }
   197
   198    /* 16-byte test, partial store */
   199    if (nb >= 16) {
   200        __m128i  vv = _mm_loadu_si128   ((const void *)sp);
   201        __m128i  rv = _mm_find_quote    (vv);
   202        uint32_t mv = _mm_movemask_epi8 (rv);
   203        uint32_t fv = __builtin_ctz     (mv | 0x010000);
   204
   205        /* copy at most `dn` characters */
   206        if (fv <= dn) {
   207            memcpy_p16(dp, sp, fv);
   208            return sp - ss + fv;
   209        } else {
   210            memcpy_p16(dp, sp, dn);
   211            return -(sp - ss + dn) - 1;
   212        }
   213    }
   214
   215    /* handle the remaining bytes with scalar code */
   216    while (nb > 0 && dn > 0) {
   217        if (_SingleQuoteTab[*(uint8_t *)sp].n) {
   218            return sp - ss;
   219        } else {
   220            dn--, nb--;
   221            *dp++ = *sp++;
   222        }
   223    }
   224
   225    /* check for dest buffer */
   226    if (nb == 0) {
   227        return sp - ss;
   228    } else {
   229        return -(sp - ss) - 1;
   230    }
   231}
   232
   233static const bool _EscTab[256] = {
   234    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00-0x0F
   235    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10-0x1F
   236    //   '"'
   237    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20-0x2F
   238    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30-0x3F
   239    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40-0x4F
   240    //                                 '""
   241    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 0x50-0x5F
   242    // 0x60-0xFF are zeroes
   243};
   244
   245static inline uint8_t escape_mask4(const char *sp) {
   246    return _EscTab[*(uint8_t *)(sp)] | (_EscTab[*(uint8_t *)(sp + 1)] << 1) | (_EscTab[*(uint8_t *)(sp + 2)] << 2) | (_EscTab[*(uint8_t *)(sp + 3)]  << 3);
   247}
   248
   249static inline ssize_t memcchr_quote_unsafe(const char *sp, ssize_t nb, char *dp, const quoted_t * tab) {
   250    uint32_t     mm;
   251    const char * ds = dp;
   252    size_t cn = 0;
   253
   254simd_copy:
   255
   256    if (nb < 16) goto scalar_copy;
   257
   258#if USE_AVX2
   259    /* 32-byte loop, full store */
   260    while (nb >= 32) {
   261        __m256i vv = _mm256_loadu_si256  ((const void *)sp);
   262        __m256i rv = _mm256_find_quote   (vv);
   263                     _mm256_storeu_si256 ((void *)dp, vv);
   264
   265        /* check for matches */
   266        if ((mm = _mm256_movemask_epi8(rv)) != 0) {
   267            cn = __builtin_ctz(mm);
   268            sp += cn;
   269            nb -= cn;
   270            dp += cn;
   271            goto escape;
   272        }
   273
   274        /* move to next block */
   275        sp += 32;
   276        dp += 32;
   277        nb -= 32;
   278    }
   279
   280    /* clear upper half to avoid AVX-SSE transition penalty */
   281    _mm256_zeroupper();
   282#endif
   283
   284    /* 16-byte loop, full store */
   285    while (nb >= 16) {
   286        __m128i vv = _mm_loadu_si128  ((const void *)sp);
   287        __m128i rv = _mm_find_quote   (vv);
   288                     _mm_storeu_si128 ((void *)dp, vv);
   289
   290        /* check for matches */
   291        if ((mm = _mm_movemask_epi8(rv)) != 0) {
   292            cn =  __builtin_ctz(mm);
   293            sp += cn;
   294            nb -= cn;
   295            dp += cn;
   296            goto escape;
   297        }
   298
   299        /* move to next block */
   300        sp += 16;
   301        dp += 16;
   302        nb -= 16;
   303    }
   304
   305    /* handle the remaining bytes with scalar code */
   306    // while (nb > 0) {
   307    //     if (_EscTab[*(uint8_t *)sp]) {
   308    //         goto escape;
   309    //     } else {
   310    //         nb--;
   311    //         *dp++ = *sp++;
   312    //     }
   313    // }
   314    // optimize: loop unrolling here
   315
   316scalar_copy:
   317    if (nb >= 8) {
   318        uint8_t mask1 = escape_mask4(sp);
   319        *(uint64_t *)dp = *(const uint64_t *)sp;
   320        if (unlikely(mask1)) {
   321            cn =  __builtin_ctz(mask1);
   322            sp += cn;
   323            nb -= cn;
   324            dp += cn;
   325            goto escape;
   326        }
   327        uint8_t mask2 = escape_mask4(sp + 4);
   328        if (unlikely(mask2)) {
   329            cn =  __builtin_ctz(mask2);
   330            sp += cn + 4;
   331            nb -= cn + 4;
   332            dp += cn + 4;
   333            goto escape;
   334        }
   335        dp += 8, sp += 8, nb -= 8;
   336    }
   337
   338    if (nb >= 4) {
   339        uint8_t mask2 = escape_mask4(sp);
   340        *(uint32_t *)dp = *(const uint32_t *)sp;
   341        if (unlikely(mask2)) {
   342            cn =  __builtin_ctz(mask2);
   343            sp += cn;
   344            nb -= cn;
   345            dp += cn;
   346            goto escape;
   347        }
   348        dp += 4, sp += 4, nb -= 4;
   349    }
   350
   351    while (nb > 0) {
   352        if (unlikely(_EscTab[*(uint8_t *)(sp)])) goto escape;
   353        *dp++ = *sp++, nb--;
   354    }
   355    /* all quote done */
   356    return dp - ds;
   357escape:
   358     /* get the escape entry, handle consecutive quotes */
   359     do {
   360        uint8_t ch = *(uint8_t *)sp;
   361        int nc = tab[ch].n;
   362        /* copy the quoted value.
   363         * Note: dp always has at least 8 bytes (MAX_ESCAPED_BYTES) here.
   364         * so, we not use memcpy_p8(dp, tab[ch].s, nc);
   365         */
   366        *(uint64_t *)dp = *(const uint64_t *)tab[ch].s;
   367        sp++;
   368        nb--;
   369        dp += nc;
   370        if (nb <= 0) break;
   371        /* copy and find escape chars */
   372        if (_EscTab[*(uint8_t *)(sp)] == 0) {
   373            goto simd_copy;
   374        }
   375    } while (true);
   376    return dp - ds;
   377}
   378
   379ssize_t quote(const char *sp, ssize_t nb, char *dp, ssize_t *dn, uint64_t flags) {
   380    ssize_t          nd = *dn;
   381    const char *     ds = dp;
   382    const char *     ss = sp;
   383    const quoted_t * tab;
   384
   385    /* select quoting table */
   386    if (!(flags & F_DBLUNQ)) {
   387        tab = _SingleQuoteTab;
   388    } else {
   389        tab = _DoubleQuoteTab;
   390    }
   391
   392    if (*dn >= nb * MAX_ESCAPED_BYTES) {
   393        *dn = memcchr_quote_unsafe(sp, nb, dp, tab);
   394        return nb;
   395    }
   396
   397    /* find the special characters, copy on the fly */
   398    while (nb != 0) {
   399        int     nc;
   400        uint8_t ch;
   401        ssize_t rb = memcchr_quote(sp, nb, dp, nd);
   402
   403        /* not enough buffer space */
   404        if (rb < 0) {
   405            *dn = dp - ds - rb - 1;
   406            return -(sp - ss - rb - 1) - 1;
   407        }
   408
   409        /* skip already copied bytes */
   410        sp += rb;
   411        dp += rb;
   412        nb -= rb;
   413        nd -= rb;
   414
   415        /* get the escape entry, handle consecutive quotes */
   416        while (nb != 0) {
   417            ch = *(uint8_t *)sp;
   418            nc = tab[ch].n;
   419
   420            /* check for escape character */
   421            if (nc == 0) {
   422                break;
   423            }
   424
   425            /* check for buffer space */
   426            if (nc > nd) {
   427                *dn = dp - ds;
   428                return -(sp - ss) - 1;
   429            }
   430
   431            /* copy the quoted value */
   432            memcpy_p8(dp, tab[ch].s, nc);
   433            sp++;
   434            nb--;
   435            dp += nc;
   436            nd -= nc;
   437        }
   438    }
   439
   440    /* all done */
   441    *dn = dp - ds;
   442    return sp - ss;
   443}
   444
   445/** String Unquoting **/
   446
   447static const char _UnquoteTab[256] = {
   448    ['/' ] = '/',
   449    ['"' ] = '"',
   450    ['b' ] = '\b',
   451    ['f' ] = '\f',
   452    ['n' ] = '\n',
   453    ['r' ] = '\r',
   454    ['t' ] = '\t',
   455    ['u' ] = -1,
   456    ['\\'] = '\\',
   457};
   458
   459static inline ssize_t memcchr_p32(const char *s, ssize_t nb, char *p) {
   460    int64_t      r;
   461    ssize_t      n = nb;
   462    const char * q = s;
   463
   464#if USE_AVX2
   465    __m256i u;
   466    __m256i v;
   467    __m256i b = _mm256_set1_epi8('\\');
   468
   469    /* process every 32 bytes */
   470    while (n >= 32) {
   471        u = _mm256_loadu_si256  ((const void *)s);
   472        v = _mm256_cmpeq_epi8   (u, b);
   473            _mm256_storeu_si256 ((void *)p, u);
   474
   475        /* check for matches */
   476        if ((r = _mm256_movemask_epi8(v)) != 0) {
   477            return s - q + __builtin_ctzll(r);
   478        }
   479
   480        /* move to the next 32 bytes */
   481        s += 32;
   482        p += 32;
   483        n -= 32;
   484    }
   485
   486    /* clear upper half to avoid AVX-SSE transition penalty */
   487    _mm256_zeroupper();
   488#endif
   489
   490    /* initialze with '\\' */
   491    __m128i x;
   492    __m128i y;
   493    __m128i a = _mm_set1_epi8('\\');
   494
   495    /* process every 16 bytes */
   496    while (n >= 16) {
   497        x = _mm_loadu_si128  ((const void *)s);
   498        y = _mm_cmpeq_epi8   (x, a);
   499            _mm_storeu_si128 ((void *)p, x);
   500
   501        /* check for matches */
   502        if ((r = _mm_movemask_epi8(y)) != 0) {
   503            return s - q + __builtin_ctzll(r);
   504        }
   505
   506        /* move to the next 16 bytes */
   507        s += 16;
   508        p += 16;
   509        n -= 16;
   510    }
   511
   512    /* remaining bytes, do with scalar code */
   513    while (n--) {
   514        if (*s != '\\') {
   515            *p++ = *s++;
   516        } else {
   517            return s - q;
   518        }
   519    }
   520
   521    /* nothing found, but everything was copied */
   522    return -1;
   523}
   524
   525#define ALL_01h     (~0ul / 255)
   526#define ALL_7fh     (ALL_01h * 127)
   527#define ALL_80h     (ALL_01h * 128)
   528
   529static inline uint32_t hasless(uint32_t x, uint8_t n) {
   530    return (x - ALL_01h * n) & ~x & ALL_80h;
   531}
   532
   533static inline uint32_t hasmore(uint32_t x, uint8_t n) {
   534    return (x + ALL_01h * (127 - n) | x) & ALL_80h;
   535}
   536
   537static inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
   538    return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
   539}
   540
   541#undef ALL_01h
   542#undef ALL_7fh
   543#undef ALL_80h
   544
   545static inline char ishex(char c) {
   546    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
   547}
   548
   549static inline void unirep(char **dp) {
   550    *(*dp)++ = 0xef;
   551    *(*dp)++ = 0xbf;
   552    *(*dp)++ = 0xbd;
   553}
   554
   555static inline char unhex16_is(const char *s) {
   556    uint32_t v = *(uint32_t *)s;
   557    return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
   558}
   559
   560static inline uint32_t unhex16_fast(const char *s) {
   561    uint32_t a = __builtin_bswap32(*(uint32_t *)s);
   562    uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
   563    uint32_t c = (b >> 4) | b;
   564    uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
   565    return d;
   566}
   567
   568ssize_t unquote(const char *sp, ssize_t nb, char *dp, ssize_t *ep, uint64_t flags) {
   569    ssize_t      n;
   570    ssize_t      x = nb;
   571    const char * s = sp;
   572    const char * p = dp;
   573
   574    /* scan & copy all the non-escape characters */
   575    while (nb && (n = (*sp == '\\' ? 0 : memcchr_p32(sp, nb, dp))) != -1) {
   576        char     cc;
   577        uint32_t r0;
   578        uint32_t r1;
   579
   580        /* skip the plain text */
   581        dp += n;
   582        sp += n + 2;
   583        nb -= n + 2;
   584
   585        /* check for EOF */
   586        if (nb < 0) {
   587            *ep = x;
   588            return -ERR_EOF;
   589        }
   590
   591        /* check for double unquote */
   592        if (unlikely(flags & F_DBLUNQ)) {
   593            int  nr = nb;
   594            char c1 = sp[-1];
   595
   596            /* must have at least 1 character left */
   597            if (nr == 0) {
   598                *ep = x;
   599                return -ERR_EOF;
   600            }
   601
   602            /* every quote must be a double quote */
   603            if (c1 != '\\') {
   604                *ep = sp - s - 1;
   605                return -ERR_INVAL;
   606            }
   607
   608            /* special case of '\\\\' and '\\\"' */
   609            if (*sp == '\\') {
   610                if (nr < 2) {
   611                    *ep = x;
   612                    return -ERR_EOF;
   613                } else if (sp[1] != '"' && sp[1] != '\\') {
   614                    *ep = sp - s + 1;
   615                    return -ERR_INVAL;
   616                } else {
   617                    sp++;
   618                    nb--;
   619                }
   620            }
   621
   622            /* skip the second escape */
   623            sp++;
   624            nb--;
   625        }
   626
   627        /* check for escape sequence */
   628        if ((cc = _UnquoteTab[(uint8_t)sp[-1]]) == 0) {
   629            *ep = sp - s - 1;
   630            return -ERR_ESCAPE;
   631        }
   632
   633        /* check for simple escape sequence */
   634        if (cc != -1) {
   635            *dp++ = cc;
   636            continue;
   637        }
   638
   639        /* must have at least 4 characters */
   640        if (nb < 4) {
   641            *ep = x;
   642            return -ERR_EOF;
   643        }
   644
   645        /* check for hexadecimal characters */
   646        if (!unhex16_is(sp)) {
   647            *ep = sp - s;
   648            for (int i = 0; i < 4 && ishex(*sp); i++, sp++) ++*ep;
   649            return -ERR_INVAL;
   650        }
   651
   652        /* decode the code-point */
   653        r0 = unhex16_fast(sp);
   654        sp += 4;
   655        nb -= 4;
   656
   657    /* from line 598 */
   658    retry_decode:
   659
   660        /* ASCII characters, unlikely */
   661        if (unlikely(r0 <= 0x7f)) {
   662            *dp++ = (char)r0;
   663            continue;
   664        }
   665
   666        /* latin-1 characters, unlikely */
   667        if (unlikely(r0 <= 0x07ff)) {
   668            *dp++ = (char)(0xc0 | (r0 >> 6));
   669            *dp++ = (char)(0x80 | (r0 & 0x3f));
   670            continue;
   671        }
   672
   673        /* 3-byte characters, likely */
   674        if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
   675            *dp++ = (char)(0xe0 | ((r0 >> 12)       ));
   676            *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
   677            *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
   678            continue;
   679        }
   680
   681        /* check for double unquote */
   682        if (unlikely(flags & F_DBLUNQ)) {
   683            if (nb < 1) {
   684                if (likely(flags & F_UNIREP)) {
   685                    unirep(&dp);
   686                    continue;
   687                } else {
   688                    *ep = x;
   689                    return -ERR_EOF;
   690                }
   691            } else {
   692                if (sp[0] == '\\') {
   693                    nb--;
   694                    sp++;
   695                } else if (likely(flags & F_UNIREP)) {
   696                    unirep(&dp);
   697                    continue;
   698                } else {
   699                    *ep = sp - s - 4;
   700                    return -ERR_UNICODE;
   701                }
   702            }
   703        }
   704
   705        /* surrogate half, must follows by the other half */
   706        if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
   707            if (likely(flags & F_UNIREP)) {
   708                unirep(&dp);
   709                continue;
   710            } else {
   711                *ep = sp - s - ((flags & F_DBLUNQ) ? 5 : 4);
   712                return -ERR_UNICODE;
   713            }
   714        }
   715
   716        /* check the hexadecimal escape */
   717        if (!unhex16_is(sp + 2)) {
   718            *ep = sp - s + 2;
   719            for (int i = 2; i < 6 && ishex(sp[i]); i++) ++*ep;
   720            return -ERR_INVAL;
   721        }
   722
   723        /* decode the second code-point */
   724        r1 = unhex16_fast(sp + 2);
   725        sp += 6;
   726        nb -= 6;
   727
   728        /* it must be the other half */
   729        if (r1 < 0xdc00 || r1 > 0xdfff) {
   730            if (unlikely(!(flags & F_UNIREP))) {
   731                *ep = sp - s - 4;
   732                return -ERR_UNICODE;
   733            } else {
   734                r0 = r1;
   735                unirep(&dp);
   736                goto retry_decode;
   737            }
   738        }
   739
   740        /* merge two surrogates */
   741        r0 = (r0 - 0xd800) << 10;
   742        r1 = (r1 - 0xdc00) + 0x010000;
   743        r0 += r1;
   744
   745        /* check the code point range */
   746        if (r0 > 0x10ffff) {
   747            if (likely(!(flags & F_UNIREP))) {
   748                *ep = sp - s - 4;
   749                return -ERR_UNICODE;
   750            } else {
   751                unirep(&dp);
   752                continue;
   753            }
   754        }
   755
   756        /* encode the character */
   757        *dp++ = (char)(0xf0 | ((r0 >> 18)       ));
   758        *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
   759        *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
   760        *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
   761    }
   762
   763    /* calculate the result length */
   764    return dp + nb - p;
   765}
   766
   767static inline __m128i _mm_find_html(__m128i vv) {
   768    __m128i e1 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('<'));
   769    __m128i e2 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('>'));
   770    __m128i e3 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('&'));
   771    __m128i e4 = _mm_cmpeq_epi8   (vv, _mm_set1_epi8('\xe2'));
   772    __m128i r1 = _mm_or_si128     (e1, e2);
   773    __m128i r2 = _mm_or_si128     (e3, e4);
   774    __m128i rv = _mm_or_si128     (r1, r2);
   775    return rv;
   776}
   777
   778#if USE_AVX2
   779static inline __m256i _mm256_find_html(__m256i vv) {
   780    __m256i e1 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('<'));
   781    __m256i e2 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('>'));
   782    __m256i e3 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('&'));
   783    __m256i e4 = _mm256_cmpeq_epi8   (vv, _mm256_set1_epi8('\xe2'));
   784    __m256i r1 = _mm256_or_si256     (e1, e2);
   785    __m256i r2 = _mm256_or_si256     (e3, e4);
   786    __m256i rv = _mm256_or_si256     (r1, r2);
   787    return rv;
   788}
   789#endif
   790
   791static inline ssize_t memcchr_html_quote(const char *sp, ssize_t nb, char *dp, ssize_t dn) {
   792    uint32_t     mm;
   793    const char * ss = sp;
   794
   795#if USE_AVX2
   796    /* 32-byte loop, full store */
   797    while (nb >= 32 && dn >= 32) {
   798        __m256i vv = _mm256_loadu_si256  ((const void *)sp);
   799        __m256i rv = _mm256_find_html    (vv);
   800                     _mm256_storeu_si256 ((void *)dp, vv);
   801
   802        /* check for matches */
   803        if ((mm = _mm256_movemask_epi8(rv)) != 0) {
   804            return sp - ss + __builtin_ctz(mm);
   805        }
   806
   807        /* move to next block */
   808        sp += 32;
   809        dp += 32;
   810        nb -= 32;
   811        dn -= 32;
   812    }
   813
   814    /* 32-byte test, partial store */
   815    if (nb >= 32) {
   816        __m256i  vv = _mm256_loadu_si256   ((const void *)sp);
   817        __m256i  rv = _mm256_find_html     (vv);
   818        uint32_t mv = _mm256_movemask_epi8 (rv);
   819        uint32_t fv = __builtin_ctzll      ((uint64_t)mv | 0x0100000000);
   820
   821        /* copy at most `dn` characters */
   822        if (fv <= dn) {
   823            memcpy_p32(dp, sp, fv);
   824            return sp - ss + fv;
   825        } else {
   826            memcpy_p32(dp, sp, dn);
   827            return -(sp - ss + dn) - 1;
   828        }
   829    }
   830
   831    /* clear upper half to avoid AVX-SSE transition penalty */
   832    _mm256_zeroupper();
   833#endif
   834
   835    /* 16-byte loop, full store */
   836    while (nb >= 16 && dn >= 16) {
   837        __m128i vv = _mm_loadu_si128  ((const void *)sp);
   838        __m128i rv =  _mm_find_html   (vv);
   839                     _mm_storeu_si128 ((void *)dp, vv);
   840
   841        /* check for matches */
   842        if ((mm = _mm_movemask_epi8(rv)) != 0) {
   843            return sp - ss + __builtin_ctz(mm);
   844        }
   845
   846        /* move to next block */
   847        sp += 16;
   848        dp += 16;
   849        nb -= 16;
   850        dn -= 16;
   851    }
   852
   853    /* 16-byte test, partial store */
   854    if (nb >= 16) {
   855        __m128i  vv = _mm_loadu_si128   ((const void *)sp);
   856        __m128i  rv =  _mm_find_html    (vv);
   857        uint32_t mv = _mm_movemask_epi8 (rv);
   858        uint32_t fv = __builtin_ctz     (mv | 0x010000);
   859
   860        /* copy at most `dn` characters */
   861        if (fv <= dn) {
   862            memcpy_p16(dp, sp, fv);
   863            return sp - ss + fv;
   864        } else {
   865            memcpy_p16(dp, sp, dn);
   866            return -(sp - ss + dn) - 1;
   867        }
   868    }
   869
   870    /* handle the remaining bytes with scalar code */
   871    while (nb > 0 && dn > 0) {
   872        if (*sp == '<' || *sp == '>' || *sp == '&' || *sp == '\xe2') {
   873            return sp - ss;
   874        } else {
   875            dn--, nb--;
   876            *dp++ = *sp++;
   877        }
   878    }
   879
   880    /* check for dest buffer */
   881    if (nb == 0) {
   882        return sp - ss;
   883    } else {
   884        return -(sp - ss) - 1;
   885    }
   886}
   887
   888ssize_t html_escape(const char *sp, ssize_t nb, char *dp, ssize_t *dn) {
   889    ssize_t          nd  = *dn;
   890    const char     * ds  = dp;
   891    const char     * ss  = sp;
   892    const quoted_t * tab = _HtmlQuoteTab;
   893
   894    /* find the special characters, copy on the fly */
   895    while (nb > 0) {
   896        int     nc = 0;
   897        uint8_t ch = 0;
   898        ssize_t rb = 0;
   899        const char * cur = 0;
   900
   901        /* not enough buffer space */
   902        if (nd <= 0) {
   903            return -(sp - ss) - 1;
   904        }
   905
   906        /* find and copy */
   907        if ((rb = memcchr_html_quote(sp, nb, dp, nd)) < 0) {
   908            *dn = dp - ds - rb - 1;
   909            return -(sp - ss - rb - 1) - 1;
   910        }
   911
   912        /* skip already copied bytes */
   913        sp += rb;
   914        dp += rb;
   915        nb -= rb;
   916        nd -= rb;
   917
   918        /* stop if already finished */
   919        if (nb <= 0) {
   920            break;
   921        }
   922
   923        /* mark cur postion */
   924        cur = sp;
   925
   926        /* check for \u2028 and \u2029, binary is \xe2\x80\xa8 and \xe2\x80\xa9 */
   927        if (unlikely(*sp == '\xe2')) {
   928            if (nb >= 3 && *(sp+1) == '\x80' && (*(sp+2) == '\xa8' || *(sp+2) == '\xa9')) {
   929                sp += 2, nb -= 2;
   930            } else if (nd > 0) {
   931                *dp++ = *sp++;
   932                nb--, nd--;
   933                continue;
   934            } else {
   935                return -(sp - ss) - 1;
   936            }
   937        }
   938
   939        /* get the escape entry, handle consecutive quotes */
   940        ch = * (uint8_t*) sp;
   941        nc = tab[ch].n;
   942
   943
   944        /* check for buffer space */
   945        if (nd < nc) {
   946            *dn = dp - ds;
   947            return -(cur - ss) - 1;
   948        }
   949
   950        /* copy the quoted value */
   951        memcpy_p8(dp, tab[ch].s, nc);
   952        sp++;
   953        nb--;
   954        dp += nc;
   955        nd -= nc;
   956    }
   957
   958    /* all done */
   959    *dn = dp - ds;
   960    return sp - ss;
   961}
   962
   963#undef MAX_ESCAPED_BYTES
   964
   965static inline long unescape(const char** src, const char* end, char* dp) {
   966    const char* sp = *src;
   967    long nb = end - sp;
   968    char cc = 0;
   969    uint32_t r0, r1;
   970
   971    if (nb <= 0) return -ERR_EOF;
   972
   973    if ((cc = _UnquoteTab[(uint8_t)sp[1]]) == 0) {
   974        *src += 1;
   975        return -ERR_ESCAPE;
   976    }
   977
   978    if (cc != -1) {
   979        *dp = cc;
   980        *src += 2;
   981        return 1;
   982    }
   983
   984    if (nb < 4) {
   985        *src += 1;
   986        return -ERR_EOF;
   987    }
   988
   989    /* check for hexadecimal characters */
   990    if (!unhex16_is(sp + 2)) {
   991        *src += 2;
   992        return -ERR_INVAL;
   993    }
   994
   995    /* decode the code-point */
   996    r0 = unhex16_fast(sp + 2);
   997    sp += 6;
   998    *src = sp;
   999
  1000    /* ASCII characters, unlikely */
  1001    if (unlikely(r0 <= 0x7f)) {
  1002        *dp++ = (char)r0;
  1003        return 1;
  1004    }
  1005
  1006    /* latin-1 characters, unlikely */
  1007    if (unlikely(r0 <= 0x07ff)) {
  1008        *dp++ = (char)(0xc0 | (r0 >> 6));
  1009        *dp++ = (char)(0x80 | (r0 & 0x3f));
  1010        return 2;
  1011    }
  1012
  1013    /* 3-byte characters, likely */
  1014    if (likely(r0 < 0xd800 || r0 > 0xdfff)) {
  1015        *dp++ = (char)(0xe0 | ((r0 >> 12)       ));
  1016        *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
  1017        *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
  1018        return 3;
  1019    }
  1020
  1021    /* surrogate half, must follows by the other half */
  1022    if (nb < 6 || r0 > 0xdbff || sp[0] != '\\' || sp[1] != 'u') {
  1023        return -ERR_UNICODE;
  1024    }
  1025
  1026    /* check the hexadecimal escape */
  1027    if (!unhex16_is(sp + 2)) {
  1028        *src += 2;
  1029        return -ERR_INVAL;
  1030    }
  1031
  1032    /* decode the second code-point */
  1033    r1 = unhex16_fast(sp + 2);
  1034
  1035    /* it must be the other half */
  1036    if (r1 < 0xdc00 || r1 > 0xdfff) {
  1037        *src += 2;
  1038        return -ERR_UNICODE;
  1039    }
  1040
  1041    /* merge two surrogates */
  1042    r0 = (r0 - 0xd800) << 10;
  1043    r1 = (r1 - 0xdc00) + 0x010000;
  1044    r0 += r1;
  1045
  1046    /* encode the character */
  1047    *dp++ = (char)(0xf0 | ((r0 >> 18)       ));
  1048    *dp++ = (char)(0x80 | ((r0 >> 12) & 0x3f));
  1049    *dp++ = (char)(0x80 | ((r0 >>  6) & 0x3f));
  1050    *dp++ = (char)(0x80 | ((r0      ) & 0x3f));
  1051    *src = sp + 6;
  1052    return 4;
  1053}

View as plain text