native.c

Documentation: github.com/chenzhuoyu/base64x/native

     1#include <stdint.h>
     2#include <immintrin.h>
     3#include <sys/types.h>
     4
     5#define MODE_URL        1
     6#define MODE_RAW        2
     7#define MODE_AVX2       4
     8#define MODE_JSON       8
     9
    10#define as_m32v(v)      (*(uint32_t *)(v))
    11#define as_m64v(v)      (*(uint64_t *)(v))
    12
    13#define as_m128p(v)     ((__m128i *)(v))
    14#define as_m256p(v)     ((__m256i *)(v))
    15
    16#define as_m8c(v)       ((const uint8_t *)(v))
    17#define as_m128c(v)     ((const __m128i *)(v))
    18#define as_m256c(v)     ((const __m256i *)(v))
    19
    20#define always_inline   inline __attribute__((always_inline)) 
    21
    22struct slice_t {
    23    char * buf;
    24    size_t len;
    25    size_t cap;
    26};
    27
    28/** Exported Functions **/
    29
    30void    b64encode(struct slice_t *out, const struct slice_t *src, int mode);
    31ssize_t b64decode(struct slice_t *out, const char *src, size_t nb, int mode);
    32
    33/** Encoder Helper Functions **/
    34
    35static const char TabEncodeCharsetStd[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    36static const char TabEncodeCharsetURL[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    37
    38static const uint8_t VecEncodeShuffles[32] = {
    39    1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
    40    1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
    41};
    42
    43static const uint8_t VecEncodeCharsetStd[32] = {
    44    'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
    45    '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A'     ,        0,        0,
    46    'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
    47    '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A'     ,        0,        0,
    48};
    49
    50static const uint8_t VecEncodeCharsetURL[32] = {
    51    'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
    52    '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A'     ,        0,        0,
    53    'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
    54    '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A'     ,        0,        0,
    55};
    56
    57static always_inline __m256i encode_avx2(__m128i v0, __m128i v1, const uint8_t *tab) {
    58    __m256i vv = _mm256_set_m128i    (v1, v0);
    59    __m256i sh = _mm256_loadu_si256  (as_m256c(VecEncodeShuffles));
    60    __m256i in = _mm256_shuffle_epi8 (vv, sh);
    61    __m256i t0 = _mm256_and_si256    (in, _mm256_set1_epi32(0x0fc0fc00));
    62    __m256i t1 = _mm256_mulhi_epu16  (t0, _mm256_set1_epi32(0x04000040));
    63    __m256i t2 = _mm256_and_si256    (in, _mm256_set1_epi32(0x003f03f0));
    64    __m256i t3 = _mm256_mullo_epi16  (t2, _mm256_set1_epi32(0x01000010));
    65    __m256i vi = _mm256_or_si256     (t1, t3);
    66    __m256i s0 = _mm256_cmpgt_epi8   (_mm256_set1_epi8(26), vi);
    67    __m256i s1 = _mm256_and_si256    (_mm256_set1_epi8(13), s0);
    68    __m256i s2 = _mm256_loadu_si256  (as_m256c(tab));
    69    __m256i r0 = _mm256_subs_epu8    (vi, _mm256_set1_epi8(51));
    70    __m256i r1 = _mm256_or_si256     (r0, s1);
    71    __m256i r2 = _mm256_shuffle_epi8 (s2, r1);
    72    __m256i r3 = _mm256_add_epi8     (vi, r2);
    73    return r3;
    74}
    75
    76/** Function Implementations **/
    77
    78void b64encode(struct slice_t *out, const struct slice_t *src, int mode) {
    79    char *          ob = out->buf + out->len;
    80    char *          op = out->buf + out->len;
    81    const char *    ip = src->buf;
    82    const char *    ie = src->buf + src->len;
    83    const char *    st = TabEncodeCharsetStd;
    84    const uint8_t * vt = VecEncodeCharsetStd;
    85
    86    /* check for empty string */
    87    if (src->len == 0) {
    88        return;
    89    }
    90
    91    /* check for URL encoding */
    92    if (mode & MODE_URL) {
    93        st = TabEncodeCharsetURL;
    94        vt = VecEncodeCharsetURL;
    95    }
    96
    97    /* SIMD 24 bytes loop, but the SIMD instruction will load 4 bytes
    98     * past the end, so it's safe only if there are 28 bytes or more left */
    99    while ((ip <= ie - 28) && (mode & MODE_AVX2) != 0) {
   100        __m128i v0 = _mm_loadu_si128 (as_m128c(ip));
   101        __m128i v1 = _mm_loadu_si128 (as_m128c(ip + 12));
   102        __m256i vv = encode_avx2     (v0, v1, vt);
   103
   104        /* store the result, and advance buffer pointers */
   105        _mm256_storeu_si256(as_m256p(op), vv);
   106        op += 32;
   107        ip += 24;
   108    }
   109
   110    /* can do one more 24 bytes round, but needs special handling */
   111    if ((ip <= ie - 24) && (mode & MODE_AVX2) != 0) {
   112        __m128i v0 = _mm_loadu_si128 (as_m128c(ip));
   113        __m128i v1 = _mm_loadu_si128 (as_m128c(ip + 8));
   114        __m128i v2 = _mm_srli_si128  (v1, 4);
   115        __m256i vv = encode_avx2     (v0, v2, vt);
   116
   117        /* store the result, and advance buffer pointers */
   118        _mm256_storeu_si256(as_m256p(op), vv);
   119        op += 32;
   120        ip += 24;
   121    }
   122
   123    /* no more bytes */
   124    if (ip == ie) {
   125        out->len += op - ob;
   126        return;
   127    }
   128
   129    /* handle the remaining bytes with scalar code (with 4 bytes load) */
   130    while (ip <= ie - 4) {
   131        uint32_t v0 = __builtin_bswap32(*(const uint32_t *)ip);
   132        uint8_t  v1 = (v0 >> 26) & 0x3f;
   133        uint8_t  v2 = (v0 >> 20) & 0x3f;
   134        uint8_t  v3 = (v0 >> 14) & 0x3f;
   135        uint8_t  v4 = (v0 >>  8) & 0x3f;
   136
   137        /* encode the characters, and move to next block */
   138        ip += 3;
   139        *op++ = st[v1];
   140        *op++ = st[v2];
   141        *op++ = st[v3];
   142        *op++ = st[v4];
   143    }
   144
   145    /* load the last bytes */
   146    size_t   dp = ie - ip;
   147    uint32_t v0 = (uint32_t)(uint8_t)ip[0] << 16;
   148
   149#define B2 v0 |= (uint32_t)(uint8_t)ip[2]
   150#define B1 v0 |= (uint32_t)(uint8_t)ip[1] << 8
   151
   152#define R4 *op++ = st[(v0 >>  0) & 0x3f]
   153#define R3 *op++ = st[(v0 >>  6) & 0x3f]
   154#define R2 *op++ = st[(v0 >> 12) & 0x3f]
   155#define R1 *op++ = st[(v0 >> 18) & 0x3f]
   156
   157#define NB { out->len += op - ob; }
   158#define PD { if ((mode & MODE_RAW) == 0) { *op++ = '='; } }
   159
   160    /* encode the last few bytes */
   161    switch (dp) {
   162        case 3  : B2; B1; R1; R2; R3; R4; NB; break;
   163        case 2  :     B1; R1; R2; R3; PD; NB; break;
   164        case 1  :         R1; R2; PD; PD; NB; break;
   165        default :                         NB; break;
   166    }
   167
   168#undef PD
   169#undef NB
   170#undef R1
   171#undef R2
   172#undef R3
   173#undef R4
   174#undef B1
   175#undef B2
   176}
   177
   178/** Decoder Helper Functions **/
   179
   180static const uint8_t VecPacking[32] = {
   181    2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128,
   182    2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 128, 128, 128, 128
   183};
   184
   185static const uint8_t VecDecodeBits[32] = {
   186    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   187    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   188};
   189
   190static const uint8_t VecDecodeTableStd[128] = {
   191    0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   192    0x00, 0x00, 0x13, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   193    0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
   194    0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x54, 0x50, 0x50, 0x50, 0x54,
   195    0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
   196    0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
   197    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
   198    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
   199};
   200
   201static const uint8_t VecDecodeTableURL[128] = {
   202    0x00, 0x00, 0x11, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   203    0x00, 0x00, 0x11, 0x04, 0xbf, 0xbf, 0xb9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   204    0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x50, 0x50, 0x54, 0x50, 0x70,
   205    0xa8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, 0x50, 0x50, 0x54, 0x50, 0x70,
   206    0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f,
   207    0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f, 0x5f,
   208    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
   209    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
   210};
   211
   212static const uint8_t VecDecodeCharsetStd[256] = {
   213    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   214    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   215    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,   62, 0xff, 0xff, 0xff,   63,
   216      52,   53,   54,   55,   56,   57,   58,   59,   60,   61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   217    0xff,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   218      15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25, 0xff, 0xff, 0xff, 0xff, 0xff,
   219    0xff,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
   220      41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51, 0xff, 0xff, 0xff, 0xff, 0xff,
   221    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   222    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   223    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   224    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   225    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   226    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   227    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   228    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   229};
   230
   231static const uint8_t VecDecodeCharsetURL[256] = {
   232    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   233    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   234    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,   62, 0xff, 0xff,
   235      52,   53,   54,   55,   56,   57,   58,   59,   60,   61, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   236    0xff,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,
   237      15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25, 0xff, 0xff, 0xff, 0xff,   63,
   238    0xff,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
   239      41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51, 0xff, 0xff, 0xff, 0xff, 0xff,
   240    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   241    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   242    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   243    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   244    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   245    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   246    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   247    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   248};
   249
   250static always_inline void memcopy_24(char *dp, const uint8_t *sp) {
   251    *(uint64_t *)(dp +  0) = *(const uint64_t *)(sp +  0);
   252    *(uint64_t *)(dp +  8) = *(const uint64_t *)(sp +  8);
   253    *(uint64_t *)(dp + 16) = *(const uint64_t *)(sp + 16);
   254}
   255
   256static always_inline __m256i decode_avx2(__m256i v0, int *pos, const uint8_t *tab) {
   257    __m256i v1 = _mm256_srli_epi32           (v0, 4);
   258    __m256i vl = _mm256_and_si256            (v0, _mm256_set1_epi8(0x0f));
   259    __m256i vh = _mm256_and_si256            (v1, _mm256_set1_epi8(0x0f));
   260    __m256i st = _mm256_loadu_si256          (as_m256c(tab));
   261    __m256i mt = _mm256_loadu_si256          (as_m256c(tab + 32));
   262    __m256i et = _mm256_loadu_si256          (as_m256c(tab + 64));
   263    __m256i rt = _mm256_loadu_si256          (as_m256c(tab + 96));
   264    __m256i pt = _mm256_loadu_si256          (as_m256c(VecPacking));
   265    __m256i bt = _mm256_loadu_si256          (as_m256c(VecDecodeBits));
   266    __m256i sh = _mm256_shuffle_epi8         (st, vh);
   267    __m256i eq = _mm256_cmpeq_epi8           (v0, et);
   268    __m256i sv = _mm256_blendv_epi8          (sh, rt, eq);
   269    __m256i bm = _mm256_shuffle_epi8         (mt, vl);
   270    __m256i bv = _mm256_shuffle_epi8         (bt, vh);
   271    __m256i mr = _mm256_and_si256            (bm, bv);
   272    __m256i nm = _mm256_cmpeq_epi8           (mr, _mm256_setzero_si256());
   273    __m256i sr = _mm256_add_epi8             (v0, sv);
   274    __m256i r0 = _mm256_and_si256            (sr, _mm256_set1_epi8(0x3f));
   275    __m256i r1 = _mm256_maddubs_epi16        (r0, _mm256_set1_epi32(0x01400140));
   276    __m256i r2 = _mm256_madd_epi16           (r1, _mm256_set1_epi32(0x00011000));
   277    __m256i r3 = _mm256_shuffle_epi8         (r2, pt);
   278    __m256i r4 = _mm256_permutevar8x32_epi32 (r3, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, 3, 7));
   279    int64_t mp = _mm256_movemask_epi8        (nm);
   280    int32_t np = __builtin_ctzll             (mp | 0xffffffff00000000);
   281    return (*pos = np), r4;
   282}
   283
   284
   285#define ALL_01h     (~0ul / 255)
   286#define ALL_7fh     (ALL_01h * 127)
   287#define ALL_80h     (ALL_01h * 128)
   288
   289static always_inline uint32_t hasless(uint32_t x, uint8_t n) {
   290    return (x - ALL_01h * n) & ~x & ALL_80h;
   291}
   292
   293static always_inline uint32_t hasmore(uint32_t x, uint8_t n) {
   294    return (x + ALL_01h * (127 - n) | x) & ALL_80h;
   295}
   296
   297static always_inline uint32_t hasbetween(uint32_t x, uint8_t m, uint8_t n) {
   298    return (ALL_01h * (127 + n) - (x & ALL_7fh) & ~x & (x & ALL_7fh) + ALL_01h * (127 - m)) & ALL_80h;
   299}
   300
   301#undef ALL_01h
   302#undef ALL_7fh
   303#undef ALL_80h
   304
   305static always_inline char unhex16_is(const uint8_t *s) {
   306    uint32_t v = *(uint32_t *)s;
   307    return !(hasless(v, '0') || hasmore(v, 'f') || hasbetween(v, '9', 'A') || hasbetween(v, 'F', 'a'));
   308}
   309
   310static always_inline uint32_t unhex16_fast(const uint8_t *s) {
   311    uint32_t a = __builtin_bswap32(*(uint32_t *)s);
   312    uint32_t b = 9 * ((~a & 0x10101010) >> 4) + (a & 0x0f0f0f0f);
   313    uint32_t c = (b >> 4) | b;
   314    uint32_t d = ((c >> 8) & 0xff00) | (c & 0x00ff);
   315    return d;
   316}
   317
   318static always_inline uint8_t unescape_asc(const uint8_t * ie, const uint8_t ** ipp) {
   319    const uint8_t * ee = (*ipp) + 1;
   320    uint32_t ch = 0xff; 
   321    /* check eof */
   322    if (ee > ie) {
   323        return 0xff;
   324    }
   325    switch (ee[-1]) {
   326        case 'r': ch = '\r'; break;
   327        case 'n': ch = '\n'; break;
   328        case '/': ch = '/'; break;
   329        case 'u': /* neee more 4 bytes */
   330        if (ie - ee >= 4 && unhex16_is(ee)) { 
   331            ch = unhex16_fast(ee);
   332            /* if not ascii, as 0xff */
   333            ch = ch < 128 ? ch : 0xff;
   334            ee += 4;
   335        }
   336        break;
   337    }
   338    *ipp = ee;
   339    return ch;
   340}
   341
   342/* Return 0 if success, otherwise return the error position + 1 */
   343static always_inline int64_t decode_block(
   344    const uint8_t *  ie,
   345    const uint8_t ** ipp,
   346    char **          opp,
   347    const uint8_t *  tab,
   348    int              mode
   349) {
   350    int      nb = 0;
   351    uint32_t v0 = 0;
   352
   353    /* buffer pointers */
   354    char *          op = *opp;
   355    const uint8_t * ip = *ipp;
   356    uint8_t id = 0;
   357    uint8_t ch = 0;
   358    int pad = 0;
   359
   360#define may_unescape() { if (ch == '\\' && (mode & MODE_JSON)) ch = unescape_asc(ie, &ip); }
   361#define skip_newlines() { if (ch == '\r' || ch == '\n') continue; }
   362
   363    /* load up to 4 characters */
   364    while (ip < ie && nb < 4) {
   365        ch = *ip++;
   366        may_unescape();
   367        skip_newlines();
   368
   369        /* lookup the index, and check for invalid characters */
   370        if ((id = tab[ch]) == 0xff) {
   371            if ((mode & MODE_RAW) || ch != '=' || nb < 2) goto error;
   372            pad++; goto tail;
   373        }
   374
   375        /* decode the character */
   376        v0 = (v0 << 6) | id;
   377        nb++;
   378    }
   379
   380    if (nb == 0) {
   381        /* update the pointers */
   382        *ipp = ip;
   383        return 0;
   384    }
   385
   386    /* check eof, MODE_STD need paddings */
   387    if (ip >= ie && nb != 4) {
   388        if (!(mode & MODE_RAW) || nb == 1) goto error;
   389    }
   390
   391decode:
   392    v0 <<= 6 * (4 - nb); 
   393    /* ends with eof or 4 characters, decode into output */
   394    switch (nb) {
   395        case 4: op[2] = (v0 >>  0) & 0xff;
   396        case 3: op[1] = (v0 >>  8) & 0xff;
   397        case 2: op[0] = (v0 >> 16) & 0xff;
   398    }
   399
   400    /* update the pointers */
   401    *ipp = ip;
   402    *opp = op + nb - 1;
   403    return 0;
   404
   405tail:
   406    /* loop for more paddings */
   407    while (ip < ie) {
   408        ch = *ip++;
   409        may_unescape();
   410        skip_newlines();
   411        if (ch != '=') goto error;
   412        if (++pad + nb > 4) goto error;
   413    }
   414    goto decode;
   415#undef may_unescape
   416#undef skip_newlines
   417
   418error:
   419    /* update eof error position */
   420    if (ip == ie) ip++;
   421    return ip - *ipp;
   422
   423
   424}
   425
   426ssize_t b64decode(struct slice_t *out, const char *src, size_t nb, int mode) {
   427    int     ep;
   428    __m256i vv;
   429    int64_t dv;
   430    uint8_t buf[32] = {0};
   431
   432    /* check for empty input */
   433    if (nb == 0) {
   434        return 0;
   435    }
   436
   437    /* output buffer */
   438    char *ob = out->buf + out->len;
   439    char *op = out->buf + out->len;
   440    char *oe = out->buf + out->cap;
   441
   442    /* input buffer */
   443    const uint8_t *dt = VecDecodeTableStd;
   444    const uint8_t *st = VecDecodeCharsetStd;
   445    const uint8_t *ib = (const uint8_t *)src;
   446    const uint8_t *ip = (const uint8_t *)src;
   447    const uint8_t *ie = (const uint8_t *)src + nb;
   448
   449    /* check for URL encoding */
   450    if (mode & MODE_URL) {
   451        dt = VecDecodeTableURL;
   452        st = VecDecodeCharsetURL;
   453    }
   454
   455    /* decode every 32 bytes, the final round should be handled separately, because the
   456     * SIMD instruction performs 32-byte store, and it might store past the end of the
   457     * output buffer */
   458    if ((mode & MODE_AVX2) == 0) {
   459        goto scalar;
   460    }
   461    while ((ip <= ie - 32) && (op <= oe - 32)) {
   462        vv = _mm256_loadu_si256(as_m256c(ip));
   463        vv = decode_avx2(vv, &ep, dt);
   464
   465        /* check for invalid characters (or '=' paddings) */
   466        if (ep < 32) {
   467            if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
   468                return ib - ip - dv;
   469            } else {
   470                continue;
   471            }
   472        }
   473
   474        _mm256_storeu_si256(as_m256p(op), vv);
   475
   476        /* move to next block */
   477        ip += 32;
   478        op += 24;
   479    }
   480
   481scalar:
   482    /* handle the remaining bytes with scalar code (8 byte loop) */
   483    while (ip <= ie - 8 && op <= oe - 8) {
   484        uint8_t v0 = st[ip[0]];
   485        uint8_t v1 = st[ip[1]];
   486        uint8_t v2 = st[ip[2]];
   487        uint8_t v3 = st[ip[3]];
   488        uint8_t v4 = st[ip[4]];
   489        uint8_t v5 = st[ip[5]];
   490        uint8_t v6 = st[ip[6]];
   491        uint8_t v7 = st[ip[7]];
   492
   493        /* check for invalid bytes */
   494        if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) == 0xff) {
   495            if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
   496                return ib - ip - dv;
   497            } else {
   498                continue;
   499            }
   500        }
   501
   502        /* construct the characters */
   503        uint64_t vv = __builtin_bswap64(
   504            ((uint64_t)v0 << 58) |
   505            ((uint64_t)v1 << 52) |
   506            ((uint64_t)v2 << 46) |
   507            ((uint64_t)v3 << 40) |
   508            ((uint64_t)v4 << 34) |
   509            ((uint64_t)v5 << 28) |
   510            ((uint64_t)v6 << 22) |
   511            ((uint64_t)v7 << 16)
   512        );
   513
   514        /* store the result, and move to next block */
   515        as_m64v(op) = vv;
   516        ip += 8;
   517        op += 6;
   518    }
   519
   520    /* handle the remaining bytes with scalar code (4 byte loop) */
   521    while (ip <= ie - 4 && op <= oe - 4) {
   522        uint8_t v0 = st[ip[0]];
   523        uint8_t v1 = st[ip[1]];
   524        uint8_t v2 = st[ip[2]];
   525        uint8_t v3 = st[ip[3]];
   526
   527        /* check for invalid bytes */
   528        if ((v0 | v1 | v2 | v3) == 0xff) {
   529            if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
   530                return ib - ip - dv;
   531            } else {
   532                continue;
   533            }
   534        }
   535
   536        /* construct the characters */
   537        uint32_t vv = __builtin_bswap32(
   538            ((uint32_t)v0 << 26) |
   539            ((uint32_t)v1 << 20) |
   540            ((uint32_t)v2 << 14) |
   541            ((uint32_t)v3 <<  8)
   542        );
   543
   544        /* store the result, and move to next block */
   545        as_m32v(op) = vv;
   546        ip += 4;
   547        op += 3;
   548    }
   549
   550    /* decode the last few bytes */
   551    while (ip < ie) {
   552        if ((dv = decode_block(ie, &ip, &op, st, mode)) != 0) {
   553            return ib - ip - dv;
   554        }
   555    }
   556
   557    /* update the result length */
   558    out->len += op - ob;
   559    return op - ob;
   560}
View as plain text