1/*
2 * Copyright 2021 ByteDance Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "native.h"
18#include "tab.h"
19
20static const char Vec16xA0[16] __attribute__((aligned(16))) = {
21 '0', '0', '0', '0', '0', '0', '0', '0',
22 '0', '0', '0', '0', '0', '0', '0', '0',
23};
24
25static const uint16_t Vec8x10[8] __attribute__((aligned(16))) = {
26 10, 10, 10, 10,
27 10, 10, 10, 10,
28};
29
30static const uint32_t Vec4x10k[4] __attribute__((aligned(16))) = {
31 10000,
32 10000,
33 10000,
34 10000,
35};
36
37static const uint32_t Vec4xDiv10k[4] __attribute__((aligned(16))) = {
38 0xd1b71759,
39 0xd1b71759,
40 0xd1b71759,
41 0xd1b71759,
42};
43
44static const uint16_t VecDivPowers[8] __attribute__((aligned(16))) = {
45 0x20c5, 0x147b,
46 0x3334, 0x8000,
47 0x20c5, 0x147b,
48 0x3334, 0x8000,
49};
50
51static const uint16_t VecShiftPowers[8] __attribute__((aligned(16))) = {
52 0x0080, 0x0800,
53 0x2000, 0x8000,
54 0x0080, 0x0800,
55 0x2000, 0x8000,
56};
57
58static const uint8_t VecShiftShuffles[144] __attribute__((aligned(16))) = {
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
60 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff,
61 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
62 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff,
63 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
64 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
65 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68};
69
70static inline int itoa1(char *out, int n, uint32_t v) {
71 out[n++] = (char)v + '0';
72 return n;
73}
74
75static inline int itoa2(char *out, int n, uint32_t v) {
76 out[n++] = Digits[v];
77 out[n++] = Digits[v + 1];
78 return n;
79}
80
81static inline __m128i itoa8_sse2(uint32_t v) {
82 __m128i v00 = _mm_cvtsi32_si128 (v);
83 __m128i v01 = _mm_mul_epu32 (v00, as_m128v(Vec4xDiv10k));
84 __m128i v02 = _mm_srli_epi64 (v01, 45);
85 __m128i v03 = _mm_mul_epu32 (v02, as_m128v(Vec4x10k));
86 __m128i v04 = _mm_sub_epi32 (v00, v03);
87 __m128i v05 = _mm_unpacklo_epi16 (v02, v04);
88 __m128i v06 = _mm_slli_epi64 (v05, 2);
89 __m128i v07 = _mm_unpacklo_epi16 (v06, v06);
90 __m128i v08 = _mm_unpacklo_epi32 (v07, v07);
91 __m128i v09 = _mm_mulhi_epu16 (v08, as_m128v(VecDivPowers));
92 __m128i v10 = _mm_mulhi_epu16 (v09, as_m128v(VecShiftPowers));
93 __m128i v11 = _mm_mullo_epi16 (v10, as_m128v(Vec8x10));
94 __m128i v12 = _mm_slli_epi64 (v11, 16);
95 __m128i v13 = _mm_sub_epi16 (v10, v12);
96 return v13;
97}
98
99static inline int u32toa_small(char *out, uint32_t val) {
100 int n = 0;
101 uint32_t d1 = (val / 100) << 1;
102 uint32_t d2 = (val % 100) << 1;
103
104 /* 1000-th digit */
105 if (val >= 1000) {
106 out[n++] = Digits[d1];
107 }
108
109 /* 100-th digit */
110 if (val >= 100) {
111 out[n++] = Digits[d1 + 1];
112 }
113
114 /* 10-th digit */
115 if (val >= 10) {
116 out[n++] = Digits[d2];
117 }
118
119 /* last digit */
120 out[n++] = Digits[d2 + 1];
121 return n;
122}
123
124static inline int u32toa_medium(char *out, uint32_t val) {
125 int n = 0;
126 uint32_t b = val / 10000;
127 uint32_t c = val % 10000;
128 uint32_t d1 = (b / 100) << 1;
129 uint32_t d2 = (b % 100) << 1;
130 uint32_t d3 = (c / 100) << 1;
131 uint32_t d4 = (c % 100) << 1;
132
133 /* 10000000-th digit */
134 if (val >= 10000000) {
135 out[n++] = Digits[d1];
136 }
137
138 /* 1000000-th digit */
139 if (val >= 1000000) {
140 out[n++] = Digits[d1 + 1];
141 }
142
143 /* 100000-th digit */
144 if (val >= 100000) {
145 out[n++] = Digits[d2];
146 }
147
148 /* remaining digits */
149 out[n++] = Digits[d2 + 1];
150 out[n++] = Digits[d3];
151 out[n++] = Digits[d3 + 1];
152 out[n++] = Digits[d4];
153 out[n++] = Digits[d4 + 1];
154 return n;
155}
156
157static inline int u64toa_large_sse2(char *out, uint64_t val) {
158 uint32_t a = (uint32_t)(val / 100000000);
159 uint32_t b = (uint32_t)(val % 100000000);
160
161 /* convert to digits */
162 __m128i v0 = itoa8_sse2(a);
163 __m128i v1 = itoa8_sse2(b);
164
165 /* convert to bytes, add '0' */
166 __m128i v2 = _mm_packus_epi16 (v0, v1);
167 __m128i v3 = _mm_add_epi8 (v2, as_m128v(Vec16xA0));
168
169 /* count number of digit */
170 __m128i v4 = _mm_cmpeq_epi8 (v3, as_m128v(Vec16xA0));
171 uint32_t bm = _mm_movemask_epi8 (v4);
172 uint32_t nd = __builtin_ctz (~bm | 0x8000);
173
174 /* shift digits to the beginning */
175 __m128i p = _mm_loadu_si128 (as_m128c(&VecShiftShuffles[nd * 16]));
176 __m128i r = _mm_shuffle_epi8 (v3, p);
177
178 /* store the result */
179 _mm_storeu_si128(as_m128p(out), r);
180 return 16 - nd;
181}
182
183static inline int u64toa_xlarge_sse2(char *out, uint64_t val) {
184 int n = 0;
185 uint64_t b = val % 10000000000000000;
186 uint32_t a = (uint32_t)(val / 10000000000000000);
187
188 /* the highest 4 digits */
189 if (a < 10) {
190 n = itoa1(out, n, a);
191 } else if (a < 100) {
192 n = itoa2(out, n, a << 1);
193 } else if (a < 1000) {
194 n = itoa1(out, n, a / 100);
195 n = itoa2(out, n, (a % 100) << 1);
196 } else {
197 n = itoa2(out, n, (a / 100) << 1);
198 n = itoa2(out, n, (a % 100) << 1);
199 }
200
201 /* remaining digits */
202 __m128i v0 = itoa8_sse2 ((uint32_t)(b / 100000000));
203 __m128i v1 = itoa8_sse2 ((uint32_t)(b % 100000000));
204 __m128i v2 = _mm_packus_epi16 (v0, v1);
205 __m128i v3 = _mm_add_epi8 (v2, as_m128v(Vec16xA0));
206
207 /* convert to bytes, add '0' */
208 _mm_storeu_si128(as_m128p(&out[n]), v3);
209 return n + 16;
210}
211
212int i64toa(char *out, int64_t val) {
213 if (likely(val >= 0)) {
214 return u64toa(out, (uint64_t)val);
215 } else {
216 *out = '-';
217 return u64toa(out + 1, (uint64_t)(-val)) + 1;
218 }
219}
220
221int u64toa(char *out, uint64_t val) {
222 if (likely(val < 10000)) {
223 return u32toa_small(out, (uint32_t)val);
224 } else if (likely(val < 100000000)) {
225 return u32toa_medium(out, (uint32_t)val);
226 } else if (likely(val < 10000000000000000)) {
227 return u64toa_large_sse2(out, val);
228 } else {
229 return u64toa_xlarge_sse2(out, val);
230 }
231}
View as plain text