-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathdiv_u64_u16.cc
218 lines (192 loc) · 5.74 KB
/
div_u64_u16.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/**
* For specific case of unsigned long division by unsigned short.
* The division code borrowed from libdivide
* https://github.com/ridiculousfish/libdivide
*
* The code is distributed under libdivide license.
*
* Results on my computer:
*
* $ grep -m 1 'model name' /proc/cpuinfo
* model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz
*
* $ g++ -march=native -mtune=native -O2 div_u64_u16.cc
* $ ./a.out
* Libdivision:
* 109ms
*
* Hardware division:
* 263ms
*/
#include <assert.h>
#include <stdlib.h>
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
static const size_t ITER = 20 * 1000;
static const size_t NUMS = 1024;
static const unsigned char LIBDIVIDE_ADD_MARKER = 0x40;
static const unsigned char LIBDIVIDE_64_SHIFT_MASK = 0x3F;
struct libdivide_u64_t {
uint64_t magic;
uint8_t more;
};
static unsigned long longs1[NUMS] __attribute__((aligned(4096)));
static unsigned long longs2[NUMS] __attribute__((aligned(4096)));
static unsigned short shorts[NUMS] __attribute__((aligned(4096)));
static struct libdivide_u64_t denoms[NUMS] __attribute__((aligned(4096)));
static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y)
{
uint64_t xl = x, yl = y;
uint64_t rl = xl * yl;
return (uint32_t)(rl >> 32);
}
static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y)
{
#if defined(__SIZEOF_INT128__)
// we go here, not sure about kernel
__uint128_t xl = x, yl = y;
__uint128_t rl = xl * yl;
return (uint64_t)(rl >> 64);
#else
// full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
uint32_t mask = 0xFFFFFFFF;
uint32_t x0 = (uint32_t)(x & mask);
uint32_t x1 = (uint32_t)(x >> 32);
uint32_t y0 = (uint32_t)(y & mask);
uint32_t y1 = (uint32_t)(y >> 32);
uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
uint64_t x0y1 = x0 * (uint64_t)y1;
uint64_t x1y0 = x1 * (uint64_t)y0;
uint64_t x1y1 = x1 * (uint64_t)y1;
uint64_t temp = x1y0 + x0y0_hi;
uint64_t temp_lo = temp & mask;
uint64_t temp_hi = temp >> 32;
return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
#endif
}
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom)
{
uint8_t more = denom->more;
// Code from libdivide_u64_gen
if (!denom->magic) {
return numer >> more;
}
else {
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) {
uint64_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
}
else {
// All upper bits are 0,
// don't need to mask them off.
return q >> more;
}
}
}
// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
// uint {v}. The result must fit in 64 bits.
// Returns the quotient directly and the remainder in *r
static uint64_t
libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r)
{
uint64_t result;
#if defined(LIBDIVIDE_X86_64) && \
defined(LIBDIVIDE_GCC_STYLE_ASM)
__asm__("divq %[v]"
: "=a"(result), "=d"(*r)
: [v] "r"(v), "a"(u0), "d"(u1)
);
#elif defined(__SIZEOF_INT128__)
__uint128_t n = ((__uint128_t)u1 << 64) | u0;
result = (uint64_t)(n / v);
*r = (uint64_t)(n - result * (__uint128_t)v);
#else
#error slow platform
#endif
return result;
}
static inline struct libdivide_u64_t
libdivide_internal_u64_gen(uint64_t d)
{
assert(d);
struct libdivide_u64_t result;
uint32_t floor_log_2_d = 63 - __builtin_clzll(d); // we have similar in kernel
// Power of 2
if ((d & (d - 1)) == 0) {
result.magic = 0;
result.more = (uint8_t)floor_log_2_d;
} else {
uint64_t proposed_m, rem;
uint8_t more;
// (1 << (64 + floor_log_2_d)) / d
proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
assert(rem > 0 && rem < d);
const uint64_t e = d - rem;
// This power works if e < 2**floor_log_2_d.
if (e < (1ULL << floor_log_2_d)) {
// This power works
more = floor_log_2_d;
} else {
// We have to use the general 65-bit algorithm. We need to compute
// (2**power) / d. However, we already have (2**(power-1))/d and
// its remainder. By doubling both, and then correcting the
// remainder, we can compute the larger division.
// don't care about overflow here - in fact, we expect it
proposed_m += proposed_m;
const uint64_t twice_rem = rem + rem;
if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
}
result.magic = 1 + proposed_m;
result.more = more;
// result.more's shift should in general be ceil_log_2_d. But if we
// used the smaller power, we subtract one from the shift because we're
// using the smaller power. If we're using the larger power, we
// subtract one from the shift because it's taken care of by the add
// indicator. So floor_log_2_d happens to be correct in both cases,
// which is why we do it outside of the if statement.
}
return result;
}
void
benchmark(const char *desc, std::function<void (int i)> div_cb)
{
using namespace std::chrono;
std::cout << desc << ":" << std::endl;
auto t(steady_clock::now());
for (auto i = 0; i < ITER; ++i) {
for (auto n = 0; n < NUMS; ++n)
div_cb(n);
}
auto dt = steady_clock::now() - t;
std::cout << duration_cast<milliseconds>(dt).count()
<< "ms" << std::endl;
std::cout << std::endl;
}
int
main()
{
for (auto i = 0; i < NUMS; ++i) {
longs1[i] = longs2[i] = ((long)rand() << 32) | rand();
shorts[i] = rand();
/* Usually we know the denominators in config time. */
denoms[i] = libdivide_internal_u64_gen(shorts[i]);
}
benchmark("Libdivision",
[&](int i)
{
longs1[i] = libdivide_u64_do(longs1[i], &denoms[i]);
});
benchmark("Hardware division",
[&](int i)
{
longs2[i] = longs2[i] / shorts[i];
});
for (auto i = 0; i < NUMS; ++i)
if (longs1[i] != longs2[i])
std::cerr << "Mismatch at " << i << std::endl;
return 0;
}