Skip to content

Commit dbb88c7

Browse files
authored
[PERF] taking upper 32bit of PyObject_Hash into account (#39592)
1 parent 545cbbf commit dbb88c7

File tree

3 files changed

+35
-5
lines changed

3 files changed

+35
-5
lines changed

asv_bench/benchmarks/hash_functions.py

+9
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent):
2525
self.s.isin(self.values_outside)
2626

2727

28+
class UniqueForLargePyObjectInts:
29+
def setup(self):
30+
lst = [x << 32 for x in range(5000)]
31+
self.arr = np.array(lst, dtype=np.object_)
32+
33+
def time_unique(self):
34+
pd.unique(self.arr)
35+
36+
2837
class IsinWithRandomFloat:
2938
params = [
3039
[np.float64, np.object],

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ Performance improvements
253253
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
254254
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
255255
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
256+
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
256257

257258
.. ---------------------------------------------------------------------------
258259

pandas/_libs/src/klib/khash_python.h

+25-5
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
178178
return result;
179179
}
180180

181-
// For PyObject_Hash holds:
182-
// hash(0.0) == 0 == hash(-0.0)
183-
// hash(X) == 0 if X is a NaN-value
184-
// so it is OK to use it directly
185-
#define kh_python_hash_func(key) (PyObject_Hash(key))
181+
182+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
183+
// For PyObject_Hash holds:
184+
// hash(0.0) == 0 == hash(-0.0)
185+
// hash(X) == 0 if X is a NaN-value
186+
// so it is OK to use it directly for doubles
187+
Py_hash_t hash = PyObject_Hash(key);
188+
if (hash == -1) {
189+
PyErr_Clear();
190+
return 0;
191+
}
192+
#if SIZEOF_PY_HASH_T == 4
193+
// it is already 32bit value
194+
return hash;
195+
#else
196+
// for 64bit builds,
197+
// we need information of the upper 32bits as well
198+
// see GH 37615
199+
khuint64_t as_uint = (khuint64_t) hash;
200+
// uints avoid undefined behavior of signed ints
201+
return (as_uint>>32)^as_uint;
202+
#endif
203+
}
204+
205+
186206
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
187207

188208

0 commit comments

Comments
 (0)