From e8695f94a1f68c13f3866060047a2bf7c460894b Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Thu, 27 Aug 2015 19:02:51 +0200 Subject: [PATCH 1/2] Hasher and Hash updates - str now emits a delimiter of its own length - str and [u8] hash the same - Hasher::delimiter customizes how a delimiter is handled Add method `fn delimit(&mut self, len: usize)` to Hasher. This method makes the hasher emit a delimiter for a chunk of length `len`. For example str and slices both emit a delimiter for their length during hashing. The Hasher impl decides how to implement the delimiter. By default it emits the whole `usize` as data to the hashing stream. SipHash will ignore the first delimiter and hash the others as data. Since it hashes in the total length, hashing all but one delimiters is equivalent to hashing all lengths. For the next example, take something like farmhash that is not designed for streaming hashing. It could be implemented like this: - Every call to Hasher::write runs the whole hashing algorithm. Previous hash is xored together with the new result. - Delimiters are ignored, since the length of each chunk to write is already hashed in. It follows a sketch of how siphash and farmhash could work with this change: When hashing a: &[u8] - SipHash: `write(a); finish();` - Farmhash: `hash = write(a); hash` Both SipHash and Farmhash will hash just the bytes of a string in a single Hasher::write and a single Hasher::finish. When hashing (a: &[u8], b: [u8]): - SipHash: `write(a); write(b.len()); write(b); finish();` - Farmhash: `hash = write(a); hash ^= write(b); hash` --- src/libcore/hash/mod.rs | 12 ++++++++++-- src/libcore/hash/sip.rs | 8 ++++++++ src/libstd/sys/common/wtf8.rs | 19 ++----------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/libcore/hash/mod.rs b/src/libcore/hash/mod.rs index 2a4c909d6384c..bb80b81a03ab7 100644 --- a/src/libcore/hash/mod.rs +++ b/src/libcore/hash/mod.rs @@ -118,6 +118,13 @@ pub trait Hasher { #[stable(feature = "rust1", since = "1.0.0")] fn write(&mut self, bytes: &[u8]); + /// Emit a delimiter for data of length `len` + #[inline] + #[unstable(feature = "hash_delimit", since = "1.4.0", issue="0")] + fn delimit(&mut self, len: usize) { + self.write_usize(len); + } + /// Write a single `u8` into this hasher #[inline] #[stable(feature = "hasher_write", since = "1.3.0")] @@ -230,8 +237,9 @@ mod impls { #[stable(feature = "rust1", since = "1.0.0")] impl Hash for str { fn hash(&self, state: &mut H) { + // See `[T]` impl for why we write the u8 + state.delimit(self.len()); state.write(self.as_bytes()); - state.write_u8(0xff) } } @@ -272,7 +280,7 @@ mod impls { #[stable(feature = "rust1", since = "1.0.0")] impl Hash for [T] { fn hash(&self, state: &mut H) { - self.len().hash(state); + state.delimit(self.len()); Hash::hash_slice(self, state) } } diff --git a/src/libcore/hash/sip.rs b/src/libcore/hash/sip.rs index 4dcd513a0d2d6..5e6c463ed21fa 100644 --- a/src/libcore/hash/sip.rs +++ b/src/libcore/hash/sip.rs @@ -192,6 +192,14 @@ impl Hasher for SipHasher { self.write(msg) } + #[inline] + fn delimit(&mut self, len: usize) { + // skip the first delimiter + if self.length > 0 { + self.write_usize(len); + } + } + #[inline] fn finish(&self) -> u64 { let mut v0 = self.v0; diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs index 9e4a80a411bb1..8183f60c653f8 100644 --- a/src/libstd/sys/common/wtf8.rs +++ b/src/libstd/sys/common/wtf8.rs @@ -124,7 +124,7 @@ impl CodePoint { /// /// Similar to `String`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Hash)] pub struct Wtf8Buf { bytes: Vec } @@ -382,6 +382,7 @@ impl Extend for Wtf8Buf { /// /// Similar to `&str`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. +#[derive(Hash)] pub struct Wtf8 { bytes: [u8] } @@ -796,22 +797,6 @@ impl Hash for CodePoint { } } -impl Hash for Wtf8Buf { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) - } -} - -impl Hash for Wtf8 { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) - } -} - impl AsciiExt for Wtf8 { type Owned = Wtf8Buf; From a661c5c9b435999c557f182e0c69c29d6a1d9d8b Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Thu, 27 Aug 2015 21:08:15 +0200 Subject: [PATCH 2/2] Fixup hash tests and benchmarks Remove assert_eq! from the benchmarks. Their role is not to check the value (and returning the value is enough to black_box it). --- src/libcoretest/hash/mod.rs | 2 +- src/libcoretest/hash/sip.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libcoretest/hash/mod.rs b/src/libcoretest/hash/mod.rs index 4ea42644ecdfd..8ac43f199b7a0 100644 --- a/src/libcoretest/hash/mod.rs +++ b/src/libcoretest/hash/mod.rs @@ -61,7 +61,7 @@ fn test_writer_hasher() { assert_eq!(hash(&'a'), 97); let s: &str = "a"; - assert_eq!(hash(& s), 97 + 0xFF); + assert_eq!(hash(&s), 1 + 97); // FIXME (#18283) Enable test //let s: Box = box "a"; //assert_eq!(hasher.hash(& s), 97 + 0xFF); diff --git a/src/libcoretest/hash/sip.rs b/src/libcoretest/hash/sip.rs index 9b6cedd25b741..2b9e62a12123a 100644 --- a/src/libcoretest/hash/sip.rs +++ b/src/libcoretest/hash/sip.rs @@ -239,7 +239,7 @@ fn test_hash_no_concat_alias() { fn bench_str_under_8_bytes(b: &mut Bencher) { let s = "foo"; b.iter(|| { - assert_eq!(hash(&s), 16262950014981195938); + hash(&s) }) } @@ -247,7 +247,7 @@ fn bench_str_under_8_bytes(b: &mut Bencher) { fn bench_str_of_8_bytes(b: &mut Bencher) { let s = "foobar78"; b.iter(|| { - assert_eq!(hash(&s), 4898293253460910787); + hash(&s) }) } @@ -255,7 +255,7 @@ fn bench_str_of_8_bytes(b: &mut Bencher) { fn bench_str_over_8_bytes(b: &mut Bencher) { let s = "foobarbaz0"; b.iter(|| { - assert_eq!(hash(&s), 10581415515220175264); + hash(&s) }) } @@ -268,7 +268,7 @@ irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nul pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui \ officia deserunt mollit anim id est laborum."; b.iter(|| { - assert_eq!(hash(&s), 17717065544121360093); + hash(&s) }) }