From 37a614ec19a0429212da5239f229a94d19778d27 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 30 Dec 2015 21:45:24 +0100 Subject: [PATCH 01/10] Unify computation of length in `EscapeUnicode` The `offset` value was computed both in `next` and in `size_hint`; computing it in a single place ensures consistency and makes it easier to apply improvements. The value is now computed as soon as the iterator is constructed. This means that the time to compute it is spent immediately and cannot be avoided, but it also guarantees that it is only spent once. --- src/libcore/char.rs | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 0c3807d8ca0b5..df6044fa83928 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -299,7 +299,15 @@ impl CharExt for char { #[inline] fn escape_unicode(self) -> EscapeUnicode { - EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash } + let mut n = 0; + while (self as u32) >> (4 * (n + 1)) != 0 { + n += 1; + } + EscapeUnicode { + c: self, + state: EscapeUnicodeState::Backslash, + offset: n, + } } #[inline] @@ -420,7 +428,8 @@ pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { #[stable(feature = "rust1", since = "1.0.0")] pub struct EscapeUnicode { c: char, - state: EscapeUnicodeState + state: EscapeUnicodeState, + offset: usize, } #[derive(Clone)] @@ -428,7 +437,7 @@ enum EscapeUnicodeState { Backslash, Type, LeftBrace, - Value(usize), + Value, RightBrace, Done, } @@ -448,19 +457,15 @@ impl Iterator for EscapeUnicode { Some('u') } EscapeUnicodeState::LeftBrace => { - let mut n = 0; - while (self.c as u32) >> (4 * (n + 1)) != 0 { - n += 1; - } - self.state = EscapeUnicodeState::Value(n); + self.state = EscapeUnicodeState::Value; Some('{') } - EscapeUnicodeState::Value(offset) => { - let c = from_digit(((self.c as u32) >> (offset * 4)) & 0xf, 16).unwrap(); - if offset == 0 { + EscapeUnicodeState::Value => { + let c = from_digit(((self.c as u32) >> (self.offset * 4)) & 0xf, 16).unwrap(); + if self.offset == 0 { self.state = EscapeUnicodeState::RightBrace; } else { - self.state = EscapeUnicodeState::Value(offset - 1); + self.offset -= 1; } Some(c) } @@ -473,18 +478,15 @@ impl Iterator for EscapeUnicode { } fn size_hint(&self) -> (usize, Option) { - let mut n = 0; - while (self.c as usize) >> (4 * (n + 1)) != 0 { - n += 1; - } let n = match self.state { - EscapeUnicodeState::Backslash => n + 5, - EscapeUnicodeState::Type => n + 4, - EscapeUnicodeState::LeftBrace => n + 3, - EscapeUnicodeState::Value(offset) => offset + 2, + EscapeUnicodeState::Backslash => 5, + EscapeUnicodeState::Type => 4, + EscapeUnicodeState::LeftBrace => 3, + EscapeUnicodeState::Value => 2, EscapeUnicodeState::RightBrace => 1, EscapeUnicodeState::Done => 0, }; + let n = n + self.offset; (n, Some(n)) } } From 1027e6a5714748802841234242c7c842122eb853 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 30 Dec 2015 22:03:35 +0100 Subject: [PATCH 02/10] Improve computation of `EscapeUnicode` offset field Instead of iteratively scanning the bits, use `leading_zeros`. --- src/libcore/char.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index df6044fa83928..1df2d0d3bc8cc 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -299,14 +299,16 @@ impl CharExt for char { #[inline] fn escape_unicode(self) -> EscapeUnicode { - let mut n = 0; - while (self as u32) >> (4 * (n + 1)) != 0 { - n += 1; - } + let c = self as u32; + // or-ing 1 ensures that for c==0 the code computes that one + // digit should be printed and (which is the same) avoids the + // (31 - 32) underflow + let msb = 31 - (c | 1).leading_zeros(); + let msdigit = msb / 4; EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash, - offset: n, + offset: msdigit as usize, } } From 7f5eae753349513653cdc397d0de7426e975a7da Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 30 Dec 2015 10:08:28 +0100 Subject: [PATCH 03/10] `EscapeUnicode` and `EscapeDefault` are `ExactSizeIterator`s In #28662, `size_hint` was made exact for `EscapeUnicode` and `EscapeDefault`, but neither was marked as `ExactSizeIterator`. --- src/libcore/char.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 1df2d0d3bc8cc..5cb2c4be0011d 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -15,7 +15,7 @@ #![allow(non_snake_case)] #![stable(feature = "core_char", since = "1.2.0")] -use iter::Iterator; +use iter::{Iterator, ExactSizeIterator}; use mem::transmute; use option::Option::{None, Some}; use option::Option; @@ -493,6 +493,9 @@ impl Iterator for EscapeUnicode { } } +#[stable(feature = "rust1", since = "1.7.0")] +impl ExactSizeIterator for EscapeUnicode { } + /// An iterator that yields the literal escape code of a `char`. /// /// This `struct` is created by the [`escape_default()`] method on [`char`]. See @@ -587,3 +590,6 @@ impl Iterator for EscapeDefault { } } } + +#[stable(feature = "rust1", since = "1.7.0")] +impl ExactSizeIterator for EscapeDefault { } From 009e24373d5a9922aa2f83537bc2d542437431b4 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 30 Dec 2015 16:42:52 +0100 Subject: [PATCH 04/10] Implement `count` for `EscapeDefault` and `EscapeUnicode` Trivial implementation, as both are `ExactSizeIterator`s. Part of #24214. --- src/libcore/char.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 5cb2c4be0011d..7ac4a4104c5dd 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -491,6 +491,11 @@ impl Iterator for EscapeUnicode { let n = n + self.offset; (n, Some(n)) } + + #[inline] + fn count(self) -> usize { + self.len() + } } #[stable(feature = "rust1", since = "1.7.0")] @@ -545,13 +550,9 @@ impl Iterator for EscapeDefault { } } + #[inline] fn count(self) -> usize { - match self.state { - EscapeDefaultState::Char(_) => 1, - EscapeDefaultState::Unicode(iter) => iter.count(), - EscapeDefaultState::Done => 0, - EscapeDefaultState::Backslash(_) => 2, - } + self.len() } fn nth(&mut self, n: usize) -> Option { From ad7f68d7999a523de8ecb882d91564fafc77a3f6 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 18 Jan 2016 17:46:53 +0100 Subject: [PATCH 05/10] Implement `last` for `EscapeUnicode` Part of #24214. --- src/libcore/char.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 7ac4a4104c5dd..50cfa20e241ee 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -496,6 +496,18 @@ impl Iterator for EscapeUnicode { fn count(self) -> usize { self.len() } + + fn last(self) -> Option { + match self.state { + EscapeUnicodeState::Done => None, + + EscapeUnicodeState::RightBrace | + EscapeUnicodeState::Value | + EscapeUnicodeState::LeftBrace | + EscapeUnicodeState::Type | + EscapeUnicodeState::Backslash => Some('}'), + } + } } #[stable(feature = "rust1", since = "1.7.0")] From 113b366b8de87f13a437483cb8eb00bea2f76a02 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 18 Jan 2016 17:36:12 +0100 Subject: [PATCH 06/10] Move length computation to `ExactSizeIterator` impls and reuse it in `size_hint`. --- src/libcore/char.rs | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 50cfa20e241ee..8b112cdb2759e 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -436,12 +436,12 @@ pub struct EscapeUnicode { #[derive(Clone)] enum EscapeUnicodeState { - Backslash, - Type, - LeftBrace, - Value, - RightBrace, Done, + RightBrace, + Value, + LeftBrace, + Type, + Backslash, } #[stable(feature = "rust1", since = "1.0.0")] @@ -479,16 +479,9 @@ impl Iterator for EscapeUnicode { } } + #[inline] fn size_hint(&self) -> (usize, Option) { - let n = match self.state { - EscapeUnicodeState::Backslash => 5, - EscapeUnicodeState::Type => 4, - EscapeUnicodeState::LeftBrace => 3, - EscapeUnicodeState::Value => 2, - EscapeUnicodeState::RightBrace => 1, - EscapeUnicodeState::Done => 0, - }; - let n = n + self.offset; + let n = self.len(); (n, Some(n)) } @@ -511,7 +504,20 @@ impl Iterator for EscapeUnicode { } #[stable(feature = "rust1", since = "1.7.0")] -impl ExactSizeIterator for EscapeUnicode { } +impl ExactSizeIterator for EscapeUnicode { + #[inline] + fn len(&self) -> usize { + // The match is a single memory access with no branching + self.offset + self.state { + EscapeUnicodeState::Done => 0, + EscapeUnicodeState::RightBrace => 1, + EscapeUnicodeState::Value => 2, + EscapeUnicodeState::LeftBrace => 3, + EscapeUnicodeState::Type => 4, + EscapeUnicodeState::Backslash => 5, + } + } +} /// An iterator that yields the literal escape code of a `char`. /// @@ -528,9 +534,9 @@ pub struct EscapeDefault { #[derive(Clone)] enum EscapeDefaultState { - Backslash(char), - Char(char), Done, + Char(char), + Backslash(char), Unicode(EscapeUnicode), } @@ -553,13 +559,10 @@ impl Iterator for EscapeDefault { } } + #[inline] fn size_hint(&self) -> (usize, Option) { - match self.state { - EscapeDefaultState::Char(_) => (1, Some(1)), - EscapeDefaultState::Backslash(_) => (2, Some(2)), - EscapeDefaultState::Unicode(ref iter) => iter.size_hint(), - EscapeDefaultState::Done => (0, Some(0)), - } + let n = self.len(); + (n, Some(n)) } #[inline] @@ -605,4 +608,13 @@ impl Iterator for EscapeDefault { } #[stable(feature = "rust1", since = "1.7.0")] -impl ExactSizeIterator for EscapeDefault { } +impl ExactSizeIterator for EscapeDefault { + fn len(&self) -> usize { + match self.state { + EscapeDefaultState::Done => 0, + EscapeDefaultState::Char(_) => 1, + EscapeDefaultState::Backslash(_) => 2, + EscapeDefaultState::Unicode(ref iter) => iter.len(), + } + } +} From f6c8757e8fc3b3871ccada65b7f4a81d85ddcb39 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Tue, 19 Jan 2016 11:21:48 +0100 Subject: [PATCH 07/10] Extract stepping from `EscapeUnicode::next` Extract a function that updates the iterator state and returns the result of an arbitrary step of iteration. This implements the same logic as `next`, but it can be shared with `nth`. --- src/libcore/char.rs | 78 ++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 8b112cdb2759e..7e3298980255b 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -449,34 +449,10 @@ impl Iterator for EscapeUnicode { type Item = char; fn next(&mut self) -> Option { - match self.state { - EscapeUnicodeState::Backslash => { - self.state = EscapeUnicodeState::Type; - Some('\\') - } - EscapeUnicodeState::Type => { - self.state = EscapeUnicodeState::LeftBrace; - Some('u') - } - EscapeUnicodeState::LeftBrace => { - self.state = EscapeUnicodeState::Value; - Some('{') - } - EscapeUnicodeState::Value => { - let c = from_digit(((self.c as u32) >> (self.offset * 4)) & 0xf, 16).unwrap(); - if self.offset == 0 { - self.state = EscapeUnicodeState::RightBrace; - } else { - self.offset -= 1; - } - Some(c) - } - EscapeUnicodeState::RightBrace => { - self.state = EscapeUnicodeState::Done; - Some('}') - } - EscapeUnicodeState::Done => None, - } + let state = self.state_len(); + let offset = self.offset; + + self.step(state, offset) } #[inline] @@ -507,8 +483,15 @@ impl Iterator for EscapeUnicode { impl ExactSizeIterator for EscapeUnicode { #[inline] fn len(&self) -> usize { + self.offset + self.state_len() + } +} + +impl EscapeUnicode { + #[inline] + fn state_len(&self) -> usize { // The match is a single memory access with no branching - self.offset + self.state { + match self.state { EscapeUnicodeState::Done => 0, EscapeUnicodeState::RightBrace => 1, EscapeUnicodeState::Value => 2, @@ -517,6 +500,43 @@ impl ExactSizeIterator for EscapeUnicode { EscapeUnicodeState::Backslash => 5, } } + + #[inline] + fn step(&mut self, state: usize, offset: usize) -> Option { + self.offset = offset; + + match state { + 5 => { + self.state = EscapeUnicodeState::Type; + Some('\\') + } + 4 => { + self.state = EscapeUnicodeState::LeftBrace; + Some('u') + } + 3 => { + self.state = EscapeUnicodeState::LeftBrace; + Some('{') + } + 2 => { + self.state = if offset == 0 { + EscapeUnicodeState::RightBrace + } else { + self.offset -= 1; + EscapeUnicodeState::Value + }; + from_digit(((self.c as u32) >> (offset * 4)) & 0xf, 16) + } + 1 => { + self.state = EscapeUnicodeState::Done; + Some('}') + } + _ => { + self.state = EscapeUnicodeState::Done; + None + } + } + } } /// An iterator that yields the literal escape code of a `char`. From 23c5da0570eca1d624d53b83dfbf55dff1e670fb Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Tue, 19 Jan 2016 11:22:00 +0100 Subject: [PATCH 08/10] Implement `EscapeUnicode::nth` as a step from the appropriate state. Part of #24214. --- src/libcore/char.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 7e3298980255b..35e29f93639fb 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -466,6 +466,23 @@ impl Iterator for EscapeUnicode { self.len() } + fn nth(&mut self, n: usize) -> Option { + let remaining = self.len().saturating_sub(n); + + // offset = (number of hex digits still to be emitted) - 1 + // It can be computed from the remaining number of items by keeping + // into account that: + // - offset can never increase + // - the last 2 items (last hex digit, '}') are not counted in offset + let offset = ::cmp::min(self.offset, remaining.saturating_sub(2)); + + // state = number of items to be emitted for the state (as per state_len()) + // It can be computed because (remaining number of items) = state + offset + let state = remaining - offset; + + self.step(state, offset) + } + fn last(self) -> Option { match self.state { EscapeUnicodeState::Done => None, From 90ddfc7dc0f18be6abd83a31df264813f342cda1 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 20 Jan 2016 14:17:34 +0100 Subject: [PATCH 09/10] Use the same structure for non-Unicode variants of `EscapeDefaultState` This makes it easier to have a unique path for handling all of them. --- src/libcore/char.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 35e29f93639fb..cd1924e0911c0 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -571,7 +571,7 @@ pub struct EscapeDefault { #[derive(Clone)] enum EscapeDefaultState { - Done, + Done(char), Char(char), Backslash(char), Unicode(EscapeUnicode), @@ -588,10 +588,10 @@ impl Iterator for EscapeDefault { Some('\\') } EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; + self.state = EscapeDefaultState::Done(c); Some(c) } - EscapeDefaultState::Done => None, + EscapeDefaultState::Done(_) => None, EscapeDefaultState::Unicode(ref mut iter) => iter.next(), } } @@ -614,15 +614,15 @@ impl Iterator for EscapeDefault { Some('\\') }, EscapeDefaultState::Backslash(c) if n == 1 => { - self.state = EscapeDefaultState::Done; + self.state = EscapeDefaultState::Done(c); Some(c) }, - EscapeDefaultState::Backslash(_) => { - self.state = EscapeDefaultState::Done; + EscapeDefaultState::Backslash(c) => { + self.state = EscapeDefaultState::Done(c); None }, EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; + self.state = EscapeDefaultState::Done(c); if n == 0 { Some(c) @@ -630,7 +630,7 @@ impl Iterator for EscapeDefault { None } }, - EscapeDefaultState::Done => return None, + EscapeDefaultState::Done(_) => return None, EscapeDefaultState::Unicode(ref mut i) => return i.nth(n), } } @@ -638,7 +638,7 @@ impl Iterator for EscapeDefault { fn last(self) -> Option { match self.state { EscapeDefaultState::Unicode(iter) => iter.last(), - EscapeDefaultState::Done => None, + EscapeDefaultState::Done(_) => None, EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), } } @@ -648,7 +648,7 @@ impl Iterator for EscapeDefault { impl ExactSizeIterator for EscapeDefault { fn len(&self) -> usize { match self.state { - EscapeDefaultState::Done => 0, + EscapeDefaultState::Done(_) => 0, EscapeDefaultState::Char(_) => 1, EscapeDefaultState::Backslash(_) => 2, EscapeDefaultState::Unicode(ref iter) => iter.len(), From 266da8ecc71977562d39e42f7951e8d056a4fadf Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Wed, 20 Jan 2016 14:19:32 +0100 Subject: [PATCH 10/10] Unify `EscapeDefault::next` and `EscapeDefault::nth` by extracting a shared `step` function. --- src/libcore/char.rs | 68 +++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index cd1924e0911c0..7855d1e379418 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -582,17 +582,10 @@ impl Iterator for EscapeDefault { type Item = char; fn next(&mut self) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - } - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done(c); - Some(c) - } - EscapeDefaultState::Done(_) => None, - EscapeDefaultState::Unicode(ref mut iter) => iter.next(), + if let EscapeDefaultState::Unicode(ref mut iter) = self.state { + iter.next() + } else { + self.step(0) } } @@ -608,31 +601,7 @@ impl Iterator for EscapeDefault { } fn nth(&mut self, n: usize) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) if n == 0 => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - }, - EscapeDefaultState::Backslash(c) if n == 1 => { - self.state = EscapeDefaultState::Done(c); - Some(c) - }, - EscapeDefaultState::Backslash(c) => { - self.state = EscapeDefaultState::Done(c); - None - }, - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done(c); - - if n == 0 { - Some(c) - } else { - None - } - }, - EscapeDefaultState::Done(_) => return None, - EscapeDefaultState::Unicode(ref mut i) => return i.nth(n), - } + self.step(n) } fn last(self) -> Option { @@ -655,3 +624,30 @@ impl ExactSizeIterator for EscapeDefault { } } } + +impl EscapeDefault { + #[inline] + fn step(&mut self, n: usize) -> Option { + let (remaining, c) = match self.state { + EscapeDefaultState::Done(c) => (0usize, c), + EscapeDefaultState::Char(c) => (1, c), + EscapeDefaultState::Backslash(c) => (2, c), + EscapeDefaultState::Unicode(ref mut iter) => return iter.nth(n), + }; + + match remaining.saturating_sub(n) { + 2 => { + self.state = EscapeDefaultState::Char(c); + Some('\\') + } + 1 => { + self.state = EscapeDefaultState::Done(c); + Some(c) + } + _ => { + self.state = EscapeDefaultState::Done(c); + None + } + } + } +}