From 1614173b589620ba33aa91c49ea82513067566fb Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Tue, 15 Sep 2015 15:11:07 +0200
Subject: [PATCH 1/6] Specialize `PartialOrd` for totally ordered primitive
 types

Knowing the result of equality comparison can enable additional
optimizations in LLVM.

Additionally, this makes it obvious that `partial_cmp` on totally
ordered types cannot return `None`.
---
 src/libcore/cmp.rs | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/src/libcore/cmp.rs b/src/libcore/cmp.rs
index aea5feb4be1ff..dc550fc2173af 100644
--- a/src/libcore/cmp.rs
+++ b/src/libcore/cmp.rs
@@ -463,17 +463,35 @@ mod impls {
         }
     }
 
-    partial_ord_impl! { char usize u8 u16 u32 u64 isize i8 i16 i32 i64 f32 f64 }
+    partial_ord_impl! { f32 f64 }
 
     macro_rules! ord_impl {
         ($($t:ty)*) => ($(
+            #[stable(feature = "rust1", since = "1.0.0")]
+            impl PartialOrd for $t {
+                #[inline]
+                fn partial_cmp(&self, other: &$t) -> Option<Ordering> {
+                    if *self == *other { Some(Equal) }
+                    else if *self < *other { Some(Less) }
+                    else { Some(Greater) }
+                }
+                #[inline]
+                fn lt(&self, other: &$t) -> bool { (*self) < (*other) }
+                #[inline]
+                fn le(&self, other: &$t) -> bool { (*self) <= (*other) }
+                #[inline]
+                fn ge(&self, other: &$t) -> bool { (*self) >= (*other) }
+                #[inline]
+                fn gt(&self, other: &$t) -> bool { (*self) > (*other) }
+            }
+
             #[stable(feature = "rust1", since = "1.0.0")]
             impl Ord for $t {
                 #[inline]
                 fn cmp(&self, other: &$t) -> Ordering {
-                    if *self < *other { Less }
-                    else if *self > *other { Greater }
-                    else { Equal }
+                    if *self == *other { Equal }
+                    else if *self < *other { Less }
+                    else { Greater }
                 }
             }
         )*)

From d04b8b5818819470457fc00f87176b273d756980 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Tue, 15 Sep 2015 23:20:18 +0200
Subject: [PATCH 2/6] Improve PartialOrd for slices

Reusing the same idea as in #26884, we can exploit the fact that the
length of slices is known, hence we can use a counted loop instead of
iterators, which means that we only need a single counter, instead of
having to increment and check one pointer for each iterator.

Using the generic implementation of the boolean comparison operators
(`lt`, `le`, `gt`, `ge`) provides further speedup for simple
types. This happens because the loop scans elements checking for
equality and dispatches to element comparison or length comparison
depending on the result of the prefix comparison.

```
test u8_cmp          ... bench:      14,043 ns/iter (+/- 1,732)
test u8_lt           ... bench:      16,156 ns/iter (+/- 1,864)
test u8_partial_cmp  ... bench:      16,250 ns/iter (+/- 2,608)
test u16_cmp         ... bench:      15,764 ns/iter (+/- 1,420)
test u16_lt          ... bench:      19,833 ns/iter (+/- 2,826)
test u16_partial_cmp ... bench:      19,811 ns/iter (+/- 2,240)
test u32_cmp         ... bench:      15,792 ns/iter (+/- 3,409)
test u32_lt          ... bench:      18,577 ns/iter (+/- 2,075)
test u32_partial_cmp ... bench:      18,603 ns/iter (+/- 5,666)
test u64_cmp         ... bench:      16,337 ns/iter (+/- 2,511)
test u64_lt          ... bench:      18,074 ns/iter (+/- 7,914)
test u64_partial_cmp ... bench:      17,909 ns/iter (+/- 1,105)
```

```
test u8_cmp          ... bench:       6,511 ns/iter (+/- 982)
test u8_lt           ... bench:       6,671 ns/iter (+/- 919)
test u8_partial_cmp  ... bench:       7,118 ns/iter (+/- 1,623)
test u16_cmp         ... bench:       6,689 ns/iter (+/- 921)
test u16_lt          ... bench:       6,712 ns/iter (+/- 947)
test u16_partial_cmp ... bench:       6,725 ns/iter (+/- 780)
test u32_cmp         ... bench:       7,704 ns/iter (+/- 1,294)
test u32_lt          ... bench:       7,611 ns/iter (+/- 3,062)
test u32_partial_cmp ... bench:       7,640 ns/iter (+/- 1,149)
test u64_cmp         ... bench:       7,517 ns/iter (+/- 2,164)
test u64_lt          ... bench:       7,579 ns/iter (+/- 1,048)
test u64_partial_cmp ... bench:       7,629 ns/iter (+/- 1,195)
```
---
 src/libcore/slice.rs | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index 8d3d798afef13..7af808d1e8283 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -1558,8 +1558,18 @@ impl<T: Eq> Eq for [T] {}
 
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: Ord> Ord for [T] {
+    #[inline]
     fn cmp(&self, other: &[T]) -> Ordering {
-        self.iter().cmp(other.iter())
+        let l = cmp::min(self.len(), other.len());
+
+        for i in 0..l {
+            match self[i].cmp(&other[i]) {
+                Ordering::Equal => (),
+                non_eq => return non_eq,
+            }
+        }
+
+        self.len().cmp(&other.len())
     }
 }
 
@@ -1567,22 +1577,15 @@ impl<T: Ord> Ord for [T] {
 impl<T: PartialOrd> PartialOrd for [T] {
     #[inline]
     fn partial_cmp(&self, other: &[T]) -> Option<Ordering> {
-        self.iter().partial_cmp(other.iter())
-    }
-    #[inline]
-    fn lt(&self, other: &[T]) -> bool {
-        self.iter().lt(other.iter())
-    }
-    #[inline]
-    fn le(&self, other: &[T]) -> bool {
-        self.iter().le(other.iter())
-    }
-    #[inline]
-    fn ge(&self, other: &[T]) -> bool {
-        self.iter().ge(other.iter())
-    }
-    #[inline]
-    fn gt(&self, other: &[T]) -> bool {
-        self.iter().gt(other.iter())
+        let l = cmp::min(self.len(), other.len());
+
+        for i in 0..l {
+            match self[i].partial_cmp(&other[i]) {
+                Some(Ordering::Equal) => (),
+                non_eq => return non_eq,
+            }
+        }
+
+        self.len().partial_cmp(&other.len())
     }
 }

From bf9254a75e06f61ecd837e2f90ed0afc22ffdede Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Wed, 16 Sep 2015 15:25:51 +0200
Subject: [PATCH 3/6] Reuse cmp in totally ordered types

Instead of manually defining it, `partial_cmp` can simply wrap the
result of `cmp` for totally ordered types.
---
 src/libcore/cmp.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/libcore/cmp.rs b/src/libcore/cmp.rs
index dc550fc2173af..3344d7ea5d7fc 100644
--- a/src/libcore/cmp.rs
+++ b/src/libcore/cmp.rs
@@ -471,9 +471,7 @@ mod impls {
             impl PartialOrd for $t {
                 #[inline]
                 fn partial_cmp(&self, other: &$t) -> Option<Ordering> {
-                    if *self == *other { Some(Equal) }
-                    else if *self < *other { Some(Less) }
-                    else { Some(Greater) }
+                    Some(self.cmp(other))
                 }
                 #[inline]
                 fn lt(&self, other: &$t) -> bool { (*self) < (*other) }

From 369a9dc302582145e37cf335c454fb6bd74906c6 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Wed, 16 Sep 2015 15:27:14 +0200
Subject: [PATCH 4/6] Remove boundary checks in slice comparison operators

In order to get rid of all range checks, the compiler needs to
explicitly see that the slices it iterates over are as long as the
loop variable upper bound.

This further improves the performance of slice comparison:

```
test u8_cmp          ... bench:       4,761 ns/iter (+/- 1,203)
test u8_lt           ... bench:       4,579 ns/iter (+/- 649)
test u8_partial_cmp  ... bench:       4,768 ns/iter (+/- 761)
test u16_cmp         ... bench:       4,607 ns/iter (+/- 580)
test u16_lt          ... bench:       4,681 ns/iter (+/- 567)
test u16_partial_cmp ... bench:       4,607 ns/iter (+/- 967)
test u32_cmp         ... bench:       4,448 ns/iter (+/- 891)
test u32_lt          ... bench:       4,546 ns/iter (+/- 992)
test u32_partial_cmp ... bench:       4,415 ns/iter (+/- 646)
test u64_cmp         ... bench:       4,380 ns/iter (+/- 1,184)
test u64_lt          ... bench:       5,684 ns/iter (+/- 602)
test u64_partial_cmp ... bench:       4,663 ns/iter (+/- 1,158)
```
---
 src/libcore/slice.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index 7af808d1e8283..af8db18a8737c 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -1561,9 +1561,11 @@ impl<T: Ord> Ord for [T] {
     #[inline]
     fn cmp(&self, other: &[T]) -> Ordering {
         let l = cmp::min(self.len(), other.len());
+        let lhs = &self[..l];
+        let rhs = &other[..l];
 
         for i in 0..l {
-            match self[i].cmp(&other[i]) {
+            match lhs[i].cmp(&rhs[i]) {
                 Ordering::Equal => (),
                 non_eq => return non_eq,
             }
@@ -1578,9 +1580,11 @@ impl<T: PartialOrd> PartialOrd for [T] {
     #[inline]
     fn partial_cmp(&self, other: &[T]) -> Option<Ordering> {
         let l = cmp::min(self.len(), other.len());
+        let lhs = &self[..l];
+        let rhs = &other[..l];
 
         for i in 0..l {
-            match self[i].partial_cmp(&other[i]) {
+            match lhs[i].partial_cmp(&rhs[i]) {
                 Some(Ordering::Equal) => (),
                 non_eq => return non_eq,
             }

From 08b9edfe94c2680bde224b9dc6dd4a0de6616a07 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Wed, 16 Sep 2015 16:09:01 +0200
Subject: [PATCH 5/6] Remove inline attribute

Be more conservative with inlining.
---
 src/libcore/slice.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index af8db18a8737c..dbebea37f6fc8 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -1558,7 +1558,6 @@ impl<T: Eq> Eq for [T] {}
 
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: Ord> Ord for [T] {
-    #[inline]
     fn cmp(&self, other: &[T]) -> Ordering {
         let l = cmp::min(self.len(), other.len());
         let lhs = &self[..l];
@@ -1577,7 +1576,6 @@ impl<T: Ord> Ord for [T] {
 
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: PartialOrd> PartialOrd for [T] {
-    #[inline]
     fn partial_cmp(&self, other: &[T]) -> Option<Ordering> {
         let l = cmp::min(self.len(), other.len());
         let lhs = &self[..l];

From 74dc146f4296c209bf688d7cddff51eab48e8496 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Wed, 16 Sep 2015 16:09:23 +0200
Subject: [PATCH 6/6] Explain explicit slicing in slice cmp and partial_cmp
 methods

The explicit slicing is needed in order to enable additional range
check optimizations in the compiler.
---
 src/libcore/slice.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index dbebea37f6fc8..5518bacb019e8 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -1560,6 +1560,9 @@ impl<T: Eq> Eq for [T] {}
 impl<T: Ord> Ord for [T] {
     fn cmp(&self, other: &[T]) -> Ordering {
         let l = cmp::min(self.len(), other.len());
+
+        // Slice to the loop iteration range to enable bound check
+        // elimination in the compiler
         let lhs = &self[..l];
         let rhs = &other[..l];
 
@@ -1578,6 +1581,9 @@ impl<T: Ord> Ord for [T] {
 impl<T: PartialOrd> PartialOrd for [T] {
     fn partial_cmp(&self, other: &[T]) -> Option<Ordering> {
         let l = cmp::min(self.len(), other.len());
+
+        // Slice to the loop iteration range to enable bound check
+        // elimination in the compiler
         let lhs = &self[..l];
         let rhs = &other[..l];