rust-lang · jonas-schievink · Sep 26, 2020 · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020
diff --git a/compiler/rustc_data_structures/src/sip128.rs b/compiler/rustc_data_structures/src/sip128.rs
@@ -8,6 +8,13 @@ use std::ptr;
 #[cfg(test)]
 mod tests;
 
+/// When hashing something that ends up affecting properties like symbol names,
+/// we want these symbol names to be calculated independently of other factors
+/// like what architecture you're compiling *from*.
+///
+/// To that end, we always convert integers to little-endian format or handle
+/// them in an endian-independent way, and extend the architecture-dependent
+/// `isize` and `usize` types to 64 bits if needed before hashing.
 #[derive(Debug, Clone)]
 pub struct SipHasher128 {
     k0: u64,
@@ -125,15 +132,17 @@ impl SipHasher128 {
 
     // A specialized write function for values with size <= 8.
     //
-    // The hashing of multi-byte integers depends on endianness. E.g.:
-    // - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])`
-    // - big-endian:    `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])`
+    // The input must be zero-extended to 64-bits by the caller. The extension
+    // isn't hashed, but the implementation requires it for correctness.
+    //
+    // This function, given the same integer type and value, has the same effect
+    // on both little- and big-endian hardware. It operates on values without
+    // depending on their sequence in memory, so is independent of endianness.
     //
-    // This function does the right thing for little-endian hardware. On
-    // big-endian hardware `x` must be byte-swapped first to give the right
-    // behaviour. After any byte-swapping, the input must be zero-extended to
-    // 64-bits. The caller is responsible for the byte-swapping and
-    // zero-extension.
+    // The equivalent write() call *does* need the value's bytes converted to
+    // little-endian (without zero-extension) for equivalent behavior on little-
+    // and big-endian hardware, as write() *does* operate on byte sequences.
+    // I.e. write_u32(0xDDCCBBAA) == write(&0xDDCCBBAA_u32.to_le_bytes()).
     #[inline]
     fn short_write<T>(&mut self, _x: T, x: u64) {
         let size = mem::size_of::<T>();
@@ -167,12 +176,9 @@ impl SipHasher128 {
         //   left-shift it five bytes, giving 0xHHGG_FF00_0000_0000. We then
         //   bitwise-OR that value into `self.tail`, resulting in
         //   0xHHGG_FFEE_DDCC_BBAA. `self.tail` is now full, and we can use it
-        //   to update `self.state`. (As mentioned above, this assumes a
-        //   little-endian machine; on a big-endian machine we would have
-        //   byte-swapped 0xIIHH_GGFF in the caller, giving 0xFFGG_HHII, and we
-        //   would then end up bitwise-ORing 0xGGHH_II00_0000_0000 into
-        //   `self.tail`).
-        //
+        //   to update `self.state`. The analysis is the same whether we are on
+        //   a little-endian or big-endian machine, as the bitwise operations
+        //   are endian-independent.
         self.tail |= x << (8 * self.ntail);
         if size < needed {
             self.ntail += size;
@@ -186,8 +192,7 @@ impl SipHasher128 {
 
         // Continuing scenario 2: we have one byte left over from the input. We
         // set `self.ntail` to 1 and `self.tail` to `0x0000_0000_IIHH_GGFF >>
-        // 8*3`, which is 0x0000_0000_0000_00II. (Or on a big-endian machine
-        // the prior byte-swapping would leave us with 0x0000_0000_0000_00FF.)
+        // 8*3`, which is 0x0000_0000_0000_00II.
         //
         // The `if` is needed to avoid shifting by 64 bits, which Rust
         // complains about.
@@ -222,22 +227,30 @@ impl Hasher for SipHasher128 {
 
     #[inline]
     fn write_u16(&mut self, i: u16) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
     }
 
     #[inline]
     fn write_u32(&mut self, i: u32) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
     }
 
     #[inline]
     fn write_u64(&mut self, i: u64) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
+    }
+
+    #[inline]
+    fn write_u128(&mut self, i: u128) {
+        self.write(&i.to_le_bytes());
     }
 
     #[inline]
     fn write_usize(&mut self, i: usize) {
-        self.short_write(i, i.to_le() as u64);
+        // Always treat usize as u64 so we get the same results on 32 and 64 bit
+        // platforms. This is important for symbol hashes when cross compiling,
+        // for example.
+        self.write_u64(i as u64);
     }
 
     #[inline]
@@ -247,22 +260,31 @@ impl Hasher for SipHasher128 {
 
     #[inline]
     fn write_i16(&mut self, i: i16) {
-        self.short_write(i, (i as u16).to_le() as u64);
+        self.short_write(i, i as u16 as u64);
     }
 
     #[inline]
     fn write_i32(&mut self, i: i32) {
-        self.short_write(i, (i as u32).to_le() as u64);
+        self.short_write(i, i as u32 as u64);
     }
 
     #[inline]
     fn write_i64(&mut self, i: i64) {
-        self.short_write(i, (i as u64).to_le() as u64);
+        self.short_write(i, i as u64);
+    }
+
+    #[inline]
+    fn write_i128(&mut self, i: i128) {
+        self.write(&i.to_le_bytes());
     }
 
     #[inline]
     fn write_isize(&mut self, i: isize) {
-        self.short_write(i, (i as usize).to_le() as u64);
+        // Always treat isize as i64 so we get the same results on 32 and 64 bit
+        // platforms. This is important for symbol hashes when cross compiling,
+        // for example. Sign extending here is preferable as it means that the
+        // same negative number hashes the same on both 32 and 64 bit platforms.
+        self.write_i64(i as i64);
     }
 
     #[inline]

diff --git a/compiler/rustc_data_structures/src/sip128/tests.rs b/compiler/rustc_data_structures/src/sip128/tests.rs
@@ -1,7 +1,6 @@
 use super::*;
 
 use std::hash::{Hash, Hasher};
-use std::{mem, slice};
 
 // Hash just the bytes of the slice, without length prefix
 struct Bytes<'a>(&'a [u8]);
@@ -399,20 +398,58 @@ fn test_hash_no_concat_alias() {
 }
 
 #[test]
-fn test_write_short_works() {
-    let test_usize = 0xd0c0b0a0usize;
+fn test_short_write_works() {
+    let test_u8 = 0xFF_u8;
+    let test_u16 = 0x1122_u16;
+    let test_u32 = 0x22334455_u32;
+    let test_u64 = 0x33445566_778899AA_u64;
+    let test_u128 = 0x11223344_55667788_99AABBCC_DDEEFF77_u128;
+    let test_usize = 0xD0C0B0A0_usize;
+
+    let test_i8 = -1_i8;
+    let test_i16 = -2_i16;
+    let test_i32 = -3_i32;
+    let test_i64 = -4_i64;
+    let test_i128 = -5_i128;
+    let test_isize = -6_isize;
+
     let mut h1 = SipHasher128::new_with_keys(0, 0);
-    h1.write_usize(test_usize);
     h1.write(b"bytes");
     h1.write(b"string");
-    h1.write_u8(0xFFu8);
-    h1.write_u8(0x01u8);
+    h1.write_u8(test_u8);
+    h1.write_u16(test_u16);
+    h1.write_u32(test_u32);
+    h1.write_u64(test_u64);
+    h1.write_u128(test_u128);
+    h1.write_usize(test_usize);
+    h1.write_i8(test_i8);
+    h1.write_i16(test_i16);
+    h1.write_i32(test_i32);
+    h1.write_i64(test_i64);
+    h1.write_i128(test_i128);
+    h1.write_isize(test_isize);
+
     let mut h2 = SipHasher128::new_with_keys(0, 0);
-    h2.write(unsafe {
-        slice::from_raw_parts(&test_usize as *const _ as *const u8, mem::size_of::<usize>())
-    });
     h2.write(b"bytes");
     h2.write(b"string");
-    h2.write(&[0xFFu8, 0x01u8]);
-    assert_eq!(h1.finish128(), h2.finish128());
+    h2.write(&test_u8.to_le_bytes());
+    h2.write(&test_u16.to_le_bytes());
+    h2.write(&test_u32.to_le_bytes());
+    h2.write(&test_u64.to_le_bytes());
+    h2.write(&test_u128.to_le_bytes());
+    h2.write(&(test_usize as u64).to_le_bytes());
+    h2.write(&test_i8.to_le_bytes());
+    h2.write(&test_i16.to_le_bytes());
+    h2.write(&test_i32.to_le_bytes());
+    h2.write(&test_i64.to_le_bytes());
+    h2.write(&test_i128.to_le_bytes());
+    h2.write(&(test_isize as i64).to_le_bytes());
+
+    let h1_hash = h1.finish128();
+    let h2_hash = h2.finish128();
+
+    let expected = (5926600258011434223, 10938367019217336666);
+
+    assert_eq!(h1_hash, expected);
+    assert_eq!(h2_hash, expected);
 }
diff --git a/compiler/rustc_data_structures/src/stable_hasher.rs b/compiler/rustc_data_structures/src/stable_hasher.rs
@@ -5,6 +5,9 @@ use smallvec::SmallVec;
 use std::hash::{BuildHasher, Hash, Hasher};
 use std::mem;
 
+#[cfg(test)]
+mod tests;
+
 /// When hashing something that ends up affecting properties like symbol names,
 /// we want these symbol names to be calculated independently of other factors
 /// like what architecture you're compiling *from*.
@@ -57,6 +60,9 @@ impl StableHasher {
     }
 }
 
+// SipHasher128 currently handles ensuring platform-independent results with
+// respect to endianness and `isize` and `usize` differences (to the extent
+// possible). The write functions below don't need handle this at this time.
 impl Hasher for StableHasher {
     fn finish(&self) -> u64 {
         panic!("use StableHasher::finalize instead");
@@ -74,30 +80,27 @@ impl Hasher for StableHasher {
 
     #[inline]
     fn write_u16(&mut self, i: u16) {
-        self.state.write_u16(i.to_le());
+        self.state.write_u16(i);
     }
 
     #[inline]
     fn write_u32(&mut self, i: u32) {
-        self.state.write_u32(i.to_le());
+        self.state.write_u32(i);
     }
 
     #[inline]
     fn write_u64(&mut self, i: u64) {
-        self.state.write_u64(i.to_le());
+        self.state.write_u64(i);
     }
 
     #[inline]
     fn write_u128(&mut self, i: u128) {
-        self.state.write_u128(i.to_le());
+        self.state.write_u128(i);
     }
 
     #[inline]
     fn write_usize(&mut self, i: usize) {
-        // Always treat usize as u64 so we get the same results on 32 and 64 bit
-        // platforms. This is important for symbol hashes when cross compiling,
-        // for example.
-        self.state.write_u64((i as u64).to_le());
+        self.state.write_usize(i);
     }
 
     #[inline]
@@ -107,30 +110,27 @@ impl Hasher for StableHasher {
 
     #[inline]
     fn write_i16(&mut self, i: i16) {
-        self.state.write_i16(i.to_le());
+        self.state.write_i16(i);
     }
 
     #[inline]
     fn write_i32(&mut self, i: i32) {
-        self.state.write_i32(i.to_le());
+        self.state.write_i32(i);
     }
 
     #[inline]
     fn write_i64(&mut self, i: i64) {
-        self.state.write_i64(i.to_le());
+        self.state.write_i64(i);
     }
 
     #[inline]
     fn write_i128(&mut self, i: i128) {
-        self.state.write_i128(i.to_le());
+        self.state.write_i128(i);
     }
 
     #[inline]
     fn write_isize(&mut self, i: isize) {
-        // Always treat isize as i64 so we get the same results on 32 and 64 bit
-        // platforms. This is important for symbol hashes when cross compiling,
-        // for example.
-        self.state.write_i64((i as i64).to_le());
+        self.state.write_isize(i);
     }
 }
 

diff --git a/compiler/rustc_data_structures/src/stable_hasher/tests.rs b/compiler/rustc_data_structures/src/stable_hasher/tests.rs
@@ -0,0 +1,73 @@
+use super::*;
+
+// The tests below compare the computed hashes to particular expected values
+// in order to test that we produce the same results on different platforms,
+// regardless of endianness and `usize` and `isize` size differences (this
+// of course assumes we run these tests on platforms that differ in those
+// ways). The expected values depend on the hashing algorithm used, so they
+// need to be updated whenever StableHasher changes its hashing algorithm.
+
+#[test]
+fn test_hash_integers() {
+    // Test that integers are handled consistently across platforms.
+    let test_u8 = 0xAB_u8;
+    let test_u16 = 0xFFEE_u16;
+    let test_u32 = 0x445577AA_u32;
+    let test_u64 = 0x01234567_13243546_u64;
+    let test_u128 = 0x22114433_66557788_99AACCBB_EEDDFF77_u128;
+    let test_usize = 0xD0C0B0A0_usize;
+
+    let test_i8 = -100_i8;
+    let test_i16 = -200_i16;
+    let test_i32 = -300_i32;
+    let test_i64 = -400_i64;
+    let test_i128 = -500_i128;
+    let test_isize = -600_isize;
+
+    let mut h = StableHasher::new();
+    test_u8.hash(&mut h);
+    test_u16.hash(&mut h);
+    test_u32.hash(&mut h);
+    test_u64.hash(&mut h);
+    test_u128.hash(&mut h);
+    test_usize.hash(&mut h);
+    test_i8.hash(&mut h);
+    test_i16.hash(&mut h);
+    test_i32.hash(&mut h);
+    test_i64.hash(&mut h);
+    test_i128.hash(&mut h);
+    test_isize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (2736651863462566372, 8121090595289675650);
+
+    assert_eq!(h.finalize(), expected);
+}
+
+#[test]
+fn test_hash_usize() {
+    // Test that usize specifically is handled consistently across platforms.
+    let test_usize = 0xABCDEF01_usize;
+
+    let mut h = StableHasher::new();
+    test_usize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (5798740672699530587, 11186240177685111648);
+
+    assert_eq!(h.finalize(), expected);
+}
+
+#[test]
+fn test_hash_isize() {
+    // Test that isize specifically is handled consistently across platforms.
+    let test_isize = -7_isize;
+
+    let mut h = StableHasher::new();
+    test_isize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (14721296605626097289, 11385941877786388409);
+
+    assert_eq!(h.finalize(), expected);
+}