From a27699393bfeb34b582d6f79c3d99cf5a9773f1d Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Sat, 11 Nov 2017 21:07:46 +0300 Subject: [PATCH 1/6] Introduces real Number normalization for Crystal::Hasher. As declared by Crystal language reference, 1i32.hash should equal to 1f64.hash. Extracted from #4675, also replaces #4581. --- spec/std/crystal/hasher_spec.cr | 24 +++++- src/crystal/hasher.cr | 145 +++++++++++++++++++++++++++++++- 2 files changed, 164 insertions(+), 5 deletions(-) diff --git a/spec/std/crystal/hasher_spec.cr b/spec/std/crystal/hasher_spec.cr index 123909b69dcd..ebb67ef94968 100644 --- a/spec/std/crystal/hasher_spec.cr +++ b/spec/std/crystal/hasher_spec.cr @@ -1,5 +1,6 @@ require "spec" require "bit_array" +require "big" require "random/secure" struct Crystal::Hasher @@ -51,6 +52,15 @@ describe "Crystal::Hasher" do 2.hash.should eq(2_u64.hash) end + it "Big i64 numbers should be hashed ok" do + Int64::MAX.hash.should eq (Int64::MAX.hash) + end + + pending "128bit types should be hashed ok" do + 1.to_i128.hash.should eq (1_i8.hash) + 1.to_u128.hash.should eq (1_u8.hash) + end + it "#float should change state and differ" do hasher = TestHasher.for_test hasher1 = 1.0.hash(hasher) @@ -191,8 +201,8 @@ describe "Crystal::Hasher" do hasher = TestHasher.for_test hasher1 = 1.0.hash(hasher) hasher2 = 2.0.hash(hasher) - hasher1.result.should eq(0xecfbe7798e8f67f2_u64) - hasher2.result.should eq(0x72847386c9572c30_u64) + hasher1.result.should eq(10728791798497425537_u64) + hasher2.result.should eq(12628815283865879015_u64) end it "#string should match test vectors" do @@ -229,4 +239,14 @@ describe "Crystal::Hasher" do hasher.inspect.should_not contain(hasher.@b.to_s(16)) end end + + describe "normalization of numbers" do + it "should 1_i32 and 1_f64 hashes equal" do + 1_i32.hash.should eq(1_f64.hash) + end + + it "should 1_f32 and 1.to_big_f hashes equal" do + 1_f32.hash.should eq(1.to_big_f.hash) + end + end end diff --git a/src/crystal/hasher.cr b/src/crystal/hasher.cr index 91b6feb6e4e4..f404c31cb136 100644 --- a/src/crystal/hasher.cr +++ b/src/crystal/hasher.cr @@ -35,6 +35,52 @@ struct Crystal::Hasher # Do not output calculated hash value to user's console/form/ # html/api response, etc. Use some from digest package instead. + # Based on https://github.com/python/cpython/blob/f051e43/Python/pyhash.c#L34 + # + # For numeric types, the hash of a number x is based on the reduction + # of x modulo the Mersen Prime P = 2**HASH_BITS - 1. It's designed + # so that hash(x) == hash(y) whenever x and y are numerically equal, + # even if x and y have different types. + # A quick summary of the hashing strategy: + # (1) First define the 'reduction of x modulo P' for any rational + # number x; this is a standard extension of the usual notion of + # reduction modulo P for integers. If x == p/q (written in lowest + # terms), the reduction is interpreted as the reduction of p times + # the inverse of the reduction of q, all modulo P; if q is exactly + # divisible by P then define the reduction to be infinity. So we've + # got a well-defined map + # reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. + # (2) Now for a rational number x, define hash(x) by: + # reduce(x) if x >= 0 + # -reduce(-x) if x < 0 + # If the result of the reduction is infinity (this is impossible for + # integers, floats and Decimals) then use the predefined hash value + # HASH_INF_PLUS for x >= 0, or HASH_INF_MINUS for x < 0, instead. + # HASH_INF_PLUS, HASH_INF_MINUS and HASH_NAN are also used for the + # hashes of float and Decimal infinities and nans. + # A selling point for the above strategy is that it makes it possible + # to compute hashes of decimal and binary floating-point numbers + # efficiently, even if the exponent of the binary or decimal number + # is large. The key point is that + # reduce(x * y) == reduce(x) * reduce(y) (modulo HASH_MODULUS) + # provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a + # binary or decimal float is never infinity, since the denominator is a power + # of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, + # for nonnegative x, + # reduce(x * 2**e) == reduce(x) * reduce(2**e) % HASH_MODULUS + # reduce(x * 10**e) == reduce(x) * reduce(10**e) % HASH_MODULUS + # and reduce(10**e) can be computed efficiently by the usual modular + # exponentiation algorithm. For reduce(2**e) it's even better: since + # P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication + # by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. + + private HASH_BITS = 61 + private HASH_MODULUS = (1_i64 << HASH_BITS) - 1 + + private HASH_NAN = 0_u64 + private HASH_INF_PLUS = 314159_u64 + private HASH_INF_MINUS = (-314159_i64).unsafe_as(UInt64) + @@seed = uninitialized UInt64[2] Random::Secure.random_bytes(Slice.new(pointerof(@@seed).as(UInt8*), sizeof(typeof(@@seed)))) @@ -75,12 +121,105 @@ struct Crystal::Hasher (value ? 1 : 0).hash(self) end - def int(value) + def int(value : Int8 | Int16 | Int32) + permute(value.to_i64.unsafe_as(UInt64)) + end + + def int(value : UInt8 | UInt16 | UInt32) permute(value.to_u64) end - def float(value) - permute(value.to_f64.unsafe_as(UInt64)) + def int(value : Int::Unsigned) + permute(value.remainder(HASH_MODULUS).to_u64) + end + + def int(value : Int) + permute(value.remainder(HASH_MODULUS).to_i64.unsafe_as(UInt64)) + end + + # This function is for reference implementation, and it is used for `BigFloat`. + # For `Float64` and `Float32` all supported architectures allows more effective + # bitwise calculation. + # Arguments `frac` and `exp` are result of equivalent `Math.frexp`, though + # for `BigFloat` custom calculation used for more precision. + private def float_normalize_reference(value, frac, exp) + if value < 0 + frac = -frac + end + # process 28 bits at a time; this should work well both for binary + # and hexadecimal floating point. + x = 0_i64 + while frac > 0 + x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28) + frac *= 268435456.0 # 2**28 + exp -= 28 + y = frac.to_u32 # pull out integer part + frac -= y + x += y + x -= HASH_MODULUS if x >= HASH_MODULUS + end + {x, exp} + end + + private def float_normalize_wrap(value) + return HASH_NAN if value.nan? + if value.infinite? + return value > 0 ? HASH_INF_PLUS : HASH_INF_MINUS + end + + x, exp = yield value + + # adjust for the exponent; first reduce it modulo HASH_BITS + exp = exp >= 0 ? exp % HASH_BITS : HASH_BITS - 1 - ((-1 - exp) % HASH_BITS) + x = ((x << exp) & HASH_MODULUS) | x >> (HASH_BITS - exp) + + (x * (value < 0 ? -1 : 1)).to_i64.unsafe_as(UInt64) + end + + def float(value : Float32) + permute(float_normalize_wrap(value) do |value| + # This optimized version works on every architecture where endianess + # of Float32 and Int32 matches and float is IEEE754. All supported + # architectures fall into this category. + unsafe_int = value.unsafe_as(Int32) + exp = (((unsafe_int >> 23) & 0xff) - 127) + mantissa = unsafe_int & ((1 << 23) - 1) + if exp > -127 + exp -= 23 + mantissa |= 1 << 23 + else + # subnormals + exp -= 22 + end + {mantissa.to_i64, exp} + end) + end + + def float(value : Float64) + permute(float_normalize_wrap(value) do |value| + # This optimized version works on every architecture where endianess + # of Float64 and Int64 matches and float is IEEE754. All supported + # architectures fall into this category. + unsafe_int = value.unsafe_as(Int64) + exp = (((unsafe_int >> 52) & 0x7ff) - 1023) + mantissa = unsafe_int & ((1_u64 << 52) - 1) + if exp > -1023 + exp -= 52 + mantissa |= 1_u64 << 52 + else + # subnormals + exp -= 51 + end + + {mantissa.to_i64, exp} + end) + end + + def float(value : Float) + frac, exp = Math.frexp value + permute(float_normalize_wrap(value) do |value| + float_normalize_reference(value, frac, exp) + end) end def char(value) From c46503e0fd30ddfeb17546719e6c83541988f48f Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Sun, 12 Nov 2017 05:07:30 +0300 Subject: [PATCH 2/6] hash specializations for BigInt, BigFloat, BigRational. --- spec/std/crystal/hasher_spec.cr | 8 ++++++++ src/big/big_float.cr | 18 ++++++++++++++++++ src/big/big_int.cr | 18 ++++++++++++++++++ src/big/big_rational.cr | 18 ++++++++++++++++++ src/crystal/hasher.cr | 2 +- 5 files changed, 63 insertions(+), 1 deletion(-) diff --git a/spec/std/crystal/hasher_spec.cr b/spec/std/crystal/hasher_spec.cr index ebb67ef94968..c8f1b25be257 100644 --- a/spec/std/crystal/hasher_spec.cr +++ b/spec/std/crystal/hasher_spec.cr @@ -248,5 +248,13 @@ describe "Crystal::Hasher" do it "should 1_f32 and 1.to_big_f hashes equal" do 1_f32.hash.should eq(1.to_big_f.hash) end + + it "should 1_f32 and 1.to_big_r hashes equal" do + 1_f32.hash.should eq(1.to_big_r.hash) + end + + it "should 1_f32 and 1.to_big_i hashes equal" do + 1_f32.hash.should eq(1.to_big_i.hash) + end end end diff --git a/src/big/big_float.cr b/src/big/big_float.cr index 9efdecaa20a6..a16017a05831 100644 --- a/src/big/big_float.cr +++ b/src/big/big_float.cr @@ -293,3 +293,21 @@ module Math BigFloat.new { |mpf| LibGMP.mpf_sqrt(mpf, value) } end end + +# :nodoc: +struct Crystal::Hasher + def float(value : BigFloat) + permute(float_normalize_wrap(value) do |value| + # more exact version of `Math.frexp` + LibGMP.mpf_get_d_2exp(out exp, value) + frac = BigFloat.new do |mpf| + if exp >= 0 + LibGMP.mpf_div_2exp(mpf, value, exp) + else + LibGMP.mpf_mul_2exp(mpf, value, -exp) + end + end + float_normalize_reference(value, frac, exp) + end) + end +end diff --git a/src/big/big_int.cr b/src/big/big_int.cr index d73f97b5928b..1513028c719d 100644 --- a/src/big/big_int.cr +++ b/src/big/big_int.cr @@ -551,3 +551,21 @@ module Math sqrt(value.to_big_f) end end + +# :nodoc: +struct Crystal::Hasher + private HASH_MODULUS_INT_P = BigInt.new((1_u64 << HASH_BITS) - 1) + private HASH_MODULUS_INT_N = -BigInt.new((1_u64 << HASH_BITS) - 1) + + def int(value : BigInt) + # it should calculate `remainder(HASH_MODULUS)` + if LibGMP::ULong == UInt64 + v = LibGMP.tdiv_ui(value, HASH_MODULUS).to_i64 + value < 0 ? -v : v + elsif value >= HASH_MODULUS_INT_P || value <= HASH_MODULUS_INT_N + value.unsafe_truncated_mod(HASH_MODULUS_INT_P).to_i64 + else + value.to_i64 + end + end +end diff --git a/src/big/big_rational.cr b/src/big/big_rational.cr index e6cd39640a40..8949c8077f2c 100644 --- a/src/big/big_rational.cr +++ b/src/big/big_rational.cr @@ -275,3 +275,21 @@ module Math sqrt(value.to_big_f) end end + +# :nodoc: +struct Crystal::Hasher + private HASH_MODULUS_RAT_P = BigRational.new((1_u64 << HASH_BITS) - 1) + private HASH_MODULUS_RAT_N = -BigRational.new((1_u64 << HASH_BITS) - 1) + + def float(value : BigRational) + rem = value + if value >= HASH_MODULUS_RAT_P || value <= HASH_MODULUS_RAT_N + num = value.numerator + denom = value.denominator + div = num.tdiv(denom) + floor = div.tdiv(HASH_MODULUS) + rem -= floor * HASH_MODULUS + end + rem.to_big_f.hash + end +end diff --git a/src/crystal/hasher.cr b/src/crystal/hasher.cr index f404c31cb136..3e9808970aa9 100644 --- a/src/crystal/hasher.cr +++ b/src/crystal/hasher.cr @@ -216,8 +216,8 @@ struct Crystal::Hasher end def float(value : Float) - frac, exp = Math.frexp value permute(float_normalize_wrap(value) do |value| + frac, exp = Math.frexp value float_normalize_reference(value, frac, exp) end) end From 46b2eef06f132220452994991a4589e3ba5a33bc Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Mon, 13 Nov 2017 20:02:10 +0300 Subject: [PATCH 3/6] follow @luislavena --- spec/std/crystal/hasher_spec.cr | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/std/crystal/hasher_spec.cr b/spec/std/crystal/hasher_spec.cr index c8f1b25be257..fb4b5e568e7a 100644 --- a/spec/std/crystal/hasher_spec.cr +++ b/spec/std/crystal/hasher_spec.cr @@ -53,12 +53,12 @@ describe "Crystal::Hasher" do end it "Big i64 numbers should be hashed ok" do - Int64::MAX.hash.should eq (Int64::MAX.hash) + Int64::MAX.hash.should eq(Int64::MAX.hash) end pending "128bit types should be hashed ok" do - 1.to_i128.hash.should eq (1_i8.hash) - 1.to_u128.hash.should eq (1_u8.hash) + 1.to_i128.hash.should eq(1_i8.hash) + 1.to_u128.hash.should eq(1_u8.hash) end it "#float should change state and differ" do From 252539c89fd7175f7085e6dcd86d4caf1335d4a5 Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Sat, 18 Nov 2017 19:55:34 +0300 Subject: [PATCH 4/6] Followed @RX14 --- src/crystal/hasher.cr | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/crystal/hasher.cr b/src/crystal/hasher.cr index 3e9808970aa9..333b3bca9f93 100644 --- a/src/crystal/hasher.cr +++ b/src/crystal/hasher.cr @@ -177,7 +177,7 @@ struct Crystal::Hasher end def float(value : Float32) - permute(float_normalize_wrap(value) do |value| + norm_hash = float_normalize_wrap(value) do |value| # This optimized version works on every architecture where endianess # of Float32 and Int32 matches and float is IEEE754. All supported # architectures fall into this category. @@ -192,11 +192,12 @@ struct Crystal::Hasher exp -= 22 end {mantissa.to_i64, exp} - end) + end + permute(norm_hash) end def float(value : Float64) - permute(float_normalize_wrap(value) do |value| + norm_hash = float_normalize_wrap(value) do |value| # This optimized version works on every architecture where endianess # of Float64 and Int64 matches and float is IEEE754. All supported # architectures fall into this category. @@ -212,14 +213,16 @@ struct Crystal::Hasher end {mantissa.to_i64, exp} - end) + end + permute(norm_hash) end def float(value : Float) - permute(float_normalize_wrap(value) do |value| + norm_hash = float_normalize_wrap(value) do |value| frac, exp = Math.frexp value float_normalize_reference(value, frac, exp) - end) + end + permute(norm_hash) end def char(value) From c92aa1784a4224f8d5e1ac6f71cb1d3e25fdc52d Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Sat, 18 Nov 2017 20:47:28 +0300 Subject: [PATCH 5/6] normalized_hash --- src/crystal/hasher.cr | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/crystal/hasher.cr b/src/crystal/hasher.cr index 333b3bca9f93..92de7044b1ff 100644 --- a/src/crystal/hasher.cr +++ b/src/crystal/hasher.cr @@ -177,7 +177,7 @@ struct Crystal::Hasher end def float(value : Float32) - norm_hash = float_normalize_wrap(value) do |value| + normalized_hash = float_normalize_wrap(value) do |value| # This optimized version works on every architecture where endianess # of Float32 and Int32 matches and float is IEEE754. All supported # architectures fall into this category. @@ -193,11 +193,11 @@ struct Crystal::Hasher end {mantissa.to_i64, exp} end - permute(norm_hash) + permute(normalized_hash) end def float(value : Float64) - norm_hash = float_normalize_wrap(value) do |value| + normalized_hash = float_normalize_wrap(value) do |value| # This optimized version works on every architecture where endianess # of Float64 and Int64 matches and float is IEEE754. All supported # architectures fall into this category. @@ -214,15 +214,15 @@ struct Crystal::Hasher {mantissa.to_i64, exp} end - permute(norm_hash) + permute(normalized_hash) end def float(value : Float) - norm_hash = float_normalize_wrap(value) do |value| + normalized_hash = float_normalize_wrap(value) do |value| frac, exp = Math.frexp value float_normalize_reference(value, frac, exp) end - permute(norm_hash) + permute(normalized_hash) end def char(value) From d6d295272ef6638ef4c865df5446282f053fafd9 Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Sat, 18 Nov 2017 23:54:51 +0300 Subject: [PATCH 6/6] oops @Sija :) --- src/big/big_float.cr | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/big/big_float.cr b/src/big/big_float.cr index a16017a05831..a428ff04e04a 100644 --- a/src/big/big_float.cr +++ b/src/big/big_float.cr @@ -297,7 +297,7 @@ end # :nodoc: struct Crystal::Hasher def float(value : BigFloat) - permute(float_normalize_wrap(value) do |value| + normalized_hash = float_normalize_wrap(value) do |value| # more exact version of `Math.frexp` LibGMP.mpf_get_d_2exp(out exp, value) frac = BigFloat.new do |mpf| @@ -308,6 +308,7 @@ struct Crystal::Hasher end end float_normalize_reference(value, frac, exp) - end) + end + permute(normalized_hash) end end