From 38561367fec8f068ab8ad4271e7bc8011575e0e7 Mon Sep 17 00:00:00 2001 From: Sokolov Yura aka funny_falcon Date: Sun, 25 Jun 2017 22:59:52 +0300 Subject: [PATCH] change computation of hash value. To protect against Hash DoS, change the way hash value is computed. Class|Struct should define method `def hash(hasher)` and call `hasher << @ivar` inside. As an option, for speed, and for backward compatibility, `def hash` still could be implemented. It will be used for Hash of matched type. `Thread#hash` and `Signal#hash` is implemented as unseeded cause they are used before `StdHasher @@seed` is initialized. But it is better to implement `def hash(hasher)`. StdHasher is default hasher that uses `hash(hasher)` and it is used as default seeded hasher. It also implements `unseeded` for `Enums`. Also, number normalization for hashing introduced, ie rule 'equality forces hash equality' is forced (`a == b` => `a.hash == b.hash`). Normalization idea is borrowed from Python implementation. (idea by Akzhan Abdulin @akzhan) Fixes #4578 Prerequisite for #4557 Replaces #4581 --- spec/std/big/big_int_spec.cr | 3 +- spec/std/bool_spec.cr | 5 +- spec/std/enum_spec.cr | 2 +- spec/std/hash_spec.cr | 4 +- spec/std/struct_spec.cr | 7 +- spec/std/time/span_spec.cr | 2 +- src/big/big_float.cr | 16 ++- src/big/big_int.cr | 12 +- src/big/big_rational.cr | 32 ++++- src/big/lib_gmp.cr | 3 + src/bool.cr | 7 +- src/char.cr | 6 + src/class.cr | 5 +- src/compiler/crystal/syntax/ast.cr | 15 ++- src/enum.cr | 7 +- src/event/signal_handler.cr | 1 + src/float.cr | 62 +++++++-- src/hash.cr | 21 ++- src/http/headers.cr | 13 +- src/indexable.cr | 12 +- src/int.cr | 44 ++++++- src/json/any.cr | 5 - src/named_tuple.cr | 14 +- src/nil.cr | 7 +- src/number.cr | 21 +++ src/number/hash_normalize.cr | 95 ++++++++++++++ src/object.cr | 39 ++++-- src/prelude.cr | 1 + src/proc.cr | 5 +- src/reference.cr | 7 +- src/set.cr | 4 - src/signal.cr | 7 + src/stdhasher.cr | 199 +++++++++++++++++++++++++++++ src/string.cr | 13 +- src/struct.cr | 9 +- src/symbol.cr | 9 +- src/thread.cr | 6 + src/time.cr | 4 - src/tuple.cr | 11 +- src/xml/namespace.cr | 5 +- src/xml/node.cr | 5 +- src/xml/node_set.cr | 5 +- src/yaml/any.cr | 5 - 43 files changed, 617 insertions(+), 138 deletions(-) create mode 100644 src/number/hash_normalize.cr create mode 100644 src/stdhasher.cr diff --git a/spec/std/big/big_int_spec.cr b/spec/std/big/big_int_spec.cr index b9ecfa13aa6f..7de5a4d8c8de 100644 --- a/spec/std/big/big_int_spec.cr +++ b/spec/std/big/big_int_spec.cr @@ -287,8 +287,7 @@ describe "BigInt" do it "#hash" do hash = 5.to_big_i.hash - hash.should eq(5) - typeof(hash).should eq(UInt64) + hash.should eq(5.hash) end it "clones" do diff --git a/spec/std/bool_spec.cr b/spec/std/bool_spec.cr index 960ed8653f33..fc5eb6e548c6 100644 --- a/spec/std/bool_spec.cr +++ b/spec/std/bool_spec.cr @@ -28,8 +28,9 @@ describe "Bool" do end describe "hash" do - it { true.hash.should eq(1) } - it { false.hash.should eq(0) } + it { true.hash.should eq(true.hash) } + it { false.hash.should eq(false.hash) } + it { true.hash.should_not eq(false.hash) } end describe "to_s" do diff --git a/spec/std/enum_spec.cr b/spec/std/enum_spec.cr index ff55cdd6e014..a3c17aaf6575 100644 --- a/spec/std/enum_spec.cr +++ b/spec/std/enum_spec.cr @@ -142,7 +142,7 @@ describe Enum do end it "has hash" do - SpecEnum::Two.hash.should eq(1.hash) + SpecEnum::Two.hash.should_not eq(SpecEnum::One.hash) end it "parses" do diff --git a/spec/std/hash_spec.cr b/spec/std/hash_spec.cr index 4097a4aefecb..328df1564b38 100644 --- a/spec/std/hash_spec.cr +++ b/spec/std/hash_spec.cr @@ -145,8 +145,8 @@ describe "Hash" do end end - it "works with mixed types" do - {1 => :a, "a" => 1, 1.0 => "a", :a => 1.0}.values_at(1, "a", 1.0, :a).should eq({:a, 1, "a", 1.0}) + it "works with mixed types and normalized numbers" do + {1 => :a, "a" => 1, 2.0 => "a", :a => 1.0}.values_at(1, 2, "a", 1.0, 2.0, :a).should eq({:a, "a", 1, :a, "a", 1.0}) end end diff --git a/spec/std/struct_spec.cr b/spec/std/struct_spec.cr index f266068d92e6..79a40c1de934 100644 --- a/spec/std/struct_spec.cr +++ b/spec/std/struct_spec.cr @@ -42,11 +42,14 @@ describe "Struct" do it "does hash" do s = StructSpec::TestClass.new(1, "hello") - s.hash.should eq(31 + "hello".hash) + hasher = StdHasher.new + hasher << 1 + hasher << "hello" + s.hash.should eq(hasher.digest) end it "does hash for struct wrapper (#1940)" do - StructSpec::BigIntWrapper.new(BigInt.new(0)).hash.should eq(0) + StructSpec::BigIntWrapper.new(BigInt.new(0)).hash.should eq(BigInt.new(0).hash) end it "does dup" do diff --git a/spec/std/time/span_spec.cr b/spec/std/time/span_spec.cr index 472472fa664f..1405cb63ffb5 100644 --- a/spec/std/time/span_spec.cr +++ b/spec/std/time/span_spec.cr @@ -176,7 +176,7 @@ describe Time::Span do end it "test hash code" do - Time::Span.new(77).hash.should eq(77) + Time::Span.new(77).hash.should eq(77.hash) end it "test subtract" do diff --git a/src/big/big_float.cr b/src/big/big_float.cr index e4d8f0e946de..d12d7bd4ca2d 100644 --- a/src/big/big_float.cr +++ b/src/big/big_float.cr @@ -17,6 +17,18 @@ struct BigFloat < Float LibGMP.mpf_init_set_str(out @mpf, str, 10) end + def initialize(num : BigInt) + # Probably should detect precision and use mpf_init2 + LibGMP.mpf_init(out @mpf) + LibGMP.mpf_set_z(self, num) + end + + def initialize(num : BigRational) + # Probably should detect precision and use mpf_init2 + LibGMP.mpf_init(out @mpf) + LibGMP.mpf_set_q(self, num) + end + def initialize(num : Number) LibGMP.mpf_init_set_d(out @mpf, num.to_f64) end @@ -35,8 +47,8 @@ struct BigFloat < Float new(mpf) end - def hash - to_f64.hash + def hash_normalize + remainder(HASH_MODULUS).to_f64.hash_normalize end def self.default_precision diff --git a/src/big/big_int.cr b/src/big/big_int.cr index 6f843c3be285..e3468d5e695b 100644 --- a/src/big/big_int.cr +++ b/src/big/big_int.cr @@ -267,8 +267,16 @@ struct BigInt < Int to_s io end - def hash - to_u64 + def hash_normalize + # remainder(HASH_MODULUS) + uv = LibGMP.tdiv_ui(self, HASH_MODULUS) + v = + {% if HASH_BITS == 31 %} + uv.to_i32 + {% else %} + uv.to_i64 + {% end %} + self < 0 ? -v : v end # Returns a string representation of self. diff --git a/src/big/big_rational.cr b/src/big/big_rational.cr index 0ded929ea985..61072bbf1158 100644 --- a/src/big/big_rational.cr +++ b/src/big/big_rational.cr @@ -41,6 +41,22 @@ struct BigRational < Number initialize(num, 1) end + # Creates a exact representation of float as rational. + # + # It sures that `BigRational.new(f) == f` + # It relies on fact, that mantisa is at most 53 bits + def initialize(num : Float32 | Float64) + frac, exp = Math.frexp num + ifrac = (frac.to_f64 * (1.to_i64 << 53).to_f64).to_i64 + exp -= 53 + initialize ifrac, 1 + if exp > 0 + LibGMP.mpq_mul_2exp(out @mpq, self, exp) + elsif exp < 0 + LibGMP.mpq_div_2exp(out @mpq, self, -exp) + end + end + # :nodoc: def initialize(@mpq : LibGMP::MPQ) end @@ -64,8 +80,12 @@ struct BigRational < Number LibGMP.mpq_cmp(mpq, other) end + def <=>(other : Float32 | Float64) + self <=> BigRational.new(other) + end + def <=>(other : Float) - self.to_f <=> other + BigFloat.new(self) <=> BigFloat.new(other) end def <=>(other : Int) @@ -139,8 +159,14 @@ struct BigRational < Number BigRational.new { |mpq| LibGMP.mpq_abs(mpq, self) } end - def hash - to_f64.hash + def hash_normalize + # self.remainder(HASH_MODULUS).to_f.hash_normalize + num = numerator + denom = denominator + div = num.tdiv(denom) + floor = div.tdiv(HASH_MODULUS) + rem = self - floor * HASH_MODULUS + rem.to_f.hash_normalize end # Returns the `Float64` representing this rational. diff --git a/src/big/lib_gmp.cr b/src/big/lib_gmp.cr index 012eeee9750c..66645657d070 100644 --- a/src/big/lib_gmp.cr +++ b/src/big/lib_gmp.cr @@ -63,6 +63,7 @@ lib LibGMP fun tdiv_r = __gmpz_tdiv_r(rop : MPZ*, op1 : MPZ*, op2 : MPZ*) fun tdiv_r_ui = __gmpz_tdiv_r_ui(rop : MPZ*, op1 : MPZ*, op2 : ULong) + fun tdiv_ui = __gmpz_tdiv_ui(op1 : MPZ*, op2 : ULong) : ULong fun neg = __gmpz_neg(rop : MPZ*, op : MPZ*) fun abs = __gmpz_abs(rop : MPZ*, op : MPZ*) @@ -152,6 +153,8 @@ lib LibGMP fun mpf_get_str = __gmpf_get_str(str : UInt8*, expptr : MpExp*, base : Int, n_digits : LibC::SizeT, op : MPF*) : UInt8* fun mpf_get_d = __gmpf_get_d(op : MPF*) : Double fun mpf_set_d = __gmpf_set_d(op : MPF*, op : Double) + fun mpf_set_z = __gmpf_set_z(op : MPF*, op : MPZ*) + fun mpf_set_q = __gmpf_set_q(op : MPF*, op : MPQ*) fun mpf_get_si = __gmpf_get_si(op : MPF*) : Long fun mpf_get_ui = __gmpf_get_ui(op : MPF*) : ULong fun mpf_ceil = __gmpf_ceil(rop : MPF*, op : MPF*) diff --git a/src/bool.cr b/src/bool.cr index 5e7f5f81ae56..898b79ac81d3 100644 --- a/src/bool.cr +++ b/src/bool.cr @@ -41,9 +41,10 @@ struct Bool self != other end - # Returns a hash value for this boolean: 0 for `false`, 1 for `true`. - def hash - self ? 1 : 0 + # Protocol method for generic hashing. + def hash(hasher) + hasher << (self ? 1 : 0) + hasher end # Returns `"true"` for `true` and `"false"` for `false`. diff --git a/src/char.cr b/src/char.cr index 0fd31d1c97c2..3e5619a9abdd 100644 --- a/src/char.cr +++ b/src/char.cr @@ -419,6 +419,12 @@ struct Char ord end + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw ord + hasher + end + # Returns a Char that is one codepoint bigger than this char's codepoint. # # ``` diff --git a/src/class.cr b/src/class.cr index cfe17b1e900c..5634b64a0b3f 100644 --- a/src/class.cr +++ b/src/class.cr @@ -3,8 +3,9 @@ class Class to_s(io) end - def hash - crystal_type_id + def hash(hasher) + hasher.raw(crystal_type_id) + hasher end def ==(other : Class) diff --git a/src/compiler/crystal/syntax/ast.cr b/src/compiler/crystal/syntax/ast.cr index 76a5fce49342..911ea3120b39 100644 --- a/src/compiler/crystal/syntax/ast.cr +++ b/src/compiler/crystal/syntax/ast.cr @@ -1175,8 +1175,9 @@ module Crystal self end - def hash - 0 + def hash(hasher) + hasher << 0 + hasher end end @@ -1545,8 +1546,9 @@ module Crystal Self.new end - def hash - 0 + def hash(hasher) + hasher << 0 + hasher end end @@ -2025,8 +2027,9 @@ module Crystal Underscore.new end - def hash - 0 + def hash(hasher) + hasher << 0 + hasher end end diff --git a/src/enum.cr b/src/enum.cr index e9758ba05d13..76deb2dcfafb 100644 --- a/src/enum.cr +++ b/src/enum.cr @@ -274,9 +274,10 @@ struct Enum value == other.value end - # Returns a hash value. This is the hash of the underlying value. - def hash - value.hash + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw(value) + hasher end # Iterates each values in a Flags Enum. diff --git a/src/event/signal_handler.cr b/src/event/signal_handler.cr index 857a2bea9701..de3caf5a2a0e 100644 --- a/src/event/signal_handler.cr +++ b/src/event/signal_handler.cr @@ -1,5 +1,6 @@ require "c/signal" require "c/unistd" +require "signal" # :nodoc: # Singleton that runs Signal events (libevent2) in it's own Fiber. diff --git a/src/float.cr b/src/float.cr index bf60ffc5b9e1..48af60a4cc49 100644 --- a/src/float.cr +++ b/src/float.cr @@ -1,6 +1,7 @@ require "c/stdio" require "c/string" require "./float/printer" +require "./number/hash_normalize" # Float is the base type of all floating point numbers. # @@ -148,13 +149,33 @@ struct Float32 Printer.print(self, io) end - def hash - unsafe_as(Int32) - end - def clone self end + + include Number::HashNormalize + + def hash_normalize + float_normalize_wrap do + {% if flag?(:x86) || flag?(:x86_64) || flag(:arm) || flag(:aarch64) %} + # it should work on every architecture where endianess of Float32 and Int32 + # matches and float is IEEE754. + unsafe_int = unsafe_as(Int32) + exp = (((unsafe_int >> 23) & 0xff) - 127) + mantisa = unsafe_int & ((1 << 23) - 1) + if exp > -127 + exp -= 23 + mantisa |= 1 << 23 + else + # subnormals + exp -= 22 + end + {mantisa, exp} + {% else %} + float_normalize_reference + {% end %} + end + end end struct Float64 @@ -206,11 +227,36 @@ struct Float64 Printer.print(self, io) end - def hash - unsafe_as(Int64) - end - def clone self end + + include Number::HashNormalize + + def hash_normalize + float_normalize_wrap do + {% if flag?(:x86) || flag?(:x86_64) || flag(:arm) || flag(:aarch64) %} + # it should work on every architecture where endianess of Float64 and Int64 + # matches and float is IEEE754. + unsafe_int = unsafe_as(Int64) + exp = (((unsafe_int >> 52) & 0x7ff) - 1023) + mantisa = unsafe_int & ((1_u64 << 52) - 1) + if exp > -1023 + exp -= 52 + mantisa |= 1_u64 << 52 + else + # subnormals + exp -= 51 + end + + {% if HASH_BITS == 31 %} + mantisa %= HASH_MODULUS + {% end %} + + {mantisa, exp} + {% else %} + float_normalize_reference + {% end %} + end + end end diff --git a/src/hash.cr b/src/hash.cr index 6d7fe7567b52..05899fec5c6c 100644 --- a/src/hash.cr +++ b/src/hash.cr @@ -710,14 +710,19 @@ class Hash(K, V) # # ``` # foo = {"foo" => "bar"} - # foo.hash # => 3247054 + # foo.hash # => 3247054 (not exactly) # ``` - def hash - hash = size + def hash(hasher) + hasher.raw(size) + digest = hasher.digest each do |key, value| - hash += key.hash ^ value.hash + copy = hasher.clone + copy << key + copy << value + digest += copy.digest end - hash + hasher.raw(digest) + hasher end # Duplicates a `Hash`. @@ -864,7 +869,11 @@ class Hash(K, V) end private def bucket_index(key) - key.hash.to_u32.remainder(@buckets_size).to_i + hash_key(key).to_u32.remainder(@buckets_size).to_i + end + + protected def hash_key(key) + key.hash end private def calculate_new_size(size) diff --git a/src/http/headers.cr b/src/http/headers.cr index e47ff17765b0..f25a2371e2b4 100644 --- a/src/http/headers.cr +++ b/src/http/headers.cr @@ -9,13 +9,12 @@ struct HTTP::Headers record Key, name : String do forward_missing_to @name - def hash - h = 0 - name.each_byte do |c| - c = normalize_byte(c) - h = 31 * h + c + def hash(hasher) + hasher.raw(bytesize.to_u32) + name.each_byte do |b| + hasher.raw normalize_byte(b) end - h + hasher end def ==(key2) @@ -44,7 +43,7 @@ struct HTTP::Headers return byte if char.ascii_lowercase? || char == '-' # Optimize the common case return byte + 32 if char.ascii_uppercase? - return '-'.ord if char == '_' + return '-'.ord.to_u8 if char == '_' byte end diff --git a/src/indexable.cr b/src/indexable.cr index fa78b16f7b21..bc873c98497f 100644 --- a/src/indexable.cr +++ b/src/indexable.cr @@ -271,13 +271,13 @@ module Indexable(T) first { nil } end - # Returns a hash code based on `self`'s size and elements. - # - # See also: `Object#hash`. - def hash - reduce(31 * size) do |memo, elem| - 31 * memo + elem.hash + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw(size.to_u32) + each do |elem| + hasher << elem end + hasher end # Returns the index of the first appearance of *value* in `self` diff --git a/src/int.cr b/src/int.cr index 3dfcdb3acd9a..a85baeb55f7c 100644 --- a/src/int.cr +++ b/src/int.cr @@ -316,10 +316,6 @@ struct Int !even? end - def hash - self - end - def succ self + 1 end @@ -575,6 +571,10 @@ struct Int8 def clone self end + + def hash_normalize + self + end end struct Int16 @@ -597,6 +597,10 @@ struct Int16 def clone self end + + def hash_normalize + self + end end struct Int32 @@ -619,6 +623,14 @@ struct Int32 def clone self end + + def hash_normalize + {% if HASH_BITS == 31 %} + unsafe_mod(HASH_MODULUS) + {% else %} + self + {% end %} + end end struct Int64 @@ -641,6 +653,10 @@ struct Int64 def clone self end + + def hash_normalize + unsafe_mod(HASH_MODULUS) + end end struct UInt8 @@ -663,6 +679,10 @@ struct UInt8 def clone self end + + def hash_normalize + self + end end struct UInt16 @@ -685,6 +705,10 @@ struct UInt16 def clone self end + + def hash_normalize + self + end end struct UInt32 @@ -707,6 +731,14 @@ struct UInt32 def clone self end + + def hash_normalize + {% if HASH_BITS == 31 %} + unsafe_mod(HASH_MODULUS) + {% else %} + self + {% end %} + end end struct UInt64 @@ -729,4 +761,8 @@ struct UInt64 def clone self end + + def hash_normalize + unsafe_mod(HASH_MODULUS) + end end diff --git a/src/json/any.cr b/src/json/any.cr index fecfa508bd6d..21c7dbca2143 100644 --- a/src/json/any.cr +++ b/src/json/any.cr @@ -261,11 +261,6 @@ struct JSON::Any raw == other end - # :nodoc: - def hash - raw.hash - end - # :nodoc: def to_json(json : JSON::Builder) raw.to_json(json) diff --git a/src/named_tuple.cr b/src/named_tuple.cr index cd7468b0a418..8f8d182df6ca 100644 --- a/src/named_tuple.cr +++ b/src/named_tuple.cr @@ -159,16 +159,14 @@ struct NamedTuple yield end - # Returns a hash value based on this name tuple's size, keys and values. - # - # See also: `Object#hash`. - def hash - hash = 31 * size + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw(size) {% for key in T.keys.sort %} - hash = 31 * hash + {{key.symbolize}}.hash - hash = 31 * hash + self[{{key.symbolize}}].hash + hasher << {{key.symbolize}} + hasher << self[{{key.symbolize}}] {% end %} - hash + hasher end # Same as `to_s`. diff --git a/src/nil.cr b/src/nil.cr index 644fcc98cb71..5948772b3a46 100644 --- a/src/nil.cr +++ b/src/nil.cr @@ -67,9 +67,10 @@ struct Nil false end - # Returns `0`. - def hash - 0 + # Protocol method for generic hashing. + def hash(hasher) + hasher << nil + hasher end # Returns an empty string. diff --git a/src/number.cr b/src/number.cr index cf32b3ae6b96..96605d494a8e 100644 --- a/src/number.cr +++ b/src/number.cr @@ -1,3 +1,5 @@ +require "./number/hash_normalize" + # The top-level number type. struct Number include Comparable(Number) @@ -255,6 +257,25 @@ struct Number self == 0 end + include Number::HashNormalize + + # Protocol method for generic hashing + # All number types should define `hash_normalize`, so equal number will + # produce equal normalized value. + # Integer numbers should calculate `self.remainder(HASH_MODULUS)` + # Float64 and Float32 version generalize it for numbers with fractional part. + # BigFloat and BigRational should calculate it as + # `(v.remainder HASH_MODULUS).to_f64.hash_normalize` + # See comments in "number/hash_normalize.cr" + def hash(hasher) + {% if HASH_BITS == 31 %} + hasher.raw hash_normalize.to_i32 + {% else %} + hasher.raw hash_normalize.to_i64 + {% end %} + hasher + end + private class StepIterator(T, L, B) include Iterator(T) diff --git a/src/number/hash_normalize.cr b/src/number/hash_normalize.cr new file mode 100644 index 000000000000..8c5736736590 --- /dev/null +++ b/src/number/hash_normalize.cr @@ -0,0 +1,95 @@ +module Number::HashNormalize + # Idea by Akzhan Abdulin @akzhan + # Based on https://github.com/python/cpython/blob/f051e43/Python/pyhash.c#L34 + + private HASH_BITS = sizeof(LibC::ULong) == 32 ? 31 : 61 + {% if HASH_BITS == 31 %} + private HASH_MODULUS = (1 << HASH_BITS) - 1 + {% else %} + private HASH_MODULUS = (1_i64 << HASH_BITS) - 1 + {% end %} + + private HASH_NAN = 0 + private HASH_INFINITY = 314159 + + # Following is a copy from python's comment: + # + # For numeric types, the hash of a number x is based on the reduction + # of x modulo the Mersen Prime P = 2**HASH_BITS - 1. It's designed + # so that hash(x) == hash(y) whenever x and y are numerically equal, + # even if x and y have different types. + # A quick summary of the hashing strategy: + # (1) First define the 'reduction of x modulo P' for any rational + # number x; this is a standard extension of the usual notion of + # reduction modulo P for integers. If x == p/q (written in lowest + # terms), the reduction is interpreted as the reduction of p times + # the inverse of the reduction of q, all modulo P; if q is exactly + # divisible by P then define the reduction to be infinity. So we've + # got a well-defined map + # reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. + # (2) Now for a rational number x, define hash(x) by: + # reduce(x) if x >= 0 + # -reduce(-x) if x < 0 + # If the result of the reduction is infinity (this is impossible for + # integers, floats and Decimals) then use the predefined hash value + # HASH_INF for x >= 0, or -HASH_INF for x < 0, instead. + # HASH_INF, -HASH_INF and HASH_NAN are also used for the + # hashes of float and Decimal infinities and nans. + # A selling point for the above strategy is that it makes it possible + # to compute hashes of decimal and binary floating-point numbers + # efficiently, even if the exponent of the binary or decimal number + # is large. The key point is that + # reduce(x * y) == reduce(x) * reduce(y) (modulo HASH_MODULUS) + # provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a + # binary or decimal float is never infinity, since the denominator is a power + # of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, + # for nonnegative x, + # reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS + # reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS + # and reduce(10**e) can be computed efficiently by the usual modular + # exponentiation algorithm. For reduce(2**e) it's even better: since + # P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication + # by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. + # + private def float_normalize_wrap + return HASH_NAN if nan? + if infinite? + return self > 0 ? +HASH_INFINITY : -HASH_INFINITY + end + + x, exp = yield + + # adjust for the exponent; first reduce it modulo HASH_BITS + exp = exp >= 0 ? exp % HASH_BITS : HASH_BITS - 1 - ((-1 - exp) % HASH_BITS) + x = ((x << exp) & HASH_MODULUS) | x >> (HASH_BITS - exp) + + x * (self < 0 ? -1 : 1) + end + + # This function if for reference implementation. + # Many architectures allows more effective bitwise calculation. + private def float_normalize_reference + frac, exp = Math.frexp self + if self < 0 + frac = -frac + end + # process 28 bits at a time; this should work well both for binary + # and hexadecimal floating point. + x = + {% if HASH_BITS == 31 %} + 0u32 + {% else %} + 0u64 + {% end %} + while frac > 0 + x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28) + frac *= 268435456.0 # 2**28 + exp -= 28 + y = frac.to_u32 # pull out integer part + frac -= y + x += y + x -= HASH_MODULUS if x >= HASH_MODULUS + end + {x, exp} + end +end diff --git a/src/object.cr b/src/object.cr index 1f66f1837494..2f59f38fa236 100644 --- a/src/object.cr +++ b/src/object.cr @@ -64,7 +64,29 @@ class Object # # The hash value is used along with `==` by the `Hash` class to determine if two objects # reference the same hash key. - abstract def hash + def hash + StdHasher.hashit self + end + + # Protocol method for generic hashing. + # + # You should use `hasher << @v` for mixing values. It will recursively call + # `hash(hasher)` on values. `hash(hasher)` on numbers defined to generate + # same hash value for equal number of different types. For performance sake + # use `hasher.raw @v` if you want mix integer as abstract value and not as a + # number. + # + # Cause hasher could be a struct, `hash(hasher)` have to return hasher. + # Also, `hasher.<<` method is not chainable, unlike other `<<` methods. + # + # def hash(hasher) + # hasher.raw @size + # each do |elem| + # hasher << elem + # end + # hasher + # end + abstract def hash(hasher) # Returns a string representation of this object. # @@ -1078,7 +1100,7 @@ class Object {% end %} end - # Defines a `hash` method computed from the given fields. + # Defines a `hash(hasher)` method computed from the given fields. # # ``` # class Person @@ -1090,16 +1112,11 @@ class Object # end # ``` macro def_hash(*fields) - def hash - {% if fields.size == 1 %} - {{fields[0]}}.hash - {% else %} - hash = 0 - {% for field in fields %} - hash = 31 * hash + {{field}}.hash - {% end %} - hash + def hash(hasher) + {% for field in fields %} + hasher << {{field}} {% end %} + hasher end end diff --git a/src/prelude.cr b/src/prelude.cr index 4aa5e5954b57..a22293b5adc1 100644 --- a/src/prelude.cr +++ b/src/prelude.cr @@ -17,6 +17,7 @@ require "iterable" require "iterator" require "indexable" require "string" +require "stdhasher" # Alpha-sorted list require "array" diff --git a/src/proc.cr b/src/proc.cr index df48e26f4429..1bf225b6bdec 100644 --- a/src/proc.cr +++ b/src/proc.cr @@ -181,8 +181,9 @@ struct Proc call(other) end - def hash - internal_representation.hash + def hash(hasher) + hasher << internal_representation + hasher end def clone diff --git a/src/reference.cr b/src/reference.cr index 94034e03c3bf..e360bf8179fb 100644 --- a/src/reference.cr +++ b/src/reference.cr @@ -50,9 +50,10 @@ class Reference {% end %} end - # Returns this reference's `object_id` as the hash value. - def hash - object_id + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw object_id + hasher end def inspect(io : IO) : Nil diff --git a/src/set.cr b/src/set.cr index 991196e0867e..48750009fc76 100644 --- a/src/set.cr +++ b/src/set.cr @@ -308,10 +308,6 @@ struct Set(T) pp.list("Set{", self, "}") end - def hash - @hash.hash - end - # Returns `true` if the set and the given set have at least one element in # common. # diff --git a/src/signal.cr b/src/signal.cr index 4fd0c51a0473..7bcb845b14f6 100644 --- a/src/signal.cr +++ b/src/signal.cr @@ -116,6 +116,13 @@ enum Signal Signal::PIPE.ignore Signal::CHLD.reset end + + # There is no much of signals, so don't bother with hashing. + # And we couldn't use seeded hash, because seed is not filled yet. + # :nodoc: + def hash + value + end end # :nodoc: diff --git a/src/stdhasher.cr b/src/stdhasher.cr new file mode 100644 index 000000000000..419602028ec5 --- /dev/null +++ b/src/stdhasher.cr @@ -0,0 +1,199 @@ +require "crystal/system/random" + +# Hasher usable for `def hash(hasher)` should satisfy protocol: +# class MyHasher +# # Value should implement commutative `+` for `Hash#hash(hasher)` +# alias Value +# +# # must be implemented to mix sizes of collections, and pointers (object_id) +# def raw(v : Int::Primitive) +# # mutate +# nil +# end +# +# # must be implemented for Hash#hash +# def raw(v : Value) +# # mutate +# nil +# end +# +# def <<(b : Bytes) +# # mutate +# nil +# end +# +# def <<(n : Nil) +# # mutate +# nil +# end +# +# def <<(v) +# # v.hash will return hasher +# # if hasher is a struct, then it will be copy +# copy_from v.hash(self) +# nil +# end +# +# # digest returns hashsum for current state without state mutation +# def digest : Value +# end +# +# # should be implemented for `Hash#hash(hasher)` +# def clone +# copy_of_current_state +# end +# end + +# StdHasher used as standard hasher in `Object#hash` +# It have to provide defenense against HashDos, and be reasonably fast. +# To protect against HashDos, it is seeded with secure random, and have +# permutation that hard to forge without knowing seed and seeing hash digest. +# +# Also it has specialized methods for primitive keys with different seeds. +struct StdHasher + alias Value = UInt32 + + @@seed = StaticArray(UInt32, 4).new { |i| 0_u32 } + buf = pointerof(@@seed).as(Pointer(UInt8)) + Crystal::System::Random.random_bytes(buf.to_slice(sizeof(typeof(@@seed)))) + + protected getter a : UInt32 = 0_u32 + protected getter b : UInt32 = 0_u32 + + def initialize + @a, @b = @@seed[0], @@seed[1] + end + + def initialize(@a : UInt32, @b : UInt32) + end + + def self.hashit(v) + s = new(@@seed[0], @@seed[1]) + s << v + s.digest + end + + def clone + self.class.new(@a, @b) + end + + def <<(v : Nil) + permute_nil(@@seed[2]) + nil + end + + # mix raw value without number normalizing + def raw(v : Int8 | UInt8) + permute(v.to_u8, @@seed[2]) + nil + end + + # mix raw value without number normalizing + def raw(v : Int16 | Int32 | UInt16 | UInt32) + permute(v.to_u32, @@seed[2]) + nil + end + + # mix raw value without number normalizing + def raw(v : Int64 | UInt64) + high = (v >> 32).to_u32 + # This condition here cause of some 32bit issue in LLVM binding, + # so compiler_spec doesn't pass without it. + # Fill free to comment and debug. + if high != 0_u32 + permute(high, @@seed[2]) + end + permute(v.to_u32, @@seed[2]) + nil + end + + def <<(b : Bytes) + permute(b, @@seed[2]) + nil + end + + def <<(v) + cp = v.hash(self) + @a, @b = cp.a, cp.b + nil + end + + def digest + a, b = @a, @b + b += @@seed[3] + a ^= a >> 15 + b ^= b >> 16 + a *= 0xb8b34b2d_u32 + b *= 0x52c6a2d9_u32 + a ^= a >> 17 + b ^= b >> 16 + b + a + end + + protected def permute_nil(s : UInt32) + @a += s | 1 + # LFSR + mx = (@b.to_i32 >> 31).to_u32 & 0xa8888eef_u32 + @b = (@b << 1) ^ mx + end + + protected def permute(v : UInt8, s : UInt32) + @a += v.to_u32 ^ s + @a *= 9 + @b += @a + @b = (@b << 7) | (@b >> 25) + @b *= 5 + end + + protected def permute(v : UInt32, s : UInt32) + permute_u32(v, s, @a, @b) + end + + @[NoInline] + protected def permute(buf : Bytes, s : UInt32) + bsz = buf.size + v = bsz.to_u32 << 24 + u = buf.to_unsafe + a, b = @a, @b + bsz.unsafe_div(4).downto(1) do + cv = u.as(Pointer(UInt32)).value + permute_u32(cv, s, a, b) + u += 4 + end + r = (bsz & 3).to_u32 + if r != 0 + v |= u[0].to_u32 | (u[r/2].to_u32 << 8) | (u[r - 1].to_u32 << 16) + end + permute_u32(v, s, a, b) + @a, @b = a, b + self + end + + private macro permute_u32(v, s, a, b) + {{v}} ^= {{s}} + {{v}} *= 0xb8b34b2d_u32 + {{a}} += {{v}} + {{a}} = ({{a}}.unsafe_shl(13)) | ({{a}}.unsafe_shr(19)) + {{b}} ^= {{a}} + {{s}} + {{b}} *= {{9}} + end + + # unseeded is used for types that are used in early startup + def self.unseeded(v : Int8 | Int16 | UInt8 | UInt16 | Int32 | UInt32) + h = v.to_u32 + h ^= h >> 16 + h *= 0x52c6a2d9_u32 + h ^ (h >> 16) + end + + # unseeded is used for types that are used in early startup + def self.unseeded(v : Int64 | UInt64) + h = (v >> 32).to_u32 + h ^= h >> 16 + h *= 0xb8b34b2d_u32 + h += v.to_u32 + h ^= h >> 16 + h *= 0x52c6a2d9_u32 + h ^ (h >> 16) + end +end diff --git a/src/string.cr b/src/string.cr index 43a0449fec6d..6d5c3329f487 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3899,15 +3899,10 @@ class String sprintf self, other end - # Returns a hash based on this string’s size and content. - # - # See also: `Object#hash`. - def hash - h = 0 - each_byte do |c| - h = 31 * h + c - end - h + # Protocol method for generic hashing. + def hash(hasher) + hasher << to_slice + hasher end # Returns the number of unicode codepoints in this string. diff --git a/src/struct.cr b/src/struct.cr index 57bda79ad9eb..c730332df280 100644 --- a/src/struct.cr +++ b/src/struct.cr @@ -73,12 +73,13 @@ struct Struct # Returns a hash value based on this struct's instance variables hash values. # # See also: `Object#hash` - def hash : Int32 - hash = 0 + + # Protocol method for generic hashing. + def hash(hasher) {% for ivar in @type.instance_vars %} - hash = 31 * hash + @{{ivar.id}}.hash.to_i32 + hasher << @{{ivar.id}} {% end %} - hash + hasher end # Appends this struct's name and instance variables names and values diff --git a/src/symbol.cr b/src/symbol.cr index 8e90a7207c68..5e46c3eaf05c 100644 --- a/src/symbol.cr +++ b/src/symbol.cr @@ -15,11 +15,10 @@ struct Symbol include Comparable(Symbol) - # Generates an `Int32` hash value for this symbol. - # - # See also: `Object#hash`. - def hash : Int32 - to_i + # Protocol method for generic hashing. + def hash(hasher) + hasher.raw to_i + hasher end # Compares symbol with other based on `String#<=>` method. Returns `-1`, `0` diff --git a/src/thread.cr b/src/thread.cr index 8102a033dbe0..223a096099a2 100644 --- a/src/thread.cr +++ b/src/thread.cr @@ -48,6 +48,12 @@ class Thread end end + # override, cause StdHasher's seed is not initialized yet + # :nodoc: + def hash + StdHasher.unseeded object_id + end + # All threads, so the GC can see them (GC doesn't scan thread locals) # and we can find the current thread on platforms that don't support # thread local storage (eg: OpenBSD) diff --git a/src/time.cr b/src/time.cr index 9d2c270f1148..9cc76aad0577 100644 --- a/src/time.cr +++ b/src/time.cr @@ -309,10 +309,6 @@ struct Time end end - def hash - @encoded - end - def self.days_in_month(year, month) : Int32 unless 1 <= month <= 12 raise ArgumentError.new "Invalid month" diff --git a/src/tuple.cr b/src/tuple.cr index f0f4149fc5af..9bdc250630b9 100644 --- a/src/tuple.cr +++ b/src/tuple.cr @@ -306,15 +306,12 @@ struct Tuple size <=> other.size end - # Returns a hash value based on this tuple's length and contents. - # - # See also: `Object#hash`. - def hash - hash = 31 * size + # Protocol method for generic hashing. + def hash(hasher) {% for i in 0...T.size %} - hash = 31 * hash + self[{{i}}].hash + hasher << self[{{i}}] {% end %} - hash + hasher end # Returns a tuple containing cloned elements of this tuple using the `clone` method. diff --git a/src/xml/namespace.cr b/src/xml/namespace.cr index 43fa3d2bad69..9336930bfb58 100644 --- a/src/xml/namespace.cr +++ b/src/xml/namespace.cr @@ -4,8 +4,9 @@ struct XML::Namespace def initialize(@document : Node, @ns : LibXML::NS*) end - def hash - object_id + def hash(hasher) + hasher.raw object_id + hasher end def href diff --git a/src/xml/node.cr b/src/xml/node.cr index 3c23a238036f..f71064d4205e 100644 --- a/src/xml/node.cr +++ b/src/xml/node.cr @@ -160,8 +160,9 @@ struct XML::Node end # Returns this node's `#object_id` as the hash value. - def hash - object_id + def hash(hasher) + hasher.raw object_id + hasher end # Returns the content for this Node. diff --git a/src/xml/node_set.cr b/src/xml/node_set.cr index 00c8c759f039..215684278065 100644 --- a/src/xml/node_set.cr +++ b/src/xml/node_set.cr @@ -28,8 +28,9 @@ struct XML::NodeSet size == 0 end - def hash - object_id + def hash(hasher) + hasher.raw object_id + hasher end def inspect(io) diff --git a/src/yaml/any.cr b/src/yaml/any.cr index 1d2956c41261..efb5160a76f4 100644 --- a/src/yaml/any.cr +++ b/src/yaml/any.cr @@ -194,11 +194,6 @@ struct YAML::Any raw == other end - # :nodoc: - def hash - raw.hash - end - # :nodoc: def to_yaml(io) raw.to_yaml(io)