From 043e620a3e811571bd28fefbddc7083e4c18c498 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 24 Oct 2024 20:25:29 +0000 Subject: [PATCH] Canonicalize away bit width and embed small integers into `IntId`s The first change here is to canonicalize away bit width when tracking integers in our shared value store. This lets us have a more definitive model of "what is the mathematical value". It also frees us to use more efficient bit widths when available, such as bits inside the ID itself. For canonicalizing, we try to minimize the width adjustments and maximize the use of the SSO in APInt, and so we never shrink belowe 64-bits and grow in multiples of the word bit width in the implementation. We also canonicalize to the signed 2s compliment representation so we can represent negative numbers in an intuitive way. The canonicalizing requires getting the bit width out of the type and adjusting to it within the toolchain when doing any kind of math, and this PR updates various places to do that, as well as adding some convenience APIs to assist. Then we take advantage of the canonical form and embed small integers into the ID itself rather than allocating storage for them and referencing them with an index. This is especially helpful for the pervasive small integers such as the sizes of types, arrays, etc. Those no longer require indirection at all. Various short-cut APIs to take advantage of this have also been added. This PR improves lexing by about 5% when there are lots of `i32` types. --- toolchain/base/BUILD | 33 ++++ toolchain/base/int_store.h | 183 ++++++++++++++++++ toolchain/base/int_store_test.cpp | 150 ++++++++++++++ toolchain/base/shared_value_stores.h | 3 +- toolchain/base/shared_value_stores_test.cpp | 5 +- toolchain/base/value_ids.h | 147 +++++++++++++- toolchain/base/value_store_test.cpp | 13 -- toolchain/check/convert.cpp | 2 +- toolchain/check/eval.cpp | 103 ++++++---- toolchain/check/handle_literal.cpp | 2 +- toolchain/check/import_ref.cpp | 3 +- toolchain/check/member_access.cpp | 12 +- .../driver/testdata/dump_shared_values.carbon | 6 +- toolchain/lex/lex.cpp | 20 +- toolchain/lex/tokenized_buffer.h | 5 +- toolchain/lower/constant.cpp | 9 +- toolchain/sem_ir/file.h | 24 +++ toolchain/sem_ir/inst.h | 5 + toolchain/sem_ir/type.h | 4 + 19 files changed, 643 insertions(+), 86 deletions(-) create mode 100644 toolchain/base/int_store.h create mode 100644 toolchain/base/int_store_test.cpp diff --git a/toolchain/base/BUILD b/toolchain/base/BUILD index ec307eab68277..b7e68fc6835ae 100644 --- a/toolchain/base/BUILD +++ b/toolchain/base/BUILD @@ -47,6 +47,7 @@ cc_library( hdrs = ["value_ids.h"], deps = [ ":index_base", + "//common:check", "//common:ostream", "@llvm-project//llvm:Support", ], @@ -80,10 +81,42 @@ cc_test( ], ) +cc_library( + name = "int_store", + srcs = ["int_store.cpp"], + hdrs = ["int_store.h"], + deps = [ + ":mem_usage", + ":value_ids", + ":value_store", + ":yaml", + "//common:check", + "//common:hashtable_key_context", + "//common:ostream", + "//common:set", + "@llvm-project//llvm:Support", + ], +) + +cc_test( + name = "int_store_test", + size = "small", + srcs = ["int_store_test.cpp"], + deps = [ + ":int_store", + ":value_ids", + "//testing/base:gtest_main", + "//testing/base:test_raw_ostream", + "//toolchain/testing:yaml_test_helpers", + "@googletest//:gtest", + ], +) + cc_library( name = "shared_value_stores", hdrs = ["shared_value_stores.h"], deps = [ + ":int_store", ":mem_usage", ":value_ids", ":value_store", diff --git a/toolchain/base/int_store.h b/toolchain/base/int_store.h new file mode 100644 index 0000000000000..e38efe68021f6 --- /dev/null +++ b/toolchain/base/int_store.h @@ -0,0 +1,183 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_TOOLCHAIN_BASE_INT_STORE_H_ +#define CARBON_TOOLCHAIN_BASE_INT_STORE_H_ + +#include "common/check.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "toolchain/base/mem_usage.h" +#include "toolchain/base/value_ids.h" +#include "toolchain/base/value_store.h" +#include "toolchain/base/yaml.h" + +namespace Carbon { + +// Forward declare a testing peer so we can friend it. +namespace Testing { +struct IntStoreTestPeer; +} // namespace Testing + +// A canonicalizing value store with deep optimizations for integers. +// +// This stores integers as abstract, signed mathematical integers. The bit width +// of specific `APInt` values, either as inputs or outputs, is disregarded for +// the purpose of canonicalization and the returned integer may use a very +// different bit width `APInt` than was used when adding. There are also +// optimized paths for adding integer values representable using native integer +// types. +// +// Because the integers in the store are canonicalized without a specific bit +// width there are helper functions to coerce them to a specific desired bit +// width for use. +// +// This leverages a significant optimization for small integer values -- rather +// than canonicalizing and making unique them in a `ValueStore`, they are +// directly embedded in the `IntId` itself. Only larger integers are store in an +// array of `APInt` values and represented as an index in the ID. +class IntStore { + public: + // Adds an integer value representable in a host `int64_t` to the store. + // Especially useful when the integer is computed without an `APInt` in the + // first place. + // + // This only accepts a signed `int64_t` and uses the mathematical signed + // integer value of it as the added integer value. + // + // Returns the ID corresponding to this integer value, storing an `APInt` if + // necessary to represent it. + auto Add(int64_t value) -> IntId { + // First try directly making this into an ID. + if (IntId id = IntId::TryMakeValue(value); id.is_valid()) [[likely]] { + return id; + } + + // Fallback for larger values. + return AddLarge(value); + } + + // Stores a canonical copy of a signed value and returns its ID. + auto AddSigned(llvm::APInt value) -> IntId { + // First try directly making this into an ID. + if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] { + return id; + } + + // Fallback for larger values. + return AddSignedLarge(std::move(value)); + } + + // Stores a canonical copy of an unsigned value and returns its ID. + auto AddUnsigned(llvm::APInt value) -> IntId { + // First try directly making this into an ID. + if (IntId id = IntId::TryMakeUnsignedValue(value); id.is_valid()) + [[likely]] { + return id; + } + + // Fallback for larger values. + return AddUnsignedLarge(std::move(value)); + } + + // Returns the value for an ID. + auto Get(IntId id) const -> llvm::APInt { + if (id.is_value()) [[likely]] { + return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true); + } + return values_.Get(APIntId(id.AsIndex())); + } + + // Returns the value for an ID adjusted to a specific bit width. + // + // Note that because we store canonical mathematical integers as signed + // integers, this always sign extends or truncates to the target width. The + // caller can then use that as a signed or unsigned integer as needed. + auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt { + llvm::APInt value = Get(id); + if (static_cast(value.getBitWidth()) != bit_width) { + value = value.sextOrTrunc(bit_width); + } + return value; + } + + // Returns the value for an ID adjusted to the bit width specified with + // another integer ID. + // + // This simply looks up the width integer ID, and then calls the above + // `GetAtWidth` overload using the value found for it. See that overload for + // more details. + auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt { + const llvm::APInt& bit_width = Get(bit_width_id); + CARBON_CHECK(bit_width.isStrictlyPositive() && + bit_width.isSignedIntN(sizeof(int) * 8), + "Invalid bit width value: {0}", bit_width); + return GetAtWidth(id, bit_width.getSExtValue()); + } + + // Looks up the canonical ID for a value, or returns invalid if not in the + // store. + auto LookupSigned(llvm::APInt value) const -> IntId { + if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] { + return id; + } + + // Fallback for larger values. + return LookupSignedLarge(std::move(value)); + } + + // Output a YAML description of this data structure. Note that this will only + // include the integers that required storing, not those successfully embedded + // into the ID space. + auto OutputYaml() const -> Yaml::OutputMapping; + + auto array_ref() const -> llvm::ArrayRef { + return values_.array_ref(); + } + auto size() const -> size_t { return values_.size(); } + + // Collects the memory usage of the separately stored integers. + auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const + -> void; + + private: + friend struct Testing::IntStoreTestPeer; + + struct APIntId : IdBase, Printable { + using ValueType = llvm::APInt; + static const APIntId Invalid; + using IdBase::IdBase; + auto Print(llvm::raw_ostream& out) const -> void { + out << "ap-int"; + IdBase::Print(out); + } + }; + + static constexpr int MinAPWidth = 64; + + // Pick a canonical bit width for the provided number of significant bits. + static auto CanonicalBitWidth(int significant_bits) -> int; + + // Canonicalize an incoming signed APInt to the correct bit width. + static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt; + + // Canonicalize an incoming unsigned APInt to the correct bit width. + static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt; + + auto AddLarge(int64_t value) -> IntId; + auto AddSignedLarge(llvm::APInt value) -> IntId; + auto AddUnsignedLarge(llvm::APInt value) -> IntId; + + auto LookupSignedLarge(llvm::APInt value) const -> IntId; + + CanonicalValueStore values_; +}; + +constexpr IntStore::APIntId IntStore::APIntId::Invalid( + IntId::Invalid.AsIndex()); + +} // namespace Carbon + +#endif // CARBON_TOOLCHAIN_BASE_INT_STORE_H_ diff --git a/toolchain/base/int_store_test.cpp b/toolchain/base/int_store_test.cpp new file mode 100644 index 0000000000000..cef203dc55ed3 --- /dev/null +++ b/toolchain/base/int_store_test.cpp @@ -0,0 +1,150 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "toolchain/base/int_store.h" + +#include +#include + +#include + +#include "toolchain/base/value_ids.h" + +namespace Carbon::Testing { + +struct IntStoreTestPeer { + static constexpr int MinAPWidth = IntStore::MinAPWidth; +}; + +namespace { + +using ::testing::Eq; +using ::testing::Not; + +static constexpr int MinAPWidth = IntStoreTestPeer::MinAPWidth; + +TEST(IntStore, Basic) { + IntStore ints; + IntId id1 = ints.Add(1); + IntId id2 = ints.Add(2); + IntId id3 = ints.Add(999'999'999'999); + + ASSERT_TRUE(id1.is_valid()); + ASSERT_TRUE(id2.is_valid()); + ASSERT_TRUE(id3.is_valid()); + EXPECT_THAT(id1, Not(Eq(id2))); + EXPECT_THAT(id1, Not(Eq(id3))); + EXPECT_THAT(id2, Not(Eq(id3))); + + EXPECT_THAT(ints.Get(id1), Eq(1)); + EXPECT_THAT(ints.Get(id2), Eq(2)); + EXPECT_THAT(ints.Get(id3), Eq(999'999'999'999)); +} + +TEST(IntStore, APSigned) { + IntStore ints; + + llvm::APInt one_ap(MinAPWidth, 1, /*isSigned=*/true); + llvm::APInt two_ap(MinAPWidth, 2, /*isSigned=*/true); + llvm::APInt nines_ap(MinAPWidth, 999'999'999'999, /*isSigned=*/true); + llvm::APInt big_nines_ap = nines_ap.sext(128) * 10'000; + llvm::APInt bigger_nines_ap = nines_ap.sext(512) * 100'000; + llvm::APInt biggest_small_ap( + MinAPWidth, std::numeric_limits::max() >> (32 - TokenIdBits), + /*isSigned=*/true); + llvm::APInt smallest_large_ap = biggest_small_ap + 1; + llvm::APInt biggest_neg_large_ap( + MinAPWidth, std::numeric_limits::min() >> (32 - TokenIdBits + 1), + /*isSigned=*/true); + llvm::APInt smallest_neg_small_ap = biggest_neg_large_ap + 1; + IntId ids[] = { + ints.AddSigned(one_ap), + ints.AddSigned(two_ap), + ints.AddSigned(nines_ap), + ints.AddSigned(big_nines_ap), + ints.AddSigned(bigger_nines_ap), + ints.AddSigned(biggest_small_ap), + ints.AddSigned(smallest_large_ap), + ints.AddSigned(biggest_neg_large_ap), + ints.AddSigned(smallest_neg_small_ap), + }; + + for (IntId id : ids) { + ASSERT_TRUE(id.is_valid()); + } + + for (int i : llvm::seq(std::size(ids))) { + for (int j : llvm::seq(i + 1, std::size(ids))) { + EXPECT_THAT(ids[i], Not(Eq(ids[j]))); + } + } + + EXPECT_THAT(ints.Get(ids[0]), Eq(1)); + EXPECT_THAT(ints.Get(ids[1]), Eq(2)); + EXPECT_THAT(ints.Get(ids[2]), Eq(999'999'999'999)); + EXPECT_THAT(ints.Get(ids[3]).sext(big_nines_ap.getBitWidth()), + Eq(big_nines_ap)); + EXPECT_THAT(ints.Get(ids[4]).sext(bigger_nines_ap.getBitWidth()), + Eq(bigger_nines_ap)); + EXPECT_THAT(ints.Get(ids[5]), Eq(biggest_small_ap)); + EXPECT_THAT(ints.Get(ids[6]), Eq(smallest_large_ap)); + EXPECT_THAT(ints.Get(ids[7]), Eq(biggest_neg_large_ap)); + EXPECT_THAT(ints.Get(ids[8]), Eq(smallest_neg_small_ap)); +} + +TEST(IntStore, APUnsigned) { + IntStore ints; + + llvm::APInt one_ap(MinAPWidth, 1); + llvm::APInt two_ap(MinAPWidth, 2); + llvm::APInt nines_ap(MinAPWidth, 999'999'999'999); + llvm::APInt max64_ap(MinAPWidth, std::numeric_limits::max()); + llvm::APInt max64_plus_one_ap = max64_ap.zext(65) + 1; + llvm::APInt big_nines_ap = nines_ap.zext(128) * 10'000; + llvm::APInt bigger_nines_ap = nines_ap.zext(512) * 100'000; + llvm::APInt biggest_small_ap( + 64, std::numeric_limits::max() >> (32 - TokenIdBits)); + llvm::APInt smallest_large_ap = biggest_small_ap + 1; + IntId ids[] = { + ints.AddUnsigned(one_ap), + ints.AddUnsigned(two_ap), + ints.AddUnsigned(nines_ap), + ints.AddUnsigned(max64_ap), + ints.AddUnsigned(max64_plus_one_ap), + ints.AddUnsigned(big_nines_ap), + ints.AddUnsigned(bigger_nines_ap), + ints.AddUnsigned(biggest_small_ap), + ints.AddUnsigned(smallest_large_ap), + }; + + for (IntId id : ids) { + ASSERT_TRUE(id.is_valid()); + } + + for (int i : llvm::seq(std::size(ids))) { + for (int j : llvm::seq(i + 1, std::size(ids))) { + EXPECT_THAT(ids[i], Not(Eq(ids[j]))); + } + } + + EXPECT_THAT(ints.Get(ids[0]), Eq(1)); + EXPECT_THAT(ints.Get(ids[1]), Eq(2)); + EXPECT_THAT(ints.Get(ids[2]), Eq(999'999'999'999)); + EXPECT_THAT(ints.Get(ids[3]).getActiveBits(), Eq(64)); + EXPECT_THAT(ints.Get(ids[3]).trunc(64), + Eq(std::numeric_limits::max())); + EXPECT_THAT(ints.Get(ids[4]).truncUSat(max64_plus_one_ap.getBitWidth()), + Eq(max64_plus_one_ap)); + // We have lots of extra bits in our initial AP, so we sign extend here to + // ensure that we don't get a negative number from `Get`. + EXPECT_THAT(ints.Get(ids[5]).sext(big_nines_ap.getBitWidth()), + Eq(big_nines_ap)); + EXPECT_THAT(ints.Get(ids[6]).sext(bigger_nines_ap.getBitWidth()), + Eq(bigger_nines_ap)); + EXPECT_THAT(ints.Get(ids[7]), Eq(biggest_small_ap)); + EXPECT_THAT(ints.Get(ids[8]), Eq(smallest_large_ap)); +} + +} // namespace +} // namespace Carbon::Testing diff --git a/toolchain/base/shared_value_stores.h b/toolchain/base/shared_value_stores.h index 1a9d38d586e9c..2d47b7ba5ff15 100644 --- a/toolchain/base/shared_value_stores.h +++ b/toolchain/base/shared_value_stores.h @@ -5,6 +5,7 @@ #ifndef CARBON_TOOLCHAIN_BASE_SHARED_VALUE_STORES_H_ #define CARBON_TOOLCHAIN_BASE_SHARED_VALUE_STORES_H_ +#include "toolchain/base/int_store.h" #include "toolchain/base/mem_usage.h" #include "toolchain/base/value_ids.h" #include "toolchain/base/value_store.h" @@ -17,7 +18,7 @@ namespace Carbon { class SharedValueStores : public Yaml::Printable { public: // Provide types that can be used by APIs to forward access to these stores. - using IntStore = CanonicalValueStore; + using IntStore = IntStore; using RealStore = ValueStore; using FloatStore = CanonicalValueStore; using IdentifierStore = CanonicalValueStore; diff --git a/toolchain/base/shared_value_stores_test.cpp b/toolchain/base/shared_value_stores_test.cpp index 430d0931fa207..26ed889cf594f 100644 --- a/toolchain/base/shared_value_stores_test.cpp +++ b/toolchain/base/shared_value_stores_test.cpp @@ -40,7 +40,8 @@ TEST(SharedValueStores, PrintEmpty) { TEST(SharedValueStores, PrintVals) { SharedValueStores value_stores; llvm::APInt apint(64, 8, /*isSigned=*/true); - value_stores.ints().Add(apint); + value_stores.ints().AddSigned(apint); + value_stores.ints().AddSigned(llvm::APInt(64, 999'999'999'999)); value_stores.reals().Add( Real{.mantissa = apint, .exponent = apint, .is_decimal = true}); value_stores.identifiers().Add("a"); @@ -50,7 +51,7 @@ TEST(SharedValueStores, PrintVals) { EXPECT_THAT(Yaml::Value::FromText(out.TakeStr()), MatchSharedValues( - ElementsAre(Pair("int0", Yaml::Scalar("8"))), + ElementsAre(Pair("ap-int0", Yaml::Scalar("999999999999"))), ElementsAre(Pair("real0", Yaml::Scalar("8*10^8"))), ElementsAre(Pair("identifier0", Yaml::Scalar("a"))), ElementsAre(Pair("string0", Yaml::Scalar("foo'\"baz"))))); diff --git a/toolchain/base/value_ids.h b/toolchain/base/value_ids.h index e632a5589b0f2..60c057a9f7506 100644 --- a/toolchain/base/value_ids.h +++ b/toolchain/base/value_ids.h @@ -5,6 +5,9 @@ #ifndef CARBON_TOOLCHAIN_BASE_VALUE_IDS_H_ #define CARBON_TOOLCHAIN_BASE_VALUE_IDS_H_ +#include + +#include "common/check.h" #include "common/ostream.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -14,6 +17,12 @@ namespace Carbon { +// Valid IDs which are associated with tokens during lexing need to fit into a +// compressed storage space, which may influence the specific formulation of the +// ID. Note that there may still be IDs either not associated with tokens or +// computed after lexing outside of this range. +constexpr int TokenIdBits = 23; + // The value of a real literal token. // // This is either a dyadic fraction (mantissa * 2^exponent) or a decadic @@ -41,20 +50,140 @@ class Real : public Printable { bool is_decimal; }; -// Corresponds to an integer value represented by an APInt. This is used both -// for integer literal tokens, which are unsigned and have an unspecified -// bit-width, and integer values in SemIR, which have a signedness and bit-width -// matching their type. -struct IntId : public IdBase, public Printable { +// Corresponds to a canonicalized integer value. This is used both for integer +// literal tokens, and integer values in SemIR. These always represent the +// abstract mathematical value -- signed and regardless of the needed precision. +// +// Small values are internalized into the ID itself. Large values are +// represented as an index into an array of `APInt`s with a canonicalized bit +// width. +class IntId : public Printable { + public: using ValueType = llvm::APInt; + static const IntId Invalid; - using IdBase::IdBase; + + static auto MakeIndexOrInvalid(int index) -> IntId { + CARBON_DCHECK(index >= 0 && index <= InvalidIndex); + return IntId(ZeroIndexId - index); + } + + static auto MakeFromTokenPayload(uint32_t payload) -> IntId { + // Token-associated IDs are signed `TokenIdBits` integers, so force the sign + // extension from that bit. + constexpr int Shift = 32 - TokenIdBits; + return IntId(static_cast(payload << Shift) >> Shift); + } + + static auto TryMakeValue(int64_t value) -> IntId { + if (MinValue <= value && value <= MaxValue) { + return IntId(value); + } + + return Invalid; + } + + static auto TryMakeSignedValue(llvm::APInt value) -> IntId { + if (value.sge(MinValue) && value.sle(MaxValue)) { + return IntId(value.getSExtValue()); + } + + return Invalid; + } + + static auto TryMakeUnsignedValue(llvm::APInt value) -> IntId { + if (value.ule(MaxValue)) { + return IntId(value.getZExtValue()); + } + + return Invalid; + } + + static auto MakeRaw(int32_t raw_id) -> IntId { return IntId(raw_id); } + + constexpr auto is_valid() const -> bool { return id_ != InvalidId; } + constexpr auto is_value() const -> bool { return id_ > ZeroIndexId; } + constexpr auto is_index() const -> bool { return id_ <= ZeroIndexId; } + + auto AsValue() const -> int { + CARBON_DCHECK(is_value()); + return id_; + } + + constexpr auto AsIndex() const -> int { + CARBON_DCHECK(is_index()); + return ZeroIndexId - id_; + } + + constexpr auto AsRawId() const -> int32_t { return id_; } + auto Print(llvm::raw_ostream& out) const -> void { - out << "int"; - IdBase::Print(out); + out << "int ["; + if (is_value()) { + out << "value: " << AsValue() << "]"; + } else if (is_index()) { + out << "index: " << AsIndex() << "]"; + } else { + CARBON_CHECK(!is_valid()); + out << "invalid]"; + } + } + + friend constexpr auto operator==(IntId lhs, IntId rhs) -> bool { + return lhs.id_ == rhs.id_; + } + friend constexpr auto operator<=>(IntId lhs, IntId rhs) + -> std::strong_ordering { + return lhs.id_ <=> rhs.id_; } + + private: + // We need all the values from maximum to minimum and a healthy range of + // indices to all fit within the token ID bits. + // + // We represent this as a signed TokenIdBits-bit 2s compliment integer. The + // sign extension from TokenIdBits to a register size can be folded into the + // shift used to extract from compressed bitfield storage. + // + // We then divide the smallest 1/4th of the space to indices, and the larger + // 3/4ths to embedded values. For 23-bits total this still gives us 2 million + // unique integers larger than the embedded ones, which would be difficult to + // fill without exceeding the number of tokens we can lex (8 million). For + // non-token based integers, the indics can continue downward to the 32-bit + // signed integer minimum. + // + // Note that the invalid ID can't be used with a token. This is OK as we + // expect invalid tokens to be *error* tokens and not need to represent an + // invalid integer. + static constexpr int TokenIdBitsShift = 32 - TokenIdBits; + static constexpr int32_t MaxValue = + std::numeric_limits::max() >> TokenIdBitsShift; + static constexpr int32_t ZeroIndexId = std::numeric_limits::min() >> + (TokenIdBitsShift + 1); + static constexpr int32_t MinValue = ZeroIndexId + 1; + static constexpr int32_t InvalidId = std::numeric_limits::min(); + static constexpr int32_t InvalidIndex = ZeroIndexId - InvalidId; + + // Document the specific values of these constants to help visualize how the + // bit patterns map from the above computations. + // + // Each bit is either `T` for part of the token or `P` as part + // of the available payload that we use for the ID: + // + // clang-format off: visualizing bit positions + // + // 0bTTTT'TTTT'TPPP'PPPP'PPPP'PPPP'PPPP'PPPP + static_assert(MaxValue == 0b0000'0000'0011'1111'1111'1111'1111'1111); + static_assert(ZeroIndexId == 0b1111'1111'1110'0000'0000'0000'0000'0000); + static_assert(MinValue == 0b1111'1111'1110'0000'0000'0000'0000'0001); + static_assert(InvalidId == 0b1000'0000'0000'0000'0000'0000'0000'0000); + // clang-format on + + constexpr explicit IntId(int32_t id) : id_(id) {} + + int32_t id_; }; -constexpr IntId IntId::Invalid(IntId::InvalidIndex); +constexpr IntId IntId::Invalid(IntId::InvalidId); // Corresponds to a float value represented by an APFloat. This is used for // floating-point values in SemIR. diff --git a/toolchain/base/value_store_test.cpp b/toolchain/base/value_store_test.cpp index 569a9fab6cdc2..dcc58f2efda41 100644 --- a/toolchain/base/value_store_test.cpp +++ b/toolchain/base/value_store_test.cpp @@ -15,19 +15,6 @@ namespace { using ::testing::Eq; using ::testing::Not; -TEST(ValueStore, Int) { - CanonicalValueStore ints; - IntId id1 = ints.Add(llvm::APInt(64, 1)); - IntId id2 = ints.Add(llvm::APInt(64, 2)); - - ASSERT_TRUE(id1.is_valid()); - ASSERT_TRUE(id2.is_valid()); - EXPECT_THAT(id1, Not(Eq(id2))); - - EXPECT_THAT(ints.Get(id1), Eq(1)); - EXPECT_THAT(ints.Get(id2), Eq(2)); -} - TEST(ValueStore, Real) { Real real1{.mantissa = llvm::APInt(64, 1), .exponent = llvm::APInt(64, 11), diff --git a/toolchain/check/convert.cpp b/toolchain/check/convert.cpp index 57bc923c15c6c..4231b1e8ea25c 100644 --- a/toolchain/check/convert.cpp +++ b/toolchain/check/convert.cpp @@ -155,7 +155,7 @@ static auto MakeElementAccessInst(Context& context, SemIR::LocId loc_id, auto index_id = block.template AddInst( loc_id, {.type_id = context.GetBuiltinType(SemIR::BuiltinInstKind::IntType), - .int_id = context.ints().Add(llvm::APInt(32, i))}); + .int_id = context.ints().AddUnsigned(llvm::APInt(32, i))}); return block.template AddInst( loc_id, {elem_type_id, aggregate_id, index_id}); } else { diff --git a/toolchain/check/eval.cpp b/toolchain/check/eval.cpp index c7a70861b25a0..373fee5896604 100644 --- a/toolchain/check/eval.cpp +++ b/toolchain/check/eval.cpp @@ -244,7 +244,7 @@ static auto MakeBoolResult(Context& context, SemIR::TypeId bool_type_id, // Converts an APInt value into a ConstantId. static auto MakeIntResult(Context& context, SemIR::TypeId type_id, llvm::APInt value) -> SemIR::ConstantId { - auto result = context.ints().Add(std::move(value)); + auto result = context.ints().AddSigned(std::move(value)); return MakeConstantResult( context, SemIR::IntValue{.type_id = type_id, .int_id = result}, Phase::Template); @@ -629,12 +629,14 @@ static auto PerformBuiltinUnaryIntOp(Context& context, SemIRLoc loc, SemIR::InstId arg_id) -> SemIR::ConstantId { auto op = context.insts().GetAs(arg_id); - auto op_val = context.ints().Get(op.int_id); + auto [is_signed, bit_width_id] = context.sem_ir().GetIntTypeInfo(op.type_id); + CARBON_CHECK(bit_width_id != IntId::Invalid, + "Cannot evaluate a generic bit width integer: {0}", op); + llvm::APInt op_val = context.ints().GetAtWidth(op.int_id, bit_width_id); switch (builtin_kind) { case SemIR::BuiltinFunctionKind::IntSNegate: - if (context.types().IsSignedInt(op.type_id) && - op_val.isMinSignedValue()) { + if (is_signed && op_val.isMinSignedValue()) { CARBON_DIAGNOSTIC(CompileTimeIntegerNegateOverflow, Error, "integer overflow in negation of {0}", TypedInt); context.emitter().Emit(loc, CompileTimeIntegerNegateOverflow, @@ -663,8 +665,6 @@ static auto PerformBuiltinBinaryIntOp(Context& context, SemIRLoc loc, -> SemIR::ConstantId { auto lhs = context.insts().GetAs(lhs_id); auto rhs = context.insts().GetAs(rhs_id); - const auto& lhs_val = context.ints().Get(lhs.int_id); - const auto& rhs_val = context.ints().Get(rhs.int_id); // Check for division by zero. switch (builtin_kind) { @@ -672,7 +672,7 @@ static auto PerformBuiltinBinaryIntOp(Context& context, SemIRLoc loc, case SemIR::BuiltinFunctionKind::IntSMod: case SemIR::BuiltinFunctionKind::IntUDiv: case SemIR::BuiltinFunctionKind::IntUMod: - if (rhs_val.isZero()) { + if (context.ints().Get(rhs.int_id).isZero()) { DiagnoseDivisionByZero(context, loc); return SemIR::ConstantId::Error; } @@ -681,9 +681,58 @@ static auto PerformBuiltinBinaryIntOp(Context& context, SemIRLoc loc, break; } - bool overflow = false; + auto [lhs_is_signed, lhs_bit_width_id] = + context.sem_ir().GetIntTypeInfo(lhs.type_id); + llvm::APInt lhs_val = context.ints().GetAtWidth(lhs.int_id, lhs_bit_width_id); + llvm::APInt result_val; + + // First handle switch, which can directly use the canonical RHS and doesn't + // overflow. + switch (builtin_kind) { + // Bit shift. + case SemIR::BuiltinFunctionKind::IntLeftShift: + case SemIR::BuiltinFunctionKind::IntRightShift: { + const auto& rhs_orig_val = context.ints().Get(rhs.int_id); + if (rhs_orig_val.uge(lhs_val.getBitWidth()) || + (rhs_orig_val.isNegative() && lhs_is_signed)) { + CARBON_DIAGNOSTIC( + CompileTimeShiftOutOfRange, Error, + "shift distance not in range [0, {0}) in {1} {2:<<|>>} {3}", + unsigned, TypedInt, BoolAsSelect, TypedInt); + context.emitter().Emit( + loc, CompileTimeShiftOutOfRange, lhs_val.getBitWidth(), + {.type = lhs.type_id, .value = lhs_val}, + builtin_kind == SemIR::BuiltinFunctionKind::IntLeftShift, + {.type = rhs.type_id, .value = rhs_orig_val}); + // TODO: Is it useful to recover by returning 0 or -1? + return SemIR::ConstantId::Error; + } + + if (builtin_kind == SemIR::BuiltinFunctionKind::IntLeftShift) { + result_val = lhs_val.shl(rhs_orig_val); + } else if (lhs_is_signed) { + result_val = lhs_val.ashr(rhs_orig_val); + } else { + result_val = lhs_val.lshr(rhs_orig_val); + } + return MakeIntResult(context, lhs.type_id, std::move(result_val)); + } + + default: + // Break to do additional setup for other builtin kinds. + break; + } + + // Other operations are already checked to be homogeneous, so we can extend + // the RHS with the LHS bit width. + CARBON_CHECK(rhs.type_id == lhs.type_id, "Heterogeneous builtin integer op!"); + llvm::APInt rhs_val = context.ints().GetAtWidth(rhs.int_id, lhs_bit_width_id); + + // We may also need to diagnose overflow for these operations. + bool overflow = false; Lex::TokenKind op_token = Lex::TokenKind::Not; + switch (builtin_kind) { // Arithmetic. case SemIR::BuiltinFunctionKind::IntSAdd: @@ -744,32 +793,9 @@ static auto PerformBuiltinBinaryIntOp(Context& context, SemIRLoc loc, op_token = Lex::TokenKind::Caret; break; - // Bit shift. case SemIR::BuiltinFunctionKind::IntLeftShift: case SemIR::BuiltinFunctionKind::IntRightShift: - if (rhs_val.uge(lhs_val.getBitWidth()) || - (rhs_val.isNegative() && context.types().IsSignedInt(rhs.type_id))) { - CARBON_DIAGNOSTIC( - CompileTimeShiftOutOfRange, Error, - "shift distance not in range [0, {0}) in {1} {2:<<|>>} {3}", - unsigned, TypedInt, BoolAsSelect, TypedInt); - context.emitter().Emit( - loc, CompileTimeShiftOutOfRange, lhs_val.getBitWidth(), - {.type = lhs.type_id, .value = lhs_val}, - builtin_kind == SemIR::BuiltinFunctionKind::IntLeftShift, - {.type = rhs.type_id, .value = rhs_val}); - // TODO: Is it useful to recover by returning 0 or -1? - return SemIR::ConstantId::Error; - } - - if (builtin_kind == SemIR::BuiltinFunctionKind::IntLeftShift) { - result_val = lhs_val.shl(rhs_val); - } else if (context.types().IsSignedInt(lhs.type_id)) { - result_val = lhs_val.ashr(rhs_val); - } else { - result_val = lhs_val.lshr(rhs_val); - } - break; + CARBON_FATAL("Handled specially above."); default: CARBON_FATAL("Unexpected operation kind."); @@ -795,10 +821,15 @@ static auto PerformBuiltinIntComparison(Context& context, SemIR::TypeId bool_type_id) -> SemIR::ConstantId { auto lhs = context.insts().GetAs(lhs_id); - const auto& lhs_val = context.ints().Get(lhs.int_id); - const auto& rhs_val = - context.ints().Get(context.insts().GetAs(rhs_id).int_id); - bool is_signed = context.types().IsSignedInt(lhs.type_id); + auto rhs = context.insts().GetAs(rhs_id); + CARBON_CHECK(lhs.type_id == rhs.type_id, + "Builtin comparison with mismatched types!"); + + auto [is_signed, bit_width_id] = context.sem_ir().GetIntTypeInfo(lhs.type_id); + CARBON_CHECK(bit_width_id != IntId::Invalid, + "Cannot evaluate a generic bit width integer: {0}", lhs); + llvm::APInt lhs_val = context.ints().GetAtWidth(lhs.int_id, bit_width_id); + llvm::APInt rhs_val = context.ints().GetAtWidth(rhs.int_id, bit_width_id); bool result; switch (builtin_kind) { diff --git a/toolchain/check/handle_literal.cpp b/toolchain/check/handle_literal.cpp index d975de044809e..6b0ba103ccd73 100644 --- a/toolchain/check/handle_literal.cpp +++ b/toolchain/check/handle_literal.cpp @@ -46,7 +46,7 @@ static auto MakeI32Literal(Context& context, Parse::NodeId node_id, return context.AddInst( node_id, {.type_id = context.GetBuiltinType(SemIR::BuiltinInstKind::IntType), - .int_id = context.ints().Add(i32_val)}); + .int_id = context.ints().AddUnsigned(i32_val)}); } // Forms an IntValue instruction with type `IntLiteral` for a given literal diff --git a/toolchain/check/import_ref.cpp b/toolchain/check/import_ref.cpp index 667badd7b412e..e411963eac3a5 100644 --- a/toolchain/check/import_ref.cpp +++ b/toolchain/check/import_ref.cpp @@ -2101,7 +2101,8 @@ class ImportRefResolver { return ResolveAs( {.type_id = context_.GetTypeIdForTypeConstant(type_id), - .int_id = context_.ints().Add(import_ir_.ints().Get(inst.int_id))}); + .int_id = + context_.ints().AddSigned(import_ir_.ints().Get(inst.int_id))}); } auto TryResolveTypedInst(SemIR::IntType inst) -> ResolveResult { diff --git a/toolchain/check/member_access.cpp b/toolchain/check/member_access.cpp index 5db1dfb79e962..c9c3fc99c471f 100644 --- a/toolchain/check/member_access.cpp +++ b/toolchain/check/member_access.cpp @@ -354,8 +354,8 @@ static auto PerformInstanceBinding(Context& context, SemIR::LocId loc_id, static auto ValidateTupleIndex(Context& context, SemIR::LocId loc_id, SemIR::InstId operand_inst_id, SemIR::IntValue index_inst, int size) - -> const llvm::APInt* { - const auto& index_val = context.ints().Get(index_inst.int_id); + -> std::optional { + llvm::APInt index_val = context.ints().Get(index_inst.int_id); if (index_val.uge(size)) { CARBON_DIAGNOSTIC(TupleIndexOutOfBounds, Error, "tuple element index `{0}` is past the end of type {1}", @@ -363,9 +363,9 @@ static auto ValidateTupleIndex(Context& context, SemIR::LocId loc_id, context.emitter().Emit(loc_id, TupleIndexOutOfBounds, {.type = index_inst.type_id, .value = index_val}, operand_inst_id); - return nullptr; + return std::nullopt; } - return &index_val; + return index_val; } auto PerformMemberAccess(Context& context, SemIR::LocId loc_id, @@ -520,8 +520,8 @@ auto PerformTupleAccess(Context& context, SemIR::LocId loc_id, auto index_literal = context.insts().GetAs( context.constant_values().GetInstId(index_const_id)); auto type_block = context.type_blocks().Get(tuple_type->elements_id); - const auto* index_val = ValidateTupleIndex(context, loc_id, tuple_inst_id, - index_literal, type_block.size()); + std::optional index_val = ValidateTupleIndex( + context, loc_id, tuple_inst_id, index_literal, type_block.size()); if (!index_val) { return SemIR::InstId::BuiltinError; } diff --git a/toolchain/driver/testdata/dump_shared_values.carbon b/toolchain/driver/testdata/dump_shared_values.carbon index b2da163d9de38..e13e938ed4b84 100644 --- a/toolchain/driver/testdata/dump_shared_values.carbon +++ b/toolchain/driver/testdata/dump_shared_values.carbon @@ -22,11 +22,7 @@ var str2: String = "ab'\"c"; // CHECK:STDOUT: --- // CHECK:STDOUT: filename: dump_shared_values.carbon // CHECK:STDOUT: shared_values: -// CHECK:STDOUT: ints: -// CHECK:STDOUT: int0: 32 -// CHECK:STDOUT: int1: 1 -// CHECK:STDOUT: int2: 8 -// CHECK:STDOUT: int3: 64 +// CHECK:STDOUT: ints: {} // CHECK:STDOUT: reals: // CHECK:STDOUT: real0: 10*10^-1 // CHECK:STDOUT: real1: 8*10^7 diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp index fa4c942044a21..daad543ca4c2f 100644 --- a/toolchain/lex/lex.cpp +++ b/toolchain/lex/lex.cpp @@ -1013,10 +1013,11 @@ auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position) return VariantMatch( literal->ComputeValue(emitter_), [&](NumericLiteral::IntValue&& value) { - return LexTokenWithPayload( - TokenKind::IntLiteral, - buffer_.value_stores_->ints().Add(std::move(value.value)).index, - byte_offset); + return LexTokenWithPayload(TokenKind::IntLiteral, + buffer_.value_stores_->ints() + .AddUnsigned(std::move(value.value)) + .AsRawId(), + byte_offset); }, [&](NumericLiteral::RealValue&& value) { auto real_id = buffer_.value_stores_->reals().Add(Real{ @@ -1222,10 +1223,13 @@ auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset) suffix_value = suffix_value * 10 + (c - '0'); } - return LexTokenWithPayload( - kind, - buffer_.value_stores_->ints().Add(llvm::APInt(64, suffix_value)).index, - byte_offset); + // Add the bit width to our integer store and get its index. We treat it as + // unsigned as that's less expensive and it can't be negative. + CARBON_CHECK(suffix_value >= 0); + auto bit_width_payload = + buffer_.value_stores_->ints().Add(suffix_value).AsRawId(); + + return LexTokenWithPayload(kind, bit_width_payload, byte_offset); } auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text, diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h index b958eef20ef59..2cd55d1939e7d 100644 --- a/toolchain/lex/tokenized_buffer.h +++ b/toolchain/lex/tokenized_buffer.h @@ -312,7 +312,7 @@ class TokenizedBuffer : public Printable { kind() == TokenKind::IntTypeLiteral || kind() == TokenKind::UnsignedIntTypeLiteral || kind() == TokenKind::FloatTypeLiteral); - return IntId(token_payload_); + return IntId::MakeFromTokenPayload(token_payload_); } auto real_id() const -> RealId { @@ -363,6 +363,9 @@ class TokenizedBuffer : public Printable { static constexpr int PayloadBits = 23; + // Make sure we have enough payload bits to represent token-associated IDs. + static_assert(PayloadBits >= TokenIdBits); + // Constructor for a TokenKind that carries no payload, or where the payload // will be set later. // diff --git a/toolchain/lower/constant.cpp b/toolchain/lower/constant.cpp index 528ea882f54b2..69273aea93c46 100644 --- a/toolchain/lower/constant.cpp +++ b/toolchain/lower/constant.cpp @@ -210,13 +210,18 @@ static auto EmitAsConstant(ConstantContext& context, SemIR::IntValue inst) // IntLiteral is represented as an empty struct. All other integer types are // represented as an LLVM integer type. - if (!llvm::isa(type)) { + auto* int_type = llvm::dyn_cast(type); + if (!int_type) { auto* struct_type = llvm::dyn_cast(type); CARBON_CHECK(struct_type && struct_type->getNumElements() == 0); return llvm::ConstantStruct::get(struct_type); } - return llvm::ConstantInt::get(type, context.sem_ir().ints().Get(inst.int_id)); + auto val = context.sem_ir().ints().Get(inst.int_id); + int bit_width = int_type->getBitWidth(); + bool is_signed = context.sem_ir().GetIntTypeInfo(inst.type_id).is_signed; + return llvm::ConstantInt::get(type, is_signed ? val.sextOrTrunc(bit_width) + : val.zextOrTrunc(bit_width)); } static auto EmitAsConstant(ConstantContext& context, SemIR::Namespace inst) diff --git a/toolchain/sem_ir/file.h b/toolchain/sem_ir/file.h index fe871f497754b..cdb01f11362fd 100644 --- a/toolchain/sem_ir/file.h +++ b/toolchain/sem_ir/file.h @@ -70,6 +70,30 @@ class File : public Printable { return types().GetAs(pointer_id).pointee_id; } + struct IntTypeInfo { + bool is_signed; + IntId bit_width = IntId::Invalid; + }; + + // Compute the core integer type information from a type ID. + // + // TODO: When we don't have a builtin int type mixed with actual `IntType` + // instructions, clients should directly query the `IntType` instruction to + // compute this information. + auto GetIntTypeInfo(TypeId int_type_id) const -> IntTypeInfo { + auto inst_id = types().GetInstId(int_type_id); + if (inst_id == InstId::BuiltinIntType) { + return {true, ints().LookupSigned(llvm::APInt(/*numBits=*/64, 32))}; + } + auto int_type = insts().TryGetAs(inst_id); + CARBON_CHECK(int_type, + "Integer type ID associated with an unknown instruction: {0}", + insts().Get(inst_id)); + auto bit_width_inst = insts().TryGetAs(int_type->bit_width_id); + return {int_type->int_kind.is_signed(), + bit_width_inst ? bit_width_inst->int_id : IntId::Invalid}; + } + auto check_ir_id() const -> CheckIRId { return check_ir_id_; } auto package_id() const -> IdentifierId { return package_id_; } auto library_id() const -> SemIR::LibraryNameId { return library_id_; } diff --git a/toolchain/sem_ir/inst.h b/toolchain/sem_ir/inst.h index 2152a4ba80c91..fd8ad4bc41ac0 100644 --- a/toolchain/sem_ir/inst.h +++ b/toolchain/sem_ir/inst.h @@ -265,6 +265,7 @@ class Inst : public Printable { // Convert a field to its raw representation, used as `arg0_` / `arg1_`. static constexpr auto ToRaw(IdBase base) -> int32_t { return base.index; } + static constexpr auto ToRaw(IntId id) -> int32_t { return id.AsRawId(); } static constexpr auto ToRaw(BuiltinInstKind kind) -> int32_t { return kind.AsInt(); } @@ -275,6 +276,10 @@ class Inst : public Printable { return T(raw); } template <> + constexpr auto FromRaw(int32_t raw) -> IntId { + return IntId::MakeRaw(raw); + } + template <> constexpr auto FromRaw(int32_t raw) -> BuiltinInstKind { return BuiltinInstKind::FromInt(raw); } diff --git a/toolchain/sem_ir/type.h b/toolchain/sem_ir/type.h index b09be3cd34900..b4cb287c16f24 100644 --- a/toolchain/sem_ir/type.h +++ b/toolchain/sem_ir/type.h @@ -99,6 +99,10 @@ class TypeStore : public Yaml::Printable { } // Determines whether the given type is a signed integer type. + // + // TODO: When we don't have a builtin int type mixed with actual `IntType` + // instructions, clients should directly query the `IntType` instruction to + // compute this information. auto IsSignedInt(TypeId int_type_id) const -> bool { auto inst_id = GetInstId(int_type_id); if (inst_id == InstId::BuiltinIntType) {