Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Canonicalize away bit width and embed small integers into IntIds #4487

Merged
merged 14 commits into from
Nov 13, 2024
33 changes: 33 additions & 0 deletions toolchain/base/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ cc_library(
hdrs = ["value_ids.h"],
deps = [
":index_base",
"//common:check",
"//common:ostream",
"@llvm-project//llvm:Support",
],
Expand Down Expand Up @@ -80,10 +81,42 @@ cc_test(
],
)

cc_library(
name = "int_store",
srcs = ["int_store.cpp"],
hdrs = ["int_store.h"],
deps = [
":mem_usage",
":value_ids",
":value_store",
":yaml",
"//common:check",
"//common:hashtable_key_context",
"//common:ostream",
"//common:set",
"@llvm-project//llvm:Support",
],
)

cc_test(
name = "int_store_test",
size = "small",
srcs = ["int_store_test.cpp"],
deps = [
":int_store",
":value_ids",
"//testing/base:gtest_main",
"//testing/base:test_raw_ostream",
"//toolchain/testing:yaml_test_helpers",
"@googletest//:gtest",
],
)

cc_library(
name = "shared_value_stores",
hdrs = ["shared_value_stores.h"],
deps = [
":int_store",
":mem_usage",
":value_ids",
":value_store",
Expand Down
60 changes: 60 additions & 0 deletions toolchain/base/int_store.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "toolchain/base/int_store.h"

namespace Carbon {

auto IntStore::CanonicalBitWidth(int significant_bits) -> int {
// For larger integers, we store them in as a signed APInt with a canonical
// width that is the smallest multiple of the word type's bits, but no
// smaller than a minimum of 64 bits to avoid spurious resizing of the most
// common cases (<= 64 bits).
static constexpr int WordWidth = llvm::APInt::APINT_BITS_PER_WORD;

return std::max<int>(
MinAPWidth, ((significant_bits + WordWidth - 1) / WordWidth) * WordWidth);
}

auto IntStore::CanonicalizeSigned(llvm::APInt value) -> llvm::APInt {
return value.sextOrTrunc(CanonicalBitWidth(value.getSignificantBits()));
}

auto IntStore::CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt {
// We need the width to include a zero sign bit as we canonicalize to a
// signed representation.
return value.zextOrTrunc(CanonicalBitWidth(value.getActiveBits() + 1));
}

auto IntStore::AddLarge(int64_t value) -> IntId {
auto ap_id =
values_.Add(llvm::APInt(CanonicalBitWidth(64), value, /*isSigned=*/true));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::AddSignedLarge(llvm::APInt value) -> IntId {
auto ap_id = values_.Add(CanonicalizeSigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::AddUnsignedLarge(llvm::APInt value) -> IntId {
auto ap_id = values_.Add(CanonicalizeUnsigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::LookupSignedLarge(llvm::APInt value) const -> IntId {
auto ap_id = values_.Lookup(CanonicalizeSigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::OutputYaml() const -> Yaml::OutputMapping {
return values_.OutputYaml();
}

auto IntStore::CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
-> void {
mem_usage.Collect(std::string(label), values_);
}

} // namespace Carbon
187 changes: 187 additions & 0 deletions toolchain/base/int_store.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef CARBON_TOOLCHAIN_BASE_INT_STORE_H_
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Organizationally, do you think it'd help if we had toolchain/base/int.h with both IntStore and IntId? We do somewhat similar in sem_ir, with things like type.h (just noting that one since it's in the PR)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM. I'll do the rename from int_store.h to int.h last to preserve review threads as much as I can.

#define CARBON_TOOLCHAIN_BASE_INT_STORE_H_

#include "common/check.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "toolchain/base/mem_usage.h"
#include "toolchain/base/value_ids.h"
#include "toolchain/base/value_store.h"
#include "toolchain/base/yaml.h"

namespace Carbon {

// Forward declare a testing peer so we can friend it.
namespace Testing {
struct IntStoreTestPeer;
} // namespace Testing

// A canonicalizing value store with deep optimizations for integers.
//
// This stores integers as abstract, signed mathematical integers. The bit width
// of specific `APInt` values, either as inputs or outputs, is disregarded for
// the purpose of canonicalization and the returned integer may use a very
// different bit width `APInt` than was used when adding. There are also
// optimized paths for adding integer values representable using native integer
// types.
//
// Because the integers in the store are canonicalized without a specific bit
// width there are helper functions to coerce them to a specific desired bit
// width for use.
//
// This leverages a significant optimization for small integer values -- rather
// than canonicalizing and making them unique in a `ValueStore`, they are
// directly embedded in the `IntId` itself. Only larger integers are stored in
// an array of `APInt` values and represented as an index in the ID.
class IntStore {
public:
// Adds an integer value representable in a host `int64_t` to the store.
// Especially useful when the integer is computed without an `APInt` in the
// first place.
//
// This only accepts a signed `int64_t` and uses the mathematical signed
// integer value of it as the added integer value.
//
// Returns the ID corresponding to this integer value, storing an `APInt` if
// necessary to represent it.
auto Add(int64_t value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return AddLarge(value);
}

// Returns the ID corresponding to this integer value, storing an `APInt` if
// necessary to represent it.
auto AddSigned(llvm::APInt value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return AddSignedLarge(std::move(value));
}

// Returns the ID corresponding to an equivalent signed integer value, storing an `APInt` if necessary to represent it.
auto AddUnsigned(llvm::APInt value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeUnsignedValue(value); id.is_valid())
[[likely]] {
return id;
}

// Fallback for larger values.
return AddUnsignedLarge(std::move(value));
}

// Returns the value for an ID.
//
// This will always be a signed `APInt` with a canonical bit width for the
// specific integer value in question.
auto Get(IntId id) const -> llvm::APInt {
if (id.is_value()) [[likely]] {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're currently using LLVM_LIKELY/LLVM_UNLIKELY, is this equivalent? Are you proposing to switch given the attribute is available?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed that we have standard attributes now. Happy to either switch to LLVM ones until we can move the rest of the code, or move the rest of the code in a follow-up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought is we've generally agreed to use C++ attribute forms so that seems the better choice. I don't think it makes sense to switch this code if the rest changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true);
}
return values_.Get(APIntId(id.AsIndex()));
}

// Returns the value for an ID adjusted to a specific bit width.
//
// Note that because we store canonical mathematical integers as signed
// integers, this always sign extends or truncates to the target width. The
// caller can then use that as a signed or unsigned integer as needed.
auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt {
llvm::APInt value = Get(id);
if (static_cast<int>(value.getBitWidth()) != bit_width) {
value = value.sextOrTrunc(bit_width);
}
return value;
}

// Returns the value for an ID adjusted to the bit width specified with
// another integer ID.
//
// This simply looks up the width integer ID, and then calls the above
// `GetAtWidth` overload using the value found for it. See that overload for
// more details.
auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt {
const llvm::APInt bit_width = Get(bit_width_id);
CARBON_CHECK(bit_width.isStrictlyPositive() &&
bit_width.isSignedIntN(MinAPWidth),
"Invalid bit width value: {0}", bit_width);
return GetAtWidth(id, bit_width.getSExtValue());
}

// Looks up the canonical ID for a value, or returns invalid if not in the
// store.
auto LookupSigned(llvm::APInt value) const -> IntId {
if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return LookupSignedLarge(std::move(value));
}

// Output a YAML description of this data structure. Note that this will only
// include the integers that required storing, not those successfully embedded
// into the ID space.
auto OutputYaml() const -> Yaml::OutputMapping;

auto array_ref() const -> llvm::ArrayRef<llvm::APInt> {
return values_.array_ref();
}
auto size() const -> size_t { return values_.size(); }

// Collects the memory usage of the separately stored integers.
auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
-> void;

private:
friend struct Testing::IntStoreTestPeer;

struct APIntId : IdBase, Printable<APIntId> {
using ValueType = llvm::APInt;
static const APIntId Invalid;
using IdBase::IdBase;
auto Print(llvm::raw_ostream& out) const -> void {
out << "ap_int";
IdBase::Print(out);
}
};

static constexpr int MinAPWidth = 64;

// Pick a canonical bit width for the provided number of significant bits.
static auto CanonicalBitWidth(int significant_bits) -> int;

// Canonicalize an incoming signed APInt to the correct bit width.
static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt;

// Canonicalize an incoming unsigned APInt to the correct bit width.
static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt;

auto AddLarge(int64_t value) -> IntId;
auto AddSignedLarge(llvm::APInt value) -> IntId;
auto AddUnsignedLarge(llvm::APInt value) -> IntId;

auto LookupSignedLarge(llvm::APInt value) const -> IntId;

CanonicalValueStore<APIntId> values_;
};

constexpr IntStore::APIntId IntStore::APIntId::Invalid(
IntId::Invalid.AsIndex());

} // namespace Carbon

#endif // CARBON_TOOLCHAIN_BASE_INT_STORE_H_
Loading
Loading