From 0329772be19a07a03091aa432eb4162a522c387b Mon Sep 17 00:00:00 2001 From: Meghan Date: Tue, 5 Jun 2018 16:37:53 -0700 Subject: [PATCH 1/4] ARM Popcount lowering rule and codegen updates to support reinterpreting and accessing vectors --- HalideIR | 2 +- src/codegen/llvm/codegen_arm.cc | 77 ++++++++++++++++++++++++++++++++ src/codegen/llvm/codegen_llvm.cc | 26 ++++++++++- 3 files changed, 103 insertions(+), 2 deletions(-) diff --git a/HalideIR b/HalideIR index a3698398faff..e20e5e9abb3a 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit a3698398faff7fec1c0fa4e4479357651382db75 +Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970 diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index b87b6ec88808..abf30756011c 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -18,8 +18,85 @@ class CodeGenARM final : public CodeGenCPU { native_vector_bits_ = 16 * 8; CodeGenCPU::InitTarget(tm); } + llvm::Value* CreateIntrinsic(const Call* op) override; + + private: + Expr ARMPopcount(const Call* op); }; +llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { + if (op->is_intrinsic("llvm_intrin")) { + llvm::Intrinsic::ID id = static_cast( + op->args[0].as()->value); + if (id == ::llvm::Intrinsic::ctpop) { + Expr e = ARMPopcount(op); + return CodeGenCPU::CreateIntrinsic(e.as()); + } + } + return CodeGenCPU::CreateIntrinsic(op); +} + +Expr CodeGenARM::ARMPopcount(const Call *call) { + using namespace ir; + const Expr& e = call->args[2]; + ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; + ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; + + + Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); + Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); + Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + + // Fallback to default llvm lowering rule if input type not a full vector or half vector length + int total_size = call->type.bits() * call->type.lanes(); + if (!call->type.is_vector() || call->type.bits() == 8 || + (total_size != 128 && total_size != 64)) { + Array vcnt_args; + vcnt_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id)); + vcnt_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt_args.push_back(e); + return ir::Call::make(call->type, "llvm_intrin", vcnt_args, Call::PureIntrinsic); + } + + // Interpret input as vector of 8bit values + Expr input8 = reinterpret(uint8_type, e); + // Popcount 8bit->8bit + const Call* c0 = input8.as(); + CHECK(c0 != nullptr); + Array vcnt8_args; + vcnt8_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id)); + vcnt8_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt8_args.push_back(input8); + Expr vcnt8 = ir::Call::make(uint8_type, "llvm_intrin", vcnt8_args, Call::PureIntrinsic); + + // Accumulation 8->16bit + Array vcnt16_args; + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt16_args.push_back(vcnt8); + Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic); + if (call->type.bits() == 16) { + return vcnt16; + } + + // Accumulation 16->32bit + Array vcnt32_args; + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt32_args.push_back(vcnt16); + Expr vcnt32 = ir::Call::make(uint32_type, "llvm_intrin", vcnt32_args, Call::PureIntrinsic); + if (call->type.bits() == 32) { + return vcnt32; + } + + // Accumulation 32->64bit + Array vcnt64_args; + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt64_args.push_back(vcnt32); + return ir::Call::make(call->type, "llvm_intrin", vcnt64_args, Call::PureIntrinsic); +} + TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm") .set_body([](const TVMArgs& targs, TVMRetValue* rv) { CodeGenLLVM* cg = new CodeGenARM(); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 934398d9ce09..d0c5b77cbfd5 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) { llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) { int num_elems = static_cast(vec->getType()->getVectorNumElements()); if (extent == num_elems && begin == 0) return vec; - CHECK_LT(begin + extent, num_elems); + CHECK_LT(begin + extent, num_elems+1); std::vector indices; for (int i = 0; i < extent; ++i) { indices.push_back(begin + i); @@ -562,6 +562,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { sig_type.push_back(arg_value.back()->getType()); } } + llvm::Type *returnType = LLVMType(op->type); + if (returnType != sig_type[0]) { + sig_type.insert(sig_type.begin(), returnType); + } llvm::Function* f = llvm::Intrinsic::getDeclaration( module_.get(), id, sig_type); return builder_->CreateCall(f, arg_value); @@ -628,6 +632,26 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { value->addIncoming(then_value, then_value_block); value->addIncoming(else_value, else_value_block); return value; + } else if (op->is_intrinsic(Call::reinterpret)) { + llvm::Type * target = LLVMType(op->type); + return builder_->CreateBitCast(MakeValue(op->args[0]), target); + } else if (op->is_intrinsic("vectorlow")) { + llvm::Value *v = MakeValue(op->args[0]); + int l = v->getType()->getVectorNumElements(); + return CreateVecSlice(v, 0, l/2); + } else if (op->is_intrinsic("vectorhigh")) { + llvm::Value *v = MakeValue(op->args[0]); + int l = v->getType()->getVectorNumElements(); + return CreateVecSlice(v, l/2, l/2); + } else if (op->is_intrinsic("vectorcombine")) { + llvm::Value *v0 = MakeValue(op->args[0]); + llvm::Value *v1 = MakeValue(op->args[1]); + int num_elems = static_cast(v0->getType()->getVectorNumElements()) * 2; + std::vector indices; + for (int i = 0; i < num_elems; ++i) { + indices.push_back(i); + } + return builder_->CreateShuffleVector(v0, v1, indices); } else { LOG(FATAL) << "unknown intrinsic " << op->name; return nullptr; From 777f9ea69d24d40433d2867c69e3761f9704198a Mon Sep 17 00:00:00 2001 From: Meghan Date: Wed, 6 Jun 2018 16:05:59 -0700 Subject: [PATCH 2/4] Fixes and test case for arm popcount --- HalideIR | 2 +- src/codegen/llvm/codegen_arm.cc | 24 ++++++++++++-------- src/codegen/llvm/codegen_llvm.cc | 2 +- src/codegen/llvm/llvm_module.cc | 38 ++++++++++++++++++++++++++++---- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/HalideIR b/HalideIR index e20e5e9abb3a..a3698398faff 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970 +Subproject commit a3698398faff7fec1c0fa4e4479357651382db75 diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index abf30756011c..161d6db6e42d 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -39,13 +39,9 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { Expr CodeGenARM::ARMPopcount(const Call *call) { using namespace ir; const Expr& e = call->args[2]; - ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; - ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; - - Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); - Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); - Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; + ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; // Fallback to default llvm lowering rule if input type not a full vector or half vector length int total_size = call->type.bits() * call->type.lanes(); @@ -58,6 +54,16 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { return ir::Call::make(call->type, "llvm_intrin", vcnt_args, Call::PureIntrinsic); } + // Popcount lowering rule: + // Reinterpret input vector as a vector of 8bit values and preform popcount + // Pairwise add between adjacent elements and double width with vpaddlu + // to return back to original input type + + // Dvisions are always divisible (number of bits = 64 or 128) + Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); + Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); + Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + // Interpret input as vector of 8bit values Expr input8 = reinterpret(uint8_type, e); // Popcount 8bit->8bit @@ -71,7 +77,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 8->16bit Array vcnt16_args; - vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt16_args.push_back(vcnt8); Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic); @@ -81,7 +87,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 16->32bit Array vcnt32_args; - vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt32_args.push_back(vcnt16); Expr vcnt32 = ir::Call::make(uint32_type, "llvm_intrin", vcnt32_args, Call::PureIntrinsic); @@ -91,7 +97,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 32->64bit Array vcnt64_args; - vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt64_args.push_back(vcnt32); return ir::Call::make(call->type, "llvm_intrin", vcnt64_args, Call::PureIntrinsic); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index d0c5b77cbfd5..bbf52512d3d5 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) { llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) { int num_elems = static_cast(vec->getType()->getVectorNumElements()); if (extent == num_elems && begin == 0) return vec; - CHECK_LT(begin + extent, num_elems+1); + CHECK_LE(begin + extent, num_elems); std::vector indices; for (int i = 0; i < extent; ++i) { indices.push_back(begin + i); diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc index c16af511febc..2bae52b194f5 100644 --- a/src/codegen/llvm/llvm_module.cc +++ b/src/codegen/llvm/llvm_module.cc @@ -117,11 +117,41 @@ class LLVMModuleNode final : public runtime::ModuleNode { } std::string GetSource(const std::string& format) final { + std::string fmt = runtime::GetFileFormat("", format); std::string type_str; - llvm::raw_string_ostream rso(type_str); - CHECK(mptr_ != nullptr); - mptr_->print(rso, nullptr); - return rso.str(); + llvm::SmallString<256> str; + llvm::raw_svector_ostream rso(str); + + if (fmt == "s" || fmt == "asm") { + #if TVM_LLVM_VERSION <= 60 + std::unique_ptr m = llvm::CloneModule(mptr_); + #else + std::unique_ptr m = llvm::CloneModule(*mptr_); + #endif + llvm::legacy::PassManager pass; + CHECK(tm_); + #if TVM_LLVM_VERSION <= 60 + CHECK(tm_->addPassesToEmitFile( + pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + #else + CHECK(tm_->addPassesToEmitFile( + pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + #endif + pass.run(*m); + return rso.str().str(); + } else if (fmt == "" || fmt == "ll") { + std::string type_str; + llvm::raw_string_ostream rso(type_str); + CHECK(mptr_ != nullptr); + mptr_->print(rso, nullptr); + return rso.str(); + } else { + LOG(FATAL) << "Do not know how to get source code with format: " + << format << "\'"; + } + return ""; } void Init(const Array& funcs, std::string target) { From 2e56f092e09cc366dc64dcc7480ab8f75869ae56 Mon Sep 17 00:00:00 2001 From: Meghan Date: Wed, 6 Jun 2018 16:07:15 -0700 Subject: [PATCH 3/4] white space fixes --- src/codegen/llvm/codegen_arm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index 161d6db6e42d..18a0eb54e182 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -39,7 +39,6 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { Expr CodeGenARM::ARMPopcount(const Call *call) { using namespace ir; const Expr& e = call->args[2]; - ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; From 1d55a8e2a4cfa53f7b6a5a94cb84c73b768785ac Mon Sep 17 00:00:00 2001 From: Meghan Date: Tue, 12 Jun 2018 12:36:29 -0700 Subject: [PATCH 4/4] unit test fixes and arm codegentest --- src/codegen/llvm/codegen_llvm.cc | 6 ++--- tests/python/unittest/test_codegen_arm.py | 30 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 tests/python/unittest/test_codegen_arm.py diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index bbf52512d3d5..329d7311c9f4 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -562,9 +562,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { sig_type.push_back(arg_value.back()->getType()); } } - llvm::Type *returnType = LLVMType(op->type); - if (returnType != sig_type[0]) { - sig_type.insert(sig_type.begin(), returnType); + llvm::Type *return_type = LLVMType(op->type); + if (sig_type.size() > 0 && return_type != sig_type[0]) { + sig_type.insert(sig_type.begin(), return_type); } llvm::Function* f = llvm::Intrinsic::getDeclaration( module_.get(), id, sig_type); diff --git a/tests/python/unittest/test_codegen_arm.py b/tests/python/unittest/test_codegen_arm.py new file mode 100644 index 000000000000..24240db72b26 --- /dev/null +++ b/tests/python/unittest/test_codegen_arm.py @@ -0,0 +1,30 @@ +import tvm +import re +import os +import ctypes + +def test_popcount(): + target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon' + + def check_correct_assembly(type, elements, counts): + n = tvm.convert(elements) + A = tvm.placeholder(n, dtype=type, name='A') + B = tvm.compute(A.shape, lambda i: tvm.popcount(A[i]), name='B') + s = tvm.create_schedule(B.op) + s[B].vectorize(s[B].op.axis[0]) + f = tvm.build(s, [A, B], target) + + # Verify we see the correct number of vpaddl and vcnt instructions in the assembly + assembly = f.get_source('asm') + matches = re.findall("vpaddl", assembly) + assert (len(matches) == counts) + matches = re.findall("vcnt", assembly) + assert (len(matches) == 1) + check_correct_assembly('uint16', 8, 1) + check_correct_assembly('uint16', 4, 1) + check_correct_assembly('uint32', 4, 2) + check_correct_assembly('uint32', 2, 2) + check_correct_assembly('uint64', 2, 3) + +if __name__ == "__main__": + test_popcount()