Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rebase #1

Merged
merged 9 commits into from
Jan 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cc_bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -eux
python tests/scripts/github_cc_reviewers.py
python tests/scripts/github_cc_reviewers.py || echo step failed
5 changes: 1 addition & 4 deletions docker/Dockerfile.ci_qemu
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
RUN bash /install/ubuntu_install_rust.sh
ENV RUSTUP_HOME /opt/rust
ENV CARGO_HOME /opt/rust

# wasmtime
COPY install/ubuntu_install_wasmtime.sh /install/ubuntu_install_wasmtime.sh
RUN bash /install/ubuntu_install_wasmtime.sh
ENV PATH $PATH:$CARGO_HOME/bin

# AutoTVM deps
COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
Expand Down
2 changes: 1 addition & 1 deletion docs/contribute/git_howto.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Git Usage Tips
Here are some tips for git workflow.

How to resolve a conflict with ``main``
-------------------------------------
---------------------------------------

- First rebase to most recent main

Expand Down
156 changes: 144 additions & 12 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.
# pylint: disable=invalid-name
"""Device config class to hold information about the target hardware"""
from typing import Tuple, List, Dict
from typing import Tuple, List, Dict, Optional
from functools import reduce

import math
Expand Down Expand Up @@ -332,6 +332,7 @@ def _get_input_block(

def get_kernel_steps(
self,
op_type: str,
dilated_kernel_h: int,
dilated_kernel_w: int,
ifm_dtype: str,
Expand All @@ -341,6 +342,9 @@ def get_kernel_steps(

Parameters
----------
op_type : str
The NPU primitive operator
"ethosu_pooling"
dilated_kernel_h: int
Height of dilated kernel
dilated_kernel_w: int
Expand All @@ -355,18 +359,23 @@ def get_kernel_steps(
List[int]
List where each entry contains the amount of elements in one of the subkernels
"""
if op_type == "ethosu_binary_elementwise":
return [1]

subkernels = self._get_subkernels(dilated_kernel_h, dilated_kernel_w)

# Determine the number of kernel steps per subkernel
kernel_steps = []
for y, x in subkernels:
subkernel_elements = x * y
if is_partkernel:
# Part-kernel-first traversal
if op_type == "ethosu_conv2d" and is_partkernel:
# Part-kernel-first traversal conv2d
divisor = 4 if ifm_dtype == "int8" else 2
kernel_steps.append(int(_round_up_div(subkernel_elements, divisor)))
elif op_type == "ethosu_depthwise_conv2d":
kernel_steps.append(int(_round_up_div(subkernel_elements, 4)))
else:
# Depth-first traversal
# Depth-first traversal conv2d or pooling
kernel_steps.append(int(subkernel_elements))

return kernel_steps
Expand Down Expand Up @@ -430,11 +439,133 @@ def is_partkernel(

return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8

def get_elementwise_block_config(
self,
ifm_propagator: Propagator,
ifm2_propagator: Optional[Propagator],
op_attrs: Dict,
ofm_shape: List[int],
output_layout: str,
input_layout: str,
input2_layout: Optional[str],
ifm_dtype: str,
ofm_dtype: str,
) -> List[BlockConfig]:
"""Get a suitable block config for an elementwise operator

Parameters
----------
ifm_propagator: Propagator,
The propagator containing the data dependencies between input and output
ifm2_propagator: Propagator,
The propagator containing the data dependencies between input2 and output
op_attrs: Dict,
Dictionary containing operator attributes
ofm_shape: List[int],
Shape of the output tensor
output_layout: str,
The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".
input_layout: str,
The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
input2_layout: str,
The layout of the Input2 Feature Map tensor. Can be "NHWC" or "NHCWB16".
ifm_dtype: str,
Datatype of the Input Feature Map tensor (IFM)
ofm_dtype: str,
Datatype of the Output Feature Map tensor (OFM)

Returns
----------
List[BlockConfig]
List containing a single suitable block config
"""
block_config = []
output_shape = [int(a) for a in ofm_shape]

op_type = op_attrs.get("op")
op_str = op_attrs.get("op_str")
activation = op_attrs.get("activation", "NONE")

input_bytewidth = 1 if ifm_dtype == "int8" else 2 if ifm_dtype == "int16" else 4
banks_available = self._total_banks - self._reserved_banks
if activation == "LUT" and not self._lut_reserved:
banks_available -= 2

# Split the block in half until it fits into SHRAM
if output_layout == "NHCWB16":
split_order = (a for a in [1, 3, 2])
output_block = [
output_shape[0],
min(output_shape[1], self._max_block_shape.height),
min(output_shape[2] * output_shape[4], self._max_block_shape.depth),
min(output_shape[3], self._max_block_shape.width),
16,
]
else:
split_order = (a for a in [1, 2, 3])
output_block = [
output_shape[0],
min(output_shape[1], self._max_block_shape.height),
min(output_shape[2], self._max_block_shape.width),
min(output_shape[3], self._max_block_shape.depth),
]
split_axis = next(split_order)
while True:
# Create stripe config for output block
offset = [0] * len(output_block)
stripes = [1] * len(output_block)
order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
output_stripe_config = StripeConfig(
output_block, output_block, output_block, order, stripes, offset
)

# Propagate the output to obtain the two input blocks
input_block = _Shape(ifm_propagator.propagate(output_stripe_config).shape, input_layout)
if ifm2_propagator:
input2_block = _Shape(
ifm2_propagator.propagate(output_stripe_config).shape, input2_layout
)
else:
# Unary elementwise
input2_block = _Shape([0, 0, 0, 0])

input_block.round_up(self._input_micro_block)
input2_block.round_up(self._input_micro_block)

# Banks required for input block
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

# Banks required for input2 block
input2_bytes = input2_block.area() * self._align(
input2_block.depth * input_bytewidth, 8
)
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
input2_banks = _round_up(input2_banks, self._input_granularity)

# Check whether or not both IFMs fit into SHRAM
if (input_banks + input2_banks) <= banks_available:
output_cycles = self._get_output_cycles(
op_type, op_str, ifm_dtype, ofm_dtype, activation
)
output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
output_cycles = int(math.ceil(output_cycles))
block_config.append(BlockConfig(output_block, 0, output_cycles))
break

if output_block[split_axis] == 1:
split_axis = next(split_order)

output_block[split_axis] = _round_up_div(output_block[split_axis], 2)

return block_config

def get_valid_block_configs(
self,
ifm_propagator: Propagator,
op_attrs: Dict,
output_shape: List[int],
ofm_shape: List[int],
ofm_channels: int,
ifm_channels: int,
output_layout: str,
Expand All @@ -452,7 +583,7 @@ def get_valid_block_configs(
The propagator containing the data dependencies between input and output
op_attrs: Dict,
Dictionary containing operator attributes
output_shape: List[int],
ofm_shape: List[int],
Shape of the output tensor
ofm_channels: int,
Number of output channels
Expand Down Expand Up @@ -487,9 +618,9 @@ def get_valid_block_configs(

subkernel_transform = ifm_propagator.transform
if output_layout == "NHCWB16":
output_shape = _Shape([1, output_shape[1], output_shape[3], ofm_channels])
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
else:
output_shape = _Shape(output_shape)
output_shape = _Shape(ofm_shape)

if input_layout == "NHCWB16":
subkernel_transform[1][-1] = min(
Expand Down Expand Up @@ -571,6 +702,7 @@ def get_valid_block_configs(

input_block_shape = _Shape(input_block.shape, input_layout)
input_block_shape.round_up(self._input_micro_block)

output_block_shape = _Shape(output_block, output_layout)

if op_type == "ethosu_conv2d":
Expand All @@ -592,12 +724,11 @@ def get_valid_block_configs(
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

if (input_banks + acc_banks) <= banks_available:

output_cycles = self._get_output_cycles(
op_type, op_str, ifm_dtype, ofm_dtype, activation
)
output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
output_cycles = int(_round_up(output_cycles, 1))
output_cycles = int(math.ceil(output_cycles))
compute_cycles = self._estimate_compute_cycles_per_block(
op_type,
output_block_shape,
Expand Down Expand Up @@ -634,16 +765,17 @@ def _estimate_compute_cycles_per_block(
num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth)
num_quantum_xy = num_quantum_x * num_quantum_y

kernel_steps = self.get_kernel_steps(kernel_h, kernel_w, ifm_dtype, is_partkernel)
kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel)

wd_cycles = self._get_weight_decoder_cycles(op_type)
delay_cycles = self._get_delay_cycles(op_type, ifm_dtype)
cycle_quantum = 4

compute_cycles = 0
for subkernel_steps in kernel_steps:
subkernel_cycles = 1 if op_type == "ethosu_pooling" else subkernel_steps
compute_cycles += (
max(wd_cycles, cycle_quantum * num_quantum_xy) * subkernel_steps * num_quantum_z
max(wd_cycles, cycle_quantum * num_quantum_xy) * subkernel_cycles * num_quantum_z
)

if num_quantum_xy == 1:
Expand Down
34 changes: 21 additions & 13 deletions python/tvm/relay/backend/contrib/ethosu/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@

import tvm
from tvm import relay
from tvm import ir
from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
from tvm.relay.backend.contrib.ethosu import util
from tvm.relay.expr_functor import ExprMutator
from tvm.ir.transform import Pass

# pylint: disable=unused-import
from tvm.relay.backend.contrib.ethosu.op import op_attrs
Expand Down Expand Up @@ -109,13 +109,11 @@ def visit_call(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Call:
return new_call


@relay.transform.function_pass(opt_level=1, name="LUTsOptimizer")
class LUTsOptimizer(Pass):
@ir.transform.module_pass(opt_level=1, name="LUTsOptimizer")
class LUTsOptimizer:
"""Register LUTsOptimizer as a relay pass."""

def transform_function(
self, func: tvm.relay.function.Function, mod: tvm.IRModule, _
) -> tvm.IRModule:
def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.IRModule:
"""Visit relay nodes in the given module.

Parameters
Expand All @@ -131,7 +129,13 @@ def transform_function(
New module with optimized LUTs.
"""
assert len(mod.functions.items()) == 1, "Module can only contain one function."
return OptimizeLUTs().visit(func)
global_var, func = mod.functions.items()[0]
optimized_func = OptimizeLUTs().visit(func)
mod.update_func(global_var, optimized_func)
return mod

def __call__(self, *args, **kwargs):
pass


class LayoutOptimization(ExprMutator):
Expand Down Expand Up @@ -247,19 +251,23 @@ def visit_call(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Call:
return super().visit_call(call)


@relay.transform.function_pass(opt_level=1, name="LayoutOptimizer")
class LayoutOptimizer(Pass):
@ir.transform.module_pass(opt_level=1, name="LayoutOptimizer")
class LayoutOptimizer:
"""Register LayoutOptimizer as a Relay pass."""

def transform_function(
self, func: tvm.relay.function.Function, mod: tvm.IRModule, _
) -> tvm.IRModule:
def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.IRModule:
"""A pass to optimize the layout of NPU operations. If both the
producer and consumer of a tensor are NPU operators, then the
layout is converted from NHWC to NHCWB16 as this is the layout NPU
uses internally."""
assert len(mod.functions.items()) == 1, "Module can only contain one function."
return LayoutOptimization().visit(func)
global_var, func = mod.functions.items()[0]
optimized_func = LayoutOptimization().visit(func)
mod.update_func(global_var, optimized_func)
return mod

def __call__(self, *args, **kwargs):
pass


@tvm._ffi.register_func("relay.ext.ethos-u.constant_updater")
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/backend/contrib/ethosu/legalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def __call__(self, *args, **kwargs):

def sigmoid_calc_func(x: float) -> float:
"""Function to calculate the values for sigmoid"""
# Thse limits are inherited from TFLite
# These limits are inherited from TFLite
upper_limit = 8.0
lower_limit = -8.0

Expand Down
Loading