Skip to content

Commit

Permalink
Update on "[executorch][flat_tensor] implement load into and dont hol…
Browse files Browse the repository at this point in the history
…d onto the segment"

1. Implement load_into in FlatTensorDataMap
2. Do not persist 'data_ro' in the FlatTensorDataMap. From `get_data`, return the FreeableBuffer given by the data loader.

TODO: add test for load_into.

Differential Revision: [D69148652](https://our.internmc.facebook.com/intern/diff/D69148652/)

[ghstack-poisoned]
  • Loading branch information
lucylq committed Feb 20, 2025
2 parents 6a8265d + d5f15d2 commit 0863b58
Show file tree
Hide file tree
Showing 235 changed files with 4,791 additions and 1,694 deletions.
3 changes: 0 additions & 3 deletions .ci/scripts/build_llama_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
fi
which "${PYTHON_EXECUTABLE}"
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"

install_executorch_and_backend_lib() {
echo "Installing executorch and xnnpack backend"
Expand All @@ -28,7 +27,6 @@ install_executorch_and_backend_lib() {
-DANDROID_ABI="${ANDROID_ABI}" \
-DCMAKE_INSTALL_PREFIX=cmake-android-out \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
Expand All @@ -54,7 +52,6 @@ build_llama_runner() {
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-Bcmake-android-out/examples/models/llama examples/models/llama

cmake --build cmake-android-out/examples/models/llama -j4 --config Release
Expand Down
1 change: 0 additions & 1 deletion .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ cmake_install_executorch_libraries() {
rm -rf cmake-out
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
Expand Down
7 changes: 2 additions & 5 deletions .ci/scripts/test_llava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ NPROC=8
if hash nproc &> /dev/null; then NPROC=$(nproc); fi

python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
EXECUTORCH_COMMON_CMAKE_ARGS=" \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
Expand All @@ -48,7 +47,6 @@ EXECUTORCH_COMMON_CMAKE_ARGS=" \
cmake_install_executorch_libraries() {
cmake \
${EXECUTORCH_COMMON_CMAKE_ARGS} \
"-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
Expand All @@ -59,7 +57,6 @@ cmake_install_executorch_libraries_for_android() {
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
${EXECUTORCH_COMMON_CMAKE_ARGS} \
"-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
Expand All @@ -80,7 +77,7 @@ cmake_build_llava_runner() {

cmake \
${LLAVA_COMMON_CMAKE_ARGS} \
-DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
-DCMAKE_PREFIX_PATH="$python_lib" \
-B${BUILD_DIR}/${dir} \
${dir}

Expand All @@ -96,7 +93,7 @@ cmake_build_llava_runner_for_android() {
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
${LLAVA_COMMON_CMAKE_ARGS} \
-DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
-DCMAKE_PREFIX_PATH="$python_lib" \
-DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \
-B${BUILD_DIR}/${dir} \
${dir}
Expand Down
4 changes: 0 additions & 4 deletions .ci/scripts/test_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,10 @@ prepare_artifacts_upload() {

build_cmake_executor_runner() {
echo "Building executor_runner"
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
rm -rf ${CMAKE_OUTPUT_DIR}
cmake -DCMAKE_BUILD_TYPE=Debug \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-B${CMAKE_OUTPUT_DIR} .

cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
Expand Down Expand Up @@ -100,14 +98,12 @@ test_model() {

build_cmake_xnn_executor_runner() {
echo "Building xnn_executor_runner"
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"

(rm -rf ${CMAKE_OUTPUT_DIR} \
&& mkdir ${CMAKE_OUTPUT_DIR} \
&& cd ${CMAKE_OUTPUT_DIR} \
&& retry cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)

cmake --build ${CMAKE_OUTPUT_DIR} -j4
Expand Down
4 changes: 0 additions & 4 deletions .ci/scripts/test_phi_3_mini.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ NPROC=8
if hash nproc &> /dev/null; then NPROC=$(nproc); fi

cmake_install_executorch_libraries() {
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
Expand All @@ -41,10 +39,8 @@ cmake_install_executorch_libraries() {
}

cmake_build_phi_3_mini() {
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
Expand Down
3 changes: 0 additions & 3 deletions .ci/scripts/test_quantized_aot_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,10 @@ CMAKE_OUTPUT_DIR=cmake-out

build_cmake_quantized_aot_lib() {
echo "Building quantized aot lib"
SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
(rm -rf ${CMAKE_OUTPUT_DIR} \
&& mkdir ${CMAKE_OUTPUT_DIR} \
&& cd ${CMAKE_OUTPUT_DIR} \
&& retry cmake -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)

Expand Down
1 change: 0 additions & 1 deletion .ci/scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ cmake_install_executorch_lib() {
clean_executorch_install_folders
retry cmake -DBUCK2="$BUCK" \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \
-DCMAKE_BUILD_TYPE=Release \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out .
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ jobs:
rm -rf cmake-out
cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
Expand All @@ -412,7 +411,6 @@ jobs:
cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
Expand Down
17 changes: 12 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
OFF
)

option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
OFF
)

option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)

option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
Expand Down Expand Up @@ -245,7 +249,7 @@ cmake_dependent_option(
)

if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
set(EXECUTORCH_BUILF_EXTENSION_DATA_LOADER ON)
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
endif()

if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
Expand Down Expand Up @@ -348,6 +352,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
endif()

if(EXECUTORCH_BUILD_TESTS)
set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
include(CTest)
endif()

Expand All @@ -373,7 +378,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
"fix for this restriction."
)
endif()
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10)

#
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
Expand Down Expand Up @@ -717,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
endif()

if(EXECUTORCH_BUILD_EXTENSION_LLM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
endif()

if(EXECUTORCH_BUILD_EXTENSION_MODULE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
endif()
Expand Down Expand Up @@ -752,9 +761,7 @@ if(EXECUTORCH_BUILD_PYBIND)
endif()

# find pytorch lib, to allow pybind to take at::Tensor as input/output
if(NOT TARGET torch)
find_package(Torch CONFIG REQUIRED)
endif()
find_package_torch()
find_library(
TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
)
Expand Down
2 changes: 1 addition & 1 deletion backends/apple/coreml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ target_include_directories(
coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
)
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
target_link_libraries(coremldelegate PRIVATE executorch_core)

Expand Down
16 changes: 15 additions & 1 deletion backends/apple/coreml/partition/coreml_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Please refer to the license found in the LICENSE file in the root directory of the source tree.

import logging
from typing import List, Optional
from typing import Callable, List, Optional, Tuple

import coremltools as ct

Expand Down Expand Up @@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
return PartitionResult(
tagged_exported_program=exported_program, partition_tags=partition_tags
)

def ops_to_not_decompose(
self, ep: ExportedProgram
) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
do_not_decompose = []
op_support = OperatorsSupportedForCoreMLBackend()
for node in ep.graph.nodes:
if (
node.op == "call_function"
and isinstance(node.target, torch._ops.OpOverload)
and op_support.is_node_supported(None, node)
):
do_not_decompose.append(node.target)
return do_not_decompose, None
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@
"$(SRCROOT)/../kvstore",
"$(SRCROOT)/../inmemoryfs",
"$(SRCROOT)/../include",
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
"$(SRCROOT)/../sdk",
"$(SRCROOT)/../util",
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
Expand Down Expand Up @@ -954,7 +954,7 @@
"$(SRCROOT)/../kvstore",
"$(SRCROOT)/../inmemoryfs",
"$(SRCROOT)/../include",
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
"$(SRCROOT)/../sdk",
"$(SRCROOT)/../util",
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
Expand Down
46 changes: 46 additions & 0 deletions backends/apple/coreml/test/test_coreml_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from executorch.backends.apple.coreml.compiler import CoreMLBackend
from executorch.backends.apple.coreml.partition import CoreMLPartitioner
from executorch.exir.backend.utils import format_delegated_graph


class TestCoreMLPartitioner(unittest.TestCase):
Expand Down Expand Up @@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
"getitem",
]

def test_ops_to_not_decompose(self):
class Model(torch.nn.Module):
def forward(self, q, k, v, mask):
return torch.ops.aten.scaled_dot_product_attention.default(
q, k, v, attn_mask=mask
)

model = Model()
model.eval()

batch_size = 1
n_heads = 12
seq_len = 1
max_seq_length = 32
embedding_dim = 16
q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
mask = torch.randn(seq_len, max_seq_length)
example_inputs = (q, k, v, mask)
ep = torch.export.export(model, example_inputs)
coreml_partitioner = CoreMLPartitioner()

# Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
edge_program_manager = executorch.exir.to_edge_transform_and_lower(
ep, partitioner=[coreml_partitioner]
)
self.assertTrue(
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
in format_delegated_graph(
edge_program_manager.exported_program().graph_module
)
)

# Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
edge_program_manager2 = executorch.exir.to_edge(ep)
edge_program_manager2.to_backend(coreml_partitioner)
self.assertTrue(
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
not in format_delegated_graph(
edge_program_manager2.exported_program().graph_module
)
)

def test_buffer(self):
embedding_dim = 3
max_seq_len = 2
Expand Down Expand Up @@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
test_runner = TestCoreMLPartitioner()
test_runner.test_add_sub_skip_mm()
test_runner.test_vit_skip_conv()
test_runner.test_ops_to_not_decompose()
test_runner.test_buffer()
6 changes: 3 additions & 3 deletions backends/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Arm Limited and/or its affiliates.
# Copyright 2023, 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
Expand All @@ -14,15 +14,15 @@ endif()

include(${EXECUTORCH_ROOT}/build/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)

# Third-party folder and Ethos-U driver inclued
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})

set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp
set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
backends/arm/runtime/VelaBinStream.cpp
)
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
Expand Down
Loading

0 comments on commit 0863b58

Please sign in to comment.