diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 6b8f851d77..d37c65aa8e 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -10,6 +10,12 @@ set -exu # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" +CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" + install_executorch_and_backend_lib() { echo "Installing executorch and xnnpack backend" clean_executorch_install_folders @@ -22,6 +28,7 @@ install_executorch_and_backend_lib() { -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ @@ -47,6 +54,7 @@ build_llama_runner() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -Bcmake-android-out/examples/models/llama examples/models/llama cmake --build cmake-android-out/examples/models/llama -j4 --config Release diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 9bb881ce8e..9735e26798 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -154,6 +154,7 @@ cmake_install_executorch_libraries() { rm -rf cmake-out retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index a9e1313756..c511942be9 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -30,9 +30,11 @@ fi NPROC=8 if hash nproc &> /dev/null; then NPROC=$(nproc); fi +python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') +CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" EXECUTORCH_COMMON_CMAKE_ARGS=" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ @@ -46,6 +48,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS=" \ cmake_install_executorch_libraries() { cmake \ ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \ -B${BUILD_DIR} . cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE} @@ -56,6 +59,7 @@ cmake_install_executorch_libraries_for_android() { -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=arm64-v8a \ ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \ -B${BUILD_DIR} . cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE} @@ -76,7 +80,7 @@ cmake_build_llava_runner() { cmake \ ${LLAVA_COMMON_CMAKE_ARGS} \ - -DCMAKE_PREFIX_PATH="$python_lib" \ + -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \ -B${BUILD_DIR}/${dir} \ ${dir} @@ -92,7 +96,7 @@ cmake_build_llava_runner_for_android() { -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=arm64-v8a \ ${LLAVA_COMMON_CMAKE_ARGS} \ - -DCMAKE_PREFIX_PATH="$python_lib" \ + -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \ -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \ -B${BUILD_DIR}/${dir} \ ${dir} diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index b4fbc4486a..ef4859135c 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -50,10 +50,12 @@ prepare_artifacts_upload() { build_cmake_executor_runner() { echo "Building executor_runner" + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" rm -rf ${CMAKE_OUTPUT_DIR} cmake -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ -B${CMAKE_OUTPUT_DIR} . cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug @@ -98,8 +100,7 @@ test_model() { build_cmake_xnn_executor_runner() { echo "Building xnn_executor_runner" - SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" (rm -rf ${CMAKE_OUTPUT_DIR} \ && mkdir ${CMAKE_OUTPUT_DIR} \ diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh index 40767013e2..64dd6b829d 100644 --- a/.ci/scripts/test_phi_3_mini.sh +++ b/.ci/scripts/test_phi_3_mini.sh @@ -22,8 +22,10 @@ NPROC=8 if hash nproc &> /dev/null; then NPROC=$(nproc); fi cmake_install_executorch_libraries() { + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ @@ -39,8 +41,10 @@ cmake_install_executorch_libraries() { } cmake_build_phi_3_mini() { + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index be684b7bfa..c21d0bb604 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -136,6 +136,7 @@ cmake_install_executorch_lib() { clean_executorch_install_folders retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE=Release \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index a2f65f1a7a..fac2319789 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -147,6 +147,8 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + source .ci/scripts/utils.sh + install_executorch "use-pt-pinned-commit" BUILD_TOOL="cmake" PYTHON_EXECUTABLE=python \ bash .ci/scripts/build_llama_android.sh "${BUILD_TOOL}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 04a6c96f3e..18e34bff72 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -394,6 +394,7 @@ jobs: rm -rf cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ @@ -411,6 +412,7 @@ jobs: cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bf5be83ed..ad1f5d2019 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -614,6 +614,8 @@ if(BUILD_EXECUTORCH_PORTABLE_OPS) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + # find pytorch lib here to make it available to all sub-directories + find_package_torch_headers() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py index 420f1760e3..7166e3acf4 100644 --- a/backends/cadence/aot/export_example.py +++ b/backends/cadence/aot/export_example.py @@ -6,7 +6,7 @@ # Example script for exporting simple models to flatbuffer -#pyre-unsafe +# pyre-unsafe import logging import tempfile diff --git a/backends/xnnpack/test/quantizer/test_pt2e_quantization.py b/backends/xnnpack/test/quantizer/test_pt2e_quantization.py index 7155a972a4..ea6116a6f0 100644 --- a/backends/xnnpack/test/quantizer/test_pt2e_quantization.py +++ b/backends/xnnpack/test/quantizer/test_pt2e_quantization.py @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree # pyre-unsafe diff --git a/build/Utils.cmake b/build/Utils.cmake index 2b4d22ea61..dca3f189ec 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -324,3 +324,22 @@ function(resolve_python_executable) ) endif() endfunction() + +# find_package(Torch CONFIG REQUIRED) replacement for targets that +# have a header-only Torch dependency. Because find_package sets +# variables in the parent scope, we use a macro to preserve this +# rather than maintaining our own list of those variables. +macro(find_package_torch_headers) + # We cannot simply use CMAKE_FIND_ROOT_PATH_BOTH, because that does + # not propagate into TorchConfig.cmake. + foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE) + set(OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}}) + set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} BOTH) + endforeach() + if(NOT TARGET torch) + find_package(Torch CONFIG REQUIRED) + endif() + foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE) + set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}}) + endforeach() +endmacro() diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index f8ded21099..b72968037c 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -7,6 +7,12 @@ set -ex +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" +CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" + build_jar() { pushd extension/android ./gradlew build @@ -36,6 +42,7 @@ build_android_native_library() { fi cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-26 \ @@ -69,6 +76,7 @@ build_android_native_library() { -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-26 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ diff --git a/examples/models/llama/export_llama.py b/examples/models/llama/export_llama.py index eeb425c338..e25a8a007e 100644 --- a/examples/models/llama/export_llama.py +++ b/examples/models/llama/export_llama.py @@ -7,6 +7,12 @@ # Example script for exporting Llama2 to flatbuffer import logging + +# force=True to ensure logging while in debugger. Set up logger before any +# other imports. +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT, force=True) + import sys import torch @@ -16,10 +22,6 @@ sys.setrecursionlimit(4096) -FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" -logging.basicConfig(level=logging.INFO, format=FORMAT) - - def main() -> None: seed = 42 torch.manual_seed(seed) diff --git a/exir/_serialize/padding.py b/exir/_serialize/padding.py index 181dbbccd1..cc294bcc1f 100644 --- a/exir/_serialize/padding.py +++ b/exir/_serialize/padding.py @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree # pyre-strict diff --git a/exir/passes/quantize_io_pass.py b/exir/passes/quantize_io_pass.py index 21ac4c868a..64b6c14d75 100644 --- a/exir/passes/quantize_io_pass.py +++ b/exir/passes/quantize_io_pass.py @@ -1,4 +1,9 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree + import logging from typing import Any, Dict, List, Optional, Union diff --git a/install_executorch.py b/install_executorch.py index 60655e38ba..a1a6a3f478 100644 --- a/install_executorch.py +++ b/install_executorch.py @@ -165,6 +165,8 @@ def main(args): if pybind_arg == "training": CMAKE_ARGS += " -DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON" os.environ["EXECUTORCH_BUILD_TRAINING"] = "ON" + elif pybind_arg == "mps": + CMAKE_ARGS += " -DEXECUTORCH_BUILD_MPS=ON" else: CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" EXECUTORCH_BUILD_PYBIND = "ON" diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index a8f3670b0b..1f3aff57ec 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -61,7 +61,7 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) -target_include_directories(optimized_kernels PRIVATE "${EXECUTORCH_ROOT}/third-party/pocketfft") +target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft") target_link_libraries( optimized_kernels PRIVATE executorch_core cpublas extension_threadpool ) diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp index 8859132339..dcb6bbc427 100644 --- a/kernels/optimized/cpu/op_gelu.cpp +++ b/kernels/optimized/cpu/op_gelu.cpp @@ -13,6 +13,7 @@ #include +#include #include #include #include @@ -47,48 +48,26 @@ void gelu( CTYPE* out_data = output.mutable_data_ptr(); size_t lim = input.numel(); - // TODO: Add fast path for tanh using sleef's tanh if (approximate == "tanh") { - // 0.5 * x * (1 + Tanh(sqrt(2 / pi) * (x + 0.044715 * x^3)) - for (size_t i = 0; i < lim; ++i) { - const CTYPE x = in_data[i]; - const CTYPE kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; - const CTYPE kKappa = 0.044715; - auto x_cube = x * x * x; - auto inner = kBeta * (x + kKappa * x_cube); - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::tanh(inner)); + using Vec = at::vec::Vectorized; + int i = 0; + for (; i < lim - (lim % Vec::size()); i += Vec::size()) { + Vec x = Vec::loadu(in_data + i); + at::native::vectorized_gelu_approximated_with_tanh(x).store(out_data + i); } - } else if (approximate == "none") { // dont appx - // GELU(x) = x * Φ(x) where Φ(x) is the is the Cumulative Distribution - // Function for Gaussian Distribution. - -#ifndef __aarch64__ - for (size_t i = 0; i < lim; ++i) { - const CTYPE x = in_data[i]; - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2)); + for (; i < lim; ++i) { + out_data[i] = at::native::scalar_gelu_approximated_with_tanh(in_data[i]); } -#else - size_t i = 0; - if constexpr (std::is_same_v) { - for (; i + 4 < lim; i += 4) { - const float32x4_t in = - vld1q_f32(static_cast(&in_data[i])); - const float32x4_t m_sqrt1_2x4 = { - M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}; - const float32x4_t ones = vmovq_n_f32(1.0); - const float32x4_t halves = vmovq_n_f32(0.5); - float32x4_t out = Sleef_erff4_u10(vmulq_f32(in, m_sqrt1_2x4)); - vst1q_f32( - static_cast(&out_data[i]), - vmulq_f32(vmulq_f32(vaddq_f32(out, ones), in), halves)); - } + } else if (approximate == "none") { + using Vec = at::vec::Vectorized; + int i = 0; + for (; i < lim - (lim % Vec::size()); i += Vec::size()) { + Vec x = Vec::loadu(in_data + i); + at::native::vectorized_gelu(x).store(out_data + i); } for (; i < lim; ++i) { - const CTYPE x = in_data[i]; - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2)); + out_data[i] = at::native::scalar_gelu(in_data[i]); } -#endif // __aarch64__ - } else { ET_KERNEL_CHECK_MSG( context, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 8fcaf21024..b88c309959 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -32,13 +32,9 @@ _OPTIMIZED_ATEN_OPS = ( op_target(name = "op_sigmoid"), op_target( name = "op_gelu", - deps = select({ - "DEFAULT": [], - "ovr_config//cpu:arm64": [ - "fbsource//third-party/sleef:sleef_arm", - ], - }) + [ + deps = [ "//executorch/kernels/portable/cpu/util:activation_ops_util", + "//executorch/runtime/core/portable_type/c10:aten_headers_for_executorch", ], ), op_target( @@ -100,6 +96,13 @@ _OPTIMIZED_ATEN_OPS = ( ), ) + +def get_sleef_preprocessor_flags(): + if runtime.is_oss: + return [] + return ["-DAT_BUILD_ARM_VEC256_WITH_SLEEF"] + + def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl index 6839454be2..3af20680b5 100644 --- a/kernels/optimized/op_registration_util.bzl +++ b/kernels/optimized/op_registration_util.bzl @@ -90,9 +90,14 @@ def define_op_library(name, deps): "//executorch/kernels/test/...", "@EXECUTORCH_CLIENTS", ], - # kernels often have helpers with no prototypes just disabling the warning here as the headers - # are codegend and linked in later - compiler_flags = ["-Wno-missing-prototypes"] + get_compiler_optimization_flags(), + compiler_flags = [ + # kernels often have helpers with no prototypes just disabling the warning here as the headers + # are codegend and linked in later + "-Wno-missing-prototypes", + # pragma unroll fails with -Os, don't need to warn us and + # fail Werror builds; see https://godbolt.org/z/zvf85vTsr + "-Wno-pass-failed", + ] + get_compiler_optimization_flags(), deps = [ "//executorch/runtime/kernel:kernel_includes", ] + augmented_deps + get_vec_deps(), diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml index 37a2730f92..a24aa9ca17 100644 --- a/kernels/optimized/optimized-oss.yaml +++ b/kernels/optimized/optimized-oss.yaml @@ -1,8 +1,8 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This yaml file contains operators that have optimized kernels available. -# Note that this is a copy of optimized.yaml that does not include gelu and -# log_softmax, due to the OSS build not currently including sleef. +# Note that this is a copy of optimized.yaml that does not include log_softmax, +# due to the OSS build not currently including sleef. # TODO (T183193812) - op: _fft_r2c.out @@ -45,6 +45,11 @@ - arg_meta: null kernel_name: torch::executor::opt_sigmoid_out +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_gelu_out + - op: le.Scalar_out kernels: - arg_meta: null diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl index 37a68abaa0..c079b97f63 100644 --- a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl @@ -134,5 +134,5 @@ def define_op_target(name, deps): def is_op_disabled(name): # TODO (gjcomer) Enable ops with sleef dependency in OSS - disabled_ops = ["op_gelu", "op_log_softmax"] + disabled_ops = ["op_log_softmax"] return name in disabled_ops diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 2ae8ff8328..012fded59c 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -22,13 +22,20 @@ elif [[ $(uname) == "Linux" ]]; then export LLVM_COV="${LLVM_COV:-llvm-cov}" fi +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" + build_executorch() { BUILD_VULKAN="OFF" if [ -x "$(command -v glslc)" ]; then BUILD_VULKAN="ON" fi + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake . \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \