diff --git a/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
index 3df53835..ddb79d64 100644
--- a/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
index 9f8a62b5..67218f69 100644
--- a/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_64_blas_implgenericc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
index f3434835..9814faf5 100644
--- a/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
index 3348ffc3..c4f5e713 100644
--- a/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_64_blas_implmklc_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
index 9a5eb8c5..227d605d 100644
--- a/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml b/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
index 99655f8b..fde13792 100644
--- a/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
+++ b/.ci_support/linux_aarch64_c_compiler_version13channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version13is_rcFalse.yaml
@@ -80,3 +80,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
index c6feec78..501ed4d2 100644
--- a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
index 20e3ffe5..aa9f1282 100644
--- a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
index db9bacb3..f76824aa 100644
--- a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
index 634c6e7a..a7cbd2d6 100644
--- a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
index c30fcfeb..543e475e 100644
--- a/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
+++ b/.ci_support/osx_64_blas_implgenericchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
index 54a51bee..e1d6d150 100644
--- a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
index 5b2d139d..7af20c3b 100644
--- a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
index 9ba9f925..f0734437 100644
--- a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
index 532cd94d..807c918f 100644
--- a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
+++ b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
index 56565a9d..19683513 100644
--- a/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
+++ b/.ci_support/osx_64_blas_implmklchannel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
index ecc6bdbc..464936e2 100644
--- a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
+++ b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.10.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
index 8fbb298f..9785408f 100644
--- a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
+++ b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.11.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
index 576ca03c..90557f3d 100644
--- a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
+++ b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.12.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
index ef8363fa..2cee3388 100644
--- a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
+++ b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2.0python3.9.____cpython.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
index 82cd4601..a7fe35f5 100644
--- a/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
+++ b/.ci_support/osx_arm64_channel_targetsconda-forge_mainis_rcFalsenumpy2python3.13.____cp313.yaml
@@ -67,3 +67,5 @@ zip_keys:
   - is_rc
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNoneis_rcFalse.yaml b/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNoneis_rcFalse.yaml
index ad886cb0..63e8c0dc 100644
--- a/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNoneis_rcFalse.yaml
+++ b/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilerNonecuda_compiler_versionNoneis_rcFalse.yaml
@@ -59,3 +59,5 @@ zip_keys:
   - cuda_compiler_version
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6is_rcFalse.yaml b/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6is_rcFalse.yaml
index f5b631f7..51971f6d 100644
--- a/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6is_rcFalse.yaml
+++ b/.ci_support/win_64_channel_targetsconda-forge_maincuda_compilercuda-nvcccuda_compiler_version12.6is_rcFalse.yaml
@@ -59,3 +59,5 @@ zip_keys:
   - cuda_compiler_version
 - - python
   - numpy
+zlib:
+- '1'
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index 3c036ad9..40a005a9 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -16,7 +16,7 @@ jobs:
   build:
     name: ${{ matrix.CONFIG }}
     runs-on: ${{ matrix.runs_on }}
-    timeout-minutes: 1080
+    timeout-minutes: 900
     strategy:
       fail-fast: false
       matrix:
diff --git a/conda-forge.yml b/conda-forge.yml
index e8a1366f..d4eae11d 100644
--- a/conda-forge.yml
+++ b/conda-forge.yml
@@ -5,6 +5,9 @@ azure:
   settings_win:
     variables:
       CONDA_BLD_PATH: C:\\bld\\
+bot:
+  abi_migration_branches:
+  - v2.5.x
 build_platform:
   linux_aarch64: linux_64
   osx_arm64: osx_64
@@ -16,7 +19,7 @@ github:
   tooling_branch_name: main
 github_actions:
   self_hosted: true
-  timeout_minutes: 1080
+  timeout_minutes: 900
   triggers:
   - push
   - pull_request
diff --git a/recipe/bld.bat b/recipe/bld.bat
index 4089b425..7f3ce96e 100644
--- a/recipe/bld.bat
+++ b/recipe/bld.bat
@@ -93,8 +93,7 @@ if not "%cuda_compiler_version%" == "None" (
     set MAGMA_HOME=%LIBRARY_PREFIX%
     set "PATH=%CUDA_BIN_PATH%;%PATH%"
     set CUDNN_INCLUDE_DIR=%LIBRARY_PREFIX%\include
-    @REM turn off very noisy nvcc warnings
-    set "CUDAFLAGS=-w --ptxas-options=-w"
+    set "CUDA_VERSION=%cuda_compiler_version%"
 ) else (
     set USE_CUDA=0
     @REM MKLDNN is an Apache-2.0 licensed library for DNNs and is used
@@ -162,7 +161,7 @@ if EXIST build (
     if %ERRORLEVEL% neq 0 exit 1
 )
 
-%PYTHON% -m pip %PIP_ACTION% . --no-build-isolation --no-deps %PIP_VERBOSITY% --no-clean
+%PYTHON% -m pip %PIP_ACTION% . --no-build-isolation --no-deps %PIP_VERBOSITY% --no-clean --config-settings=--global-option=-q
 if %ERRORLEVEL% neq 0 exit 1
 
 @REM Here we split the build into two parts.
@@ -205,7 +204,7 @@ if "%PKG_NAME%" == "libtorch" (
 
     @REM Remove the python binary file, that is placed in the site-packages
     @REM directory by the specific python specific pytorch package.
-    del %LIBRARY_BIN%\torch_python.* %LIBRARY_LIB%\torch_python.* %LIBRARY_LIB%\_C.lib
+    del %LIBRARY_BIN%\torch_python.* %LIBRARY_LIB%\torch_python.*
     if %ERRORLEVEL% neq 0 exit 1
 
     popd
@@ -228,8 +227,8 @@ if "%PKG_NAME%" == "libtorch" (
 
     @REM Copy libtorch_python.lib back -- that's much easier than the for loop
     @REM needed to remove everything else.
-    robocopy /NP /NFL /NDL /NJH /E %LIBRARY_LIB%\ torch\lib\ torch_python.lib
-    robocopy /NP /NFL /NDL /NJH /E %LIBRARY_LIB%\ torch\lib\ _C.lib
+    mkdir %SP_DIR%\torch\lib
+    robocopy /NP /NFL /NDL /NJH /E /MOV %LIBRARY_LIB%\ %SP_DIR%\torch\lib\ torch_python.lib _C.lib
 )
 
 @REM Show the sccache stats.
diff --git a/recipe/build.sh b/recipe/build.sh
index 22dde8f0..13d48aa6 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -34,6 +34,9 @@ export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-std=c++[0-9][0-9]//g')"
 # break users' programs
 export CFLAGS="$(echo $CFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
 export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
+# ignore warnings; blows up the logs for no benefit; they need to be fixed upstream
+export CXXFLAGS="$CXXFLAGS -w"
+
 export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,--as-needed//g')"
 # The default conda LDFLAGs include -Wl,-dead_strip_dylibs, which removes all the
 # MKL sequential, core, etc. libraries, resulting in a "Symbol not found: _mkl_blas_caxpy"
@@ -54,6 +57,10 @@ export _GLIBCXX_USE_CXX11_ABI=1
 if [[ "$target_platform" == "osx-64" ]]; then
   export CXXFLAGS="$CXXFLAGS -DTARGET_OS_OSX=1"
   export CFLAGS="$CFLAGS -DTARGET_OS_OSX=1"
+elif [[ "$target_platform" == linux-* ]]; then
+    # Explicitly force non-executable stack to fix compatibility with glibc 2.41, due to:
+    # ittptmark64.S.o: missing .note.GNU-stack section implies executable stack
+    LDFLAGS="${LDFLAGS} -Wl,-z,noexecstack"
 fi
 
 # Dynamic libraries need to be lazily loaded so that torch
@@ -219,8 +226,10 @@ elif [[ ${cuda_compiler_version} != "None" ]]; then
     export USE_STATIC_CUDNN=0
     export MAGMA_HOME="${PREFIX}"
     export USE_MAGMA=1
-    # turn off noisy nvcc warnings
-    export CMAKE_CUDA_FLAGS="-w --ptxas-options=-w"
+    export CUDA_VERSION=$cuda_compiler_version
+    # ptxas advisories do not get ignored correctly, see
+    # https://github.com/conda-forge/cuda-nvcc-feedstock/issues/60
+    export CMAKE_CUDA_FLAGS="-w -Xptxas -w"
 else
     if [[ "$target_platform" != *-64 ]]; then
       # Breakpad seems to not work on aarch64 or ppc64le
@@ -240,7 +249,12 @@ case ${PKG_NAME} in
   libtorch)
     # Call setup.py directly to avoid spending time on unnecessarily
     # packing and unpacking the wheel.
-    $PREFIX/bin/python setup.py build
+    if [[ "$target_platform" == linux-* ]]; then
+        # filter out extremely noisy ptxas advisories
+        $PREFIX/bin/python setup.py -q build | stdbuf -oL grep -vE "Advisory: Modifier '\.sp::ordered_metadata'"
+    else
+        $PREFIX/bin/python setup.py -q build
+    fi
 
     mv build/lib.*/torch/bin/* ${PREFIX}/bin/
     mv build/lib.*/torch/lib/* ${PREFIX}/lib/
@@ -253,7 +267,7 @@ case ${PKG_NAME} in
     cp build/CMakeCache.txt build/CMakeCache.txt.orig
     ;;
   pytorch)
-    $PREFIX/bin/python -m pip install . --no-deps --no-build-isolation -v --no-clean \
+    $PREFIX/bin/python -m pip install . --no-deps --no-build-isolation -v --no-clean --config-settings=--global-option=-q \
         | sed "s,${CXX},\$\{CXX\},g" \
         | sed "s,${PREFIX},\$\{PREFIX\},g"
     # Keep this in ${PREFIX}/lib so that the library can be found by
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index e1c2a2d6..55f1df0b 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,6 +1,6 @@
 # if you wish to build release candidate number X, append the version string with ".rcX"
-{% set version = "2.5.1" %}
-{% set build = 12 %}
+{% set version = "2.6.0" %}
+{% set build = 0 %}
 
 # Use a higher build number for the CUDA variant, to ensure that it's
 # preferred by conda's solver, and it's preferentially
@@ -16,7 +16,7 @@
 # see .ci/docker/ci_commit_pins/triton.txt
 # pytorch and triton are released in tandem, see notes in their release process
 # https://github.com/pytorch/pytorch/blob/main/RELEASE.md#triton-dependency-for-the-release
-{% set triton = "3.1.0" %}
+{% set triton = "3.2.0" %}
 
 # TODO Temporary pin, remove me
 {% set mkl = "<2025" %}
@@ -32,48 +32,39 @@ source:
 {% else %}
   # The "pytorch-v" tarballs contain submodules; the "pytorch-" ones don't.
   url: https://github.com/pytorch/pytorch/releases/download/v{{ version }}/pytorch-v{{ version }}.tar.gz
-  sha256: 740eb5fff95e33cfe699bad43be83523f569c7cc7f9c285c2a255416443dd266
+  sha256: 3005690eb7b083c443a38c7657938af63902f524ad87a6c83f1aca38c77e3b57
 {% endif %}
   patches:
     - patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
-    # https://github.com/pytorch/pytorch/pull/137084
+    # backport https://github.com/pytorch/pytorch/pull/137084
     - patches/0002-Help-find-numpy.patch
-    # https://github.com/pytorch/pytorch/pull/138287
-    - patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
-    # sympy 1.13.2 was reported to result in test failures on Windows and mac
+    # unpin sympy; 1.13.2 was reported to result in test failures on Windows and mac, see
     # https://github.com/pytorch/pytorch/pull/133235
-    - patches/0004-Update-sympy-version.patch
-    - patches/0005-Fix-duplicate-linker-script.patch  # [cuda_compiler_version != "None" and aarch64]
-    # https://github.com/pytorch/pytorch/pull/136034
-    - patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
-    # https://github.com/pytorch/pytorch/pull/137331
-    - patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
+    - patches/0003-Update-sympy-version.patch
+    - patches/0004-Fix-duplicate-linker-script.patch  # [cuda_compiler_version != "None" and aarch64]
     # conda-specific patch, lets us override CUDA paths
-    - patches/0008-Allow-overriding-CUDA-related-paths.patch
-    # NumPy 2 fixes:
-    # https://github.com/pytorch/pytorch/pull/136800
-    - patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
-    # https://github.com/pytorch/pytorch/pull/137740
-    - patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
+    - patches/0005-Allow-overriding-CUDA-related-paths.patch
     # fix BLAS calling convention for openblas
-    - patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
+    - patches/0006-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
     # fix mkl-2024 issue
     # https://github.com/pytorch/pytorch/pull/143894
-    - patches/0012-fix-issue-142484.patch
-    - patches/0013-Fix-FindOpenBLAS.patch
-    # backport https://github.com/pytorch/pytorch/pull/138095
-    - patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
+    - patches/0007-fix-issue-142484.patch
+    - patches/0008-Fix-FindOpenBLAS.patch
     # backport https://github.com/pytorch/pytorch/pull/145480
-    - patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
+    - patches/0009-simplify-torch.utils.cpp_extension.include_paths-use.patch
     # point to headers that are now living in $PREFIX/include instead of $SP_DIR/torch/include
-    - patches/0016-point-include-paths-to-PREFIX-include.patch
-    - patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
-    - patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
-    - patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch                       # [win]
-    - patches/0020-make-library-name-in-test_mutable_custom_op_fixed_la.patch
-    - patches/0021-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
+    - patches/0010-point-include-paths-to-PREFIX-include.patch
+    - patches/0011-Add-conda-prefix-to-inductor-include-paths.patch
+    - patches/0012-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
+    - patches/0013-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch                       # [win]
+    - patches/0014-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
+    # backport https://github.com/pytorch/pytorch/pull/140030
+    - patches/0015-export-AOTI_TORCH_EXPORT-on-Windows.-140030.patch
     - patches_submodules/fbgemm/0001-remove-DESTINATION-lib-from-CMake-install-directives.patch     # [win]
     - patches_submodules/tensorpipe/0001-switch-away-from-find_package-CUDA.patch
+    # backport https://github.com/google/XNNPACK/commit/5f23827e66cca435fa400b6e221892ac95af0079
+    # for https://github.com/pytorch/pytorch/issues/141083
+    - patches_submodules/XNNPACK/0001-Fix-bazel-linux-aarch64-gcc13-workflow-and-resolve-a.patch
 
 build:
   number: {{ build }}
@@ -87,8 +78,8 @@ build:
 {% else %}
   skip: true  # [is_rc]
 {% endif %}
-  string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-  string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
+  string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ build }}  # [cuda_compiler_version != "None"]
+  string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ build }}                                                 # [cuda_compiler_version == "None"]
   detect_binary_files_with_prefix: false
   run_exports:
     - {{ pin_subpackage('libtorch', max_pin='x.x') }}
@@ -127,6 +118,7 @@ requirements:
     - protobuf
     - make      # [linux]
     - sccache   # [win]
+    - grep      # [unix]
     - rsync     # [unix]
   host:
     # GPU requirements
@@ -158,9 +150,7 @@ requirements:
     - numpy *      # [megabuild]
     - numpy        # [not megabuild]
     - pip
-    # see https://github.com/pytorch/pytorch/issues/136541
-    - setuptools <=72.1.0  # [win]
-    - setuptools  # [not win]
+    - setuptools
     - pyyaml
     - requests
     - six
@@ -195,8 +185,8 @@ requirements:
     - pytorch-gpu ==99999999       # [cuda_compiler_version == "None"]
     - pytorch-gpu =={{ version }}  # [cuda_compiler_version != "None"]
     - pytorch-cpu ==99999999       # [cuda_compiler_version != "None"]
-    - pytorch {{ version }} cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_*_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-    - pytorch {{ version }} cpu_{{ blas_impl }}_*_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
+    - pytorch {{ version }} cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_*_{{ build }}  # [cuda_compiler_version != "None"]
+    - pytorch {{ version }} cpu_{{ blas_impl }}_*_{{ build }}                                                 # [cuda_compiler_version == "None"]
     # if using OpenBLAS, ensure that a version compatible with OpenMP is used
     # otherwise, we get the following warnings:
     # OpenBLAS Warning : Detect OpenMP Loop and this application may hang. Please rebuild the library with USE_OPENMP=1 option.
@@ -254,8 +244,8 @@ outputs:
     script: build.sh    # [unix]
     script: bld.bat     # [win]
     build:
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ build }}  # [cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ build }}                                                 # [cuda_compiler_version == "None"]
       detect_binary_files_with_prefix: false
       run_exports:
         - {{ pin_subpackage('pytorch', max_pin='x.x') }}
@@ -283,6 +273,7 @@ outputs:
         - make      # [linux]
         - sccache   # [win]
       host:
+        - {{ pin_subpackage('libtorch', exact=True) }}
         # GPU requirements
         - cudnn                           # [cuda_compiler_version != "None"]
         - nccl                            # [cuda_compiler_version != "None" and linux]
@@ -310,9 +301,7 @@ outputs:
         - python
         - numpy
         - pip
-        # see https://github.com/pytorch/pytorch/issues/136541
-        - setuptools <=72.1.0  # [win]
-        - setuptools  # [not win]
+        - setuptools
         - pyyaml
         - requests
         - six
@@ -325,37 +314,40 @@ outputs:
         - intel-openmp {{ mkl }}  # [win]
         - libabseil
         - libprotobuf
+        - pybind11
+        - eigen
         - sleef
         - libuv
         - pkg-config  # [unix]
         - typing_extensions
-        - {{ pin_subpackage('libtorch', exact=True) }}
-        - pybind11
-        - eigen
         - zlib
       run:
+        - {{ pin_subpackage('libtorch', exact=True) }}  # [megabuild]
+        # for non-megabuild, allow libtorch from any python version;
+        # pinning build number would be nice but breaks conda
+        - libtorch {{ version }}.*                      # [not megabuild]
         - llvm-openmp    # [osx]
         - intel-openmp {{ mkl }}  # [win]
         - libblas * *{{ blas_impl }}  # [blas_impl == "mkl"]
+        - nomkl                       # [blas_impl != "mkl"]
         # GPU requirements without run_exports
-        - {{ pin_compatible('cudnn') }}                       # [cuda_compiler_version != "None"]
-        # other requirements
+        - {{ pin_compatible('cudnn') }}     # [cuda_compiler_version != "None"]
+        - triton {{ triton }}               # [cuda_compiler_version != "None" and not win]
+        # avoid that people without GPUs needlessly download ~0.5-1GB
+        - __cuda                            # [cuda_compiler_version != "None"]
         - python
-        - typing_extensions
-        # sympy 1.13.2 was reported to result in test failures on Windows and mac
-        # https://github.com/pytorch/pytorch/pull/133235
-        - sympy >=1.13.1,!=1.13.2
+        # other requirements, see https://github.com/pytorch/pytorch/blame/main/requirements.txt
         - filelock
+        - fsspec
         - jinja2
         - networkx
+        - optree >=0.13.0
         - pybind11
-        - nomkl                 # [blas_impl != "mkl"]
-        - fsspec
-        # avoid that people without GPUs needlessly download ~0.5-1GB
-        - __cuda  # [cuda_compiler_version != "None"]
-        - libtorch {{ version }}
         - setuptools
-        - triton {{ triton }}   # [cuda_compiler_version != "None" and not win]
+        # sympy 1.13.2 was reported to result in test failures on Windows and mac
+        # https://github.com/pytorch/pytorch/pull/133235
+        - sympy >=1.13.1,!=1.13.2
+        - typing_extensions >=4.10.0
       run_constrained:
         # These constraints ensure conflict between pytorch and
         # pytorch-cpu 1.1 which we built before conda-forge had GPU infrastructure
@@ -384,20 +376,21 @@ outputs:
         # Required by run_test.py
         - pytest-flakefinder
         - pytest-rerunfailures
-        - pytest-xdist
+        # disabled because GPU tests might run OOM
+        # - pytest-xdist
         # danpetry/TF: Pytorch includes their own edited version of pytest-shard and adding
         # it into the test deps as well results in the --shard-id option being added twice.
         # https://github.com/pytorch/pytorch/blob/main/test/pytest_shard_custom.py
         # - pytest-shard
       imports:
         - torch
+        - torch._C
       source_files:
         # Only include the source_files if we are actually going to run the tests.
         - test
         # tools/ is needed to optimise test run
         # as of pytorch=2.0.0, there is a bug when trying to run tests without the tools
         - tools
-        #- .ci/pytorch/smoke_test/smoke_test.py
       commands:
         # Run pip check so as to ensure that all pytorch packages are installed
         # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/24
@@ -409,43 +402,21 @@ outputs:
         - python -c "import torch; import numpy"
         - python -c "import numpy; import torch"
         # distributed support is enabled by default on linux; for mac, we enable it manually in build.sh
-        - python -c "import torch; assert torch.distributed.is_available()"        # [linux or osx]
-        - python -c "import torch; assert torch.backends.cuda.is_built()"          # [linux64 and (cuda_compiler_version != "None")]
-        - python -c "import torch; assert torch.backends.cudnn.is_available()"     # [linux64 and (cuda_compiler_version != "None")]
-        - python -c "import torch; assert torch.backends.cudnn.enabled"            # [linux64 and (cuda_compiler_version != "None")]
+        - python -c "import torch; assert torch.distributed.is_available()"         # [linux or osx]
+        - python -c "import torch; assert torch.backends.cuda.is_built()"           # [cuda_compiler_version != "None"]
+        - python -c "import torch; assert torch.backends.cudnn.is_available()"      # [cuda_compiler_version != "None"]
+        - python -c "import torch; assert torch.backends.cudnn.enabled"             # [cuda_compiler_version != "None"]
+        - python -c "import torch; assert torch.version.cuda is not None"           # [cuda_compiler_version != "None"]
         # At conda-forge, we target versions of OSX that are too old for MPS support
         # But if users install a newer version of OSX, they will have MPS support
         # https://github.com/conda-forge/pytorch-cpu-feedstock/pull/123#issuecomment-1186355073
         # - python -c "import torch; assert torch.backends.mps.is_available()" # [osx]
 
         # python-version-specific library (default location in SP_DIR symlinks back to this)
-        - test -f $PREFIX/lib/libtorch_python${SHLIB_EXT}     # [unix]
-        - if not exist %LIBRARY_BIN%\torch_python.dll exit 1  # [win]
-        - if not exist %LIBRARY_LIB%\torch_python.lib exit 1  # [win]
-
-        # See here for environment variables needed by the smoke test script
-        # https://github.com/pytorch/pytorch/blob/266fd35c5842902f6304aa8e7713b252cbfb243c/.ci/pytorch/smoke_test/smoke_test.py#L16
-        - set MATRIX_GPU_ARCH_VERSION="{{ '.'.join((cuda_compiler_version or "").split('.')[:2]) }}"   # [(cuda_compiler_version != "None") and (win)]
-        - set MATRIX_GPU_ARCH_TYPE="cuda"                                                       # [(cuda_compiler_version != "None") and (win)]
-        - set MATRIX_GPU_ARCH_VERSION="none"                                                    # [(cuda_compiler_version == "None") and (win)]
-        - set MATRIX_GPU_ARCH_TYPE="none"                                                       # [(cuda_compiler_version == "None") and (win)]
-        - set MATRIX_CHANNEL="defaults"                                                         # [win]
-        - set MATRIX_STABLE_VERSION={{ version }}                                               # [win]
-        - set MATRIX_PACKAGE_TYPE="conda"                                                       # [win]
-        - set TARGET_OS="windows"                                                               # [win]
-        - set OMP_NUM_THREADS=4                                                                 # [win]
-        - export MATRIX_GPU_ARCH_VERSION="{{ '.'.join((cuda_compiler_version or "").split('.')[:2]) }}"  # [(cuda_compiler_version != "None") and (linux and x86_64)]
-        - export MATRIX_GPU_ARCH_TYPE="cuda"                                                    # [(cuda_compiler_version != "None") and (linux and x86_64)]
-        - export MATRIX_GPU_ARCH_VERSION="none"                                                 # [(cuda_compiler_version == "None") and (not win)]
-        - export MATRIX_GPU_ARCH_TYPE="none"                                                    # [(cuda_compiler_version == "None") and (not win)]
-        - export MATRIX_CHANNEL="defaults"                                                      # [not win]
-        - export MATRIX_STABLE_VERSION="{{ version }}"                                          # [not win]
-        - export MATRIX_PACKAGE_TYPE="conda"                                                    # [not win]
-        - export TARGET_OS="linux"                                                              # [linux]
-        - export TARGET_OS="macos-arm64"                                                        # [(osx and arm64)]
-        - export TARGET_OS="macos-x86_64"                                                       # [(osx and x86_64)]
-        - export OMP_NUM_THREADS=4                                                              # [not win]
-        #- python ./smoke_test/smoke_test.py --package torchonly
+        - test -f $PREFIX/lib/libtorch_python${SHLIB_EXT}           # [unix]
+        - if not exist %LIBRARY_BIN%\torch_python.dll exit 1        # [win]
+        - if not exist %SP_DIR%\torch\lib\torch_python.lib exit 1   # [win]
+        - if not exist %SP_DIR%\torch\lib\_C.lib exit 1             # [win]
 
         # a reasonably safe subset of tests that should run under 15 minutes
         {% set tests = " ".join([
@@ -461,23 +432,18 @@ outputs:
         ]) %}
         # tests torch.compile; avoid on aarch because it adds >4h in test runtime in emulation;
         # they add a lot of runtime (15->60min on windows), so run them for only one python version
-        {% set tests = tests ~ " test/inductor/test_torchinductor.py" %}    # [py==312 and not aarch64]
+        {% set tests = tests ~ " test/inductor/test_torchinductor.py" %}    # [py==312 and not (aarch64 or osx)]
 
         {% set skips = "(TestTorch and test_print)" %}
-        # tolerance violation with openblas
+        # minor tolerance violations
         {% set skips = skips ~ " or test_1_sized_with_0_strided_cpu_float32" %}         # [osx]
+        {% set skips = skips ~ " or test_batchnorm_nhwc_cpu" %}                         # [unix]
         # timeouts and failures on aarch, see https://github.com/conda-forge/pytorch-cpu-feedstock/pull/298#issuecomment-2555888508
         {% set skips = skips ~ " or test_pynode_destruction_deadlock" %}                # [aarch64]
         {% set skips = skips ~ " or (TestLinalgCPU and test_cholesky_cpu_float32)" %}   # [aarch64]
         {% set skips = skips ~ " or (TestLinalgCPU and test_pca_lowrank_cpu)" %}        # [aarch64]
         {% set skips = skips ~ " or (TestLinalgCPU and test_svd_lowrank_cpu)" %}        # [aarch64]
         {% set skips = skips ~ " or (TestMkldnnCPU and test_lstm_cpu)" %}               # [aarch64]
-        # dynamo does not support python 3.13
-        {% set skips = skips ~ " or (TestCustomOp and test_data_dependent_compile)" %}  # [py==313]
-        {% set skips = skips ~ " or (TestCustomOp and test_functionalize_error)" %}     # [py==313]
-        {% set skips = skips ~ " or (TestCustomOpAPI and test_compile)" %}              # [py==313]
-        {% set skips = skips ~ " or (TestCustomOpAPI and test_fake)" %}                 # [py==313]
-        {% set skips = skips ~ " or test_compile_int4_mm or test_compile_int8_mm" %}    # [py==313]
         # doesn't crash, but gets different result on aarch + CUDA
         {% set skips = skips ~ " or illcondition_matrix_input_should_not_crash_cpu" %}  # [aarch64 and cuda_compiler_version != "None"]
         # may crash spuriously
@@ -485,7 +451,7 @@ outputs:
         {% set skips = skips ~ " or (TestAutograd and test_profiler_propagation)" %}
         # tests that fail due to resource clean-up issues (non-unique temporary libraries), see
         # https://github.com/conda-forge/pytorch-cpu-feedstock/pull/318#issuecomment-2620080859
-        {% set skips = skips ~ " or test_mutable_custom_op_fixed_layout" %}             # [cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or test_mutable_custom_op_fixed_layout" %}
         # trivial accuracy problems
         {% set skips = skips ~ " or test_BCELoss_weights_no_reduce_cuda" %}             # [unix and cuda_compiler_version != "None"]
         {% set skips = skips ~ " or test_ctc_loss_cudnn_tensor_cuda " %}                # [unix and cuda_compiler_version != "None"]
@@ -494,15 +460,25 @@ outputs:
         {% set skips = skips ~ " or test_sdpa_inference_mode_aot_compile" %}            # [linux and cuda_compiler_version != "None"]
         {% set skips = skips ~ " or (TestNN and test_grid_sample)" %}                   # [linux and cuda_compiler_version != "None"]
         # don't mess with tests that rely on GPU failure handling
+        {% set skips = skips ~ " or test_cublas_config_nondeterministic_alert_cuda" %}  # [linux and cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or test_cross_entropy_loss_2d_out_of_bounds_class" %}  # [linux and cuda_compiler_version != "None"]
         {% set skips = skips ~ " or test_indirect_device_assert" %}                     # [linux and cuda_compiler_version != "None"]
         # test that fails to find temporary resource
         {% set skips = skips ~ " or (GPUTests and test_scatter_reduce2)" %}             # [linux and cuda_compiler_version != "None"]
+        # ROCM test whose skip doesn't trigger
+        {% set skips = skips ~ " or test_ck_blas_library_cpu" %}                        # [linux and cuda_compiler_version != "None"]
+        # problem with finding output of `torch.cuda.tunable.write_file()`
+        {% set skips = skips ~ " or test_matmul_offline_tunableop_cuda_float16" %}      # [linux and cuda_compiler_version != "None"]
+        # catastropic accuracy failure in convolution
+        {% set skips = skips ~ " or test_Conv3d_1x1x1_no_bias_cuda" %}                  # [linux and cuda_compiler_version != "None"]
+        # skip some very long-running groups of tests (~30 minutes total)
+        {% set skips = skips ~ " or (test_gradgrad_nn_Transformer and _cuda_)" %}       # [linux and cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or test_avg_pool3d_backward2" %}                       # [linux and cuda_compiler_version != "None"]
         # MKL problems
         {% set skips = skips ~ " or (TestLinalgCPU and test_inverse_errors_large_cpu)" %}           # [linux and blas_impl == "mkl" and cuda_compiler_version != "None"]
-        {% set skips = skips ~ " or test_reentrant_parent_error_on_cpu_cuda)" %}                    # [linux and blas_impl == "mkl" and cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or test_reentrant_parent_error_on_cpu_cuda" %}                     # [linux and blas_impl == "mkl" and cuda_compiler_version != "None"]
         # non-MKL problems
-        {% set skips = skips ~ " or test_cross_entropy_loss_2d_out_of_bounds_class_index_cuda" %}   # [linux and blas_impl != "mkl" and cuda_compiler_version != "None"]
-        {% set skips = skips ~ " or test_cublas_config_nondeterministic_alert_cuda " %}             # [linux and blas_impl != "mkl" and cuda_compiler_version != "None"]
+        {% set skips = skips ~ " or test_gather_scatter_cpu or test_index_put2_cpu " %}             # [linux and blas_impl != "mkl" and cuda_compiler_version != "None"]
         # these tests are failing with low -n values
         {% set skips = skips ~ " or test_base_does_not_require_grad_mode_nothing" %}
         {% set skips = skips ~ " or test_base_does_not_require_grad_mode_warn" %}
@@ -524,8 +500,7 @@ outputs:
         - export OMP_NUM_THREADS=4  # [unix]
         # reduced paralellism to avoid OOM; test only one python version on aarch because emulation is super-slow
         # disable hypothesis because it randomly yields health check errors
-        - python -m pytest -n 2 {{ tests }} -k "not ({{ skips }})" -m "not hypothesis" --durations=50   # [unix and (not aarch64 or py==312)]
-        - python -m pytest -v -s {{ tests }} -k "not ({{ skips }})" -m "not hypothesis" --durations=50  # [win]
+        - pytest -v {{ tests }} -k "not ({{ skips }})" -m "not hypothesis" --durations=50 --disable-warnings    # [not aarch64 or py==312]
 
         # regression test for https://github.com/conda-forge/pytorch-cpu-feedstock/issues/329, where we picked up
         # duplicate `.pyc` files due to newest py-ver (3.13) in the build environment not matching the one in host;
@@ -540,19 +515,19 @@ outputs:
   {% set pytorch_cpu_gpu = "pytorch-gpu" %}   # [cuda_compiler_version != "None"]
   - name: {{ pytorch_cpu_gpu }}
     build:
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                  # [megabuild and cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                                # [megabuild and cuda_compiler_version == "None"]
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [not megabuild and cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                # [not megabuild and cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ build }}                  # [megabuild and cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ build }}                                                                 # [megabuild and cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ build }}  # [not megabuild and cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ build }}                                                # [not megabuild and cuda_compiler_version == "None"]
       detect_binary_files_with_prefix: false
       # weigh down cpu implementation and give cuda preference
       track_features:
         - pytorch-cpu                                      # [cuda_compiler_version == "None"]
     requirements:
       run:
-        - pytorch {{ version }}=cuda*_{{ blas_impl }}*{{ PKG_BUILDNUM }}  # [megabuild and cuda_compiler_version != "None"]
-        - pytorch {{ version }}=cpu_{{ blas_impl }}*{{ PKG_BUILDNUM }}    # [megabuild and cuda_compiler_version == "None"]
-        - {{ pin_subpackage("pytorch", exact=True) }}                     # [not megabuild]
+        - pytorch {{ version }}=cuda*_{{ blas_impl }}*{{ build }}   # [megabuild and cuda_compiler_version != "None"]
+        - pytorch {{ version }}=cpu_{{ blas_impl }}*{{ build }}     # [megabuild and cuda_compiler_version == "None"]
+        - {{ pin_subpackage("pytorch", exact=True) }}               # [not megabuild]
     test:
       imports:
         - torch
diff --git a/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch b/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
index fda50bcc..5f175049 100644
--- a/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
+++ b/recipe/patches/0001-Force-usage-of-python-3-and-error-without-numpy.patch
@@ -1,17 +1,17 @@
-From f3a0f9aab6dce56eea590b946f60256014b61bf7 Mon Sep 17 00:00:00 2001
+From b1493b8712c1fc4ad02b2640c191f3c7f1fc6c9d Mon Sep 17 00:00:00 2001
 From: Mark Harfouche <mark.harfouche@gmail.com>
 Date: Sun, 1 Sep 2024 17:35:40 -0400
-Subject: [PATCH 01/21] Force usage of python 3 and error without numpy
+Subject: [PATCH 01/15] Force usage of python 3 and error without numpy
 
 ---
  cmake/Dependencies.cmake | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index e78305e0a8e..15c62548601 100644
+index 1813f4418a2..36b507f4f6e 100644
 --- a/cmake/Dependencies.cmake
 +++ b/cmake/Dependencies.cmake
-@@ -861,9 +861,9 @@ if(BUILD_PYTHON)
+@@ -848,9 +848,9 @@ if(BUILD_PYTHON)
    if(USE_NUMPY)
      list(APPEND PYTHON_COMPONENTS NumPy)
    endif()
@@ -23,7 +23,7 @@ index e78305e0a8e..15c62548601 100644
  endif()
  
  if(NOT Python_Interpreter_FOUND)
-@@ -880,7 +880,7 @@ if(BUILD_PYTHON)
+@@ -867,7 +867,7 @@ if(BUILD_PYTHON)
    if(Python_Development.Module_FOUND)
      if(USE_NUMPY)
        if(NOT Python_NumPy_FOUND)
diff --git a/recipe/patches/0002-Help-find-numpy.patch b/recipe/patches/0002-Help-find-numpy.patch
index d660deda..653c4b5d 100644
--- a/recipe/patches/0002-Help-find-numpy.patch
+++ b/recipe/patches/0002-Help-find-numpy.patch
@@ -1,17 +1,17 @@
-From 21c30036b5b86f403c0cf4426165d9a6a50edb1a Mon Sep 17 00:00:00 2001
+From e88ebf63cc47b4471e6be3142cda1c2483b4dc9b Mon Sep 17 00:00:00 2001
 From: Mark Harfouche <mark.harfouche@gmail.com>
 Date: Tue, 1 Oct 2024 00:28:40 -0400
-Subject: [PATCH 02/21] Help find numpy
+Subject: [PATCH 02/15] Help find numpy
 
 ---
  tools/setup_helpers/cmake.py | 6 ++++++
  1 file changed, 6 insertions(+)
 
 diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
-index 4b605fe5975..bde41323c76 100644
+index 84e4dad32d3..8ce7272bea8 100644
 --- a/tools/setup_helpers/cmake.py
 +++ b/tools/setup_helpers/cmake.py
-@@ -305,9 +305,15 @@ class CMake:
+@@ -306,9 +306,15 @@ class CMake:
              sys.exit(1)
          build_options.update(cmake__options)
  
diff --git a/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch b/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
deleted file mode 100644
index d44513d4..00000000
--- a/recipe/patches/0003-Add-USE_SYSTEM_NVTX-option-138287.patch
+++ /dev/null
@@ -1,86 +0,0 @@
-From d1826af525db41eda5020a1404f5d5521d67a5dc Mon Sep 17 00:00:00 2001
-From: Jeongseok Lee <jeongseok@meta.com>
-Date: Sat, 19 Oct 2024 04:26:01 +0000
-Subject: [PATCH 03/21] Add USE_SYSTEM_NVTX option (#138287)
-
-## Summary
-
-We are currently [updating](https://github.com/conda-forge/pytorch-cpu-feedstock/pull/277) the [`conda-forge::pytorch`](https://anaconda.org/conda-forge/pytorch) package to version 2.5.0. This update includes a new dependency, the third_party/NVTX submodule. However, like other package management frameworks (e.g., apt), conda-forge prefers using system-installed packages instead of vendor-provided third-party packages.
-
-This pull request aims to add an option, `USE_SYSTEM_NVTX`, to select whether to use the vendored nvtx or the system-installed one, with the default being the vendored one (which is the current behavior).
-
-## Test Plan
-
-The `USE_SYSTEM_NVTX` option is tested by building the `conda-forge::pytorch` package with the change applied as a [patch](https://github.com/conda-forge/pytorch-cpu-feedstock/blob/cd1d2464dd14e48ae4bd2214e6885e2432de483e/recipe/patches/0005-Use-system-nvtx3.patch).
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/138287
-Approved by: https://github.com/albanD
----
- CMakeLists.txt          |  2 ++
- cmake/public/cuda.cmake |  6 +++++-
- setup.py                | 16 +++++++++++++++-
- 3 files changed, 22 insertions(+), 2 deletions(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 98593c2de97..ae3c3f2cbd5 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -470,6 +470,7 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
- option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
- option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
- option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
-+OPTION(USE_SYSTEM_NVTX "Use system-provided nvtx." OFF)
- option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
- if(USE_SYSTEM_LIBS)
-   set(USE_SYSTEM_CPUINFO ON)
-@@ -488,6 +489,7 @@ if(USE_SYSTEM_LIBS)
-   if(USE_NCCL)
-     set(USE_SYSTEM_NCCL ON)
-   endif()
-+  set(USE_SYSTEM_NVTX ON)
- endif()
- 
- # /Z7 override option When generating debug symbols, CMake default to use the
-diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
-index afc1bc12abf..152fbdbe6dd 100644
---- a/cmake/public/cuda.cmake
-+++ b/cmake/public/cuda.cmake
-@@ -170,7 +170,11 @@ else()
- endif()
- 
- # nvToolsExt
--find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
-+if(USE_SYSTEM_NVTX)
-+  find_path(nvtx3_dir NAMES nvtx3)
-+else()
-+  find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
-+endif()
- find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
- if(nvtx3_FOUND)
-   add_library(torch::nvtx3 INTERFACE IMPORTED)
-diff --git a/setup.py b/setup.py
-index 2b0cfa99d71..7174777ed4e 100644
---- a/setup.py
-+++ b/setup.py
-@@ -183,7 +183,21 @@
- #   USE_SYSTEM_LIBS (work in progress)
- #      Use system-provided libraries to satisfy the build dependencies.
- #      When turned on, the following cmake variables will be toggled as well:
--#        USE_SYSTEM_CPUINFO=ON USE_SYSTEM_SLEEF=ON BUILD_CUSTOM_PROTOBUF=OFF
-+#        USE_SYSTEM_CPUINFO=ON
-+#        USE_SYSTEM_SLEEF=ON
-+#        USE_SYSTEM_GLOO=ON
-+#        BUILD_CUSTOM_PROTOBUF=OFF
-+#        USE_SYSTEM_EIGEN_INSTALL=ON
-+#        USE_SYSTEM_FP16=ON
-+#        USE_SYSTEM_PTHREADPOOL=ON
-+#        USE_SYSTEM_PSIMD=ON
-+#        USE_SYSTEM_FXDIV=ON
-+#        USE_SYSTEM_BENCHMARK=ON
-+#        USE_SYSTEM_ONNX=ON
-+#        USE_SYSTEM_XNNPACK=ON
-+#        USE_SYSTEM_PYBIND11=ON
-+#        USE_SYSTEM_NCCL=ON
-+#        USE_SYSTEM_NVTX=ON
- #
- #   USE_MIMALLOC
- #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
diff --git a/recipe/patches/0004-Update-sympy-version.patch b/recipe/patches/0003-Update-sympy-version.patch
similarity index 62%
rename from recipe/patches/0004-Update-sympy-version.patch
rename to recipe/patches/0003-Update-sympy-version.patch
index a73a7399..52df04c9 100644
--- a/recipe/patches/0004-Update-sympy-version.patch
+++ b/recipe/patches/0003-Update-sympy-version.patch
@@ -1,20 +1,20 @@
-From e3219c5fe8834753b0cf9e92be4d1ef1e874f370 Mon Sep 17 00:00:00 2001
+From 3fb6b3704a6359521e186bfd4c6644a56aa08d90 Mon Sep 17 00:00:00 2001
 From: Jeongseok Lee <jeongseok@meta.com>
 Date: Thu, 17 Oct 2024 15:04:05 -0700
-Subject: [PATCH 04/21] Update sympy version
+Subject: [PATCH 03/15] Update sympy version
 
 ---
  setup.py | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/setup.py b/setup.py
-index 7174777ed4e..65be34e39b1 100644
+index a6a6db7b033..d73cec7dd86 100644
 --- a/setup.py
 +++ b/setup.py
-@@ -1158,7 +1158,7 @@ def main():
-         "typing-extensions>=4.8.0",
+@@ -1099,7 +1099,7 @@ def main():
+         "filelock",
+         "typing-extensions>=4.10.0",
          'setuptools ; python_version >= "3.12"',
-         'sympy==1.12.1 ; python_version == "3.8"',
 -        'sympy==1.13.1 ; python_version >= "3.9"',
 +        'sympy>=1.13.1,!=1.13.2 ; python_version >= "3.9"',
          "networkx",
diff --git a/recipe/patches/0005-Fix-duplicate-linker-script.patch b/recipe/patches/0004-Fix-duplicate-linker-script.patch
similarity index 80%
rename from recipe/patches/0005-Fix-duplicate-linker-script.patch
rename to recipe/patches/0004-Fix-duplicate-linker-script.patch
index 49e6d72b..8458e4a8 100644
--- a/recipe/patches/0005-Fix-duplicate-linker-script.patch
+++ b/recipe/patches/0004-Fix-duplicate-linker-script.patch
@@ -1,17 +1,17 @@
-From 08a1f44fbc81324aa98d720dfb7b87a261923ac2 Mon Sep 17 00:00:00 2001
+From be785be20dab23d5cee88e13adf40150ce9ead3c Mon Sep 17 00:00:00 2001
 From: Jeongseok Lee <jeongseok@meta.com>
 Date: Sun, 3 Nov 2024 01:12:36 -0700
-Subject: [PATCH 05/21] Fix duplicate linker script
+Subject: [PATCH 04/15] Fix duplicate linker script
 
 ---
  setup.py | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/setup.py b/setup.py
-index 65be34e39b1..b0e01e0d1ee 100644
+index d73cec7dd86..75fdfce7e35 100644
 --- a/setup.py
 +++ b/setup.py
-@@ -1184,7 +1184,9 @@ def main():
+@@ -1125,7 +1125,9 @@ def main():
              filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
          )
          linker_script_path = os.path.abspath("cmake/linker_script.ld")
diff --git a/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch b/recipe/patches/0005-Allow-overriding-CUDA-related-paths.patch
similarity index 89%
rename from recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch
rename to recipe/patches/0005-Allow-overriding-CUDA-related-paths.patch
index b52d1588..23d83bab 100644
--- a/recipe/patches/0008-Allow-overriding-CUDA-related-paths.patch
+++ b/recipe/patches/0005-Allow-overriding-CUDA-related-paths.patch
@@ -1,7 +1,7 @@
-From f03bf82d9da9cccb2cf4d4833c1a6349622dc37d Mon Sep 17 00:00:00 2001
+From e0cb086099287bd51fdbe8e6f847ec2d0646f085 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
 Date: Wed, 27 Nov 2024 13:47:23 +0100
-Subject: [PATCH 08/21] Allow overriding CUDA-related paths
+Subject: [PATCH 05/15] Allow overriding CUDA-related paths
 
 ---
  cmake/Modules/FindCUDAToolkit.cmake | 2 +-
@@ -22,10 +22,10 @@ index ec9ae530aa6..b7c0bd9fc51 100644
    set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
    set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
 diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
-index bde41323c76..b171837cd4a 100644
+index 8ce7272bea8..67b2b495c62 100644
 --- a/tools/setup_helpers/cmake.py
 +++ b/tools/setup_helpers/cmake.py
-@@ -252,7 +252,7 @@ class CMake:
+@@ -253,7 +253,7 @@ class CMake:
              true_var = additional_options.get(var)
              if true_var is not None:
                  build_options[true_var] = val
diff --git a/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch b/recipe/patches/0006-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
similarity index 86%
rename from recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
rename to recipe/patches/0006-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
index c7b201b6..2ababacc 100644
--- a/recipe/patches/0011-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
+++ b/recipe/patches/0006-Use-BLAS_USE_CBLAS_DOT-for-OpenBLAS-builds.patch
@@ -1,7 +1,7 @@
-From 56f1528fa072023fb2724d5abf8790f2f6cc3aaa Mon Sep 17 00:00:00 2001
+From 7e7547dab6c26e7fd324fde6cb6aad5d57bebcf9 Mon Sep 17 00:00:00 2001
 From: Isuru Fernando <ifernando@quansight.com>
 Date: Wed, 18 Dec 2024 03:59:00 +0000
-Subject: [PATCH 11/21] Use BLAS_USE_CBLAS_DOT for OpenBLAS builds
+Subject: [PATCH 06/15] Use BLAS_USE_CBLAS_DOT for OpenBLAS builds
 
 There are two calling conventions for *dotu functions
 
@@ -31,10 +31,10 @@ functional calls.
  1 file changed, 1 insertion(+)
 
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 15c62548601..3965416eb29 100644
+index 36b507f4f6e..b94993c34ba 100644
 --- a/cmake/Dependencies.cmake
 +++ b/cmake/Dependencies.cmake
-@@ -182,6 +182,7 @@ elseif(BLAS STREQUAL "OpenBLAS")
+@@ -181,6 +181,7 @@ elseif(BLAS STREQUAL "OpenBLAS")
    set(BLAS_INFO "open")
    set(BLAS_FOUND 1)
    set(BLAS_LIBRARIES ${OpenBLAS_LIB})
diff --git a/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch b/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
deleted file mode 100644
index 99baed0a..00000000
--- a/recipe/patches/0006-fix-3.13-pickle-error-in-serialization.py-136034.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-From 15df314a41c69a31c0443254d5552aa1b39d708d Mon Sep 17 00:00:00 2001
-From: William Wen <williamwen@meta.com>
-Date: Fri, 13 Sep 2024 13:02:33 -0700
-Subject: [PATCH 06/21] fix 3.13 pickle error in serialization.py (#136034)
-
-Error encountered when adding dynamo 3.13 support.
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/136034
-Approved by: https://github.com/albanD
----
- torch/serialization.py | 16 ++++++++++++----
- 1 file changed, 12 insertions(+), 4 deletions(-)
-
-diff --git a/torch/serialization.py b/torch/serialization.py
-index d936d31d6f5..d937680c031 100644
---- a/torch/serialization.py
-+++ b/torch/serialization.py
-@@ -1005,8 +1005,12 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
-     pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
-     pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
-     pickle_module.dump(sys_info, f, protocol=pickle_protocol)
--    pickler = pickle_module.Pickler(f, protocol=pickle_protocol)
--    pickler.persistent_id = persistent_id
-+
-+    class PyTorchLegacyPickler(pickle_module.Pickler):
-+        def persistent_id(self, obj):
-+            return persistent_id(obj)
-+
-+    pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol)
-     pickler.dump(obj)
- 
-     serialized_storage_keys = sorted(serialized_storages.keys())
-@@ -1083,8 +1087,12 @@ def _save(
- 
-     # Write the pickle data for `obj`
-     data_buf = io.BytesIO()
--    pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol)
--    pickler.persistent_id = persistent_id
-+
-+    class PyTorchPickler(pickle_module.Pickler):  # type: ignore[name-defined]
-+        def persistent_id(self, obj):
-+            return persistent_id(obj)
-+
-+    pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
-     pickler.dump(obj)
-     data_value = data_buf.getvalue()
-     zip_file.write_record("data.pkl", data_value, len(data_value))
diff --git a/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch b/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
deleted file mode 100644
index ae6a94cd..00000000
--- a/recipe/patches/0007-Allow-users-to-overwrite-ld-with-environment-variabl.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 655f694854c3eafdd631235b60bc6c1b279218ed Mon Sep 17 00:00:00 2001
-From: Mark Harfouche <mark.harfouche@gmail.com>
-Date: Thu, 3 Oct 2024 22:49:56 -0400
-Subject: [PATCH 07/21] Allow users to overwrite ld with environment variables
-
-This should help in the case of cross compilation.
-
-xref: https://github.com/conda-forge/pytorch-cpu-feedstock/pull/261
----
- tools/setup_helpers/generate_linker_script.py | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
-index 11c397a9e5f..e66fc197062 100644
---- a/tools/setup_helpers/generate_linker_script.py
-+++ b/tools/setup_helpers/generate_linker_script.py
-@@ -1,3 +1,4 @@
-+import os
- import subprocess
- 
- 
-@@ -9,8 +10,8 @@ def gen_linker_script(
-         prioritized_text = [
-             line.replace("\n", "") for line in prioritized_text if line != "\n"
-         ]
--
--    linker_script_lines = subprocess.check_output(["ld", "-verbose"], text=True).split(
-+    ld = os.environ.get("LD", "ld")
-+    linker_script_lines = subprocess.check_output([ld, "-verbose"], text=True).split(
-         "\n"
-     )
- 
diff --git a/recipe/patches/0012-fix-issue-142484.patch b/recipe/patches/0007-fix-issue-142484.patch
similarity index 83%
rename from recipe/patches/0012-fix-issue-142484.patch
rename to recipe/patches/0007-fix-issue-142484.patch
index db13f7ac..30674b10 100644
--- a/recipe/patches/0012-fix-issue-142484.patch
+++ b/recipe/patches/0007-fix-issue-142484.patch
@@ -1,7 +1,7 @@
-From beba58d724cc1bd7ca73660b0a5ad9e61ae0c562 Mon Sep 17 00:00:00 2001
+From 63f0d3218792d874650a7926f2b956ecbe74eac0 Mon Sep 17 00:00:00 2001
 From: "Zheng, Zhaoqiong" <zhaoqiong.zheng@intel.com>
 Date: Fri, 27 Dec 2024 13:49:36 +0800
-Subject: [PATCH 12/21] fix issue 142484
+Subject: [PATCH 07/15] fix issue 142484
 
 From https://github.com/pytorch/pytorch/pull/143894
 ---
@@ -9,10 +9,10 @@ From https://github.com/pytorch/pytorch/pull/143894
  1 file changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
-index e26cfbf6d8e..c61b76d3205 100644
+index 3d777ecdcf8..2227e492dea 100644
 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp
 +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
-@@ -477,7 +477,17 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+@@ -478,7 +478,17 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
  
    const auto value_type = c10::toRealValueType(input.scalar_type());
    out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
diff --git a/recipe/patches/0013-Fix-FindOpenBLAS.patch b/recipe/patches/0008-Fix-FindOpenBLAS.patch
similarity index 93%
rename from recipe/patches/0013-Fix-FindOpenBLAS.patch
rename to recipe/patches/0008-Fix-FindOpenBLAS.patch
index 49d43f90..6a430787 100644
--- a/recipe/patches/0013-Fix-FindOpenBLAS.patch
+++ b/recipe/patches/0008-Fix-FindOpenBLAS.patch
@@ -1,7 +1,7 @@
-From 816a248a4425a97350959e412666e6db9012a52e Mon Sep 17 00:00:00 2001
+From 6e00778c46305f6a36670fa99a326c2426203a42 Mon Sep 17 00:00:00 2001
 From: Bas Zalmstra <bas@prefix.dev>
 Date: Thu, 16 May 2024 10:46:49 +0200
-Subject: [PATCH 13/21] Fix FindOpenBLAS
+Subject: [PATCH 08/15] Fix FindOpenBLAS
 
 ---
  cmake/Modules/FindOpenBLAS.cmake | 15 +++++++++------
diff --git a/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch b/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
deleted file mode 100644
index 7d9d1ab5..00000000
--- a/recipe/patches/0009-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
+++ /dev/null
@@ -1,77 +0,0 @@
-From 4b1faf6ba142953ce2730766db44f8d98d161ef0 Mon Sep 17 00:00:00 2001
-From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
-Date: Tue, 1 Oct 2024 07:53:24 +0000
-Subject: [PATCH 09/21] Fix test/test_linalg.py for NumPy 2 (#136800)
-
-Related to  #107302.
-
-When built and tested with NumPy 2 the following unit tests failed.
-
-```
-=========================================================== short test summary info ============================================================
-FAILED [0.0026s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_complex128 - TypeError: expected np.ndarray (got Tensor)
-FAILED [0.0024s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_complex64 - TypeError: expected np.ndarray (got Tensor)
-FAILED [0.0025s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_float32 - TypeError: expected np.ndarray (got Tensor)
-FAILED [0.0024s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_float64 - TypeError: expected np.ndarray (got Tensor)
-FAILED [0.0016s] test/test_linalg.py::TestLinalgCPU::test_nuclear_norm_axes_small_brute_force_old_cpu - ValueError: Unable to avoid copy while creating an array as requested.
-FAILED [0.0054s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_complex128 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
-FAILED [0.0055s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_complex64 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
-FAILED [0.0048s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_float32 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
-FAILED [0.0054s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_float64 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
-=========================================== 9 failed, 1051 passed, 118 skipped in 152.51s (0:02:32) ============================================
-```
-
-This PR fixes them. The test is now compatible with both NumPy 1 & 2.
-
-Some more details:
-
-1. The `np.linalg.solve` has changed its behavior. So I added an adapt function in the unit test to keep its behavior the same no matter it is NumPy 1 or Numpy 2.
-2. The cause of the failure is when passing a `torch.Tensor` to `np.linalg.qr`, the return type in NumPy 1 is `(np.ndarray, np.ndarray)`, while it is `(torch.Tensor, torch.Tensor)` in NumPy 2.
-3. NumPy 2 does not allow `np.array(obj, copy=False)`, but recommended to use `np.asarray(obj)` instead.
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/136800
-Approved by: https://github.com/lezcano
----
- test/test_linalg.py | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/test/test_linalg.py b/test/test_linalg.py
-index e9ec874d695..060bccef2e5 100644
---- a/test/test_linalg.py
-+++ b/test/test_linalg.py
-@@ -2351,7 +2351,7 @@ class TestLinalg(TestCase):
-             if self.device_type != 'cpu' and randrange(100) < 95:
-                 return  # too many cpu <==> device copies
- 
--            a = np.array(x.cpu(), copy=False)
-+            a = np.asarray(x.cpu())
-             expected = np.linalg.norm(a, "nuc", axis=axes)
- 
-             ans = torch.norm(x, "nuc", dim=axes)
-@@ -3082,7 +3082,14 @@ class TestLinalg(TestCase):
-             self.assertEqual(b.expand_as(Ax), Ax)
- 
-             # Check against NumPy
--            expected = np.linalg.solve(A.cpu().numpy(), b.expand_as(x).cpu().numpy())
-+            if rhs == ():
-+                # In NumPy 2, "b" can no longer be a vector (i.e. rhs == ()) if has batch dimensions.
-+                # So, reshape it to a matrix and back. Related documentation:
-+                # https://numpy.org/doc/1.26/reference/generated/numpy.linalg.solve.html
-+                # https://numpy.org/doc/2.0/reference/generated/numpy.linalg.solve.html
-+                expected = np.linalg.solve(A.cpu().numpy(), b.cpu().numpy().reshape(*b.shape, 1)).reshape(b.shape)
-+            else:
-+                expected = np.linalg.solve(A.cpu().numpy(), b.cpu().numpy())
-             self.assertEqual(x, expected)
- 
-         batches = [(), (0, ), (3, ), (2, 3)]
-@@ -5234,7 +5241,9 @@ class TestLinalg(TestCase):
-                 tau_shape = [*A_cpu.shape[:-2], A_cpu.shape[-1]]
-                 tau = torch.empty(tau_shape, dtype=dtype).view(-1, A_cpu.shape[-1])
-                 for A_i, reflectors_i, tau_i in zip(A_cpu.contiguous().view(*flattened_batch_shape), reflectors, tau):
--                    reflectors_tmp, tau_i[:] = map(torch.from_numpy, np.linalg.qr(A_i, mode='raw'))
-+                    reflectors_tmp, tau_i[:] = (
-+                        torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in np.linalg.qr(A_i, mode='raw')
-+                    )
-                     reflectors_i[:] = reflectors_tmp.T
-                 reflectors = reflectors.view(*A_cpu.shape)
-                 tau = tau.view(tau_shape)
diff --git a/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch b/recipe/patches/0009-simplify-torch.utils.cpp_extension.include_paths-use.patch
similarity index 83%
rename from recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
rename to recipe/patches/0009-simplify-torch.utils.cpp_extension.include_paths-use.patch
index 6cf5ea9c..8b898e57 100644
--- a/recipe/patches/0015-simplify-torch.utils.cpp_extension.include_paths-use.patch
+++ b/recipe/patches/0009-simplify-torch.utils.cpp_extension.include_paths-use.patch
@@ -1,7 +1,7 @@
-From 33790dfbf966e7d8ea4ff6798d2ff92474d84079 Mon Sep 17 00:00:00 2001
+From 12a4473ae7a47da2a30121f329a2c3c8f3f456c5 Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Thu, 23 Jan 2025 22:46:58 +1100
-Subject: [PATCH 15/21] simplify torch.utils.cpp_extension.include_paths; use
+Subject: [PATCH 09/15] simplify torch.utils.cpp_extension.include_paths; use
  it in cpp_builder
 
 The /TH headers have not existed since pytorch 1.11
@@ -11,10 +11,10 @@ The /TH headers have not existed since pytorch 1.11
  2 files changed, 3 insertions(+), 14 deletions(-)
 
 diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
-index 95a0bff86fd..860e7fb062f 100644
+index 92cf88df8eb..9328e3f96e2 100644
 --- a/torch/_inductor/cpp_builder.py
 +++ b/torch/_inductor/cpp_builder.py
-@@ -743,16 +743,9 @@ def _get_build_args_of_chosen_isa(vec_isa: VecISA) -> Tuple[List[str], List[str]
+@@ -764,16 +764,9 @@ def _get_build_args_of_chosen_isa(vec_isa: VecISA) -> Tuple[List[str], List[str]
  def _get_torch_related_args(
      include_pytorch: bool, aot_mode: bool
  ) -> Tuple[List[str], List[str], List[str]]:
@@ -35,10 +35,10 @@ index 95a0bff86fd..860e7fb062f 100644
      libraries = []
      if sys.platform != "darwin" and not config.is_fbcode():
 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
-index aaa45ea4c90..3f584ef5598 100644
+index b4a70dcc06e..23e2499903c 100644
 --- a/torch/utils/cpp_extension.py
 +++ b/torch/utils/cpp_extension.py
-@@ -1159,10 +1159,6 @@ def include_paths(cuda: bool = False) -> List[str]:
+@@ -1212,10 +1212,6 @@ def include_paths(device_type: str = "cpu") -> List[str]:
          lib_include,
          # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
          os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
@@ -47,5 +47,5 @@ index aaa45ea4c90..3f584ef5598 100644
 -        os.path.join(lib_include, 'TH'),
 -        os.path.join(lib_include, 'THC')
      ]
-     if cuda and IS_HIP_EXTENSION:
+     if device_type == "cuda" and IS_HIP_EXTENSION:
          paths.append(os.path.join(lib_include, 'THH'))
diff --git a/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch b/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
deleted file mode 100644
index c28fe93a..00000000
--- a/recipe/patches/0010-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
+++ /dev/null
@@ -1,60 +0,0 @@
-From 032b9be9ca7f9ae174e75554cecc82600ea3ef54 Mon Sep 17 00:00:00 2001
-From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
-Date: Sat, 12 Oct 2024 02:40:17 +0000
-Subject: [PATCH 10/21] Fixes NumPy 2 test failures in test_torch.py (#137740)
-
-Related to #107302
-
-The breakages are caused by backward incompatibility between NumPy 1 and NumPy 2.
-This PR fixes all the corresponding test failures in `test_torch.py`.
-
-1. The dtype of the return value `np.percentile` when passed a `torch.float32` tensor.
-NumPy 1: Return value of `np.float64`.
-NumPy 2: Return value of `np.float32`.
-Solution: Enforce it with `.astype(np.float64)`.
-
-2. The type of `np.gradient()` when returning multiple arrays.
-NumPy1: A list of arrays.
-NumPy2: A tuple of arrays.
-Solution: Cast the tuple to a list.
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/137740
-Approved by: https://github.com/ezyang
----
- test/test_torch.py | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/test/test_torch.py b/test/test_torch.py
-index be4d6180819..c6fd6ac9f19 100644
---- a/test/test_torch.py
-+++ b/test/test_torch.py
-@@ -2891,7 +2891,7 @@ else:
- 
-     # if the given input arg is not a list, it returns a list of single element: [arg]
-     def _wrap_to_list(self, input_array):
--        return input_array if isinstance(input_array, list) else [input_array]
-+        return list(input_array) if isinstance(input_array, (list, tuple)) else [input_array]
- 
-     # To ensure inf, -inf, and nan values do not cause divergence between Numpy and PyTorch.
-     # There are two types of possible divergence:
-@@ -3029,7 +3029,7 @@ else:
-                     # Result is given just as real number and all the imaginary parts to be equal to zero.
-                     self.assertEqual(expected[i].imag, torch.zeros(actual[i].shape), exact_dtype=False)
-             else:
--                actual, expected = self._inf_nan_preprocess(list(actual), expected)
-+                actual, expected = self._inf_nan_preprocess(list(actual), list(expected))
-                 self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False)
- 
-     @onlyNativeDeviceTypes
-@@ -7549,10 +7549,10 @@ class TestTorch(TestCase):
-             torch.mean(sample, dim=0), torch.full((d,), 0.5), atol=2, rtol=2
-         )
-         torch.testing.assert_close(
--            np.percentile(sample, 25, axis=0), np.repeat(0.25, d), atol=2, rtol=2
-+            np.percentile(sample, 25, axis=0).astype(np.float64), np.repeat(0.25, d), atol=2, rtol=2
-         )
-         torch.testing.assert_close(
--            np.percentile(sample, 75, axis=0), np.repeat(0.75, d), atol=2, rtol=2
-+            np.percentile(sample, 75, axis=0).astype(np.float64), np.repeat(0.75, d), atol=2, rtol=2
-         )
- 
-     @skipIfTorchDynamo("np.float64 restored as float32 after graph break.")
diff --git a/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch b/recipe/patches/0010-point-include-paths-to-PREFIX-include.patch
similarity index 51%
rename from recipe/patches/0016-point-include-paths-to-PREFIX-include.patch
rename to recipe/patches/0010-point-include-paths-to-PREFIX-include.patch
index ed6b74f6..4a03ba8d 100644
--- a/recipe/patches/0016-point-include-paths-to-PREFIX-include.patch
+++ b/recipe/patches/0010-point-include-paths-to-PREFIX-include.patch
@@ -1,24 +1,33 @@
-From 799f6fa59dac93dabbbcf72d46f4e1334e3d65d9 Mon Sep 17 00:00:00 2001
+From 2e9805edf7c26bf7890a8704460047592fff3a79 Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Thu, 23 Jan 2025 22:58:14 +1100
-Subject: [PATCH 16/21] point include paths to $PREFIX/include
+Subject: [PATCH 10/15] point include paths to $PREFIX/include
 
 ---
- torch/utils/cpp_extension.py | 9 +++++++++
- 1 file changed, 9 insertions(+)
+ torch/utils/cpp_extension.py | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
 
 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
-index 3f584ef5598..4210f62b6db 100644
+index 23e2499903c..a8caba3c058 100644
 --- a/torch/utils/cpp_extension.py
 +++ b/torch/utils/cpp_extension.py
-@@ -1155,10 +1155,19 @@ def include_paths(cuda: bool = False) -> List[str]:
+@@ -1208,10 +1208,28 @@ def include_paths(device_type: str = "cpu") -> List[str]:
          A list of include path strings.
      """
      lib_include = os.path.join(_TORCH_PATH, 'include')
-+    if os.environ.get("CONDA_BUILD", None) is not None:
++    if (os.environ.get("CONDA_BUILD", None) is not None
++            and os.environ.get("CONDA_BUILD_CROSS_COMPILATION", None) not in (None, "", "0")):
++        # to avoid problems in cross-compilation, we need to point to the same environment
++        # where the currently running pytorch is -- i.e. the BUILD_PREFIX. See
++        # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/349
++        pieces = [os.environ["BUILD_PREFIX"]] + IS_WINDOWS * ["Library"] + ["include"]
++        lib_include = os.path.join(*pieces)
++    elif os.environ.get("CONDA_BUILD", None) is not None:
++        # regular build (& testing) phase --> PREFIX is set
 +        pieces = [os.environ["PREFIX"]] + IS_WINDOWS * ["Library"] + ["include"]
 +        lib_include = os.path.join(*pieces)
 +    elif os.environ.get("CONDA_PREFIX", None) is not None:
++        # final environment
 +        pieces = [os.environ["CONDA_PREFIX"]] + IS_WINDOWS * ["Library"] + ["include"]
 +        lib_include = os.path.join(*pieces)
      paths = [
@@ -29,5 +38,5 @@ index 3f584ef5598..4210f62b6db 100644
 +        # $PREFIX/include), as some torch-internal headers are still in this directory
 +        os.path.join(_TORCH_PATH, 'include'),
      ]
-     if cuda and IS_HIP_EXTENSION:
+     if device_type == "cuda" and IS_HIP_EXTENSION:
          paths.append(os.path.join(lib_include, 'THH'))
diff --git a/recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch b/recipe/patches/0011-Add-conda-prefix-to-inductor-include-paths.patch
similarity index 79%
rename from recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
rename to recipe/patches/0011-Add-conda-prefix-to-inductor-include-paths.patch
index aff55f95..00619aca 100644
--- a/recipe/patches/0017-Add-conda-prefix-to-inductor-include-paths.patch
+++ b/recipe/patches/0011-Add-conda-prefix-to-inductor-include-paths.patch
@@ -1,7 +1,7 @@
-From 9f73a02bacf9680833ac64657fde6762d33ab200 Mon Sep 17 00:00:00 2001
+From e8eef4b33903af5886cbde7b4342ebc2705933ef Mon Sep 17 00:00:00 2001
 From: Daniel Petry <dpetry@anaconda.com>
 Date: Tue, 21 Jan 2025 17:45:23 -0600
-Subject: [PATCH 17/21] Add conda prefix to inductor include paths
+Subject: [PATCH 11/15] Add conda prefix to inductor include paths
 
 Currently inductor doesn't look in conda's includes and libs. This results in
 errors when it tries to compile, if system versions are being used of
@@ -14,10 +14,10 @@ end user provides a <filename>_compile_flags.json file.
  1 file changed, 1 insertion(+)
 
 diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
-index 860e7fb062f..76c61375d91 100644
+index 9328e3f96e2..5b455ffee17 100644
 --- a/torch/_inductor/cpp_builder.py
 +++ b/torch/_inductor/cpp_builder.py
-@@ -1048,6 +1048,7 @@ def get_cpp_torch_options(
+@@ -1064,6 +1064,7 @@ def get_cpp_torch_options(
          + python_include_dirs
          + torch_include_dirs
          + omp_include_dir_paths
diff --git a/recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch b/recipe/patches/0012-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
similarity index 83%
rename from recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
rename to recipe/patches/0012-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
index 426e6015..125face3 100644
--- a/recipe/patches/0018-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
+++ b/recipe/patches/0012-make-ATEN_INCLUDE_DIR-relative-to-TORCH_INSTALL_PREF.patch
@@ -1,7 +1,7 @@
-From b0cfa0f728e96a3a9d6f7434e2c02d74d6daa9a9 Mon Sep 17 00:00:00 2001
+From 7c955a22b748da66317f69f49f2e99d826083a0d Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Tue, 28 Jan 2025 14:15:34 +1100
-Subject: [PATCH 18/21] make ATEN_INCLUDE_DIR relative to TORCH_INSTALL_PREFIX
+Subject: [PATCH 12/15] make ATEN_INCLUDE_DIR relative to TORCH_INSTALL_PREFIX
 
 we cannot set CMAKE_INSTALL_PREFIX without the pytorch build complaining, but we can
 use TORCH_INSTALL_PREFIX, which is set correctly relative to our CMake files already:
@@ -11,10 +11,10 @@ https://github.com/pytorch/pytorch/blob/v2.5.1/cmake/TorchConfig.cmake.in#L47
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index 6d9152a4d07..aa4dd7b05cc 100644
+index f0868ea0489..c20ea20aa32 100644
 --- a/aten/src/ATen/CMakeLists.txt
 +++ b/aten/src/ATen/CMakeLists.txt
-@@ -563,7 +563,7 @@ if(USE_ROCM)
+@@ -604,7 +604,7 @@ if(USE_ROCM)
    # list(APPEND ATen_HIP_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
  endif()
  
diff --git a/recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch b/recipe/patches/0013-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
similarity index 90%
rename from recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
rename to recipe/patches/0013-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
index 78950986..36b52de7 100644
--- a/recipe/patches/0019-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
+++ b/recipe/patches/0013-remove-DESTINATION-lib-from-CMake-install-TARGETS-di.patch
@@ -1,7 +1,7 @@
-From f7db4cbfb0af59027ed8bdcd0387dba6fbcb1192 Mon Sep 17 00:00:00 2001
+From c4cc82934faf2e32ab283a2ec2d9266049db9872 Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Tue, 28 Jan 2025 10:58:29 +1100
-Subject: [PATCH 19/21] remove `DESTINATION lib` from CMake `install(TARGETS`
+Subject: [PATCH 13/15] remove `DESTINATION lib` from CMake `install(TARGETS`
  directives
 
 Suggested-By: Silvio Traversaro <silvio@traversaro.it>
@@ -16,10 +16,10 @@ Suggested-By: Silvio Traversaro <silvio@traversaro.it>
  7 files changed, 15 insertions(+), 15 deletions(-)
 
 diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
-index 80e172497d5..d7f8987020d 100644
+index 34577caef2e..8f00b3d8cb1 100644
 --- a/c10/CMakeLists.txt
 +++ b/c10/CMakeLists.txt
-@@ -162,7 +162,7 @@ if(NOT BUILD_LIBTORCHLESS)
+@@ -163,7 +163,7 @@ if(NOT BUILD_LIBTORCHLESS)
    # Note: for now, we will put all export path into one single Caffe2Targets group
    # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
    # individual libraries like libc10.so and libcaffe2.so are still self-contained.
@@ -68,10 +68,10 @@ index 01f77d61713..437ade657f9 100644
    add_subdirectory(test)
  endif()
 diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index 9be7f3732f3..b51c7cc637b 100644
+index 33199c74b8e..fc858f3e0de 100644
 --- a/caffe2/CMakeLists.txt
 +++ b/caffe2/CMakeLists.txt
-@@ -549,7 +549,7 @@ if(USE_CUDA)
+@@ -557,7 +557,7 @@ if(USE_CUDA)
    endif()
  
    target_link_libraries(caffe2_nvrtc PRIVATE caffe2::nvrtc ${DELAY_LOAD_FLAGS})
@@ -80,8 +80,8 @@ index 9be7f3732f3..b51c7cc637b 100644
    if(USE_NCCL)
      list(APPEND Caffe2_GPU_SRCS
        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
-@@ -609,7 +609,7 @@ if(USE_ROCM)
-   target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
+@@ -628,7 +628,7 @@ if(USE_ROCM)
+   target_link_libraries(caffe2_nvrtc hip::amdhip64 hiprtc::hiprtc)
    target_include_directories(caffe2_nvrtc PRIVATE ${CMAKE_BINARY_DIR})
    target_compile_definitions(caffe2_nvrtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
 -  install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
@@ -89,7 +89,7 @@ index 9be7f3732f3..b51c7cc637b 100644
  endif()
  
  if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
-@@ -995,7 +995,7 @@ elseif(USE_CUDA)
+@@ -1031,7 +1031,7 @@ elseif(USE_CUDA)
            CUDA::culibos ${CMAKE_DL_LIBS})
      endif()
      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp PROPERTIES COMPILE_FLAGS "-DBUILD_LAZY_CUDA_LINALG")
@@ -98,7 +98,7 @@ index 9be7f3732f3..b51c7cc637b 100644
    endif()
  
    if(USE_PRECOMPILED_HEADERS)
-@@ -1467,17 +1467,17 @@ endif()
+@@ -1517,17 +1517,17 @@ endif()
  
  caffe2_interface_library(torch torch_library)
  
@@ -121,7 +121,7 @@ index 9be7f3732f3..b51c7cc637b 100644
  
  target_link_libraries(torch PUBLIC torch_cpu_library)
  
-@@ -1616,7 +1616,7 @@ if(BUILD_SHARED_LIBS)
+@@ -1666,7 +1666,7 @@ if(BUILD_SHARED_LIBS)
        target_link_libraries(torch_global_deps torch::nvtoolsext)
      endif()
    endif()
@@ -131,10 +131,10 @@ index 9be7f3732f3..b51c7cc637b 100644
  
  # ---[ Caffe2 HIP sources.
 diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
-index c74b45431c9..80fb5e7734e 100644
+index b123023d2fd..650319cb1ee 100644
 --- a/torch/CMakeLists.txt
 +++ b/torch/CMakeLists.txt
-@@ -447,7 +447,7 @@ if(NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
+@@ -458,7 +458,7 @@ if(NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
      set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
  endif()
  
diff --git a/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch b/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
deleted file mode 100644
index af808376..00000000
--- a/recipe/patches/0014-CD-Enable-Python-3.13-on-windows-138095.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-From db896f927403f55a18f931b18a6469cb4e37d322 Mon Sep 17 00:00:00 2001
-From: atalman <atalman@fb.com>
-Date: Tue, 12 Nov 2024 12:28:10 +0000
-Subject: [PATCH 14/21] CD Enable Python 3.13 on windows (#138095)
-
-Adding CD windows. Part of: https://github.com/pytorch/pytorch/issues/130249
-Builder PR landed with smoke test: https://github.com/pytorch/builder/pull/2035
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/138095
-Approved by: https://github.com/Skylion007, https://github.com/malfet
-
-Cherry-pick-note: minus changes in `.github/*`
----
- functorch/csrc/dim/dim.cpp      |  1 +
- functorch/csrc/dim/dim_opcode.c | 13 ++++++++++++-
- 2 files changed, 13 insertions(+), 1 deletion(-)
-
-diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
-index 722618efbb0..f98818bfdcc 100644
---- a/functorch/csrc/dim/dim.cpp
-+++ b/functorch/csrc/dim/dim.cpp
-@@ -38,6 +38,7 @@ PyObject* Dim_init() {
- #include "python_variable_simple.h"
- 
- #if IS_PYTHON_3_11_PLUS
-+
- #define Py_BUILD_CORE
- #include "internal/pycore_opcode.h"
- #undef Py_BUILD_CORE
-diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
-index 81ba62a3781..1b5d0677344 100644
---- a/functorch/csrc/dim/dim_opcode.c
-+++ b/functorch/csrc/dim/dim_opcode.c
-@@ -1,6 +1,17 @@
- #include <torch/csrc/utils/python_compat.h>
- #if defined(_WIN32) && IS_PYTHON_3_11_PLUS
- #define Py_BUILD_CORE
--#define NEED_OPCODE_TABLES
-+#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
-+
-+#if IS_PYTHON_3_13_PLUS
-+#include <cpython/code.h> // To get PyUnstable_Code_GetFirstFree
-+#define NEED_OPCODE_METADATA
-+#include "internal/pycore_opcode_metadata.h"
-+#undef NEED_OPCODE_METADATA
-+#else
- #include "internal/pycore_opcode.h"
- #endif
-+
-+#undef NEED_OPCODE_TABLES
-+#undef Py_BUILD_CORE
-+#endif
diff --git a/recipe/patches/0021-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch b/recipe/patches/0014-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
similarity index 91%
rename from recipe/patches/0021-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
rename to recipe/patches/0014-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
index a6f17c5d..9879f126 100644
--- a/recipe/patches/0021-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
+++ b/recipe/patches/0014-avoid-deprecated-find_package-CUDA-in-caffe2-CMake-m.patch
@@ -1,7 +1,7 @@
-From 1780879024ea952f8591aa175a9787f93e697368 Mon Sep 17 00:00:00 2001
+From e644304ce9c67c3f4185141dc603f8196e32a7cd Mon Sep 17 00:00:00 2001
 From: "H. Vetinari" <h.vetinari@gmx.com>
 Date: Thu, 30 Jan 2025 08:33:44 +1100
-Subject: [PATCH 21/21] avoid deprecated `find_package(CUDA)` in caffe2 CMake
+Subject: [PATCH 14/15] avoid deprecated `find_package(CUDA)` in caffe2 CMake
  metadata
 
 vendor the not-available-anymore function torch_cuda_get_nvcc_gencode_flag from CMake
@@ -15,10 +15,10 @@ vendor the not-available-anymore function torch_cuda_get_nvcc_gencode_flag from
  6 files changed, 153 insertions(+), 50 deletions(-)
 
 diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index b51c7cc637b..6e107b5b02a 100644
+index fc858f3e0de..6f9f29dfd54 100644
 --- a/caffe2/CMakeLists.txt
 +++ b/caffe2/CMakeLists.txt
-@@ -906,25 +906,25 @@ if(USE_ROCM)
+@@ -942,25 +942,25 @@ if(USE_ROCM)
          "$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
    endif()
  elseif(USE_CUDA)
@@ -49,7 +49,7 @@ index b51c7cc637b..6e107b5b02a 100644
    torch_compile_options(torch_cuda)  # see cmake/public/utils.cmake
    target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
  
-@@ -973,12 +973,12 @@ elseif(USE_CUDA)
+@@ -1009,12 +1009,12 @@ elseif(USE_CUDA)
          torch_cuda
      )
      if($ENV{ATEN_STATIC_CUDA})
@@ -65,7 +65,7 @@ index b51c7cc637b..6e107b5b02a 100644
              CUDA::cusolver_static
              ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
 diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
-index d51c451589c..154f04a89dd 100644
+index d8a4bcf2191..faf3efcc4e5 100644
 --- a/cmake/Summary.cmake
 +++ b/cmake/Summary.cmake
 @@ -76,7 +76,7 @@ function(caffe2_print_configuration_summary)
@@ -103,10 +103,10 @@ index d51c451589c..154f04a89dd 100644
      if(${USE_TENSORRT})
        message(STATUS "      TensorRT runtime library: ${TENSORRT_LIBRARY}")
 diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
-index cba4d929855..da904fc6a18 100644
+index 8f2b2c30aee..c2b205a5cd8 100644
 --- a/cmake/TorchConfig.cmake.in
 +++ b/cmake/TorchConfig.cmake.in
-@@ -125,7 +125,7 @@ if(@USE_CUDA@)
+@@ -126,7 +126,7 @@ if(@USE_CUDA@)
      find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
      list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
    else()
@@ -116,7 +116,7 @@ index cba4d929855..da904fc6a18 100644
    if(TARGET torch::nvtoolsext)
      list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
 diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
-index 152fbdbe6dd..0d1aeffc59f 100644
+index 72e6d9d71c8..792e2ac78d3 100644
 --- a/cmake/public/cuda.cmake
 +++ b/cmake/public/cuda.cmake
 @@ -26,8 +26,8 @@ if(NOT MSVC)
@@ -128,8 +128,8 @@ index 152fbdbe6dd..0d1aeffc59f 100644
 +find_package(CUDAToolkit)
 +if(NOT CUDAToolkit_FOUND)
    message(WARNING
-     "Caffe2: CUDA cannot be found. Depending on whether you are building "
-     "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+     "PyTorch: CUDA cannot be found. Depending on whether you are building "
+     "PyTorch or a PyTorch dependent library, the next warning / error will "
 @@ -36,8 +36,6 @@ if(NOT CUDA_FOUND)
    return()
  endif()
@@ -154,13 +154,13 @@ index 152fbdbe6dd..0d1aeffc59f 100644
 -                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
 -endif()
 -
--message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
--message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
--message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
+-message(STATUS "PyTorch: CUDA detected: " ${CUDA_VERSION})
+-message(STATUS "PyTorch: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
+-message(STATUS "PyTorch: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
 -if(CUDA_VERSION VERSION_LESS 11.0)
-+message(STATUS "Caffe2: CUDA detected: " ${CUDAToolkit_VERSION})
-+message(STATUS "Caffe2: CUDA nvcc is: " ${CUDAToolkit_NVCC_EXECUTABLE})
-+message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDAToolkit_ROOT})
++message(STATUS "PyTorch: CUDA detected: " ${CUDAToolkit_VERSION})
++message(STATUS "PyTorch: CUDA nvcc is: " ${CUDAToolkit_NVCC_EXECUTABLE})
++message(STATUS "PyTorch: CUDA toolkit directory: " ${CUDAToolkit_ROOT})
 +if(CUDAToolkit_VERSION VERSION_LESS 11.0)
    message(FATAL_ERROR "PyTorch requires CUDA 11.0 or above.")
  endif()
@@ -182,9 +182,9 @@ index 152fbdbe6dd..0d1aeffc59f 100644
        COMPILE_OUTPUT_VARIABLE output_var
        )
 @@ -106,30 +98,14 @@ if(CUDA_FOUND)
-       message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+       message(FATAL_ERROR "PyTorch: Couldn't determine version from header: " ${output_var})
      endif()
-     message(STATUS "Caffe2: Header version is: " ${cuda_version_from_header})
+     message(STATUS "PyTorch: Header version is: " ${cuda_version_from_header})
 -    if(NOT cuda_version_from_header STREQUAL ${CUDA_VERSION_STRING})
 -      # Force CUDA to be processed for again next time
 -      # TODO: I'm not sure if this counts as an implementation detail of
@@ -215,7 +215,7 @@ index 152fbdbe6dd..0d1aeffc59f 100644
    execute_process(
      COMMAND Python::Interpreter -c
 diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
-index c6647eb457c..accebfd3457 100644
+index 0c8e91ab7cf..52d6857c918 100644
 --- a/cmake/public/utils.cmake
 +++ b/cmake/public/utils.cmake
 @@ -306,6 +306,133 @@ macro(torch_hip_get_arch_list store_var)
@@ -350,13 +350,13 @@ index c6647eb457c..accebfd3457 100644
 +endfunction()
 +
  ##############################################################################
- # Get the NVCC arch flags specified by TORCH_CUDA_ARCH_LIST and CUDA_ARCH_NAME.
+ # Get the XPU arch flags specified by TORCH_XPU_ARCH_LIST.
  # Usage:
 diff --git a/setup.py b/setup.py
-index b0e01e0d1ee..dc21f91d69e 100644
+index 75fdfce7e35..93659023593 100644
 --- a/setup.py
 +++ b/setup.py
-@@ -627,7 +627,7 @@ class build_ext(setuptools.command.build_ext.build_ext):
+@@ -626,7 +626,7 @@ class build_ext(setuptools.command.build_ext.build_ext):
          else:
              report("-- Not using cuDNN")
          if cmake_cache_vars["USE_CUDA"]:
diff --git a/recipe/patches/0015-export-AOTI_TORCH_EXPORT-on-Windows.-140030.patch b/recipe/patches/0015-export-AOTI_TORCH_EXPORT-on-Windows.-140030.patch
new file mode 100644
index 00000000..d5f1cbfa
--- /dev/null
+++ b/recipe/patches/0015-export-AOTI_TORCH_EXPORT-on-Windows.-140030.patch
@@ -0,0 +1,66 @@
+From afc5756195b26f0fcbe0ee96a267149db0bbe71c Mon Sep 17 00:00:00 2001
+From: Xu Han <xu.han@outlook.com>
+Date: Wed, 15 Jan 2025 23:43:41 +0000
+Subject: [PATCH 15/15] export AOTI_TORCH_EXPORT on Windows. (#140030)
+
+Fixes #139954
+
+reproduce UT:
+```cmd
+pytest test/inductor/test_torchinductor_codegen_dynamic_shapes.py -k test_device_assert_dynamic_shapes_cpu
+```
+Issue:
+<img width="856" alt="image" src="https://github.com/user-attachments/assets/5fc501a9-54e5-45ac-9fb3-509ec11a7abe">
+
+After fixing:
+![Image](https://github.com/user-attachments/assets/883846fb-8e92-4b9c-9400-daab32382a3a)
+
+Reland:
+1. Declare export on Windows explicitly.
+2. Support cpu, cuda and xpu devices.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/140030
+Approved by: https://github.com/jgong5, https://github.com/desertfire, https://github.com/malfet
+
+Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
+---
+ CMakeLists.txt                          |  3 +++
+ torch/csrc/inductor/aoti_torch/c/shim.h | 10 +++++++++-
+ 2 files changed, 12 insertions(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c8af5f00b5c..c1733a99e91 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1097,6 +1097,9 @@ if(NOT MSVC)
+     append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
+   endif()
+ else()
++  # Define export functions for AOTI.
++  add_compile_definitions(EXPORT_AOTI_FUNCTIONS)
++
+   # skip unwanted includes from windows.h
+   add_compile_definitions(WIN32_LEAN_AND_MEAN)
+   # Windows SDK broke compatibility since version 25131, but introduced this
+diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
+index 4c6c9afcacc..b2202b24b91 100644
+--- a/torch/csrc/inductor/aoti_torch/c/shim.h
++++ b/torch/csrc/inductor/aoti_torch/c/shim.h
+@@ -44,8 +44,16 @@
+ // to symbol clashes at link time if libtorch is included in a DLL and binary
+ // that depends on the DLL. As a short term fix, we don't export the symbols.
+ // In the long term, this will need to be addressed when Windows is supported.
+-// #define AOTI_TORCH_EXPORT __declspec(dllexport)
++#ifdef OVRSOURCE
++// Do not export AOTI on Windows for internal builds
+ #define AOTI_TORCH_EXPORT
++#else /* OVRSOURCE */
++#ifdef EXPORT_AOTI_FUNCTIONS
++#define AOTI_TORCH_EXPORT __declspec(dllexport)
++#else
++#define AOTI_TORCH_EXPORT __declspec(dllimport)
++#endif
++#endif /* OVRSOURCE */
+ #else // !_WIN32
+ #define AOTI_TORCH_EXPORT
+ #endif // _WIN32
diff --git a/recipe/patches/0020-make-library-name-in-test_mutable_custom_op_fixed_la.patch b/recipe/patches/0020-make-library-name-in-test_mutable_custom_op_fixed_la.patch
deleted file mode 100644
index 17c54e33..00000000
--- a/recipe/patches/0020-make-library-name-in-test_mutable_custom_op_fixed_la.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 39041f5a78068d2cf58d99f76938aee95a3c7bb5 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Thu, 30 Jan 2025 13:23:14 +1100
-Subject: [PATCH 20/21] make library name in
- `test_mutable_custom_op_fixed_layout{,2}` unique
-
-Suggested-By: Daniel Petry <dpetry@anaconda.com>
----
- test/inductor/test_torchinductor.py | 14 +++++++++-----
- 1 file changed, 9 insertions(+), 5 deletions(-)
-
-diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
-index 610f5d27332..99e2169febb 100644
---- a/test/inductor/test_torchinductor.py
-+++ b/test/inductor/test_torchinductor.py
-@@ -10628,7 +10628,8 @@ class CommonTemplate:
-     @requires_gpu()
-     @config.patch(implicit_fallbacks=True)
-     def test_mutable_custom_op_fixed_layout2(self):
--        with torch.library._scoped_library("mylib", "DEF") as lib:
-+        unique_lib_name = f"mylib_{id(self)}"  # Make unique name using test instance id
-+        with torch.library._scoped_library(unique_lib_name, "DEF") as lib:
-             mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
-             inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
-             expected_stride = mod(inp).clone().stride()
-@@ -10664,8 +10665,9 @@ class CommonTemplate:
-             def fn(x):
-                 # Inductor changes the conv to be channels-last
-                 z = mod(x)
--                output = torch.ops.mylib.bar(z, torch._dynamo.is_compiling())
--                torch.ops.mylib.add_one(output)
-+                mylib = importlib.import_module(f"torch.ops.{unique_lib_name}")
-+                output = mylib.bar(z, torch._dynamo.is_compiling())
-+                mylib.add_one(output)
-                 return output**2
- 
-             with torch.no_grad():
-@@ -10681,7 +10683,8 @@ class CommonTemplate:
- 
-     @config.patch(implicit_fallbacks=True)
-     def test_mutable_custom_op_fixed_layout(self):
--        with torch.library._scoped_library("mylib", "DEF") as lib:
-+        unique_lib_name = f"mylib_{id(self)}"  # Make unique name using test instance id
-+        with torch.library._scoped_library(unique_lib_name, "DEF") as lib:
-             lib.define(
-                 "copy_(Tensor(a!) dst, Tensor src) -> ()",
-                 tags=torch.Tag.needs_fixed_stride_order,
-@@ -10697,7 +10700,8 @@ class CommonTemplate:
- 
-             def f(x):
-                 full_default_3 = torch.full([3], 7.0, device="cpu")
--                chunk_cat_default_1 = torch.ops.mylib.copy_.default(full_default_3, x)
-+                mylib = importlib.import_module(f"torch.ops.{unique_lib_name}")
-+                chunk_cat_default_1 = mylib.copy_.default(full_default_3, x)
-                 mul_out = torch.mul(full_default_3, full_default_3)
-                 return mul_out
- 
diff --git a/recipe/patches_submodules/XNNPACK/0001-Fix-bazel-linux-aarch64-gcc13-workflow-and-resolve-a.patch b/recipe/patches_submodules/XNNPACK/0001-Fix-bazel-linux-aarch64-gcc13-workflow-and-resolve-a.patch
new file mode 100644
index 00000000..4f697260
--- /dev/null
+++ b/recipe/patches_submodules/XNNPACK/0001-Fix-bazel-linux-aarch64-gcc13-workflow-and-resolve-a.patch
@@ -0,0 +1,50 @@
+From 50fbfa98c8a25411993c2423a4c2fabe5023fc56 Mon Sep 17 00:00:00 2001
+From: XNNPACK Team <xnnpack-github-robot@google.com>
+Date: Mon, 25 Nov 2024 13:00:56 -0800
+Subject: [PATCH] Fix `bazel-linux-aarch64-gcc13` workflow and resolve
+ accompanying build errors.
+
+Note that task names should not be prefixes of another task since this messes up retrieving the correct cache for each task.
+
+PiperOrigin-RevId: 700075031
+
+[Cherry-pick note: dropped changes in .github/workflows/build.yml]
+---
+ src/reference/unary-elementwise.cc | 10 ++++++++++
+ src/xnnpack/simd/s16-neon.h        |  2 +-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/third_party/XNNPACK/src/reference/unary-elementwise.cc b/third_party/XNNPACK/src/reference/unary-elementwise.cc
+index bd95ded6c..da892d8be 100644
+--- a/third_party/XNNPACK/src/reference/unary-elementwise.cc
++++ b/third_party/XNNPACK/src/reference/unary-elementwise.cc
+@@ -127,6 +127,16 @@ struct ConvertOp {
+   }
+ };
+ 
++#ifdef XNN_HAVE_FLOAT16
++template <>
++struct ConvertOp<xnn_bfloat16, _Float16> {
++  explicit ConvertOp(const xnn_unary_uparams*) {}
++  _Float16 operator()(xnn_bfloat16 x) const {
++    return static_cast<_Float16>(static_cast<float>(x));
++  }
++};
++#endif
++
+ template <typename TIn, typename TOut>
+ const xnn_unary_elementwise_config* get_convert_config(
+     std::true_type /*input_quantized*/, std::true_type /*output_quantized*/) {
+diff --git a/third_party/XNNPACK/src/xnnpack/simd/s16-neon.h b/third_party/XNNPACK/src/xnnpack/simd/s16-neon.h
+index 4e8ebcfbd..e8392f4e9 100644
+--- a/third_party/XNNPACK/src/xnnpack/simd/s16-neon.h
++++ b/third_party/XNNPACK/src/xnnpack/simd/s16-neon.h
+@@ -70,7 +70,7 @@ static XNN_INLINE void xnn_store_tail_s16(int16_t* output, xnn_simd_s16_t v,
+     v_low = vget_high_s16(v);
+   }
+   if (num_elements & 2) {
+-    vst1_lane_s32((void*) output, vreinterpret_s32_s16(v_low), 0);
++    vst1_lane_s32((int32_t*) output, vreinterpret_s32_s16(v_low), 0);
+     output += 2;
+     v_low = vext_s16(v_low, v_low, 2);
+   }