From bea14a48045cf0b61374acbdc4724f485013754f Mon Sep 17 00:00:00 2001
From: Logan Ward <WardLT@users.noreply.github.com>
Date: Thu, 23 May 2024 13:22:58 -0400
Subject: [PATCH] Enforce single XPU training (#134)

---
 component-tests/training/run_test.py | 28 +++++-----------------------
 envs/build-aurora.sh                 |  2 +-
 envs/environment-aurora.yml          |  9 +++++++++
 mofa/difflinker_train.py             | 13 +++++++++++++
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/component-tests/training/run_test.py b/component-tests/training/run_test.py
index f7dc1db0..a280cb96 100644
--- a/component-tests/training/run_test.py
+++ b/component-tests/training/run_test.py
@@ -11,7 +11,7 @@
 from parsl.app.python import PythonApp
 from parsl.executors import HighThroughputExecutor
 from parsl.providers import PBSProProvider
-from parsl.launchers import MpiExecLauncher
+from parsl.launchers import SimpleLauncher
 
 from mofa.model import MOFRecord
 
@@ -111,29 +111,12 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e
             )
         ])
     elif args.config.startswith("sunspot"):
-        if args.config == "sunspot":
-            accel_ids = [
-                f"{gid}.{tid}"
-                for gid in range(6)
-                for tid in range(2)
-            ]
-        elif args.config == "sunspot-device":
-            accel_ids = [
-                f"{gid}.0,{gid}.1"
-                for gid in range(6)
-            ]
-        else:
-            raise ValueError(f'Not supported: {args.config}')
         config = Config(
-            retries=2,
             executors=[
                 HighThroughputExecutor(
                     label="sunspot_test",
-                    available_accelerators=accel_ids,  # Ensures one worker per accelerator
-                    cpu_affinity="block",  # Assigns cpus in sequential order
                     prefetch_capacity=0,
-                    max_workers=len(accel_ids),
-                    cores_per_worker=208 // len(accel_ids),
+                    max_workers=1,
                     provider=PBSProProvider(
                         account="CSC249ADCD08_CNDA",
                         queue="workq",
@@ -147,16 +130,15 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e
 module load gcc/12.2.0
 module list
 
-{"" if len(accel_ids) == 12 else "export IPEX_TILE_AS_DEVICE=0"}
+python -c "import intel_extension_for_pytorch as ipex; print(ipex.xpu.device_count())"
+
 cd $PBS_O_WORKDIR
 pwd
 which python
 hostname
                         """,
                         walltime="1:10:00",
-                        launcher=MpiExecLauncher(
-                            bind_cmd="--cpu-bind", overrides="--depth=208 --ppn 1"
-                        ),  # Ensures 1 manger per node and allows it to divide work among all 208 threads
+                        launcher=SimpleLauncher(),
                         select_options="system=sunspot,place=scatter",
                         nodes_per_block=1,
                         min_blocks=0,
diff --git a/envs/build-aurora.sh b/envs/build-aurora.sh
index 70acd73d..49278616 100755
--- a/envs/build-aurora.sh
+++ b/envs/build-aurora.sh
@@ -14,7 +14,7 @@ conda activate ./env
 
 # Build torch_ccl locally
 #  Clone from: https://github.com/intel/torch-ccl
-cd libs/torch_ccl
+cd libs/torch-ccl
 COMPUTE_BACKEND=dpcpp pip install -e .
 
 # Now install Corey's stuff
diff --git a/envs/environment-aurora.yml b/envs/environment-aurora.yml
index a3c8957d..01e68dea 100644
--- a/envs/environment-aurora.yml
+++ b/envs/environment-aurora.yml
@@ -38,9 +38,18 @@ dependencies:
   - pytorch==2.1.0
   - intel-extension-for-pytorch==2.1.10
 
+  # Tools to build CCL locally
+  - conda-forge::cmake
+  - ninja
+
   - pip
   - pip:
     - git+https://gitlab.com/ase/ase.git
     - git+https://github.com/exalearn/colmena.git  # Fixes for streaming not yet on PyPI
+
+    # Install ccl manually for now, uncomment when SSL doesn't disagree between
+    #  the following wheel's version and Sunspot/Aurora
+    #- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    #    - oneccl_bind_pt==2.1.200+xpu
     - -e ..[test]
 
diff --git a/mofa/difflinker_train.py b/mofa/difflinker_train.py
index 303da7b4..af8fc898 100644
--- a/mofa/difflinker_train.py
+++ b/mofa/difflinker_train.py
@@ -5,6 +5,13 @@
 
 from pytorch_lightning import Trainer, callbacks
 from pytorch_lightning.callbacks import TQDMProgressBar
+from pytorch_lightning.strategies import SingleDeviceStrategy
+
+try:
+    import intel_extension_for_pytorch as ipex  # noqa: F401
+    import oneccl_bindings_for_pytorch  # noqa: F401
+except ImportError:
+    pass
 
 try:
     import intel_extension_for_pytorch as ipex  # noqa: F401
@@ -150,6 +157,11 @@ def main(
             if '.' in args.train_data_prefix:
                 context_node_nf += 1
 
+            # Lock XPU to single device for now
+            strategy = 'auto'
+            if args.device == 'xpu':
+                strategy = SingleDeviceStrategy(device='xpu')
+
             checkpoint_callback = [callbacks.ModelCheckpoint(
                 dirpath=checkpoints_dir,
                 filename='difflinker_{epoch:02d}',
@@ -164,6 +176,7 @@ def main(
                 accelerator=args.device,
                 num_sanity_val_steps=0,
                 enable_progress_bar=args.enable_progress_bar,
+                strategy=strategy
             )
 
             # Add a callback for fit setup