From 8a1f5cea48787ff50d3f3b5565a4ca2ada910aa4 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Tue, 11 Apr 2023 13:29:27 -0700
Subject: [PATCH 01/11] Add Flash attention code from
 https://github.com/HazyResearch/flash-attention/blob/main/tests/models/test_gpt_neox.py

---
 requirements.txt            |  1 +
 test_flash_attn_gpt_neox.py | 92 +++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 test_flash_attn_gpt_neox.py

diff --git a/requirements.txt b/requirements.txt
index 9f9f1e643..b68471d2e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,3 +49,4 @@ pypandoc==1.11
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
+git+https://github.com/HazyResearch/flash-attention.git@d478eeec8f16c7939c54e4617dbd36f59b8eeed7
\ No newline at end of file
diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py
new file mode 100644
index 000000000..4a0b2b857
--- /dev/null
+++ b/test_flash_attn_gpt_neox.py
@@ -0,0 +1,92 @@
+import torch
+import pytest
+
+from transformers import GPTNeoXConfig
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
+
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        # "EleutherAI/gpt-neox-20b",
+        'togethercomputer/GPT-NeoXT-Chat-Base-20B',
+    ]
+)
+def test_gptj_state_dict(model_name):
+    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
+    pretrained_state_dict = remap_state_dict_hf_gpt_neox(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config, device='meta')  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        # "EleutherAI/gpt-neox-20b",
+        'togethercomputer/GPT-NeoXT-Chat-Base-20B',
+    ]
+)
+def test_gpt_neox_optimized(model_name):
+    """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = 'cuda'
+    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    input_ids = torch.randint(0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long,
+                              device=device)
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Need at least 2 GPUs, otherwise we'll OOM
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map='auto')
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = GPTNeoXForCausalLM.from_pretrained(model_name, torch_dtype=dtype,
+                                                  device_map={"": device})
+    model_hf.eval()
+    with torch.no_grad():
+        out_hf = model_hf.gpt_neox(input_ids).last_hidden_state
+        logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    print(f'HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}')
+    print(f'HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}')
+    assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+    assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item()
+
+    print(f'Logits max diff: {(logits - logits_ref).abs().max().item()}')
+    print(f'Logits mean diff: {(logits - logits_ref).abs().mean().item()}')
+    print(f'HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}')
+    print(f'HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}')
+    assert (logits - logits_ref).abs().max().item() < 2 * (logits_hf - logits_ref).abs().max().item()
+    assert (logits - logits_ref).abs().mean().item() < 2 * (logits_hf - logits_ref).abs().mean().item()

From cec2ed630f3e3ccd0aa2c41bf6519c5ea286d55f Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Tue, 11 Apr 2023 13:58:01 -0700
Subject: [PATCH 02/11] Add instructions to install Apex.

---
 README.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/README.md b/README.md
index 4c120433f..d19232ab5 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,126 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md).
 - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md).
 
+6. Compile Apex (for Flash attention, needs CUDA 11.7 above) [howto src](https://github.com/NVIDIA/apex/#linux)
+
+```bash
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+
+Fine-tune on single GPU on single node:
+```
+torchrun finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=alpaca_data_cleaned.json 
+```
+this will download the model, load the data, and generate an output directory lora-alpaca.
+
+Fine-tune using 2 nodes with 2 GPUs each:
+```
+WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B'
+
+WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B'
+```
+
+Fine-tune using 2 24GB GPUs to split up a 30B model:
+```
+WORLD_SIZE=2 python finetune.py --data_path=alpaca_data_cleaned.json --base_model="decapoda-research/llama-30b-hf" --ddp=False
+```
+
+Fine-tune previously saved model (running `export_hf_checkpoint.py`):
+```
+WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node0.log
+
+WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node1.log
+```
+
+Generate on single GPU on single node:
+```
+torchrun generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca
+```
+this will download the foundation model, our fine-tuned lora_weights, and open up a GUI with text generation input/output.
+
+
+In case you get peer to peer related errors, set this env var:
+```
+export NCCL_P2P_LEVEL=LOC
+```
+
+
+### Docker Setup & Inference
+
+1. Build the container image:
+
+```bash
+docker build -t h2o-llm .
+```
+
+2. Run the container (you can also use `finetune.py` and all of its parameters as shown above for training):
+
+```bash
+docker run --runtime=nvidia --shm-size=64g -p 7860:7860 -v ${HOME}/.cache:/root/.cache --rm h2o-llm -it generate.py \
+    --load_8bit=True --base_model='EleutherAI/gpt-neox-20b' --prompt_type=human_bot
+```
+
+3. Open `https://localhost:7860` in the browser
+
+### Docker Compose Setup & Inference
+
+1. (optional) Change desired model and weights under `environment` in the `docker-compose.yml`
+
+2. Build and run the container
+
+```bash
+docker-compose up -d --build
+```
+
+3. Open `https://localhost:7860` in the browser
+
+4. See logs:
+
+```bash
+docker-compose logs -f
+```
+
+5. Clean everything up:
+
+```bash
+docker-compose down --volumes --rmi all
+```
+
+
+### Tensorboard
+
+```bash
+tensorboard --logdir=runs/
+```
+
+### Plan
+Open source instruct model for demoable usecases.
+1. Base: Start with fully open source apache 2.0 models EleutherAI--gpt-j-6B, EleutherAI--gpt-neox-20b, 
+GPT-NeoXT-Chat-Base-20B, etc. 
+2. Construct Prompt: Setup prompt engineering on 6B-20B as-is to convert a sentence into question/answer or command/response format 
+3. Open-Source Instruct Data: Convert wiki data into instruct form
+4. Fine-tune: LORA fine-tune 6B and 20B using DAI docs
+5. Open Data & Model: Submit DAI docs model huggingface
+6. Use toolformer approach for external APIs
+
+### Goals
+1. Demonstrate fine-tuning working on some existing corpus
+2. Demonstrate efficiency of LORA for fast and low-memory fine-tuning
+
+
+### Code to consider including
+[flan-alpaca](https://github.com/declare-lab/flan-alpaca)<br />
+[text-generation-webui](https://github.com/oobabooga/text-generation-webui)<br />
+[minimal-llama](https://github.com/zphang/minimal-llama/)<br />
+[finetune GPT-NeoX](https://nn.labml.ai/neox/samples/finetune.html)<br />
+[GPTQ-for_LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/compare/cuda...Digitous:GPTQ-for-GPT-NeoX:main)<br />
+[OpenChatKit on multi-GPU](https://github.com/togethercomputer/OpenChatKit/issues/20)<br />
+[Non-Causal LLM](https://huggingface.co/docs/transformers/main/en/model_doc/gptj#transformers.GPTJForSequenceClassification)<br />
+[OpenChatKit_Offload](https://github.com/togethercomputer/OpenChatKit/commit/148b5745a57a6059231178c41859ecb09164c157)<br />
+[Flan-alpaca](https://github.com/declare-lab/flan-alpaca/blob/main/training.py)<br />
+
 ### Help
 
 [FAQs](FAQ.md)

From 6e163e23664ebed827043bdec9261af666eb025f Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Tue, 11 Apr 2023 14:27:27 -0700
Subject: [PATCH 03/11] Update instructions for flash attention

---
 README.md        | 128 ++++-------------------------------------------
 requirements.txt |   1 -
 2 files changed, 11 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index d19232ab5..1358c32ef 100644
--- a/README.md
+++ b/README.md
@@ -74,126 +74,20 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md).
 - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md).
 
-6. Compile Apex (for Flash attention, needs CUDA 11.7 above) [howto src](https://github.com/NVIDIA/apex/#linux)
+6. Add Flash Attention
 
 ```bash
-git clone https://github.com/NVIDIA/apex
-cd apex
-pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+git clone https://github.com/HazyResearch/flash-attention.git
+cd flash-attention
+python setup.py install
+cd csrc/layer_norm
+pip install .
+cd ../rotary
+pip install .
+cd ../fused_dense_lib
+pip install .
+cd ../..
 ```
-
-Fine-tune on single GPU on single node:
-```
-torchrun finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=alpaca_data_cleaned.json 
-```
-this will download the model, load the data, and generate an output directory lora-alpaca.
-
-Fine-tune using 2 nodes with 2 GPUs each:
-```
-WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B'
-
-WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B'
-```
-
-Fine-tune using 2 24GB GPUs to split up a 30B model:
-```
-WORLD_SIZE=2 python finetune.py --data_path=alpaca_data_cleaned.json --base_model="decapoda-research/llama-30b-hf" --ddp=False
-```
-
-Fine-tune previously saved model (running `export_hf_checkpoint.py`):
-```
-WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node0.log
-
-WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node1.log
-```
-
-Generate on single GPU on single node:
-```
-torchrun generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca
-```
-this will download the foundation model, our fine-tuned lora_weights, and open up a GUI with text generation input/output.
-
-
-In case you get peer to peer related errors, set this env var:
-```
-export NCCL_P2P_LEVEL=LOC
-```
-
-
-### Docker Setup & Inference
-
-1. Build the container image:
-
-```bash
-docker build -t h2o-llm .
-```
-
-2. Run the container (you can also use `finetune.py` and all of its parameters as shown above for training):
-
-```bash
-docker run --runtime=nvidia --shm-size=64g -p 7860:7860 -v ${HOME}/.cache:/root/.cache --rm h2o-llm -it generate.py \
-    --load_8bit=True --base_model='EleutherAI/gpt-neox-20b' --prompt_type=human_bot
-```
-
-3. Open `https://localhost:7860` in the browser
-
-### Docker Compose Setup & Inference
-
-1. (optional) Change desired model and weights under `environment` in the `docker-compose.yml`
-
-2. Build and run the container
-
-```bash
-docker-compose up -d --build
-```
-
-3. Open `https://localhost:7860` in the browser
-
-4. See logs:
-
-```bash
-docker-compose logs -f
-```
-
-5. Clean everything up:
-
-```bash
-docker-compose down --volumes --rmi all
-```
-
-
-### Tensorboard
-
-```bash
-tensorboard --logdir=runs/
-```
-
-### Plan
-Open source instruct model for demoable usecases.
-1. Base: Start with fully open source apache 2.0 models EleutherAI--gpt-j-6B, EleutherAI--gpt-neox-20b, 
-GPT-NeoXT-Chat-Base-20B, etc. 
-2. Construct Prompt: Setup prompt engineering on 6B-20B as-is to convert a sentence into question/answer or command/response format 
-3. Open-Source Instruct Data: Convert wiki data into instruct form
-4. Fine-tune: LORA fine-tune 6B and 20B using DAI docs
-5. Open Data & Model: Submit DAI docs model huggingface
-6. Use toolformer approach for external APIs
-
-### Goals
-1. Demonstrate fine-tuning working on some existing corpus
-2. Demonstrate efficiency of LORA for fast and low-memory fine-tuning
-
-
-### Code to consider including
-[flan-alpaca](https://github.com/declare-lab/flan-alpaca)<br />
-[text-generation-webui](https://github.com/oobabooga/text-generation-webui)<br />
-[minimal-llama](https://github.com/zphang/minimal-llama/)<br />
-[finetune GPT-NeoX](https://nn.labml.ai/neox/samples/finetune.html)<br />
-[GPTQ-for_LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/compare/cuda...Digitous:GPTQ-for-GPT-NeoX:main)<br />
-[OpenChatKit on multi-GPU](https://github.com/togethercomputer/OpenChatKit/issues/20)<br />
-[Non-Causal LLM](https://huggingface.co/docs/transformers/main/en/model_doc/gptj#transformers.GPTJForSequenceClassification)<br />
-[OpenChatKit_Offload](https://github.com/togethercomputer/OpenChatKit/commit/148b5745a57a6059231178c41859ecb09164c157)<br />
-[Flan-alpaca](https://github.com/declare-lab/flan-alpaca/blob/main/training.py)<br />
-
 ### Help
 
 [FAQs](FAQ.md)
diff --git a/requirements.txt b/requirements.txt
index b68471d2e..9f9f1e643 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,4 +49,3 @@ pypandoc==1.11
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
-git+https://github.com/HazyResearch/flash-attention.git@d478eeec8f16c7939c54e4617dbd36f59b8eeed7
\ No newline at end of file

From ff4307f7f4af2bb2afac674084b7e744da55d9f9 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Tue, 11 Apr 2023 16:44:05 -0700
Subject: [PATCH 04/11] WIP - nothing working yet. Disable mix_in by default.

---
 README.md   |  2 +-
 finetune.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1358c32ef..c9bb2a4b6 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 6. Add Flash Attention
 
 ```bash
-git clone https://github.com/HazyResearch/flash-attention.git
+git clone https://github.com/h2oai/flash-attention.git
 cd flash-attention
 python setup.py install
 cd csrc/layer_norm
diff --git a/finetune.py b/finetune.py
index bd1c696e6..3aea12292 100644
--- a/finetune.py
+++ b/finetune.py
@@ -173,6 +173,7 @@ def train(
         save_steps: int = None,  # must be round multiple of eval_steps
         save_total_limit: int = 3,
         add_eos_token: bool = False,
+        flash_attention: bool = False,
 ):
 
     if llama_flash_attn:
@@ -296,6 +297,37 @@ def train(
         lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
     lora_mappings['distilgpt2'] = ["c_attn"]
 
+    if "h2ogpt" in base_model and not llama_type and flash_attention:
+        log("Enabling Flash attention")
+        # speed up forward prop for attention layer and reduce memory especially for long context lengths
+        from flash_attn.models.gpt import GPTLMHeadModel
+        from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config
+        from flash_attn.models.gptj import gptj_config_to_gpt2_config
+
+        if "gpt-j" in base_model.lower():
+            config = gptj_config_to_gpt2_config(model.config)
+        else:
+            config = gpt_neox_config_to_gpt2_config(model.config)
+        config.use_flash_attn = True
+        config.fused_bias_fc = True
+        config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
+        config.fused_dropout_add_ln = True
+        config.residual_in_fp32 = True
+        lora_target_modules = ['Wqkv']
+        model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16)
+        # for v in vars(model2.config):
+        #     setattr(model.config, v, getattr(model2.config, v))
+        # model.transformer.config = model.config
+        # model.transformer.h = model2.transformer.layers
+        # model.lm_head = model2.lm_head
+        ### model.transformer.wte = model2.transformer.wte
+        ### model.transformer.embeddings = model2.transformer.embeddings
+        print(model)
+        # FIXME - don't disable LoRA
+        lora_r = 0
+        # FIXME - enable 8-bit
+        # model = prepare_model_for_int8_training(model)
+
     if lora_weights:
 
         from peft import PeftModel

From 92335ed06a02e935cae295fac12dfea2bf82a924 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Wed, 10 May 2023 21:09:00 -0700
Subject: [PATCH 05/11] Remove manual install of flash-attn.

---
 README.md | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/README.md b/README.md
index c9bb2a4b6..4c120433f 100644
--- a/README.md
+++ b/README.md
@@ -74,20 +74,6 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md).
 - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md).
 
-6. Add Flash Attention
-
-```bash
-git clone https://github.com/h2oai/flash-attention.git
-cd flash-attention
-python setup.py install
-cd csrc/layer_norm
-pip install .
-cd ../rotary
-pip install .
-cd ../fused_dense_lib
-pip install .
-cd ../..
-```
 ### Help
 
 [FAQs](FAQ.md)

From bd1b00905bf37a3256dc83eb34ae228ab269600b Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Wed, 10 May 2023 21:24:16 -0700
Subject: [PATCH 06/11] Upgrade requirements, fixes sm80 issue.

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9f9f1e643..8c943d601 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
 # for generate (gradio server) and finetune
-datasets==2.11.0
+datasets==2.12.0
 sentencepiece==0.1.97
 accelerate==0.18.0
 gradio==3.27.0
-huggingface_hub==0.13.4
+huggingface_hub==0.14.1
 appdirs==1.4.4
 fire==0.5.0
 docutils==0.19
-torch==2.0.0
+torch==2.0.1
 evaluate==0.4.0
 rouge_score==0.1.2
 sacrebleu==2.3.1

From af4a98b13fd10e943e6964642863bb776c405714 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Wed, 10 May 2023 23:04:45 -0700
Subject: [PATCH 07/11] Rebase, rename llama_flash_attn -> flash_attn.

---
 finetune.py | 78 ++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/finetune.py b/finetune.py
index 3aea12292..279b03d2f 100644
--- a/finetune.py
+++ b/finetune.py
@@ -154,7 +154,7 @@ def train(
         lora_dropout: float = 0.05,
         lora_target_modules: List[str] = None,
         llama_type: bool = None,
-        llama_flash_attn: bool = False,
+        flash_attn: bool = False,
 
         # llm hyperparams
         train_on_inputs: bool = True,  # if False, masks out inputs in loss
@@ -173,10 +173,9 @@ def train(
         save_steps: int = None,  # must be round multiple of eval_steps
         save_total_limit: int = 3,
         add_eos_token: bool = False,
-        flash_attention: bool = False,
 ):
 
-    if llama_flash_attn:
+    if flash_attn:
         # Need to call this before importing transformers.
         from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
         replace_llama_attn_with_flash_attn()
@@ -212,21 +211,16 @@ def train(
         tokenizer_base_model = base_model
     if llama_type is None:
         llama_type = "llama" in base_model.lower()
-    if llama_type and llama_flash_attn:
+    if flash_attn:
         import pkg_resources
         try:
             pkg_resources.get_distribution('flash_attn')
-            can_do_flash_attn = True
+            log("Enabling Flash attention")
         except (pkg_resources.DistributionNotFound, pkg_resources.ContextualVersionConflict):
-            can_do_flash_attn = False
-
-        if not can_do_flash_attn:
             raise RuntimeError("""Flash attention not installed.
             NOTE: for current pytorch 2.0, flash attention requires installing cuda 11.7 via https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=runfile_local and then when running, to avoid installing driver, docs, samples, just install toolkit.  Then when pip installing flash attention do:
 
             CUDA_HOME=/usr/local/cuda-11.7 pip install flash-attn""")
-        from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
-        replace_llama_attn_with_flash_attn()
     assert (
         base_model
     ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"
@@ -297,36 +291,40 @@ def train(
         lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
     lora_mappings['distilgpt2'] = ["c_attn"]
 
-    if "h2ogpt" in base_model and not llama_type and flash_attention:
+    if not llama_type and flash_attn:
         log("Enabling Flash attention")
-        # speed up forward prop for attention layer and reduce memory especially for long context lengths
-        from flash_attn.models.gpt import GPTLMHeadModel
-        from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config
-        from flash_attn.models.gptj import gptj_config_to_gpt2_config
-
-        if "gpt-j" in base_model.lower():
-            config = gptj_config_to_gpt2_config(model.config)
-        else:
-            config = gpt_neox_config_to_gpt2_config(model.config)
-        config.use_flash_attn = True
-        config.fused_bias_fc = True
-        config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
-        config.fused_dropout_add_ln = True
-        config.residual_in_fp32 = True
-        lora_target_modules = ['Wqkv']
-        model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16)
-        # for v in vars(model2.config):
-        #     setattr(model.config, v, getattr(model2.config, v))
-        # model.transformer.config = model.config
-        # model.transformer.h = model2.transformer.layers
-        # model.lm_head = model2.lm_head
-        ### model.transformer.wte = model2.transformer.wte
-        ### model.transformer.embeddings = model2.transformer.embeddings
-        print(model)
-        # FIXME - don't disable LoRA
-        lora_r = 0
-        # FIXME - enable 8-bit
-        # model = prepare_model_for_int8_training(model)
+        # from flash_attn.models.gpt import GPTLMHeadModel
+        # from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config
+        # from flash_attn.models.gptj import gptj_config_to_gpt2_config
+        #
+        # if "gpt-j" in base_model.lower():
+        #     config = gptj_config_to_gpt2_config(model.config)
+        # else:
+        #     assert any([x in base_model.lower() for x in ["pythia", "h2ogpt", "gpt-neox"]])
+        #     config = gpt_neox_config_to_gpt2_config(model.config)
+        # config.use_flash_attn = True
+        # config.fused_bias_fc = True
+        # config.activation_function = 'gelu_fast'  # GPT-NeoX-20B uses "gelu_fast"
+        # config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
+        # config.fused_dropout_add_ln = True
+        # config.residual_in_fp32 = True
+        # lora_target_modules = ['Wqkv']
+        # # model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16)
+        #
+        # model = GPTLMHeadModel(config, base_model, device='cuda', dtype=torch.float16)
+        # # Load state_dict in cpu because we already initialized the model in GPU, and we don't
+        # # want extra stuff taking up more GPU memory
+        # state_dict = state_dict_from_pretrained(
+        #     base_model, device='cpu', dtype=torch.float16
+        # )
+        # if base_model.startswith('EleutherAI/gpt-j-'):
+        #     state_dict = remap_state_dict_hf_gptj(state_dict, config)
+        #     strict = False  # We have rotary_emb.inf_freq buffers not in the GPT-J checkpoint
+        # else:
+        #     state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
+        # if world_size > 1:
+        #     state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
+        # model.load_state_dict(state_dict, strict=True)
 
     if lora_weights:
 
@@ -672,7 +670,7 @@ def compute_metrics(eval_preds):
     if torch.__version__ >= "2" and sys.platform != "win32":
         model = torch.compile(model)
         # WIP (not generally replacing layers until pytorch 2.1)
-        if not llama_flash_attn:
+        if not flash_attn:
             torch.backends.cuda.enable_flash_sdp(True)
 
     if gpus > 1 and not ddp:

From b9dcb7da6ac5f49603f7bbe8937ccf1a3d8f3883 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Wed, 10 May 2023 23:46:21 -0700
Subject: [PATCH 08/11] WIP. Add back custom install for flash-attn.

---
 README.md                   | 14 ++++++++++++++
 test_flash_attn_gpt_neox.py |  9 ++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4c120433f..c9bb2a4b6 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,20 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md).
 - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md).
 
+6. Add Flash Attention
+
+```bash
+git clone https://github.com/h2oai/flash-attention.git
+cd flash-attention
+python setup.py install
+cd csrc/layer_norm
+pip install .
+cd ../rotary
+pip install .
+cd ../fused_dense_lib
+pip install .
+cd ../..
+```
 ### Help
 
 [FAQs](FAQ.md)
diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py
index 4a0b2b857..176369ddb 100644
--- a/test_flash_attn_gpt_neox.py
+++ b/test_flash_attn_gpt_neox.py
@@ -1,7 +1,8 @@
 import torch
 import pytest
+from flash_attn.models.gptj import gptj_config_to_gpt2_config
 
-from transformers import GPTNeoXConfig
+from transformers import GPTNeoXConfig, GPTJConfig
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
 
 from flash_attn.models.gpt import GPTLMHeadModel
@@ -29,8 +30,9 @@ def test_gptj_state_dict(model_name):
 @pytest.mark.parametrize(
     'model_name',
     [
-        # "EleutherAI/gpt-neox-20b",
-        'togethercomputer/GPT-NeoXT-Chat-Base-20B',
+        "EleutherAI/gpt-neox-20b",
+        # "h2oai/h2ogpt-oig-oasst1-512-6.9b",
+        # "h2oai/h2ogpt-oasst1-512-12b",
     ]
 )
 def test_gpt_neox_optimized(model_name):
@@ -41,6 +43,7 @@ def test_gpt_neox_optimized(model_name):
     dtype = torch.float16
     device = 'cuda'
     config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
+    # config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
     config.use_flash_attn = True
     config.fused_bias_fc = True
     config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"

From ee79bfa458f5f89696083315a687aa03136f8c51 Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Thu, 11 May 2023 14:09:54 -0700
Subject: [PATCH 09/11] Cleanup.

---
 README.md                   | 14 ------
 finetune.py                 | 35 --------------
 test_flash_attn_gpt_neox.py | 95 -------------------------------------
 3 files changed, 144 deletions(-)
 delete mode 100644 test_flash_attn_gpt_neox.py

diff --git a/README.md b/README.md
index c9bb2a4b6..4c120433f 100644
--- a/README.md
+++ b/README.md
@@ -74,20 +74,6 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h
 - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md).
 - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md).
 
-6. Add Flash Attention
-
-```bash
-git clone https://github.com/h2oai/flash-attention.git
-cd flash-attention
-python setup.py install
-cd csrc/layer_norm
-pip install .
-cd ../rotary
-pip install .
-cd ../fused_dense_lib
-pip install .
-cd ../..
-```
 ### Help
 
 [FAQs](FAQ.md)
diff --git a/finetune.py b/finetune.py
index 279b03d2f..d2ca116c0 100644
--- a/finetune.py
+++ b/finetune.py
@@ -291,41 +291,6 @@ def train(
         lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
     lora_mappings['distilgpt2'] = ["c_attn"]
 
-    if not llama_type and flash_attn:
-        log("Enabling Flash attention")
-        # from flash_attn.models.gpt import GPTLMHeadModel
-        # from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config
-        # from flash_attn.models.gptj import gptj_config_to_gpt2_config
-        #
-        # if "gpt-j" in base_model.lower():
-        #     config = gptj_config_to_gpt2_config(model.config)
-        # else:
-        #     assert any([x in base_model.lower() for x in ["pythia", "h2ogpt", "gpt-neox"]])
-        #     config = gpt_neox_config_to_gpt2_config(model.config)
-        # config.use_flash_attn = True
-        # config.fused_bias_fc = True
-        # config.activation_function = 'gelu_fast'  # GPT-NeoX-20B uses "gelu_fast"
-        # config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
-        # config.fused_dropout_add_ln = True
-        # config.residual_in_fp32 = True
-        # lora_target_modules = ['Wqkv']
-        # # model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16)
-        #
-        # model = GPTLMHeadModel(config, base_model, device='cuda', dtype=torch.float16)
-        # # Load state_dict in cpu because we already initialized the model in GPU, and we don't
-        # # want extra stuff taking up more GPU memory
-        # state_dict = state_dict_from_pretrained(
-        #     base_model, device='cpu', dtype=torch.float16
-        # )
-        # if base_model.startswith('EleutherAI/gpt-j-'):
-        #     state_dict = remap_state_dict_hf_gptj(state_dict, config)
-        #     strict = False  # We have rotary_emb.inf_freq buffers not in the GPT-J checkpoint
-        # else:
-        #     state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
-        # if world_size > 1:
-        #     state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
-        # model.load_state_dict(state_dict, strict=True)
-
     if lora_weights:
 
         from peft import PeftModel
diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py
deleted file mode 100644
index 176369ddb..000000000
--- a/test_flash_attn_gpt_neox.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import torch
-import pytest
-from flash_attn.models.gptj import gptj_config_to_gpt2_config
-
-from transformers import GPTNeoXConfig, GPTJConfig
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
-
-from flash_attn.models.gpt import GPTLMHeadModel
-from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config
-from flash_attn.utils.pretrained import state_dict_from_pretrained
-
-
-@pytest.mark.parametrize(
-    'model_name',
-    [
-        # "EleutherAI/gpt-neox-20b",
-        'togethercomputer/GPT-NeoXT-Chat-Base-20B',
-    ]
-)
-def test_gptj_state_dict(model_name):
-    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
-    pretrained_state_dict = remap_state_dict_hf_gpt_neox(state_dict_from_pretrained(model_name), config)
-    model = GPTLMHeadModel(config, device='meta')  # Without device='meta' init is very slow
-    state_dict = model.state_dict()
-    assert state_dict.keys() == pretrained_state_dict.keys()
-    for k in state_dict.keys():
-        assert state_dict[k].shape == pretrained_state_dict[k].shape
-
-
-@pytest.mark.parametrize(
-    'model_name',
-    [
-        "EleutherAI/gpt-neox-20b",
-        # "h2oai/h2ogpt-oig-oasst1-512-6.9b",
-        # "h2oai/h2ogpt-oasst1-512-12b",
-    ]
-)
-def test_gpt_neox_optimized(model_name):
-    """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the
-    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
-    forward pass in fp16, when compared to the HF forward pass in fp32.
-    """
-    dtype = torch.float16
-    device = 'cuda'
-    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
-    # config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
-    config.use_flash_attn = True
-    config.fused_bias_fc = True
-    config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
-    config.fused_dropout_add_ln = True
-    config.residual_in_fp32 = True
-
-    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
-    model.eval()
-
-    torch.manual_seed(0)
-    batch_size = 2
-    max_seqlen = 256
-    input_ids = torch.randint(0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long,
-                              device=device)
-    with torch.no_grad():
-        out = model.transformer(input_ids)
-        logits = model(input_ids).logits
-    del model
-
-    # Need at least 2 GPUs, otherwise we'll OOM
-    # Without device_map, the model is loaded on the CPU, which is very slow
-    model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map='auto')
-    model_ref.eval()
-    with torch.no_grad():
-        out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device)
-        logits_ref = model_ref(input_ids).logits.to(device=device)
-    del model_ref
-
-    model_hf = GPTNeoXForCausalLM.from_pretrained(model_name, torch_dtype=dtype,
-                                                  device_map={"": device})
-    model_hf.eval()
-    with torch.no_grad():
-        out_hf = model_hf.gpt_neox(input_ids).last_hidden_state
-        logits_hf = model_hf(input_ids).logits
-    del model_hf
-
-    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
-    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
-    print(f'HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}')
-    print(f'HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}')
-    assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
-    assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item()
-
-    print(f'Logits max diff: {(logits - logits_ref).abs().max().item()}')
-    print(f'Logits mean diff: {(logits - logits_ref).abs().mean().item()}')
-    print(f'HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}')
-    print(f'HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}')
-    assert (logits - logits_ref).abs().max().item() < 2 * (logits_hf - logits_ref).abs().max().item()
-    assert (logits - logits_ref).abs().mean().item() < 2 * (logits_hf - logits_ref).abs().mean().item()

From e286ec31375a84643b1c7b18b0d4a93c24fb1d6c Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Thu, 11 May 2023 14:11:14 -0700
Subject: [PATCH 10/11] Revert name change.

---
 finetune.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/finetune.py b/finetune.py
index d2ca116c0..2ab1e1a2e 100644
--- a/finetune.py
+++ b/finetune.py
@@ -154,7 +154,7 @@ def train(
         lora_dropout: float = 0.05,
         lora_target_modules: List[str] = None,
         llama_type: bool = None,
-        flash_attn: bool = False,
+        llama_flash_attn: bool = False,
 
         # llm hyperparams
         train_on_inputs: bool = True,  # if False, masks out inputs in loss
@@ -175,7 +175,7 @@ def train(
         add_eos_token: bool = False,
 ):
 
-    if flash_attn:
+    if llama_flash_attn:
         # Need to call this before importing transformers.
         from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
         replace_llama_attn_with_flash_attn()
@@ -211,7 +211,7 @@ def train(
         tokenizer_base_model = base_model
     if llama_type is None:
         llama_type = "llama" in base_model.lower()
-    if flash_attn:
+    if llama_flash_attn:
         import pkg_resources
         try:
             pkg_resources.get_distribution('flash_attn')
@@ -635,7 +635,7 @@ def compute_metrics(eval_preds):
     if torch.__version__ >= "2" and sys.platform != "win32":
         model = torch.compile(model)
         # WIP (not generally replacing layers until pytorch 2.1)
-        if not flash_attn:
+        if not llama_flash_attn:
             torch.backends.cuda.enable_flash_sdp(True)
 
     if gpus > 1 and not ddp:

From cc43cc56a1c3e41b2dd40f560c74d8c6704f865d Mon Sep 17 00:00:00 2001
From: Arno Candel <arno.candel@gmail.com>
Date: Thu, 11 May 2023 14:12:32 -0700
Subject: [PATCH 11/11] Revert more changes.

---
 finetune.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/finetune.py b/finetune.py
index 2ab1e1a2e..4b4f8922a 100644
--- a/finetune.py
+++ b/finetune.py
@@ -211,12 +211,15 @@ def train(
         tokenizer_base_model = base_model
     if llama_type is None:
         llama_type = "llama" in base_model.lower()
-    if llama_flash_attn:
+    if llama_type and llama_flash_attn:
         import pkg_resources
         try:
             pkg_resources.get_distribution('flash_attn')
-            log("Enabling Flash attention")
+            can_do_flash_attn = True
         except (pkg_resources.DistributionNotFound, pkg_resources.ContextualVersionConflict):
+            can_do_flash_attn = False
+
+        if not can_do_flash_attn:
             raise RuntimeError("""Flash attention not installed.
             NOTE: for current pytorch 2.0, flash attention requires installing cuda 11.7 via https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=runfile_local and then when running, to avoid installing driver, docs, samples, just install toolkit.  Then when pip installing flash attention do: