From 8a1f5cea48787ff50d3f3b5565a4ca2ada910aa4 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Tue, 11 Apr 2023 13:29:27 -0700 Subject: [PATCH 01/11] Add Flash attention code from https://github.com/HazyResearch/flash-attention/blob/main/tests/models/test_gpt_neox.py --- requirements.txt | 1 + test_flash_attn_gpt_neox.py | 92 +++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 test_flash_attn_gpt_neox.py diff --git a/requirements.txt b/requirements.txt index 9f9f1e643..b68471d2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,3 +49,4 @@ pypandoc==1.11 openpyxl==3.1.2 lm_dataformat==0.0.20 bioc==2.0 +git+https://github.com/HazyResearch/flash-attention.git@d478eeec8f16c7939c54e4617dbd36f59b8eeed7 \ No newline at end of file diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py new file mode 100644 index 000000000..4a0b2b857 --- /dev/null +++ b/test_flash_attn_gpt_neox.py @@ -0,0 +1,92 @@ +import torch +import pytest + +from transformers import GPTNeoXConfig +from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM + +from flash_attn.models.gpt import GPTLMHeadModel +from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config +from flash_attn.utils.pretrained import state_dict_from_pretrained + + +@pytest.mark.parametrize( + 'model_name', + [ + # "EleutherAI/gpt-neox-20b", + 'togethercomputer/GPT-NeoXT-Chat-Base-20B', + ] +) +def test_gptj_state_dict(model_name): + config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) + pretrained_state_dict = remap_state_dict_hf_gpt_neox(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config, device='meta') # Without device='meta' init is very slow + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize( + 'model_name', + [ + # "EleutherAI/gpt-neox-20b", + 'togethercomputer/GPT-NeoXT-Chat-Base-20B', + ] +) +def test_gpt_neox_optimized(model_name): + """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = 'cuda' + config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + input_ids = torch.randint(0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, + device=device) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Need at least 2 GPUs, otherwise we'll OOM + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map='auto') + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = GPTNeoXForCausalLM.from_pretrained(model_name, torch_dtype=dtype, + device_map={"": device}) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.gpt_neox(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f'Output max diff: {(out - out_ref).abs().max().item()}') + print(f'Output mean diff: {(out - out_ref).abs().mean().item()}') + print(f'HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}') + print(f'HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}') + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item() + + print(f'Logits max diff: {(logits - logits_ref).abs().max().item()}') + print(f'Logits mean diff: {(logits - logits_ref).abs().mean().item()}') + print(f'HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}') + print(f'HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}') + assert (logits - logits_ref).abs().max().item() < 2 * (logits_hf - logits_ref).abs().max().item() + assert (logits - logits_ref).abs().mean().item() < 2 * (logits_hf - logits_ref).abs().mean().item() From cec2ed630f3e3ccd0aa2c41bf6519c5ea286d55f Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Tue, 11 Apr 2023 13:58:01 -0700 Subject: [PATCH 02/11] Add instructions to install Apex. --- README.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/README.md b/README.md index 4c120433f..d19232ab5 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,126 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md). - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md). +6. Compile Apex (for Flash attention, needs CUDA 11.7 above) [howto src](https://github.com/NVIDIA/apex/#linux) + +```bash +git clone https://github.com/NVIDIA/apex +cd apex +pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +``` + +Fine-tune on single GPU on single node: +``` +torchrun finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=alpaca_data_cleaned.json +``` +this will download the model, load the data, and generate an output directory lora-alpaca. + +Fine-tune using 2 nodes with 2 GPUs each: +``` +WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B' + +WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B' +``` + +Fine-tune using 2 24GB GPUs to split up a 30B model: +``` +WORLD_SIZE=2 python finetune.py --data_path=alpaca_data_cleaned.json --base_model="decapoda-research/llama-30b-hf" --ddp=False +``` + +Fine-tune previously saved model (running `export_hf_checkpoint.py`): +``` +WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node0.log + +WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node1.log +``` + +Generate on single GPU on single node: +``` +torchrun generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca +``` +this will download the foundation model, our fine-tuned lora_weights, and open up a GUI with text generation input/output. + + +In case you get peer to peer related errors, set this env var: +``` +export NCCL_P2P_LEVEL=LOC +``` + + +### Docker Setup & Inference + +1. Build the container image: + +```bash +docker build -t h2o-llm . +``` + +2. Run the container (you can also use `finetune.py` and all of its parameters as shown above for training): + +```bash +docker run --runtime=nvidia --shm-size=64g -p 7860:7860 -v ${HOME}/.cache:/root/.cache --rm h2o-llm -it generate.py \ + --load_8bit=True --base_model='EleutherAI/gpt-neox-20b' --prompt_type=human_bot +``` + +3. Open `https://localhost:7860` in the browser + +### Docker Compose Setup & Inference + +1. (optional) Change desired model and weights under `environment` in the `docker-compose.yml` + +2. Build and run the container + +```bash +docker-compose up -d --build +``` + +3. Open `https://localhost:7860` in the browser + +4. See logs: + +```bash +docker-compose logs -f +``` + +5. Clean everything up: + +```bash +docker-compose down --volumes --rmi all +``` + + +### Tensorboard + +```bash +tensorboard --logdir=runs/ +``` + +### Plan +Open source instruct model for demoable usecases. +1. Base: Start with fully open source apache 2.0 models EleutherAI--gpt-j-6B, EleutherAI--gpt-neox-20b, +GPT-NeoXT-Chat-Base-20B, etc. +2. Construct Prompt: Setup prompt engineering on 6B-20B as-is to convert a sentence into question/answer or command/response format +3. Open-Source Instruct Data: Convert wiki data into instruct form +4. Fine-tune: LORA fine-tune 6B and 20B using DAI docs +5. Open Data & Model: Submit DAI docs model huggingface +6. Use toolformer approach for external APIs + +### Goals +1. Demonstrate fine-tuning working on some existing corpus +2. Demonstrate efficiency of LORA for fast and low-memory fine-tuning + + +### Code to consider including +[flan-alpaca](https://github.com/declare-lab/flan-alpaca)
+[text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+[minimal-llama](https://github.com/zphang/minimal-llama/)
+[finetune GPT-NeoX](https://nn.labml.ai/neox/samples/finetune.html)
+[GPTQ-for_LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/compare/cuda...Digitous:GPTQ-for-GPT-NeoX:main)
+[OpenChatKit on multi-GPU](https://github.com/togethercomputer/OpenChatKit/issues/20)
+[Non-Causal LLM](https://huggingface.co/docs/transformers/main/en/model_doc/gptj#transformers.GPTJForSequenceClassification)
+[OpenChatKit_Offload](https://github.com/togethercomputer/OpenChatKit/commit/148b5745a57a6059231178c41859ecb09164c157)
+[Flan-alpaca](https://github.com/declare-lab/flan-alpaca/blob/main/training.py)
+ ### Help [FAQs](FAQ.md) From 6e163e23664ebed827043bdec9261af666eb025f Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Tue, 11 Apr 2023 14:27:27 -0700 Subject: [PATCH 03/11] Update instructions for flash attention --- README.md | 128 ++++------------------------------------------- requirements.txt | 1 - 2 files changed, 11 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index d19232ab5..1358c32ef 100644 --- a/README.md +++ b/README.md @@ -74,126 +74,20 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md). - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md). -6. Compile Apex (for Flash attention, needs CUDA 11.7 above) [howto src](https://github.com/NVIDIA/apex/#linux) +6. Add Flash Attention ```bash -git clone https://github.com/NVIDIA/apex -cd apex -pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +git clone https://github.com/HazyResearch/flash-attention.git +cd flash-attention +python setup.py install +cd csrc/layer_norm +pip install . +cd ../rotary +pip install . +cd ../fused_dense_lib +pip install . +cd ../.. ``` - -Fine-tune on single GPU on single node: -``` -torchrun finetune.py --base_model='EleutherAI/gpt-j-6B' --data_path=alpaca_data_cleaned.json -``` -this will download the model, load the data, and generate an output directory lora-alpaca. - -Fine-tune using 2 nodes with 2 GPUs each: -``` -WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B' - -WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --data_path=alpaca_data_cleaned.json --run_id=0 --base_model='EleutherAI/gpt-j-6B' -``` - -Fine-tune using 2 24GB GPUs to split up a 30B model: -``` -WORLD_SIZE=2 python finetune.py --data_path=alpaca_data_cleaned.json --base_model="decapoda-research/llama-30b-hf" --ddp=False -``` - -Fine-tune previously saved model (running `export_hf_checkpoint.py`): -``` -WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=0 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node0.log - -WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1" torchrun --nnodes=2 --master_addr="10.10.10.2" --node_rank=1 --nproc_per_node=2 --master_port=1234 finetune.py --num_epochs=2 --micro_batch_size=8 --data_path=alpaca_data_cleaned.json --run_id=3 --base_model='gpt-j-6B.DAIdocs' --tokenizer_base_model='EleutherAI/gpt-j-6B' --output_dir=lora_6B.DAIdocs &> 3.node1.log -``` - -Generate on single GPU on single node: -``` -torchrun generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca -``` -this will download the foundation model, our fine-tuned lora_weights, and open up a GUI with text generation input/output. - - -In case you get peer to peer related errors, set this env var: -``` -export NCCL_P2P_LEVEL=LOC -``` - - -### Docker Setup & Inference - -1. Build the container image: - -```bash -docker build -t h2o-llm . -``` - -2. Run the container (you can also use `finetune.py` and all of its parameters as shown above for training): - -```bash -docker run --runtime=nvidia --shm-size=64g -p 7860:7860 -v ${HOME}/.cache:/root/.cache --rm h2o-llm -it generate.py \ - --load_8bit=True --base_model='EleutherAI/gpt-neox-20b' --prompt_type=human_bot -``` - -3. Open `https://localhost:7860` in the browser - -### Docker Compose Setup & Inference - -1. (optional) Change desired model and weights under `environment` in the `docker-compose.yml` - -2. Build and run the container - -```bash -docker-compose up -d --build -``` - -3. Open `https://localhost:7860` in the browser - -4. See logs: - -```bash -docker-compose logs -f -``` - -5. Clean everything up: - -```bash -docker-compose down --volumes --rmi all -``` - - -### Tensorboard - -```bash -tensorboard --logdir=runs/ -``` - -### Plan -Open source instruct model for demoable usecases. -1. Base: Start with fully open source apache 2.0 models EleutherAI--gpt-j-6B, EleutherAI--gpt-neox-20b, -GPT-NeoXT-Chat-Base-20B, etc. -2. Construct Prompt: Setup prompt engineering on 6B-20B as-is to convert a sentence into question/answer or command/response format -3. Open-Source Instruct Data: Convert wiki data into instruct form -4. Fine-tune: LORA fine-tune 6B and 20B using DAI docs -5. Open Data & Model: Submit DAI docs model huggingface -6. Use toolformer approach for external APIs - -### Goals -1. Demonstrate fine-tuning working on some existing corpus -2. Demonstrate efficiency of LORA for fast and low-memory fine-tuning - - -### Code to consider including -[flan-alpaca](https://github.com/declare-lab/flan-alpaca)
-[text-generation-webui](https://github.com/oobabooga/text-generation-webui)
-[minimal-llama](https://github.com/zphang/minimal-llama/)
-[finetune GPT-NeoX](https://nn.labml.ai/neox/samples/finetune.html)
-[GPTQ-for_LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/compare/cuda...Digitous:GPTQ-for-GPT-NeoX:main)
-[OpenChatKit on multi-GPU](https://github.com/togethercomputer/OpenChatKit/issues/20)
-[Non-Causal LLM](https://huggingface.co/docs/transformers/main/en/model_doc/gptj#transformers.GPTJForSequenceClassification)
-[OpenChatKit_Offload](https://github.com/togethercomputer/OpenChatKit/commit/148b5745a57a6059231178c41859ecb09164c157)
-[Flan-alpaca](https://github.com/declare-lab/flan-alpaca/blob/main/training.py)
- ### Help [FAQs](FAQ.md) diff --git a/requirements.txt b/requirements.txt index b68471d2e..9f9f1e643 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,4 +49,3 @@ pypandoc==1.11 openpyxl==3.1.2 lm_dataformat==0.0.20 bioc==2.0 -git+https://github.com/HazyResearch/flash-attention.git@d478eeec8f16c7939c54e4617dbd36f59b8eeed7 \ No newline at end of file From ff4307f7f4af2bb2afac674084b7e744da55d9f9 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Tue, 11 Apr 2023 16:44:05 -0700 Subject: [PATCH 04/11] WIP - nothing working yet. Disable mix_in by default. --- README.md | 2 +- finetune.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1358c32ef..c9bb2a4b6 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h 6. Add Flash Attention ```bash -git clone https://github.com/HazyResearch/flash-attention.git +git clone https://github.com/h2oai/flash-attention.git cd flash-attention python setup.py install cd csrc/layer_norm diff --git a/finetune.py b/finetune.py index bd1c696e6..3aea12292 100644 --- a/finetune.py +++ b/finetune.py @@ -173,6 +173,7 @@ def train( save_steps: int = None, # must be round multiple of eval_steps save_total_limit: int = 3, add_eos_token: bool = False, + flash_attention: bool = False, ): if llama_flash_attn: @@ -296,6 +297,37 @@ def train( lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() lora_mappings['distilgpt2'] = ["c_attn"] + if "h2ogpt" in base_model and not llama_type and flash_attention: + log("Enabling Flash attention") + # speed up forward prop for attention layer and reduce memory especially for long context lengths + from flash_attn.models.gpt import GPTLMHeadModel + from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config + from flash_attn.models.gptj import gptj_config_to_gpt2_config + + if "gpt-j" in base_model.lower(): + config = gptj_config_to_gpt2_config(model.config) + else: + config = gpt_neox_config_to_gpt2_config(model.config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + lora_target_modules = ['Wqkv'] + model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16) + # for v in vars(model2.config): + # setattr(model.config, v, getattr(model2.config, v)) + # model.transformer.config = model.config + # model.transformer.h = model2.transformer.layers + # model.lm_head = model2.lm_head + ### model.transformer.wte = model2.transformer.wte + ### model.transformer.embeddings = model2.transformer.embeddings + print(model) + # FIXME - don't disable LoRA + lora_r = 0 + # FIXME - enable 8-bit + # model = prepare_model_for_int8_training(model) + if lora_weights: from peft import PeftModel From 92335ed06a02e935cae295fac12dfea2bf82a924 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Wed, 10 May 2023 21:09:00 -0700 Subject: [PATCH 05/11] Remove manual install of flash-attn. --- README.md | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/README.md b/README.md index c9bb2a4b6..4c120433f 100644 --- a/README.md +++ b/README.md @@ -74,20 +74,6 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md). - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md). -6. Add Flash Attention - -```bash -git clone https://github.com/h2oai/flash-attention.git -cd flash-attention -python setup.py install -cd csrc/layer_norm -pip install . -cd ../rotary -pip install . -cd ../fused_dense_lib -pip install . -cd ../.. -``` ### Help [FAQs](FAQ.md) From bd1b00905bf37a3256dc83eb34ae228ab269600b Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Wed, 10 May 2023 21:24:16 -0700 Subject: [PATCH 06/11] Upgrade requirements, fixes sm80 issue. --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9f9f1e643..8c943d601 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ # for generate (gradio server) and finetune -datasets==2.11.0 +datasets==2.12.0 sentencepiece==0.1.97 accelerate==0.18.0 gradio==3.27.0 -huggingface_hub==0.13.4 +huggingface_hub==0.14.1 appdirs==1.4.4 fire==0.5.0 docutils==0.19 -torch==2.0.0 +torch==2.0.1 evaluate==0.4.0 rouge_score==0.1.2 sacrebleu==2.3.1 From af4a98b13fd10e943e6964642863bb776c405714 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Wed, 10 May 2023 23:04:45 -0700 Subject: [PATCH 07/11] Rebase, rename llama_flash_attn -> flash_attn. --- finetune.py | 78 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/finetune.py b/finetune.py index 3aea12292..279b03d2f 100644 --- a/finetune.py +++ b/finetune.py @@ -154,7 +154,7 @@ def train( lora_dropout: float = 0.05, lora_target_modules: List[str] = None, llama_type: bool = None, - llama_flash_attn: bool = False, + flash_attn: bool = False, # llm hyperparams train_on_inputs: bool = True, # if False, masks out inputs in loss @@ -173,10 +173,9 @@ def train( save_steps: int = None, # must be round multiple of eval_steps save_total_limit: int = 3, add_eos_token: bool = False, - flash_attention: bool = False, ): - if llama_flash_attn: + if flash_attn: # Need to call this before importing transformers. from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn replace_llama_attn_with_flash_attn() @@ -212,21 +211,16 @@ def train( tokenizer_base_model = base_model if llama_type is None: llama_type = "llama" in base_model.lower() - if llama_type and llama_flash_attn: + if flash_attn: import pkg_resources try: pkg_resources.get_distribution('flash_attn') - can_do_flash_attn = True + log("Enabling Flash attention") except (pkg_resources.DistributionNotFound, pkg_resources.ContextualVersionConflict): - can_do_flash_attn = False - - if not can_do_flash_attn: raise RuntimeError("""Flash attention not installed. NOTE: for current pytorch 2.0, flash attention requires installing cuda 11.7 via https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=runfile_local and then when running, to avoid installing driver, docs, samples, just install toolkit. Then when pip installing flash attention do: CUDA_HOME=/usr/local/cuda-11.7 pip install flash-attn""") - from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn - replace_llama_attn_with_flash_attn() assert ( base_model ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'" @@ -297,36 +291,40 @@ def train( lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() lora_mappings['distilgpt2'] = ["c_attn"] - if "h2ogpt" in base_model and not llama_type and flash_attention: + if not llama_type and flash_attn: log("Enabling Flash attention") - # speed up forward prop for attention layer and reduce memory especially for long context lengths - from flash_attn.models.gpt import GPTLMHeadModel - from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config - from flash_attn.models.gptj import gptj_config_to_gpt2_config - - if "gpt-j" in base_model.lower(): - config = gptj_config_to_gpt2_config(model.config) - else: - config = gpt_neox_config_to_gpt2_config(model.config) - config.use_flash_attn = True - config.fused_bias_fc = True - config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" - config.fused_dropout_add_ln = True - config.residual_in_fp32 = True - lora_target_modules = ['Wqkv'] - model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16) - # for v in vars(model2.config): - # setattr(model.config, v, getattr(model2.config, v)) - # model.transformer.config = model.config - # model.transformer.h = model2.transformer.layers - # model.lm_head = model2.lm_head - ### model.transformer.wte = model2.transformer.wte - ### model.transformer.embeddings = model2.transformer.embeddings - print(model) - # FIXME - don't disable LoRA - lora_r = 0 - # FIXME - enable 8-bit - # model = prepare_model_for_int8_training(model) + # from flash_attn.models.gpt import GPTLMHeadModel + # from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config + # from flash_attn.models.gptj import gptj_config_to_gpt2_config + # + # if "gpt-j" in base_model.lower(): + # config = gptj_config_to_gpt2_config(model.config) + # else: + # assert any([x in base_model.lower() for x in ["pythia", "h2ogpt", "gpt-neox"]]) + # config = gpt_neox_config_to_gpt2_config(model.config) + # config.use_flash_attn = True + # config.fused_bias_fc = True + # config.activation_function = 'gelu_fast' # GPT-NeoX-20B uses "gelu_fast" + # config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" + # config.fused_dropout_add_ln = True + # config.residual_in_fp32 = True + # lora_target_modules = ['Wqkv'] + # # model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16) + # + # model = GPTLMHeadModel(config, base_model, device='cuda', dtype=torch.float16) + # # Load state_dict in cpu because we already initialized the model in GPU, and we don't + # # want extra stuff taking up more GPU memory + # state_dict = state_dict_from_pretrained( + # base_model, device='cpu', dtype=torch.float16 + # ) + # if base_model.startswith('EleutherAI/gpt-j-'): + # state_dict = remap_state_dict_hf_gptj(state_dict, config) + # strict = False # We have rotary_emb.inf_freq buffers not in the GPT-J checkpoint + # else: + # state_dict = remap_state_dict_hf_gpt_neox(state_dict, config) + # if world_size > 1: + # state_dict = shard_state_dict_tp(state_dict, config, world_size, rank) + # model.load_state_dict(state_dict, strict=True) if lora_weights: @@ -672,7 +670,7 @@ def compute_metrics(eval_preds): if torch.__version__ >= "2" and sys.platform != "win32": model = torch.compile(model) # WIP (not generally replacing layers until pytorch 2.1) - if not llama_flash_attn: + if not flash_attn: torch.backends.cuda.enable_flash_sdp(True) if gpus > 1 and not ddp: From b9dcb7da6ac5f49603f7bbe8937ccf1a3d8f3883 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Wed, 10 May 2023 23:46:21 -0700 Subject: [PATCH 08/11] WIP. Add back custom install for flash-attn. --- README.md | 14 ++++++++++++++ test_flash_attn_gpt_neox.py | 9 ++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4c120433f..c9bb2a4b6 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,20 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md). - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md). +6. Add Flash Attention + +```bash +git clone https://github.com/h2oai/flash-attention.git +cd flash-attention +python setup.py install +cd csrc/layer_norm +pip install . +cd ../rotary +pip install . +cd ../fused_dense_lib +pip install . +cd ../.. +``` ### Help [FAQs](FAQ.md) diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py index 4a0b2b857..176369ddb 100644 --- a/test_flash_attn_gpt_neox.py +++ b/test_flash_attn_gpt_neox.py @@ -1,7 +1,8 @@ import torch import pytest +from flash_attn.models.gptj import gptj_config_to_gpt2_config -from transformers import GPTNeoXConfig +from transformers import GPTNeoXConfig, GPTJConfig from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM from flash_attn.models.gpt import GPTLMHeadModel @@ -29,8 +30,9 @@ def test_gptj_state_dict(model_name): @pytest.mark.parametrize( 'model_name', [ - # "EleutherAI/gpt-neox-20b", - 'togethercomputer/GPT-NeoXT-Chat-Base-20B', + "EleutherAI/gpt-neox-20b", + # "h2oai/h2ogpt-oig-oasst1-512-6.9b", + # "h2oai/h2ogpt-oasst1-512-12b", ] ) def test_gpt_neox_optimized(model_name): @@ -41,6 +43,7 @@ def test_gpt_neox_optimized(model_name): dtype = torch.float16 device = 'cuda' config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) + # config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name)) config.use_flash_attn = True config.fused_bias_fc = True config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" From ee79bfa458f5f89696083315a687aa03136f8c51 Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Thu, 11 May 2023 14:09:54 -0700 Subject: [PATCH 09/11] Cleanup. --- README.md | 14 ------ finetune.py | 35 -------------- test_flash_attn_gpt_neox.py | 95 ------------------------------------- 3 files changed, 144 deletions(-) delete mode 100644 test_flash_attn_gpt_neox.py diff --git a/README.md b/README.md index c9bb2a4b6..4c120433f 100644 --- a/README.md +++ b/README.md @@ -74,20 +74,6 @@ More information about the models can be found on [H2O.ai's Hugging Face page](h - To fine-tune any LLM models on your data, follow the [fine-tuning instructions](FINETUNE.md). - To create a container for deployment, follow the [Docker instructions](INSTALL-DOCKER.md). -6. Add Flash Attention - -```bash -git clone https://github.com/h2oai/flash-attention.git -cd flash-attention -python setup.py install -cd csrc/layer_norm -pip install . -cd ../rotary -pip install . -cd ../fused_dense_lib -pip install . -cd ../.. -``` ### Help [FAQs](FAQ.md) diff --git a/finetune.py b/finetune.py index 279b03d2f..d2ca116c0 100644 --- a/finetune.py +++ b/finetune.py @@ -291,41 +291,6 @@ def train( lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() lora_mappings['distilgpt2'] = ["c_attn"] - if not llama_type and flash_attn: - log("Enabling Flash attention") - # from flash_attn.models.gpt import GPTLMHeadModel - # from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config - # from flash_attn.models.gptj import gptj_config_to_gpt2_config - # - # if "gpt-j" in base_model.lower(): - # config = gptj_config_to_gpt2_config(model.config) - # else: - # assert any([x in base_model.lower() for x in ["pythia", "h2ogpt", "gpt-neox"]]) - # config = gpt_neox_config_to_gpt2_config(model.config) - # config.use_flash_attn = True - # config.fused_bias_fc = True - # config.activation_function = 'gelu_fast' # GPT-NeoX-20B uses "gelu_fast" - # config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" - # config.fused_dropout_add_ln = True - # config.residual_in_fp32 = True - # lora_target_modules = ['Wqkv'] - # # model = GPTLMHeadModel.from_pretrained(base_model, config, device='cuda', dtype=torch.float16) - # - # model = GPTLMHeadModel(config, base_model, device='cuda', dtype=torch.float16) - # # Load state_dict in cpu because we already initialized the model in GPU, and we don't - # # want extra stuff taking up more GPU memory - # state_dict = state_dict_from_pretrained( - # base_model, device='cpu', dtype=torch.float16 - # ) - # if base_model.startswith('EleutherAI/gpt-j-'): - # state_dict = remap_state_dict_hf_gptj(state_dict, config) - # strict = False # We have rotary_emb.inf_freq buffers not in the GPT-J checkpoint - # else: - # state_dict = remap_state_dict_hf_gpt_neox(state_dict, config) - # if world_size > 1: - # state_dict = shard_state_dict_tp(state_dict, config, world_size, rank) - # model.load_state_dict(state_dict, strict=True) - if lora_weights: from peft import PeftModel diff --git a/test_flash_attn_gpt_neox.py b/test_flash_attn_gpt_neox.py deleted file mode 100644 index 176369ddb..000000000 --- a/test_flash_attn_gpt_neox.py +++ /dev/null @@ -1,95 +0,0 @@ -import torch -import pytest -from flash_attn.models.gptj import gptj_config_to_gpt2_config - -from transformers import GPTNeoXConfig, GPTJConfig -from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM - -from flash_attn.models.gpt import GPTLMHeadModel -from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config -from flash_attn.utils.pretrained import state_dict_from_pretrained - - -@pytest.mark.parametrize( - 'model_name', - [ - # "EleutherAI/gpt-neox-20b", - 'togethercomputer/GPT-NeoXT-Chat-Base-20B', - ] -) -def test_gptj_state_dict(model_name): - config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) - pretrained_state_dict = remap_state_dict_hf_gpt_neox(state_dict_from_pretrained(model_name), config) - model = GPTLMHeadModel(config, device='meta') # Without device='meta' init is very slow - state_dict = model.state_dict() - assert state_dict.keys() == pretrained_state_dict.keys() - for k in state_dict.keys(): - assert state_dict[k].shape == pretrained_state_dict[k].shape - - -@pytest.mark.parametrize( - 'model_name', - [ - "EleutherAI/gpt-neox-20b", - # "h2oai/h2ogpt-oig-oasst1-512-6.9b", - # "h2oai/h2ogpt-oasst1-512-12b", - ] -) -def test_gpt_neox_optimized(model_name): - """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the - HF implementation: the output of our forward pass in fp16 should be around the same as the HF - forward pass in fp16, when compared to the HF forward pass in fp32. - """ - dtype = torch.float16 - device = 'cuda' - config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) - # config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name)) - config.use_flash_attn = True - config.fused_bias_fc = True - config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" - config.fused_dropout_add_ln = True - config.residual_in_fp32 = True - - model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) - model.eval() - - torch.manual_seed(0) - batch_size = 2 - max_seqlen = 256 - input_ids = torch.randint(0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, - device=device) - with torch.no_grad(): - out = model.transformer(input_ids) - logits = model(input_ids).logits - del model - - # Need at least 2 GPUs, otherwise we'll OOM - # Without device_map, the model is loaded on the CPU, which is very slow - model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map='auto') - model_ref.eval() - with torch.no_grad(): - out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device) - logits_ref = model_ref(input_ids).logits.to(device=device) - del model_ref - - model_hf = GPTNeoXForCausalLM.from_pretrained(model_name, torch_dtype=dtype, - device_map={"": device}) - model_hf.eval() - with torch.no_grad(): - out_hf = model_hf.gpt_neox(input_ids).last_hidden_state - logits_hf = model_hf(input_ids).logits - del model_hf - - print(f'Output max diff: {(out - out_ref).abs().max().item()}') - print(f'Output mean diff: {(out - out_ref).abs().mean().item()}') - print(f'HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}') - print(f'HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}') - assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() - assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item() - - print(f'Logits max diff: {(logits - logits_ref).abs().max().item()}') - print(f'Logits mean diff: {(logits - logits_ref).abs().mean().item()}') - print(f'HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}') - print(f'HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}') - assert (logits - logits_ref).abs().max().item() < 2 * (logits_hf - logits_ref).abs().max().item() - assert (logits - logits_ref).abs().mean().item() < 2 * (logits_hf - logits_ref).abs().mean().item() From e286ec31375a84643b1c7b18b0d4a93c24fb1d6c Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Thu, 11 May 2023 14:11:14 -0700 Subject: [PATCH 10/11] Revert name change. --- finetune.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/finetune.py b/finetune.py index d2ca116c0..2ab1e1a2e 100644 --- a/finetune.py +++ b/finetune.py @@ -154,7 +154,7 @@ def train( lora_dropout: float = 0.05, lora_target_modules: List[str] = None, llama_type: bool = None, - flash_attn: bool = False, + llama_flash_attn: bool = False, # llm hyperparams train_on_inputs: bool = True, # if False, masks out inputs in loss @@ -175,7 +175,7 @@ def train( add_eos_token: bool = False, ): - if flash_attn: + if llama_flash_attn: # Need to call this before importing transformers. from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn replace_llama_attn_with_flash_attn() @@ -211,7 +211,7 @@ def train( tokenizer_base_model = base_model if llama_type is None: llama_type = "llama" in base_model.lower() - if flash_attn: + if llama_flash_attn: import pkg_resources try: pkg_resources.get_distribution('flash_attn') @@ -635,7 +635,7 @@ def compute_metrics(eval_preds): if torch.__version__ >= "2" and sys.platform != "win32": model = torch.compile(model) # WIP (not generally replacing layers until pytorch 2.1) - if not flash_attn: + if not llama_flash_attn: torch.backends.cuda.enable_flash_sdp(True) if gpus > 1 and not ddp: From cc43cc56a1c3e41b2dd40f560c74d8c6704f865d Mon Sep 17 00:00:00 2001 From: Arno Candel Date: Thu, 11 May 2023 14:12:32 -0700 Subject: [PATCH 11/11] Revert more changes. --- finetune.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/finetune.py b/finetune.py index 2ab1e1a2e..4b4f8922a 100644 --- a/finetune.py +++ b/finetune.py @@ -211,12 +211,15 @@ def train( tokenizer_base_model = base_model if llama_type is None: llama_type = "llama" in base_model.lower() - if llama_flash_attn: + if llama_type and llama_flash_attn: import pkg_resources try: pkg_resources.get_distribution('flash_attn') - log("Enabling Flash attention") + can_do_flash_attn = True except (pkg_resources.DistributionNotFound, pkg_resources.ContextualVersionConflict): + can_do_flash_attn = False + + if not can_do_flash_attn: raise RuntimeError("""Flash attention not installed. NOTE: for current pytorch 2.0, flash attention requires installing cuda 11.7 via https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=runfile_local and then when running, to avoid installing driver, docs, samples, just install toolkit. Then when pip installing flash attention do: