FlagOpen · shh2000 · Dec 26, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 8, 2023
diff --git a/training/benchmarks/llama2_7b/deepspeed/README.md b/training/benchmarks/llama2_7b/deepspeed/README.md
@@ -10,7 +10,12 @@ Llama 2, a collection of pretrained and fine-tuned large language models (LLMs)
 
 ## 数据准备
 
-当前目录的data/目录下，存放着数据
+参考run_benckmarks下面的config/test_conf.py文件的路径，存放着数据
+
+## 优化策略
+gradient_checkpointing
+fc优化，参考/data/dataset/llama2-7b/fc_autotune_fp16.log
+
 
 
 ### 昆仑芯XPU配置与运行信息参考
@@ -24,7 +29,7 @@ Llama 2, a collection of pretrained and fine-tuned large language models (LLMs)
   - OS版本：Ubuntu 20.04
   - OS kernel版本: 5.4.0-26-generic
   - 加速卡驱动版本：4.0.25
-  - Docker镜像和版本：pytorch2.0.1-cu17-ubuntu20.04:v0.01
+  - Docker镜像和版本：XPyTorch2.0.1-cu17-ubuntu20.04:v0.01,如有需要联系周玮获取
   - 训练框架版本：xmlir
   - 训练编译器版本：xacc
   - 依赖软件版本：pytorch-2.0.1+cu17
@@ -37,22 +42,22 @@ Llama 2, a collection of pretrained and fine-tuned large language models (LLMs)
 | 指标名称       | 指标值                  | 特殊说明                                    |
 | -------------- | ----------------------- | ------------------------------------------- |
 | 任务类别       | 自然语言理解            |                                             |
-| 模型           | deepspeed-llama2-7b      |                                             |
-| 数据集         |         openwebtext      |                                             |
+| 模型           | deepspeed-llama2-7b     |                                             |
+| 数据集         | openwebtext             |                                             |
 | 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
 | 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
-| 硬件设备简称   | R300             |                                             |
-| 硬件存储使用   | memory,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
-| 吞吐量     | token/p/s,见“性能指标”   | 平均单卡每秒处理的token数                     |
-| 损失值       | loss,见“性能指标”    | 训练损失值 |
-| 计算使用率   | MFU,见“性能指标”    | 参见PaLM论文定义           |
+| 硬件设备简称   | R300                    |                                             |
+| 硬件存储使用   | memory,见“性能指标”     | 通常称为“显存”,单位为GiB                    |
+| 吞吐量         | token/p/s,见“性能指标”  | 平均单卡每秒处理的token数                   |
+| 损失值         | loss,见“性能指标”       | 训练损失值                                  |
+| 计算使用率     | MFU,见“性能指标”        | 参见PaLM论文定义                            |
 
 * 性能指标
 
-| 配置                | precision | fix_hp              | tokens/p/s | loss  | memory | MFU |
-| ------------------- | --------- | ------------------- | --------   | ----- | ------- | ------ | ------- | --------- |
-| R300单机8卡（1x8）  |  fp32     | bs=12,seqlength=512  |           |  5.4  |         |        |         |
-| R300单机8卡（1x8）  |  fp32     | bs=8,seqlength=512  |            |  5.4  |         |
-| R300单机8卡（1x8）  |  fp16     | bs=12,seqlength=512  |            |  6.76 |         |        |
+| 配置                | precision | fix_hp              | tokens/p/s | loss  | memory  |   MFU  |
+| ------------------- | --------- | ------------------- | --------   | ----- | ------- | ------ |
+| R300单机8卡（1x8）  |  fp32     | bs=8,seqlength=512  |            |  5.4  |         |        |
+| R300单机8卡（1x8）  |  fp32     | bs=12,seqlength=512 |            |  5.4  |         |        |
+| R300单机8卡（1x8）  |  fp16     | bs=12,seqlength=512 |            |  6.76 | 26G/32G |        |
 
 
diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -124,8 +124,7 @@ def get_metric(texts):
     dataloader = DataLoader(dataset,
                             sampler=sampler,
                             batch_size=batchsize,
-                            num_workers=4,
-                            pin_memory=False)
+                            pin_memory=True)
 
     epoch = 0
     while epoch < epochs:

diff --git a/training/kunlunxin/docker_image/deepspeed/Dockerfile.source b/training/kunlunxin/docker_image/deepspeed/Dockerfile.source
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
@@ -0,0 +1,13 @@
+'''Cluster configs'''
+
+# Hosts to run the benchmark. Each item is an IP address or a hostname.
+HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
+
+# Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
+HOSTS_PORTS = ["2222"]
+
+# Master port to connect
+MASTER_PORT = "29501"
+
+# ssh connection port
+SSH_PORT = "22"
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
@@ -19,7 +19,7 @@
 #       "--device=/dev/davinciX --device=/dev/davinci_manager + \
 #        --device=/dev/devmm_svm --device=/dev/hisi_hdc + \
 #        -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi"
-ACCE_CONTAINER_OPT = "--gpus all"
+ACCE_CONTAINER_OPT = " --gpus all"
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
 #   CUDA_VISIBLE_DEVICES for nvidia, iluvatar
@@ -58,18 +58,23 @@
     # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
+    #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
     # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "efficientnet:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+
     # "faster_rcnn:pytorch_1.8:A100:1:8:1": "/raid/dataset/fasterrcnn/coco2017/",
     # "bigtransfer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
 
     #"tacotron2:pytorch_1.13:A100:1:8:1": "/raid/dataset/tacotron2/LJSpeech/",
     # "resnet50:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch_1.8:A100:1:8:1": "/raid/dataset/maskrcnn/coco2017",
+
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
+
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
+
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/",
@@ -79,6 +84,7 @@
     # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train",
     # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/",
     # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/",
+
     # "llama1_7B:paddle_2.5.1:TP1PP1SH2SP8A10040G:1:8:1":"/raid/dataset/llama/"
     # "llama1_7B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/llama/"
     # "llama1_7B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/llama/"