diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 7da64a8340..e89774686b 100755 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -149,7 +149,6 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ To run Llama3-405B inference on 8 Gaudi3 cards use the following command: ```bash -DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API=true \ ENABLE_LB_BUNDLE_ALL_COMPUTE_MME=0 ENABLE_EXPERIMENTAL_FLAGS=1 \ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --model_name_or_path meta-llama/Llama-3.1-405B-Instruct \ @@ -378,7 +377,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ Here is an example to measure the tensor quantization statistics on Llama3-405B with 8 cards: > Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3. ```bash -DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API=true \ QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \ --use_deepspeed --world_size 8 run_lm_eval.py \ -o acc_llama3_405b_bs1_quant.txt \ @@ -397,7 +395,6 @@ QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python .. Here is an example to quantize the model based on previous measurements for Llama3-405B with 8 cards: > Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3. ```bash -DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API=true \ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --use_deepspeed --world_size 8 run_generation.py \ --model_name_or_path meta-llama/Llama-3.1-405B-Instruct \ diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json index 20783ea3f1..216cf27e68 100644 --- a/examples/text-generation/quantization_config/unit_scale_quant.json +++ b/examples/text-generation/quantization_config/unit_scale_quant.json @@ -3,10 +3,5 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "unit_scale", - "whitelist": {"types": [], "names": []}, - "blacklist": {"types": [], "names": []}, - "quantize_weight": false, - "dump_stats_path": "./results/hk", - "ignore_modules_wo_measures": "True", - "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" + "dump_stats_path": "./hqt_output/measure" } diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 61270ca218..63a1a32fb7 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -439,12 +439,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): logger.info("DeepSpeed is enabled.") deepspeed.init_distributed(dist_backend="hccl") config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) - - keep_module_on_host = False - if "Llama-3.1-405B" in args.model_name_or_path: - keep_module_on_host = True - - load_to_meta = False if keep_module_on_host else model_on_meta(config) + load_to_meta = model_on_meta(config) if args.assistant_model is None: assistant_model = None @@ -499,7 +494,6 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): # Initialize the model ds_inference_kwargs = {"dtype": model_dtype} - ds_inference_kwargs["keep_module_on_host"] = keep_module_on_host ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size} ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)