-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrag_llama3_1.py
73 lines (54 loc) · 2.71 KB
/
rag_llama3_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
selected_model = "./LLMs/llama-models/models/llama3_1/llama-3.1-8B-Instruct-hf"
#selected_model = "meta-llama/Meta-Llama-3-8B-Instruct"
#selected_model = "meta-llama/Meta-Llama-3-8B"
from llama_index.core import PromptTemplate
system_prompt = "You are a Q&A assistant on FrameNet. Your goal is to answer questions as accurately as possible based on the instructions, conversation history and context provided."
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate(
'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>
'''+system_prompt+'''<|eot_id|><|start_header_id|>user<|end_header_id|>
{query_str}<|eot_id|><|start_header_id|>assistant<|end_header_id|>''')
from transformers import AutoTokenizer
from llama_index.llms.huggingface import HuggingFaceLLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
#tokenizer = AutoTokenizer.from_pretrained(selected_model)
stopping_ids = [tokenizer.eos_token_id,tokenizer.convert_tokens_to_ids("<|eot_id|>")]
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=2048,
generate_kwargs={"temperature": 0.85, "do_sample": True},
#system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name=selected_model,
model_name=selected_model,
device_map="auto",
stopping_ids=stopping_ids,
# change these settings below depending on your GPU
model_kwargs={"torch_dtype": torch.float16,"load_in_8bit": True},
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
cache_direc="./embed_model"
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5",cache_folder=cache_direc)
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
from llama_index.core import SimpleDirectoryReader
# load documents
documents = SimpleDirectoryReader("./frame_and_papers/").load_data()
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
wrapper=""
while True:
inp=input(">> Enter the prompt: ") #"What are the data driven approaches to Framenet expansion?"
response = query_engine.query(wrapper+inp)
print(">> Response: ",response)
wrapper+=inp+"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"+str(response)+"<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"