-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc_loader.py
55 lines (42 loc) · 1.48 KB
/
doc_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
os.environ["USER_AGENT"] = "myagent"
import requests
from langchain_community.document_loaders import WebBaseLoader, TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Liste di file di testo e PDF
text_paths = [
"data/bibbia.txt"
]
pdf_paths = [
"data/lezionario.pdf",
"data/messale.pdf",
"data/ccc.pdf",
]
# Caricare i documenti di testo
text_documents = []
for text_path in text_paths:
text_loader = TextLoader(text_path, encoding="iso-8859-1") # iso-8859-1 Encoding per il file bibbia.txt altrimenti utf-8
text_documents.extend(text_loader.load())
# Caricare i documenti PDF
pdf_documents = []
for pdf_path in pdf_paths:
pdf_loader = PyPDFLoader(pdf_path)
pdf_documents.extend(pdf_loader.load())
# Unire i documenti caricati
docs = text_documents + pdf_documents
print(f"Loaded {len(docs)} chunks of text")
# Configurare il text splitter
chunk_size = 700
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# Suddividere i documenti
all_splits = text_splitter.split_documents(docs)
# Inviare i documenti suddivisi all'endpoint /add_document
response = requests.post(
'http://localhost:5001/add_document',
json={"documents": [{"page_content": doc.page_content} for doc in all_splits]}
)
if response.status_code == 200:
print("Documents added to vector store")
else:
print({"status": "failure", "reason": response.text})