-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
150 lines (129 loc) · 4.92 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import telebot
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
import pickle
from dotenv import load_dotenv
import io
from datetime import datetime
from PIL import Image
#import pytesseract
from pdf2image import convert_from_bytes
import base64
from google.cloud import vision
from google.oauth2 import service_account
credentials_path = 'api-key.json'
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = vision.ImageAnnotatorClient(credentials=credentials)
pkl_folder = "pkl_files"
pdf_folder = "pdf_files"
# Ensure the pkl_folder exists
if not os.path.exists(pkl_folder):
os.makedirs(pkl_folder)
if not os.path.exists(pdf_folder):
os.makedirs(pdf_folder)
load_dotenv()
# Initialize the Telegram bot
API_TOKEN = os.getenv('TELE_API')
bot = telebot.TeleBot(API_TOKEN)
user_store_name = None
def delete_pdf(user_store_name):
pdf_file_path = f"{user_store_name}.pdf"
if os.path.exists(pdf_file_path):
os.remove(pdf_file_path)
def extract_text_from_pdf(pdf_path):
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Process and store the PDF VectorStore
# Process and store the PDF VectorStore
def process_and_store_pdf(pdf_file, store_name):
images = convert_from_bytes(pdf_file)
pdf_text = ""
extracted_text = []
for image in images:
image_bytes = image_to_bytes(image)
response = extract_text_from_image(image_bytes)
pdf_text+=response.description
print(pdf_text)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text=pdf_text)
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
VectorStore = pickle.load(f)
else:
embeddings = OpenAIEmbeddings()
VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(VectorStore, f)
return VectorStore
def image_to_bytes(image):
img_byte_array = io.BytesIO()
image.save(img_byte_array, format='PNG')
return img_byte_array.getvalue()
def extract_text_from_image(image_bytes):
image = vision.Image(content=image_bytes)
response = client.text_detection(image=image)
return response.text_annotations[0]
# Handle document messages (PDF files)
@bot.message_handler(content_types=['document'])
def handle_document(message):
global user_store_name
# Check if the document is a PDF
if not message.document.mime_type == 'application/pdf':
bot.reply_to(message, "Please upload a PDF file.")
return
# Set user_store_name based on user ID and timestamp
now = datetime.now()
user_store_name = f"user_{message.from_user.id}_{now.strftime('%Y%m%d%H%M%S')}"
# Get the file path for the PDF document
file_info = bot.get_file(message.document.file_id)
downloaded_file = bot.download_file(file_info.file_path)
pdf_path = os.path.join(pdf_folder, user_store_name + ".pdf")
pdf_file_path = pdf_path
with open(pdf_file_path, "wb") as f:
f.write(downloaded_file)
# Extract text from the PDF
#pdf_text = extract_text_from_pdf(pdf_file_path)
with open(pdf_file_path, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
# Process and store the PDF VectorStore
pkl_path = os.path.join(pkl_folder, user_store_name + ".pkl")
process_and_store_pdf(pdf_bytes, pkl_path)
delete_pdf(pdf_file_path)
bot.reply_to(message, "PDF file processed. You can now start asking questions.")
os.remove(pdf_file_path)
# Handle text messages
@bot.message_handler(func=lambda message: True)
def handle_text(message):
global user_store_name
# Check if user_store_name is set
if user_store_name is None:
bot.reply_to(message, "Please upload a PDF and process it first.")
return
query = message.text
pkl_path = os.path.join(pkl_folder, user_store_name + ".pkl")
# Load the PDF VectorStore based on the provided user_store_name
with open(pkl_path + ".pkl", "rb") as f:
VectorStore = pickle.load(f)
# Perform the LangChain operations and send the response
docs = VectorStore.similarity_search(query=query, k=3)
llm = ChatOpenAI(max_tokens=256)
chain = load_qa_chain(llm=llm, chain_type="stuff")
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=query)
bot.reply_to(message, response)
# Start the bot's polling
bot.infinity_polling()