-
Notifications
You must be signed in to change notification settings - Fork 657
/
Copy pathbooks.py
186 lines (137 loc) · 5.26 KB
/
books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Search application using Open Library book data. Requires the following steps to be run:
Install Streamlit
pip install streamlit
Download and prepare data
mkdir openlibrary && cd openlibrary
wget -O works.txt.gz https://openlibrary.org/data/ol_dump_works_latest.txt.gz
gunzip works.txt.gz
grep "\"description\":" works.txt > filtered.txt
Build index
python books.py openlibrary
Run application
streamlit run books.py openlibrary
"""
import json
import os
import sqlite3
import sys
import pandas as pd
import streamlit as st
from txtai.embeddings import Embeddings
class Application:
"""
Main application.
"""
def __init__(self, path):
"""
Creates a new application.
Args:
path: root path to data
"""
self.path = path
self.dbpath = os.path.join(self.path, "books")
def rows(self, index):
"""
Iterates over dataset yielding each row.
Args:
index: yields rows for embeddings indexing if True, otherwise yields database rows
"""
with open(os.path.join(self.path, "filtered.txt"), encoding="utf-8") as infile:
for x, row in enumerate(infile):
if x % 1000 == 0:
print(f"Processed {x} rows", end="\r")
row = row.split("\t")
uid, data = row[1], json.loads(row[4])
description = data["description"]
if isinstance(description, dict):
description = description["value"]
if "title" in data:
if index:
yield (uid, data["title"] + ". " + description, None)
else:
cover = f"{data['covers'][0]}" if "covers" in data and data["covers"] else None
yield (uid, data["title"], description, cover)
def database(self):
"""
Builds a SQLite database.
"""
# Database file path
dbfile = os.path.join(self.dbpath, "books.sqlite")
# Delete existing file
if os.path.exists(dbfile):
os.remove(dbfile)
# Create output database
db = sqlite3.connect(dbfile)
# Create database cursor
cur = db.cursor()
cur.execute("CREATE TABLE books (Id TEXT PRIMARY KEY, Title TEXT, Description TEXT, Cover TEXT)")
for uid, title, description, cover in self.rows(False):
cur.execute("INSERT INTO books (Id, Title, Description, Cover) VALUES (?, ?, ?, ?)", (uid, title, description, cover))
# Finish and close database
db.commit()
db.close()
def build(self):
"""
Builds an embeddings index and database.
"""
# Build embeddings index
embeddings = Embeddings({"path": "sentence-transformers/msmarco-distilbert-base-v4"})
embeddings.index(self.rows(True))
embeddings.save(self.dbpath)
# Build SQLite DB
self.database()
@st.cache(allow_output_mutation=True)
def load(self):
"""
Loads and caches embeddings index.
Returns:
embeddings index
"""
embeddings = Embeddings()
embeddings.load(self.dbpath)
return embeddings
def run(self):
"""
Runs a Streamlit application.
"""
# Build embeddings index
embeddings = self.load()
db = sqlite3.connect(os.path.join(self.dbpath, "books.sqlite"))
cur = db.cursor()
st.title("Book search")
st.markdown(
"This application builds a local txtai index using book data from [openlibrary.org](https://openlibrary.org). "
+ "Links to the Open Library pages and covers are shown in the application."
)
query = st.text_input("Search query:")
if query:
ids = [uid for uid, score in embeddings.search(query, 10) if score >= 0.5]
results = []
for uid in ids:
cur.execute("SELECT Title, Description, Cover FROM books WHERE Id=?", (uid,))
result = cur.fetchone()
if result:
# Build cover image
cover = (
f"<img src='http://covers.openlibrary.org/b/id/{result[2]}-M.jpg'/>"
if result[2]
else "<img src='http://openlibrary.org/images/icons/avatar_book-lg.png'/>"
)
# Append book link
cover = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{cover}</a>"
title = f"<a target='_blank' href='https://openlibrary.org/{uid}'>{result[0]}</a>"
results.append({"Cover": cover, "Title": title, "Description": result[1]})
st.write(pd.DataFrame(results).to_html(escape=False, index=False), unsafe_allow_html=True)
db.close()
if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Application is used both to index and search
app = Application(sys.argv[1])
# pylint: disable=W0212
if st._is_running_with_streamlit:
# Run application using existing index/db
app.run()
else:
# Not running through streamlit, build database/index
app.build()