-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
224 lines (183 loc) · 8.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
from bs4 import BeautifulSoup
import requests
import openai
import os
import time
from tqdm import tqdm
from typing import Optional
import datetime
date_time_string = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model = "gpt-3.5-turbo" # try to cut down costs by using GPT3.5
skip_generate_text = False
skip_narration = False
max_retries = 5
max_article_length = 4*3000 if model=="gpt-3.5-turbo" else 4*7000 # token is roughly 4 characters, so use 3000 for GPT3 for testing and 7000 for GPT4.
openai.api_key = os.getenv("OPENAI_API_KEY")
language = "Italian"
# language = "Spanish"
# language = "Portuguese"
# language = "Slovak"
# language = "German"
number_of_articles = 5
text_file = f"/Users/lukasplatinsky/workspace/hn-slow-italian/episode-transcripts/hn-ep-{language}-transcript-{date_time_string}.txt"
file_audio = f"hn-ep-{language}-audio" + date_time_string
playht_api_key = os.getenv("PLAYHT_API_KEY")
playht_user_id = os.getenv("PLAYHT_USER_ID")
playht_voice = 'it-IT-ElsaNeural'
# playht_voice = "es-ES_LauraV3Voice"
# playht_voice = "pt-PT-FernandaNeural"
# playht_voice = "sk-SK-LukasNeural"
# playht_voice = "VickiNeural"
denylist_urls = [
# Everything should be handled in the prompt now, fingers crossed.
]
def get_link_content(url: str, retry_attempt: int = 0) -> str:
try:
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
output = ''
denylist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
]
for t in text:
if t.parent.name not in denylist:
output += '{} '.format(t)
# TODO: Currently long articles are chopped off to fit the GPT limits. May want to address this at some point.
return output[:max_article_length].strip()
except Exception as e:
if retry_attempt < max_retries:
print("Error downloading article, retrying...")
time.sleep(2 ^ retry_attempt)
return get_link_content(url, retry_attempt + 1)
else:
raise Exception("Article download failed", e)
def make_gpt4_call(prompt: str, retry_attempt: int = 0) -> str:
print(prompt)
print("---------------------------------")
try:
completion = openai.ChatCompletion.create(
model= model,
messages=[
{"role": "user", "content": prompt}
]
)
print(completion.choices[0].message.content)
print("---------------------------------")
return completion.choices[0].message.content
except Exception as e:
if e.error.type == "server_error" and retry_attempt < max_retries:
print("Server error, retrying...")
time.sleep(2 ^ retry_attempt)
return make_gpt4_call(prompt, retry_attempt + 1)
else:
raise Exception("GPT call failed", e)
def get_hn_links():
hn_url = 'https://news.ycombinator.com/'
res = requests.get(hn_url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
links = soup.find_all('a')
articles = []
for link in links:
# Skip YC links and PDFs
if link.get('href')[:4] == 'http' and link.get('href')[:29] != 'https://news.ycombinator.com' and link.get('href')[:28] != 'https://www.ycombinator.com/' and "[pdf]" not in link.text:
skip = False
for url in denylist_urls:
if url in link.get('href'):
skip = True
if not skip:
articles.append((link.get('href'), link.text))
return articles
def create_introduction(episode) -> str:
prompt = f"Imagine you are creating an episode for a podcast called \"Hacker News in Slow {language}\" that summarizes the top {number_of_articles} articles on Hacker News that should be accessible for people learning {language}. Use simple, short sentences, and the most common words in {language}.\n\n"
prompt += f"This is the content of the episode:\n{episode}\n\n"
prompt += "\nCreate a short introduction paragraph for the episode to get people excited about today's content. Use an easy-to-follow and conversational style as you would in a podcast.\n\n"
return make_gpt4_call(prompt)
def create_ending(episode) -> str:
prompt = f"Imagine you are creating an episode for a podcast called \"Hacker News in Slow {language}\" that summarizes the top {number_of_articles} articles on Hacker News that should be accessible for people learning {language}. Use simple, short sentences, and the most common words in {language}. "
prompt += f"Here is the text of the episode:\n{episode}\n\n"
prompt += "Create a short ending for the episode. Use an easy-to-follow and conversational style as you would in a podcast.\n\n"
return make_gpt4_call(prompt)
def summarize_article(link, index: int) -> Optional[str]:
url, title = link
try:
content = get_link_content(url)
prompt = f"Create a summary of a content on the website (usually this contains an article, but could also be a github repo or a forum). Make the summary accessible for people learning {language}. Use simple grammar, short sentences, and only the most commonly used words in {language}.\n\n"
prompt += f"This is the text scraped from the website:\n<text>{content}</text>\n\n"
prompt += f"If no substantial content wasn't returned due to an error during the scraping, return just \"ERROR\".\n\n"
prompt += f"Otherwise, generate an engaging and in-depth summary of the most interesting information (3-5 paragraphs long) in {language}. Use an easy-to-follow and conversational style. Start by mentioning the website's title \"{title}\"."
response = make_gpt4_call(prompt)
if response.lower().startswith("error"):
return None
return response
except:
return None
def generate_episode_text() -> str:
print("Getting links...")
links = get_hn_links()
print("Summarising articles...")
summaries = []
index = 0
generated = []
skipped = []
while len(summaries) < number_of_articles and index < len(links):
summary = summarize_article(links[index], index)
if summary is not None:
summaries.append(summary)
generated.append(links[index])
else:
skipped.append(links[index])
index += 1
content = "\n\n".join(summaries)
print("Generating beginning and ending...")
transcript = create_introduction(content) + "\n\n" + content + "\n\n" + create_ending(content)
print("Generated links:", generated)
print("Skipped links:", skipped)
return transcript
def playht_narrate(text):
url = "https://play.ht/api/v1/convert"
payload = {
"content": [s for s in text.split("\n") if len(s) > 0],
"voice": playht_voice,
"globalSpeed": "85",
"title": file_audio,
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"AUTHORIZATION": playht_api_key,
"X-USER-ID": playht_user_id
}
response = requests.post(url, json=payload, headers=headers)
print(response.text)
# TODO: Poll until finished, then download.
url = "https://play.ht/api/v1/articleStatus?transcriptionId=-NOjGbpFU1Exf3DxhLHU"
headers = {
"accept": "application/json",
"AUTHORIZATION": playht_api_key,
"X-USER-ID": playht_user_id
}
response = requests.get(url, headers=headers)
print(response.text)
if __name__ == '__main__':
if not skip_generate_text:
print("Generating episode text...")
text = generate_episode_text()
with open(text_file, "w") as f:
f.write(text)
if not skip_narration:
with open(text_file, "r") as f:
text = f.read()
print("Generating episode audio...")
playht_narrate(text)
print("Done!")