-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
84 lines (71 loc) · 2.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import argparse
from converter.converter import html_to_epub
class NoAr5ivCache(Exception):
pass
def download_html(url, file_name='download.html'):
response = requests.get(url)
if response.status_code == 200:
# with open(file_name, 'wb') as f:
# f.write(response.content)
return response.content
else:
raise NoAr5ivCache
def download_website_assets(url):
html_content = download_html(url)
if not html_content:
return None
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
images = {}
for img in img_tags:
img_url = img.get('src')
if not img_url:
continue
if 'ar5iv' in img_url:
continue
img_url = requests.compat.urljoin(url, img_url)
if not 'http' in img_url:
continue
img_response = requests.get(img_url)
if img_response.status_code == 200:
images[img_url] = img_response.content
return html_content, images
def get_author(url):
soup = BeautifulSoup(download_html(url), 'html.parser')
authors_div = soup.find('div', class_='authors')
# Get the first child a tag text
try:
if authors_div:
authors = authors_div.find_all('a')
first_author_last_name = authors[0].text.split(' ')[-1]
if len(authors) > 1:
return first_author_last_name + ' et al.'
else:
return first_author_last_name
except:
return 'Unknown'
def get_title(url):
soup = BeautifulSoup(download_html(url), 'html.parser')
title = soup.find('h1', class_='title').text
title = title.replace('&', 'and')
return title.replace("Title:", '')
def convert_latex_source_to_xml(url):
return ""
def arxiv_to_paper(arxiv_link):
assert "arxiv" in arxiv_link
title = get_title(arxiv_link)
author = get_author(arxiv_link)
html_content = ""
try:
website_data = download_website_assets("https://ar5iv.labs.arxiv.org/html/" + arxiv_link.split('/')[-1])
# need to get images too
except NoAr5ivCache:
website_data = convert_latex_source_to_xml(arxiv_link)
html_to_epub(website_data,title, author)
parser = argparse.ArgumentParser(description="Process a link.")
parser.add_argument('arxiv_link', type=str, help='The link to be processed')
args = parser.parse_args()
arxiv_link = args.arxiv_link
arxiv_to_paper(arxiv_link)