-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjr_yify.py
executable file
·216 lines (174 loc) · 7.42 KB
/
jr_yify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3
#VERSION: 0.1
#AUTHORS: Justin Riddell
import sys
import aiohttp
import asyncio
# This has been written in a different (and more liberating) style from my usual
# stuff in the sense there is no error checking at all. The idea being that
# failure is non-critical and relatively easy to reproduce, test and fix. This
# makes the code faster to write and more compact.
### Start of pagination.py
# This was a separate file, however the instructions on making your own search
# plugin (to me) do not make it explicit that your plugin MUST be in one file
# only - it states that "that a qBittorrent search engine plugin is actually a
# Python class file" however this does NOT say one and only one.
# It turns out the importer uses some shenanigans (in nova2.py) messing about
# with module imports and in the actual app when you try to import you just get
# a generic error message. I wish this was made clear on the page or the error
# message given wasn't a generic "failed to load plugin".
# Simple solution, dump all this into one file.
# The way this works:
# 1. Download search page result ie. yify/search/batman
# 2. Parse and download any additional results pages for batman movies
# 3. For each results page, scrape the set of urls for actual movie pages
# 4. Download all actual movie pages and scrape metadata including magnet link
# 5. Print these magnet links
import asyncio
import bs4
import functools
import re
import sys
import aiohttp
import novaprinter
# Three types of pages:
# One page of results
# http://www.yify-movies.net/search/mission impossible/
# Multiple pages of results but no last
# http://www.yify-movies.net/search/batman/
# Multiple pages of results with so many there's a "last" and not all on this
# page
# http://www.yify-movies.net/search/the/
def additional_urls(page):
"""Returns list of additional results pages from the first results page"""
soup = bs4.BeautifulSoup(page, features = "html.parser")
pages_div = soup.find("div", class_ = "pagination")
# Likely no results at all for this title
if not pages_div:
return []
# If pagination div is empty then there are no additional results pages
hrefs = pages_div.find_all("a")
if not hrefs:
return []
last = functools.reduce(max,
(int(s["href"].split("/")[-2]) for s in hrefs[1:]))
return ["/".join(["time", str(i), ""]) for i in range(2, last + 1)]
async def get(session, url):
"""Given a session and url (awaits and) fetches the url content"""
async with session.get(url) as resp:
return await resp.text()
def page_movie_urls(page):
return {m for m in re.findall(r"href=\"(/movies[^\"]*)\"", page)}
async def metadata_from_url(session, url, root):
"""Downloads a movie page from url, returning a dict of metadata"""
# print("Getting")
metadata = {
"engine_url": root,
"desc_link": url,
"url": url,
}
page = await get(session, url)
# page = str(requests.get(url).content)
data = metadata_from_page(page)
metadata.update(data)
# doneso()
return metadata
# For debugging to see how long each request is taking
# def doneso():
# if not hasattr(doneso, "i"):
# doneso.i = 0
# print(doneso.i)
# doneso.i += 1
def metadata_from_page(page):
"""Parses page to grab appropriate metadata including magnet link"""
metadata = {"leech": "-1",}
soup = bs4.BeautifulSoup(page, features = "html.parser")
e = soup.find("div", class_ = "heading")
e.contents[0].unwrap()
name = e.string
if name.endswith(" YIFY Movie"):
name = name[:-len(" YIFY Movie")]
metadata["name"] = name
year_start = name.rfind("(") + 1
year_end = name.rfind(")", year_start)
metadata["year"] = name[year_start: year_end]
metadata["link"] = soup.find("a", href = re.compile(r"^magnet:.*"))["href"]
for tag in soup.find("div", class_ = "available").ul.find_all("li"):
# Remove bold tag from first element
tag.contents[0].unwrap()
if tag.contents[0] == "Size:":
metadata["size"] = tag.contents[1].strip()
if tag.contents[0] == "Quality:":
metadata["quality"] = tag.contents[1].strip()
if tag.contents[0] == "Peers/Seeds:":
peers, seeds = tag.contents[1].replace(" ", "").split("/")
metadata["seeds"] = seeds
return metadata
async def main_async(session, title, root):
# Get first set of pages
root_search_title = root + "/search/" + title + "/"
pages = [await get(session, root_search_title)]
urls = [root_search_title + url for url in additional_urls(pages[0])]
if urls:
pages += await asyncio.gather(*[get(session, url) for url in urls])
movie_urls = [url for page in pages for url in page_movie_urls(page)]
# print(len(pages))
# print(len(movie_urls))
# Remove bad quality or 3d versions
remove = [url for url in movie_urls if
(url.endswith("720p.html")
and url.replace("720p.html", "1080p.html") in movie_urls)
or url.endswith("3d.html")]
movie_urls = [root + url for url in movie_urls if url not in remove]
# print(len(movie_urls))
# Get actual metadata including magnet links
# print(movie_urls)
magnets = await asyncio.gather(
*[metadata_from_url(session, url, root) for url in movie_urls])
movies = sorted(magnets,
key = lambda movie: (int(movie["year"]), movie["name"]))
# Print results
for movie in movies:
# print(movie["name"], "-", movie["year"])
novaprinter.prettyPrinter(movie)
async def main_a(title, root):
"""Asynchronous entry point"""
async with aiohttp.ClientSession(
connector = aiohttp.TCPConnector(limit = 500)) as session:
await main_async(session, title, root)
def search_me(title, root):
"""Synchronous entry point, root being yify url root"""
asyncio.get_event_loop().run_until_complete(main_a(title, root))
### End of pagination.py
_ = """
Required for qbittorrent else -1
link => A string corresponding the the download link (the .torrent file or magnet link)
name => A unicode string corresponding to the torrent's name (i.e: "Ubuntu Linux v6.06")
size => A string corresponding to the torrent size (i.e: "6 MB" or "200 KB" or "1.2 GB"...)
seeds => The number of seeds for this torrent (as a string)
leech => The number of leechers for this torrent (a a string)
engine_url => The search engine url (i.e: http://www.mininova.org)
desc_link => A string corresponding to the the description page for the torrent
Mine
url => Search url
quality => Quality eg. 1080p, 3d
year => Year released
"""
# Issues with regard to:
# https://aiohttp.readthedocs.io/en/stable/faq.html#why-is-creating-a-clientsession-outside-of-an-event-loop-dangerous
# Event loop should always live at least as long as the async objects scheduled
# in it - for now a version with essentially full tear down and creation each
# time - probably less efficient - would this break nicely(/at all?) if
# qbittorrent used asyncio stuff underneath (and messed with event loops)
class jr_yify:
url = "http://www.yify-movies.net"
name = "yify-movies.net"
supported_categories = {"all": "0", "movies": "6",}
def search(self, what, cat = "movies"):
search_me(what, jr_yify.url)
def main(argv):
searcher = jr_yify()
searcher.search("scott pilgrim")
searcher.search("batman")
if __name__ == "__main__":
sys.exit(main(sys.argv))