This repository was archived by the owner on Mar 17, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4chan_ripper.py
218 lines (164 loc) · 6.52 KB
/
4chan_ripper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os, sys, re, requests, time
from bs4 import BeautifulSoup
board = ''
previous = []
#setup colors
w = '\033[1m\033[37m' #white
g = '\033[1m\033[32m' #green
r = '\033[1m\033[31m' #red
m = '\033[1m\033[35m' #magenta
def sort_links(extracted):
#acceptable file-types to rip
ftypes = [
'.webm','.png','.jpg','.jpeg','.gif','.mp4'
]
#all good hyperlinks here
to_download = []
#setup prefix url
#url = f'https://i.4cdn.org/{board}'
#iterate through extracted hyperlinks
for link in extracted:
#iterate through whitelisted file-types
for ftype in ftypes:
if link.lower().endswith(ftype):
#ensure formatting
new_link = link
if new_link.startswith("//"):
new_link = '/' + new_link[2:]
elif not new_link.startswith("/"):
new_link = '/' + new_link
new_link = 'https:/' + new_link
#add to list
to_download.append(new_link)
#if nothing captures
if not to_download:
sys.exit('\r\nUnable to rip files!!! Thread down? Exiting...\r\n')
#ensure no repeated links (common in 4chan hyperlink extraction)
to_download = list(set(to_download))
#send all valid hyperlinks back to main routine
return to_download
def get_links(content):
try:
#parse with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
#extract all embedded resources from <a> tags
new_links = [a.get('href') for a in soup.find_all('a', href=True)]
return new_links
except Exception as ex:
sys.exit(f'\r\nError extracing hyperlinks!\r\n{ex}\r\n')
def get_html(url):
try:
#send out http-get probe
response = requests.get(url)
if response.status_code == 200:
#get HTML content
return response.text
else:
sys.exit('\r\nThread is dead! Complete.\r\n')
except Exception as ex:
sys.exit(f'\r\nError scraping HTML!\r\n{ex}\r\n')
def verify_url(url):
global board
if url.count("/") != 5:
sys.exit(f'\r\nInvalid URL!')
#create regex pattern
url_pattern = r"https://boards\.4chan\.org/([^/]+)/thread/([^/]+)"
#ensure board and thread-id exists
match = re.match(url_pattern, url)
if match:
board = match.group(1) # Extracted board
tid = match.group(2) # Extracted thread ID
else:
sys.exit(f'\r\nInvalid URL!\r\n')
def main():
global previous
#refresh env
os.system('clear')
#display banner
print(f'''{g} _ _ _
| | | __| |_ __ _ _ _
|_ _/ _| ' \/ _` | ' \
|_|\__|_||_\__,_|_||_|
_____ _ _ ___ _
|_ _| |_ _ _ ___ __ _ __| | | _ (_)_ __ _ __ ___ _ _
| | | ' \| '_/ -_) _` / _` | | / | '_ \ '_ \/ -_) '_|
|_| |_||_|_| \___\__,_\__,_| |_|_\_| .__/ .__/\___|_|
|_| |_|
{w}Format: https://boards.4chan.org/{g}<BOARD>{w}/thread/{g}<TID>
''')
#capture user-input
try:
url = input(f'{w}Thread URL:{m} ')
passes = int(input(f'{w}Pass count (0=infinite):{m} '))
wait = 0
if passes != 1:
wait = int(input(f'{w}Seconds delay between passes:{m} '))
html = input(f'{w}Clone HTML as well? (Y/n):{m} ')
path = input(f'{w}Download path (ex- /tmp/):{m} ')
if not os.path.isdir(path):
sys.exit(f'\r\n{r}Error! Invalid path...\r\n')
input(f'\r\n{w}Ready? Strike <ENTER> to rip and <CTRL+C> to end...\r\n')
except KeyboardInterrupt:
sys.exit()
except Exception as e:
sys.exit(e)
#begin scraping
passed = 0
while True:
passed +=1
if passes != 0:
print(f'{m}[PASS {passed}/{passes}]\r\n')
else:
print(f'{m}[PASS {passed}/∞]\r\n')
print(f'{w}[~] Scraping HTML...')
content = get_html(url)
print('[~] Extracting hyperlinks...')
new_links = get_links(content)
print('[~] Ignoring non-local hyperlinks...')
resources = sort_links(new_links)
if previous:
print('[!] Checking for new hyperlinks...')
resources = [link for link in resources if link not in previous]
if not resources:
print(f'\r\n{w}Nothing new in this iteration!')
else:
print('[!] Now downloading!\r\n')
for resource in resources:
try:
#get filename
filename = os.path.basename(resource)
#setup download path
filepath = os.path.join(path, filename)
response = requests.get(resource)
if response.status_code == 200:
print(f'{w}----> Downloading: {g}{resource}')
with open(filepath, 'wb') as file:
file.write(response.content)
file.close()
else:
print(f'{w}----> Unable to access {r}{resource}')
except Exception as ex:
print(ex)
#download newest html layout if requested
if html.lower().startswith("y"):
try:
#setup download path
html_path = os.path.join(path, 'thread.html')
#send http-get request
response = requests.get(url)
#write code to html file
with open(html_path, 'w', encoding='utf-8') as file:
file.write(response.text)
file.close()
except:
pass
#ensure no file overwrite for next pass
if not previous:
previous = resources
if passed >= passes:
sys.exit(f'\r\n{w}Done!\r\n')
else:
print(f'\r\n{m}Sleeping for {wait} seconds...\r\n')
time.sleep(wait)
if __name__ == '__main__':
main()