Skip to content

Commit

Permalink
second commit
Browse files Browse the repository at this point in the history
  • Loading branch information
UngeheurenUngeziefer committed Jun 17, 2020
1 parent 6bd8646 commit c73a1c7
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 48 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/downloads/
7 changes: 7 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 13 additions & 10 deletions Duplicate_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hashlib
from hashlib import md5
from scipy import *
from imageio import imread
import matplotlib.pyplot as plt
import time
import numpy as np
Expand All @@ -26,15 +26,18 @@ def file_hash(filepath):
hash_keys[filehash] = index
else:
duplicates.append((index, hash_keys[filehash]))

# print(duplicates)
# 405 and 4

for file_indexes in duplicates[:30]:
plt.subplot(121), plt.imshow(imread(files_list[file_indexes[1]]))
plt.title(file_indexes[1]), plt.xticks([]), plt.yticks([])

plt.subplot(122), plt.imshow(imread(files_list[file_indexes[0]]))
plt.title(str(file_indexes[0]) + ' duplicate', plt.xticks([]), plt.yticks([]))
plt.show()

for file_indexes in duplicates[:30]:
try:
plt.subplot(121), plt.imshow(imread(files_list[file_indexes[1]]))
plt.title(file_indexes[1]), plt.xticks([]), plt.yticks([])
plt.subplot(122), plt.imshow(imread(files_list[file_indexes[0]]))
plt.title(str(file_indexes[0]) + ' duplicate'), plt.xticks([]), plt.yticks([])
plt.show()
except OSError as e:
continue

for index in duplicates:
os.remove(files_list[index[0]])
100 changes: 62 additions & 38 deletions Image_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,65 @@
import re
import os.path

http = urllib3.PoolManager()
url = requests.get('https://rare-pepe.com/')
soup = BeautifulSoup(url.text, 'html.parser')

links = []
links2 = []
links3 = []
links4 = []

# search for all <img>'s
for img in soup.find_all('img'):
links.append(img)
strlinks = str(links)

# search for all 'data-orig-file' in <img>'s
urls = re.findall(r'(data-orig-file="https?://\S+)', strlinks)
links2.append(urls)
links2 = str(links2)
urls2 = re.findall(r'(https?://\S+)', links2)

for line in urls2:
links3.append(line.replace(r"'", ''))

for line in links3:
links4.append(line.replace(r'",', ''))
links4[-1] = links4[-1][0:-3]
# print(links4)

counter = 0

for url in links4:
r = http.request('GET', url)
counter += 1
num = str(counter+1)
frmt = url[-4:]
with open(os.path.join('.\downloads', 'pepe_' + num + frmt), 'wb') as final_image:
final_image.write(r.data)
print('downloading pic ' + str(counter))
class ImageScraper:
def __init__(self, web_url, tag_name, regex_request, regex_request2, folder, img_name):
self.web_url = web_url
self.tag_name = tag_name
self.regex_request = regex_request
self.regex_request2 = regex_request2
self.folder = folder
self.img_name = img_name
self.http = urllib3.PoolManager()
self.url = requests.get(self.web_url)

def TagSearcher(self, ):
soup = BeautifulSoup(self.url.text, 'html.parser')
links = []
# search for all <img>'s
for img in soup.find_all(self.tag_name):
links.append(img)
self.strlinks = str(links)

def Regex(self):
# search for all 'data-orig-file' in <img>'s
urls = re.findall(self.regex_request, self.strlinks)
links2 = []
links2.append(urls)
links2 = str(links2)
self.urls2 = re.findall(self.regex_request2, links2)

def ClearingArtifacts(self):
# clearing artifacts
links3 = []
for line in self.urls2:
links3.append(line.replace(r"'", ''))
self.links4 = []
for line in links3:
self.links4.append(line.replace(r'",', ''))
self.links4[-1] = self.links4[-1][0:-3]
# print(links4)

def Downloader(self):
counter = 0

for url in self.links4:
r = self.http.request('GET', url)
counter += 1
num = str(counter + 1)
frmt = url[-4:]
if not os.path.exists(self.folder):
os.makedirs(self.folder)
with open(os.path.join(self.folder, self.img_name + num + frmt), 'wb') as final_image:
final_image.write(r.data)
print('downloading pic ' + str(counter))

# Obj = class url adress tag regex requests
Obj = ImageScraper('https://rare-pepe.com/', 'img', r'(data-orig-file="https?://\S+)',
r'(https?://\S+)', '.\downloads2', 'pepe_')
# folder to dwnld name of files

if __name__ == '__main__':
Obj.TagSearcher()
Obj.Regex()
Obj.ClearingArtifacts()
Obj.Downloader()
47 changes: 47 additions & 0 deletions Telegram_poster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from telegram.ext import Updater, CommandHandler

import requests
import re
import glob
import os

proxies = {'https': '169.51.80.228:3128',
'http': '159.203.82.173:3128'}

TOKEN = '1279456273:AAECOMcJPNP7x5G5sD4zIzQukDcy34sG5KU'
MAIN_URL = f'https://api.telegram.org/bot{TOKEN}'

chat_id = '-1001200942722'

# creating list of addresses to all imgs
def img_path():
path = './downloads'
img_list = []
addresses_list = []

os.chdir(path)
for file in glob.glob("*.*"):
img_list.append(file)
for img in img_list:
addresses_list.append(path + '/' + img)
# print(addresses_list)
for path in addresses_list:
return path

def bop(bot, update):
chat_id = update.message.chat_id
bot.send_photo(chat_id=chat_id, photo=img_path())

def main():
updater = Updater('1279456273:AAECOMcJPNP7x5G5sD4zIzQukDcy34sG5KU')
dp = updater.dispatcher
dp.add_handler(CommandHandler('bop', bop))
updater.start_polling()
updater.idle()

if __name__ == '__main__':
main()




0 comments on commit c73a1c7

Please sign in to comment.