second commit

UngeheurenUngeziefer · Jun 17, 2020 · c73a1c7 · c73a1c7
1 parent 6bd8646
commit c73a1c7
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 48 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/downloads/
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/Duplicate_cleaner.py b/Duplicate_cleaner.py
@@ -1,6 +1,6 @@
 import hashlib
 from hashlib import md5
-from scipy import *
+from imageio import imread
 import matplotlib.pyplot as plt
 import time
 import numpy as np
@@ -26,15 +26,18 @@ def file_hash(filepath):
             hash_keys[filehash] = index
         else:
             duplicates.append((index, hash_keys[filehash]))
-
 # print(duplicates)
-# 405 and 4
-
-for file_indexes in duplicates[:30]:
-    plt.subplot(121), plt.imshow(imread(files_list[file_indexes[1]]))
-    plt.title(file_indexes[1]), plt.xticks([]), plt.yticks([])
 
-    plt.subplot(122), plt.imshow(imread(files_list[file_indexes[0]]))
-    plt.title(str(file_indexes[0]) + ' duplicate', plt.xticks([]), plt.yticks([]))
-    plt.show()
 
+for file_indexes in duplicates[:30]:
+    try:
+        plt.subplot(121), plt.imshow(imread(files_list[file_indexes[1]]))
+        plt.title(file_indexes[1]), plt.xticks([]), plt.yticks([])
+        plt.subplot(122), plt.imshow(imread(files_list[file_indexes[0]]))
+        plt.title(str(file_indexes[0]) + ' duplicate'), plt.xticks([]), plt.yticks([])
+        plt.show()
+    except OSError as e:
+        continue
+
+for index in duplicates:
+    os.remove(files_list[index[0]])
diff --git a/Image_scraper.py b/Image_scraper.py
@@ -4,41 +4,65 @@
 import re
 import os.path
 
-http = urllib3.PoolManager()
-url = requests.get('https://rare-pepe.com/')
-soup = BeautifulSoup(url.text, 'html.parser')
-
-links = []
-links2 = []
-links3 = []
-links4 = []
-
-# search for all <img>'s
-for img in soup.find_all('img'):
-    links.append(img)
-strlinks = str(links)
-
-# search for all 'data-orig-file' in <img>'s
-urls = re.findall(r'(data-orig-file="https?://\S+)', strlinks)
-links2.append(urls)
-links2 = str(links2)
-urls2 = re.findall(r'(https?://\S+)', links2)
-
-for line in urls2:
-    links3.append(line.replace(r"'", ''))
-
-for line in links3:
-    links4.append(line.replace(r'",', ''))
-links4[-1] = links4[-1][0:-3]
-# print(links4)
-
-counter = 0
-
-for url in links4:
-    r = http.request('GET', url)
-    counter += 1
-    num = str(counter+1)
-    frmt = url[-4:]
-    with open(os.path.join('.\downloads', 'pepe_' + num + frmt), 'wb') as final_image:
-        final_image.write(r.data)
-    print('downloading pic ' + str(counter))
+class ImageScraper:
+    def __init__(self, web_url, tag_name, regex_request, regex_request2, folder, img_name):
+        self.web_url = web_url
+        self.tag_name = tag_name
+        self.regex_request = regex_request
+        self.regex_request2 = regex_request2
+        self.folder = folder
+        self.img_name = img_name
+        self.http = urllib3.PoolManager()
+        self.url = requests.get(self.web_url)
+
+    def TagSearcher(self, ):
+        soup = BeautifulSoup(self.url.text, 'html.parser')
+        links = []
+        # search for all <img>'s
+        for img in soup.find_all(self.tag_name):
+            links.append(img)
+        self.strlinks = str(links)
+
+    def Regex(self):
+        # search for all 'data-orig-file' in <img>'s
+        urls = re.findall(self.regex_request, self.strlinks)
+        links2 = []
+        links2.append(urls)
+        links2 = str(links2)
+        self.urls2 = re.findall(self.regex_request2, links2)
+
+    def ClearingArtifacts(self):
+        # clearing artifacts
+        links3 = []
+        for line in self.urls2:
+            links3.append(line.replace(r"'", ''))
+        self.links4 = []
+        for line in links3:
+            self.links4.append(line.replace(r'",', ''))
+        self.links4[-1] = self.links4[-1][0:-3]
+        # print(links4)
+
+    def Downloader(self):
+        counter = 0
+
+        for url in self.links4:
+            r = self.http.request('GET', url)
+            counter += 1
+            num = str(counter + 1)
+            frmt = url[-4:]
+            if not os.path.exists(self.folder):
+                os.makedirs(self.folder)
+            with open(os.path.join(self.folder, self.img_name + num + frmt), 'wb') as final_image:
+                final_image.write(r.data)
+            print('downloading pic ' + str(counter))
+
+# Obj = class           url adress             tag            regex requests
+Obj = ImageScraper('https://rare-pepe.com/', 'img', r'(data-orig-file="https?://\S+)',
+                    r'(https?://\S+)', '.\downloads2', 'pepe_')
+#                                   folder to dwnld     name of files
+
+if __name__ == '__main__':
+    Obj.TagSearcher()
+    Obj.Regex()
+    Obj.ClearingArtifacts()
+    Obj.Downloader()
diff --git a/Telegram_poster.py b/Telegram_poster.py
@@ -0,0 +1,47 @@
+from telegram.ext import Updater, CommandHandler
+
+import requests
+import re
+import glob
+import os
+
+proxies = {'https': '169.51.80.228:3128',
+           'http': '159.203.82.173:3128'}
+
+TOKEN = '1279456273:AAECOMcJPNP7x5G5sD4zIzQukDcy34sG5KU'
+MAIN_URL = f'https://api.telegram.org/bot{TOKEN}'
+
+chat_id = '-1001200942722'
+
+# creating list of addresses to all imgs
+def img_path():
+    path = './downloads'
+    img_list = []
+    addresses_list = []
+
+    os.chdir(path)
+    for file in glob.glob("*.*"):
+        img_list.append(file)
+    for img in img_list:
+        addresses_list.append(path + '/' + img)
+    # print(addresses_list)
+    for path in addresses_list:
+        return path
+
+def bop(bot, update):
+    chat_id = update.message.chat_id
+    bot.send_photo(chat_id=chat_id, photo=img_path())
+
+def main():
+    updater = Updater('1279456273:AAECOMcJPNP7x5G5sD4zIzQukDcy34sG5KU')
+    dp = updater.dispatcher
+    dp.add_handler(CommandHandler('bop', bop))
+    updater.start_polling()
+    updater.idle()
+
+if __name__ == '__main__':
+    main()
+
+
+
+