From 3b054ccf10e91e0c6658bf27b7a64baaad5205c2 Mon Sep 17 00:00:00 2001 From: Ayushman Date: Fri, 1 Nov 2024 22:26:43 +0530 Subject: [PATCH] Image Scrapper from a website --- .../Readme.md | 6 ++ .../requirements.txt | 1 + .../scrap_img.py | 59 +++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 PROJECTS/Download_images_from_websites.py/Readme.md create mode 100644 PROJECTS/Download_images_from_websites.py/requirements.txt create mode 100644 PROJECTS/Download_images_from_websites.py/scrap_img.py diff --git a/PROJECTS/Download_images_from_websites.py/Readme.md b/PROJECTS/Download_images_from_websites.py/Readme.md new file mode 100644 index 000000000..9e60dd938 --- /dev/null +++ b/PROJECTS/Download_images_from_websites.py/Readme.md @@ -0,0 +1,6 @@ +# Scrap images from URL + +1. Dowmload Chrome Drive From Chrome. +2. Run scrap-img.py file `py scrap-img.py` +3. `Enter Path : E:\webscraping\chromedriver_win32\chromedriver.exe`
+ `Enter URL : https://dribbble.com/` \ No newline at end of file diff --git a/PROJECTS/Download_images_from_websites.py/requirements.txt b/PROJECTS/Download_images_from_websites.py/requirements.txt new file mode 100644 index 000000000..27bc3be5d --- /dev/null +++ b/PROJECTS/Download_images_from_websites.py/requirements.txt @@ -0,0 +1 @@ +selenium==3.141.0 \ No newline at end of file diff --git a/PROJECTS/Download_images_from_websites.py/scrap_img.py b/PROJECTS/Download_images_from_websites.py/scrap_img.py new file mode 100644 index 000000000..a5d8f11f4 --- /dev/null +++ b/PROJECTS/Download_images_from_websites.py/scrap_img.py @@ -0,0 +1,59 @@ +from selenium import webdriver +import requests as rq +import os +from bs4 import BeautifulSoup +import time + +# path= E:\web scraping\chromedriver_win32\chromedriver.exe +path = input("Enter Path : ") + +url = input("Enter URL : ") + +output = "output" + + +def get_url(path, url): + driver = webdriver.Chrome(executable_path=r"{}".format(path)) + driver.get(url) + print("loading.....") + res = driver.execute_script("return document.documentElement.outerHTML") + + return res + + +def get_img_links(res): + soup = BeautifulSoup(res, "lxml") + imglinks = soup.find_all("img", src=True) + return imglinks + + +def download_img(img_link, index): + try: + extensions = [".jpeg", ".jpg", ".png", ".gif"] + extension = ".jpg" + for exe in extensions: + if img_link.find(exe) > 0: + extension = exe + break + + img_data = rq.get(img_link).content + with open(output + "\\" + str(index + 1) + extension, "wb+") as f: + f.write(img_data) + + f.close() + except Exception: + pass + + +result = get_url(path, url) +time.sleep(60) +img_links = get_img_links(result) +if not os.path.isdir(output): + os.mkdir(output) + +for index, img_link in enumerate(img_links): + img_link = img_link["src"] + print("Downloading...") + if img_link: + download_img(img_link, index) +print("Download Complete!!") \ No newline at end of file