-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding the base files (Amazon-req.py, Amazon.py) and adding the descr…
…iption file (README.md)
- Loading branch information
0 parents
commit fc028cc
Showing
3 changed files
with
168 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#Import Libraries. | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import csv | ||
from datetime import datetime | ||
import time | ||
|
||
#Count time(Starting point). | ||
start =datetime.now() | ||
#Count pages. | ||
pag = 1 | ||
|
||
#Create lists. | ||
title_list = [] | ||
price_list = [] | ||
|
||
#The main loop. | ||
while True: | ||
#Make a connection to the website. | ||
req = requests.get(f"https://www.amazon.eg/s?i=home&bbn=18021933031&rh=n%3A21863947031&fs=true&page={pag}&language=en&qid=1711754513&ref=sr_pg_1") | ||
#Get source code. | ||
cont = req.content | ||
#Parse source code. | ||
soup = BeautifulSoup(cont, "lxml") | ||
|
||
#Find all products. | ||
product = soup.find_all("div", {"class": "puis-card-container"}) | ||
|
||
#Loop over products to get all data and store it in lists. | ||
for x in product: | ||
try: | ||
title = x.find("span", {"class": "a-size-medium a-color-base a-text-normal"}).text | ||
title_list.append(title) | ||
except: | ||
title_list.append("not found") | ||
|
||
try: | ||
price = x.find("span", {"class": "a-price-whole"}).text | ||
price_list.append(price) | ||
|
||
except : | ||
price_list.append("not found") | ||
continue | ||
#Count pages. | ||
pag += 1 | ||
|
||
#Check if the last page is reached. | ||
if pag > 35: | ||
print("✔Project is done✔") | ||
break | ||
#Create CSV file. | ||
print(title_list, price_list) | ||
with open("Amazon.csv", "w", newline='', encoding='utf-8') as file: | ||
wr = csv.writer(file) | ||
wr.writerow(["Title", "prices"]) | ||
wr.writerows(zip(title_list, price_list)) | ||
#Another method you can use instead of the line before:👇 | ||
#for x in range(len(title_list)): | ||
#wr.writerow([title_list[x], size_list[x]]) | ||
|
||
#Create TXT file. | ||
file = open("Amazon.txt", "w", newline='', encoding='utf-8') | ||
file.write("Title" + "\t" + "prices" + "\n") | ||
for x in range(len(title_list)): | ||
file.write(f"Title: {title_list[x]}\nSize: {price_list[x]}\n" +'*'*70 +'\n') | ||
file.close() | ||
|
||
#Count time(Ending point). | ||
end = datetime.now() | ||
print("excution taken: ", end - start) | ||
#It's done. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#Import Libraries. | ||
from bs4 import BeautifulSoup | ||
import csv | ||
from selenium import webdriver | ||
from datetime import datetime | ||
import time | ||
|
||
#Count time(Starting point). | ||
start =datetime.now() | ||
|
||
#Count pages. | ||
pag = 1 | ||
#Create lists. | ||
title_list = [] | ||
price_list = [] | ||
|
||
#The main loop. | ||
while True: | ||
|
||
#Create options for Webdriver. | ||
options = webdriver.ChromeOptions() | ||
#Use headless mode. | ||
options.add_argument("--headless=new") | ||
|
||
#Block Images and JavaScript. | ||
prefs = {"profile.managed_default_content_settings.images": 2,"profile.managed_default_content_settings.javascript": 2} | ||
options.add_experimental_option("prefs", prefs) | ||
|
||
#Pass Options to Webdriver. | ||
driver = webdriver.Chrome(options=options) | ||
#Make a request. | ||
driver.get(f"https://www.amazon.eg/s?i=home&bbn=18021933031&rh=n%3A21863947031&fs=true&page={pag}&language=en&qid=1711754513&ref=sr_pg_1") | ||
time.sleep(1) | ||
#Get source code. | ||
src = driver.page_source | ||
#Parse source code. | ||
soup = BeautifulSoup(src, "lxml") | ||
|
||
#Find all products. | ||
product = soup.find_all("div", {"class": "puis-card-container"}) | ||
|
||
#Loop over products to get all data and store it in lists. | ||
for x in product: | ||
try: | ||
title = x.find("span", {"class": "a-size-medium a-color-base a-text-normal"}).text | ||
title_list.append(title) | ||
except: | ||
title_list.append("not found") | ||
|
||
try: | ||
price = x.find("span", {"class": "a-price-whole"}).text | ||
price_list.append(price) | ||
|
||
except : | ||
price_list.append("not found") | ||
continue | ||
|
||
#Close Webdriver. | ||
driver.quit() | ||
|
||
#Count pages. | ||
pag += 1 | ||
|
||
#Check if the last page is reached. | ||
if pag > 35: | ||
print("✔Project is done✔") | ||
break | ||
|
||
#Create CSV file. | ||
with open("Amazontest.csv", "w", newline='', encoding='utf-8') as file: | ||
wr = csv.writer(file) | ||
wr.writerow(["Title", "prices"]) | ||
wr.writerows(zip(title_list, price_list)) | ||
#Another method you can use instead of the line before:👇 | ||
#for x in range(len(title_list)): | ||
#wr.writerow([title_list[x], size_list[x]]) | ||
|
||
#Create TXT file. | ||
file = open("Amazontest.txt", "w", newline='', encoding='utf-8') | ||
file.write("Title" + "\t" + "prices" + "\n") | ||
for x in range(len(title_list)): | ||
file.write(f"Title: {title_list[x]}\nSize: {price_list[x]}\n" +'*'*70 +'\n') | ||
file.close() | ||
|
||
#Count time(Ending point). | ||
end = datetime.now() | ||
print("excution taken: ", end - start) | ||
#It's done. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<h1>Amazon Scrape Project:</h1><br/> | ||
This is a web scraping project aimed at gathering data from Amazon regarding washing machines and saving it in TXT and CSV files. | ||
I used two different methods: the first one using the selenium library in headless mode, and the second one using the requests library. | ||
<hr /> | ||
<h2>How can you benefit from this project?</h2> | ||
1- You can fork it in your scraping project and modify it to create a template.<br /> | ||
2- You can understand how the scraping operation is performed. | ||
<hr /> | ||
I hope I have helped you in some way. I wish you success. |