-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathweb_scrapper.py
75 lines (53 loc) · 1.54 KB
/
web_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
#Requirements
#requests
#bs4
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
url = raw_input("enter url: ")
source = requests.get(url)
def get_chrome_web_driver(options):
return webdriver.Chrome("./chromedriver", chrome_options=options)
def get_web_driver_options():
return webdriver.ChromeOptions()
def set_ignore_certificate_error(options):
options.add_argument('--ignore-certificate-errors')
def set_browser_as_incognito(options):
options.add_argument('--incognito')
soup = BeautifulSoup(source.text, 'html')
title = soup.find('title')
print("this is with html tags :", title)
qwery = soup.find('h1')
print("this is without html tags:", qwery.text)
links = soup.find('a')
print(links)
print(links['href'])
print(links['class'])
many_link = soup.find_all('a')
total_links = len(many_link)
print("total links in my website :", total_links)
print()
for i in many_link[:6]:
print(i)
second_link = many_link[1]
print(second_link)
print()
print("href is :", second_link['href'])
nested_div = second_link.find('div')
print(nested_div)
print()
z = (nested_div['class'])
print(z)
print(type(z))
print()
print("class name of div is :", " ".join(nested_div['class']))
wiki = requests.get("https://en.wikipedia.org/wiki/World_War_II")
soup = BeautifulSoup(wiki.text, 'html')
print(soup.find('title'))
ww2_contents = soup.find_all("div", class_='toc')
for i in ww2_contents:
print(i.text)
overview = soup.find_all('table', class_='infobox vevent')
for z in overview:
print(z.text)