-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhotel_link_extracttion_updated.py
92 lines (65 loc) · 3.04 KB
/
hotel_link_extracttion_updated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import urllib
from bs4 import BeautifulSoup
import os
import string
import sys
import pandas as pd
'''
This program intends to extract all the review pages for the hotels in Bali from Trip Advisor.
Even if your connection fails, dont worry it will take care of that.
Only pain part it is it will start from the begining. At the end it will only give you the distinct
hotel names.
You just have to give the first page in the URL.
It is fine if your internet connection dies in between.
It will not impact the code.
Also you dont need to worry about the total pages out there for your location.
Yes, another point, if your location has only 1 page means less than 30 hotels, come on !!
do it through your eyes. Dont disturb this code.
'''
orig_stdout = sys.stdout
f = file('input_file.txt', 'w')
sys.stdout = f
main_url = 'https://www.tripadvisor.com.sg/Hotels-g60763-New_York_City_New_York-Hotels.html' ## Give the first page of the hotel list. Here this is the page which comes when you search with BALI + Hotels
def hotellinst(url,k=0):
thepage = urllib.urlopen(url).read().decode('utf8')
soup = BeautifulSoup(thepage, "html.parser")
link = soup.find_all(attrs={"class": "pageNum last taLnk "})
max_len = int(str(str(link[0])[43:49].split(' ')[0])[1:len(str(str(link[0])[43:49].split(' ')[0]))-1])/30 +1
for j in range(2,max_len+1):
thepage = urllib.urlopen(url).read().decode('utf8')
soup = BeautifulSoup(thepage, "html.parser")
link_new = soup.find_all(attrs={"class":"prw_rup prw_meta_hsx_responsive_listing bottom-sep"})
for i in range(0, len(link_new)):
soup = BeautifulSoup(urllib.urlopen("http://www.tripadvisor.com" + link_new[i].find('a')['href']),"html.parser")
checker = link_new[i].find('a')['href']
urlhotel = "http://www.tripadvisor.com" + checker
print urlhotel
#print url
url= main_url[:len('-'.join([s for s in main_url.split('-')[:2]]))]+'-oa'+str((j-1)*30)+'-'+main_url[(len('-'.join([s for s in main_url.split('-')[:2]])))+1:len(main_url)]
try:
import httplib
except:
import http.client as httplib
def have_internet():
conn = httplib.HTTPConnection("www.google.com", timeout=1)
try:
conn.request("HEAD", "/")
return hotellinst(main_url)
#conn.close()
except:
#conn.close()
return False
have_internet()
sys.stdout = orig_stdout
f.close()
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(hash(line))
return line
output_file.writelines((add_line(line) for line in input_file
if hash(line) not in seen_lines))
remove_duplicate_lines('input_file.txt','output_file_all.txt')