-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtieba_cat.py
96 lines (78 loc) · 3.32 KB
/
tieba_cat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import requests
from lxml import etree
import os
import json
import sys
class Tieba_cat(object):
def __init__(self, tieba_name):
# self.url = 'http://tieba.baidu.com/f?kw=%s' % (tieba_name)
self.url = 'http://tieba.baidu.com/f?kw={}'.format(tieba_name)
print(self.url)
self.headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
# chrome浏览器会自动将HTML页面注释,所以使用叫老的IE浏览器
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) '
}
self.file = open('tieba.json', 'w', encoding='utf8')
def get_data(self, url):
response = requests.get(url, headers=self.headers)
return response.content
def parse_list_page(self, data):
'''//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a'''
# 将源码转换成 element 对象
html = etree.HTML(data)
# 获取节点列表
node_list = html.xpath(
'//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
# print(len(node_list))
data_list = []
for node in node_list:
temp = {}
temp['title'] = node.xpath('./text()')[0]
temp['url'] = 'http://tieba.baidu.com/' + node.xpath('./@href')[0]
# temp['url'] = 'http://tieba.baidu.com/' + node.xpath('./@href')[0]
data_list.append(temp)
# 获取下一页,如果是尾页则返回 None
try:
next_url = 'http:' + \
html.xpath('//*[@id="frs_list_pager"]/a[last()-1]/@href')[0]
except:
next_url = None
return data_list, next_url
def parse_detail_page(self, data):
'''//div[@class="p_content p_content p_content_nameplate"]/cc/*[@class="d_post_content j_d_post_content clearfix"]/img'''
html = etree.HTML(data)
image_list = html.xpath(
'//div[@class="p_content p_content p_content_nameplate"]/cc/*[@class="d_post_content j_d_post_content clearfix"]/img/@src'
)
return image_list
def download(self, image_list):
if not os.path.exists('image'):
os.makedirs('image')
for url in image_list:
filename = 'image' + os.sep + url.split('/')[-1]
data = self.get_data(url)
with open(filename, 'wb') as f:
f.write(data)
def save_data(self, data):
str_data = json.dumps(data, ensure_ascii=False) + ',\n'
self.file.write(str_data)
def __del__(self):
self.file.close()
def run(self):
next_url = self.url
while True:
data = self.get_data(self.url)
# 在响应页面中抽取详情页面数据列表,下一页url
detail_list, next_url = self.parse_list_page(data)
for detail in detail_list:
# 获取子页面相应
detail_data = self.get_data(detail['url'])
image_list = self.parse_detail_page(detail_data)
self.download(image_list)
# 保存数据
detail['images'] = image_list
self.save_data(detail)
if __name__ == '__main__':
tieba = Tieba_cat('猫')
tieba.run()