-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path数据(json)的提取_requests.py
76 lines (63 loc) · 2.33 KB
/
数据(json)的提取_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
Created on Sat May 1 09:35:20 2021
@author: Team317
"""
import time
import requests
import json
from bs4 import BeautifulSoup
base_url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?'
headers = {
'cookie':'_zap=787a4cc3-5307-4***',
'user-agent':'Mozilla/5.0 (Windows***',
}
# page和after_id可根据需要进行调整
page = 0
after_id = 6
params = {
'session_token':'7ff1929781f57d1262b18480fe3011c2',
'desktop':'true',
'page_number':page,
'limit':'6',
'action':'down',
'after_id':after_id,
'ad_interval':'1'
}
def get_page(params, base_url, headers):
# 附加参数
extra_url = '&'.join(['{key}={val}'.format(key=key,val=params[key]) for key in params])
# 合成url
url = base_url + extra_url
# 尝试获取json数据
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error',e.args)
return []
# 获取json数据
data = get_page(params, base_url, headers)
with open('recommend.json', 'w', encoding='utf-8') as file:
str = json.dumps(data, indent = 2, ensure_ascii = False)
file.write(str)
with open('recommend.json', 'r', encoding='utf-8') as file:
str = file.read()
data = json.loads(str)['data']
num = len(data)
# 接下来就是提取其中的信息,这需要观察json数据的格式,了解你所需要的数据的位置,然后一步步定位
# 由于json中没有找到每条推荐对应的链接,所以需要自己根据json数据自己合成链接
# 链接形如:https://www.zhihu.com/question/377886499/answer/1849697584
for i in range(num):
# 获取target字段,里面包含主要的链接信息
target = data[i].get('target')
id = target.get('id')
# 尝试获取question字段,如果失败则该条推荐不是文章类型
question = target.get('question', -1)
if question != -1:
question_id = question.get('id')
# 合成推荐内容的链接
url = 'https://www.zhihu.com/question/{q_id}/answer/{id}'.format(q_id = question_id, id = id)
title = question.get('title')
print('{title}\n{url}\n'.format(title=title, url=url))