-
Notifications
You must be signed in to change notification settings - Fork 0
/
updater.py
153 lines (116 loc) · 5.38 KB
/
updater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import logging
import os
import shutil
import zipfile
from concurrent.futures import ThreadPoolExecutor, wait
from threading import Thread
from urllib.request import urlopen
import boto3
import requests
from bs4 import BeautifulSoup
import newstools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def get_links_in_page(page_url, legacy=True, starts_with_str=None):
page_response = requests.get(page_url)
if legacy:
page_text = page_response.text
soup = BeautifulSoup(page_text, 'html.parser')
news_links = [page_url[:22] + x.get('href') for x in soup.find_all('a') if
x.get('href').startswith(starts_with_str)]
else:
page_json = page_response.json()
news_links = ['http://www.cuhk.edu.cn' + x['link'] for x in page_json['data']['lists']]
return news_links
def add_ner_entry(link, lst):
lst.append(newstools.get_ner_entry(link))
def _get_updates0(page_link, pool, futures, original_links_set, updates, legacy=True, starts_with_str=None):
links = set(get_links_in_page(page_link, legacy=legacy, starts_with_str=starts_with_str))
new_links = links - original_links_set
logger.info('New links from {}: {}'.format(page_link, new_links))
if new_links:
for link in new_links:
future = pool.submit(add_ner_entry, link, updates)
futures.append(future)
def get_updates(original: list):
original_links_set = set(map(lambda x: x[0], original))
pool = ThreadPoolExecutor()
futures = []
updates = []
# Main site
_get_updates0('http://www.cuhk.edu.cn/zh-hans/api/lists?page=0&type=all', pool, futures, original_links_set,
updates, legacy=False)
# SME
_get_updates0('http://sme.cuhk.edu.cn/zh-hans/sme/news?page=0', pool, futures, original_links_set, updates,
legacy=True, starts_with_str='/zh-hans/news/')
# SSE
_get_updates0('http://sse.cuhk.edu.cn/zh-hans/sse/news-events?page=0', pool, futures, original_links_set, updates,
legacy=True, starts_with_str='/zh-hans/node/')
# HSS
_get_updates0('http://hss.cuhk.edu.cn/zh-hans/subsite/common/lists/event/10/event/0?page=0', pool, futures,
original_links_set, updates,
legacy=True, starts_with_str='/zh-hans/node/')
_get_updates0('http://hss.cuhk.edu.cn/zh-hans/subsite/common/lists/news/10/news/90?page=0', pool, futures,
original_links_set, updates,
legacy=True, starts_with_str='/zh-hans/node/')
_get_updates0('http://hss.cuhk.edu.cn/zh-hans/subsite/common/lists/news/10/news/89?page=0', pool, futures,
original_links_set, updates,
legacy=True, starts_with_str='/zh-hans/node/')
wait(futures)
return updates
def update_news_file(file='news.txt') -> bool:
original = []
with open(file) as fle:
for line in fle:
line = line.strip()
if not line:
continue
original.append(eval(line))
updates = get_updates(original)
if updates:
with open(file, 'a') as fle:
for update in updates:
fle.write(repr(update) + '\n')
logger.info('Saved: {}'.format(update[0]))
return True
else:
return False
def lambda_handle(event, context):
lambda_client = boto3.client('lambda')
def process_bot_code():
logger.info('Preparing to download bot code')
code_url = lambda_client.get_function(FunctionName=os.environ['LGUHUANGLIBOT_LAMBDA_NAME'])['Code']['Location']
code_zip_data = urlopen(code_url).read()
with open('/tmp/code.zip', 'wb') as fle:
fle.write(code_zip_data)
logger.info('Preparing to extract bot code')
os.system('mkdir /tmp/work')
with zipfile.ZipFile('/tmp/code.zip') as fle:
fle.extractall('/tmp/work')
bot_code_thread = Thread(target=process_bot_code, name='process_bot_code')
bot_code_thread.start()
logger.info('Preparing to update')
def process_s3():
logger.info('Preparing to get update from S3')
os.mkdir('/tmp/s3')
bucket = boto3.resource('s3').Bucket(os.environ['LGUHUANGLIBOT_DATA_BUCKET_NAME'])
bucket.download_file('custom.txt', '/tmp/s3/custom.txt')
bucket.download_file('templates.txt', '/tmp/s3/templates.txt')
s3_thread = Thread(target=process_s3, name='process_s3')
s3_thread.start()
bot_code_thread.join()
s3_thread.join()
logger.info('Preparing to get update from news')
update_news_file('/tmp/work/news.txt')
logger.info('Preparing to regenerate word bank and merged data')
os.chdir('/tmp/work')
shutil.copy('/tmp/s3/custom.txt', '/tmp/work/custom.txt')
shutil.copy('/tmp/s3/templates.txt', '/tmp/work/templates.txt')
newstools.generate_word_bank('news.txt', custom='custom.txt', noref_output='noref.txt', output='wordbank.txt')
newstools.generate_merged_data('wordbank.txt', noref_words_file='noref.txt', templates_file='templates.txt', output='merged.json')
logger.info('Preparing to make deploy.zip')
shutil.make_archive('/tmp/deploy', 'zip', '/tmp/work')
logger.info('Preparing to upload')
lambda_client.update_function_code(FunctionName=os.environ['LGUHUANGLIBOT_LAMBDA_NAME'],
ZipFile=open('/tmp/deploy.zip', 'rb').read())
logger.info('Done. Returning')