-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathparse_slack.py
executable file
·59 lines (46 loc) · 1.83 KB
/
parse_slack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
"""
Parses a slack export folder (data/slack_export/), cleans up the slack messages found for your user
and then exports them into a file in corpus.
"""
import json
import os
import re
from typing import List
USERNAME_TO_EXPORT = "erik"
def extract_slack_msgs(export_root_path : str, username : str) -> List[str]:
"""extract all text from the users messages as a list of strings"""
file_strs : List[str] = []
for (dirpath, _, filenames) in os.walk(export_root_path):
file_strs += [os.path.join(dirpath, file) for file in filenames]
msgs : List[str] = []
for file_str in file_strs:
with open(file_str, encoding="utf8") as file:
channel = json.load(file)
for msg in channel:
try:
user = msg["user_profile"]["name"]
text = msg["text"]
except KeyError:
# there's some other junk that we don't care about
continue
if user == username:
msgs.append(clean_slack_msg(text))
return msgs
def clean_slack_msg(text: str) -> str:
"""cleans up a slack msg to do NLP on it."""
text = re.sub(r"<.*?>", "", text) # filter slack tags
text = re.sub(r":\S*:", "", text) # filter emojis
text = re.sub(r"'", "", text) # don't bother with contractions
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # get rid of other non text
text = re.sub(r'\s{2,}', ' ', text) # compress whitespace
text = text.strip()
text = text.lower()
return text
if __name__ == "__main__":
slack_export_root = "data/slack_export"
texts = extract_slack_msgs(slack_export_root, USERNAME_TO_EXPORT)
# write each msg as a line to a file
with open("data/corpus/slack_msgs.txt", 'w', encoding="utf8") as f:
for text in texts:
f.write(text + "\n")