-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfind_morals.py
70 lines (59 loc) · 2.4 KB
/
find_morals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import multiprocessing
import re
from collections import defaultdict
import argparse
import os
def get_moral_dict(path='resources/Enhanced_Morality_Lexicon_V1.1.txt'):
morals = defaultdict(set)
with open(path, 'r') as infile:
for line in infile.readlines():
split=line.split('|')
token=split[0][8:]
moral_foundation = split[4][9:]
morals[moral_foundation].add(token)
return dict(morals)
def setdict_to_regexes(dic):
regexes = {}
for key in dic.keys():
regexes[key] = r'\b' + r'\b|\b'.join([element.lower() for element in dic[key]]) + r'\b'
return regexes
def worker(name, df, regexes, args):
which=args.column
print('starting', name)
arr = np.array([0 for i in range(len(df))])
for i, txt in tqdm(enumerate(df[which].fillna('').values), total=len(df)):
arr[i] = len(re.findall(regexes[name], txt, flags=re.IGNORECASE))
if len(args.indir):
outdir = args.indir+"_agg"
if not os.path.exists(outdir):
os.makedirs(outdir)
np.save(os.path.join(outdir, "%s_%s.npy" % (which, name)), arr)
else:
np.save('data/raw/Reddit-QA-Corpus/%s_%s.npy' % (which,name), arr)
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--indir", type=str, default="")
parser.add_argument("--column", type=str, default='questions')
parser.add_argument("--multiprocessing", action="store_true", help="Use multiprocessing")
args = parser.parse_args()
questions = open('data/raw/Reddit-QA-Corpus/Questions_R.txt', 'r').readlines()
answers = open('data/raw/Reddit-QA-Corpus/Answers_R.txt', 'r').readlines()
df=pd.DataFrame().from_dict({'questions': questions, 'answers': answers}, orient='columns')
if len(args.indir):
print("using", args.indir)
df = pd.concat([pd.read_csv(fn) for fn in sorted(glob( os.path.join(args.indir, '*.csv') ))])
regexes = setdict_to_regexes(get_moral_dict())
jobs = []
if args.multiprocessing:
for regex in regexes:
p = multiprocessing.Process(target=worker, args=(regex, df, regexes, args))
jobs.append(p)
p.start()
else:
for regex in regexes:
worker(regex, df, regexes, args)