-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathconvert_collection_to_jsonl.py
72 lines (55 loc) · 2.25 KB
/
convert_collection_to_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
'''Converts MSMARCO's tsv collection to Anserini jsonl files.'''
import json
import os
from absl import app
from absl import flags
FLAGS = flags.FLAGS
flags.DEFINE_string('collection_path', None,
'MS MARCO .tsv collection file.')
flags.DEFINE_string('predictions', None,
'Query Predictions that will be attached to documents.')
flags.DEFINE_string('output_folder', None, 'Folder to write the jsonl files.')
flags.DEFINE_integer('beam_size', None,
'Number of predictions made per document in the predictions file.')
flags.DEFINE_integer('max_docs_per_file', 1000000,
'Maximum number of documents in each jsonl file.')
def convert_collection():
print('Converting collection...')
predictions_file = open(FLAGS.predictions)
file_index = 0
with open(FLAGS.collection_path) as f:
for i, line in enumerate(f):
# Start writting to a new file whent the current one reached its maximum
# capacity.
if i % FLAGS.max_docs_per_file == 0:
if i > 0:
output_jsonl_file.close()
output_path = os.path.join(
FLAGS.output_folder, 'docs{:02d}.json'.format(file_index))
output_jsonl_file = open(output_path, 'w')
file_index += 1
doc_id, doc_text = line.rstrip().split('\t')
# Reads from predictions and merge then to the original doc text.
pred_text = []
for _ in range(FLAGS.beam_size):
pred_text.append(predictions_file.readline().strip())
pred_text = ' '.join(pred_text)
pred_text = pred_text.replace(' / ', ' ')
text = doc_text + ' ' + pred_text
output_dict = {'id': doc_id, 'contents': text}
output_jsonl_file.write(json.dumps(output_dict) + '\n')
if i % 100000 == 0:
print('Converted {} docs in {} files'.format(i, file_index))
output_jsonl_file.close()
predictions_file.close()
def main(_):
if not os.path.exists(FLAGS.output_folder):
os.makedirs(FLAGS.output_folder)
convert_collection()
print('Done!')
if __name__ == '__main__':
flags.mark_flag_as_required('collection_path')
flags.mark_flag_as_required('predictions')
flags.mark_flag_as_required('output_folder')
flags.mark_flag_as_required('beam_size')
app.run(main)