-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathload_original_data
121 lines (109 loc) · 5.13 KB
/
load_original_data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import json
import pickle
import torch
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import numpy as np
import random
import clip
import re
import string
punctuation_string = string.punctuation # CLIP的分词器对字符长度很不友好,尤其是数字和字符,每一个单个数字都会被识别成一个token
# CLIP对带有中文拼音、罕见词的分词也很不友好,容易导致token超出最大长度76,比如PsittacosaurusYou被切分成了7个单词
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50", device=device) #['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']
def load_dataset(name='wikipedia_dataset'):
"""
Load captions and image features
Possible options: wikipedia, NUS-WIDE-10K, Pascal-Sentence, xmedia
"""
# loc = '../benchpark/' + name +'/'
loc = '/u01/isi/zzx/benchpark/' + name
# Captions
train_imgs, dev_imgs, test_imgs = [], [], []
train_caps, dev_caps, test_caps = [], [], []
train_labs, dev_labs, test_labs = [], [], []
train_ids, dev_ids, test_ids = [], [], []
train_file_path = os.path.join(loc, 'split/train.txt')
dev_file_path = os.path.join(loc, 'split/valid.txt')
test_file_path = os.path.join(loc, 'split/test.txt')
with open(test_file_path, 'r') as f:
lines = f.readlines()
for line in lines:
id, label = line.replace('\n', '').split(':')
text = json.load(open(os.path.join(loc, 'text', id + '.json'), 'r', encoding='UTF_8'))
text = text.split(' ')[:50]
text = ' '.join(text)
text = re.sub('[{}]'.format(punctuation_string), "", text)
text = re.sub('[\d]', '', text)
try:
text_ids = clip.tokenize(text).detach().cpu()
test_caps.append(text_ids.numpy())
test_labs.append(int(label))
test_ids.append(id)
except:
continue
image = Image.open(os.path.join(loc, 'wiki_image', id + '.jpg')) # 使用open函数打开后返回的图像模式都是‘RGB’或者灰度图其模式为‘L’
image = preprocess(image).unsqueeze(0)
test_imgs.append(image.numpy())
print(len(test_imgs))
print(len(test_caps))
print(' Successfully process test data')
test_data = {'image': test_imgs, "text": test_caps, "label": test_labs, 'ids': test_ids}
with open('test.pkl', 'wb') as f:
pickle.dump(test_data, f)
with open(train_file_path, 'r') as f:
lines = f.readlines()
for line in lines:
id, label = line.replace('\n', '').split(':')
text = json.load(open(os.path.join(loc, 'text', id + '.json'), 'r', encoding='UTF_8'))
text = text.split(' ')[:50]
text = ' '.join(text)
text = re.sub('[{}]'.format(punctuation_string), "", text) # 去掉符号
text = re.sub('[\d]', '', text) # 去掉数字
try:
text_ids = clip.tokenize(text).detach().cpu()
train_caps.append(text_ids.numpy())
train_labs.append(int(label))
train_ids.append(id)
except:
continue
image = Image.open(os.path.join(loc, 'wiki_image', id + '.jpg')) # 使用open函数打开后返回的图像模式都是‘RGB’或者灰度图其模式为‘L’
image = preprocess(image).unsqueeze(0)
train_imgs.append(image.numpy())
print(len(train_imgs))
print(len(train_caps))
train_data = {'image': train_imgs, "text": train_caps, "label": train_labs, 'ids': train_ids}
with open('train.pkl', 'wb') as f:
pickle.dump(train_data, f)
print(' Successfully process training data')
with open(dev_file_path, 'r') as f:
lines = f.readlines()
for line in lines:
id, label = line.replace('\n', '').split(':')
text = json.load(open(os.path.join(loc, 'text', id + '.json'), 'r', encoding='UTF_8'))
text = text.split(' ')[:50]
text = ' '.join(text)
text = re.sub('[{}]'.format(punctuation_string), "", text)
text = re.sub('[\d]', '', text)
try:
text_ids = clip.tokenize(text).detach().cpu()
dev_caps.append(text_ids.numpy())
dev_labs.append(int(label))
dev_ids.append(id)
except:
continue
image = Image.open(os.path.join(loc, 'wiki_image', id + '.jpg')) # 使用open函数打开后返回的图像模式都是‘RGB’或者灰度图其模式为‘L’
image = preprocess(image).unsqueeze(0)
dev_imgs.append(image.numpy())
print(len(dev_imgs))
print(len(dev_caps))
dev_ims = np.array(dev_imgs)
dev_caps = np.array(dev_caps)
valid_data = {'image': dev_ims, "text": dev_caps, "label": dev_labs, 'ids': dev_ids}
with open('dev.pkl', 'wb') as f:
pickle.dump(valid_data, f)
print('Successfully process dev data')
if __name__ == '__main__':
load_dataset()