Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Siamese batched inference + base image and github actions upgrades #208

Merged
merged 11 commits into from
May 30, 2023
Merged
2 changes: 1 addition & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Image for a Python 3 development environment
FROM python:3.8-slim
FROM registry.access.redhat.com/ubi8/python-39:1-105

# Add any tools that are needed beyond Python
RUN apt-get update && \
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/tca-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
ref: main
- name : get tag
run: echo ${{needs.tagged-release.get_release_tag.outputs.tags}}
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: '3.8'
python-version: '3.9'
- name: Cleanup
run: |
bash clean.sh
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/tca-unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ jobs:
name: Run unit test cases
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: '3.8'
python-version: '3.9'
- name: Cleanup
run: |
bash clean.sh
Expand Down
40 changes: 23 additions & 17 deletions benchmarks/run_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@

def parser():
parser = argparse.ArgumentParser(description="Train and evaluate TCA entity standardization models")
parser.add_argument("-model_type", type=str, default="tf_idf", help="tf_idf (default) | siamese | wiki_data_api | all")
parser.add_argument("-model_type", type=str, default="siamese", help=" siamese (default) | tf_idf | wiki_data_api | all")
parser.add_argument("-mode", type=str, default="deploy", help="deploy (default) | benchmark")
parser.add_argument("-batch_size", type=int, default=0, help="optional, batch size for siamese model. Default is 0, meaning no batching")
parser.add_argument("-show", action="store_true", help="False (default)")

return parser.parse_args()
Expand Down Expand Up @@ -147,6 +148,7 @@ def sim_metrics(json_data, label):

model_type = args.model_type
mode = args.mode
batch_size = args.batch_size
show = args.show

table_data = {}
Expand All @@ -155,19 +157,21 @@ def sim_metrics(json_data, label):

common = os.path.join("config", "common.ini")
kg = os.path.join("config", "kg.ini")
config.read([common, kg])
deploy_configs = os.listdir(os.path.join("config", "deploy"))
deploys = [os.path.join("config", "deploy", x) for x in deploy_configs]
config.read([common, kg]+deploys)

try:
data_dir = config['general']['data_dir']
kg_dir = config['general']['kg_dir']
threshold= float(config['Thresholds']['HIGH_THRESHOLD'])
threshold= float(config[f"infer_thresholds_{model_type}"]["HIGH_THRESHOLD"])
except KeyError as k:
logging.error(f'{k} is not a key in your common.ini file.')
exit()

tasks = {'sim':'tca', 'tca': 'tca', 'wikidata':'wikidata', 'deploy': 'deploy'}
task = tasks[mode]
tca_infer_file_name = os.path.join(data_dir, mode, "infer.json")
tca_infer_file_name = os.path.join(data_dir, 'tca', "infer.json")
with open(tca_infer_file_name, 'r', encoding='utf-8') as tca_infer_file:
tca_infer_data = json.load(tca_infer_file)
num_pos_data = len(tca_infer_data["data"])
Expand Down Expand Up @@ -214,24 +218,26 @@ def sim_metrics(json_data, label):

siamese_start = time.time()
siamese_infer = copy.deepcopy(tca_infer_data)
print(len(siamese_infer['data']))
label = siamese_infer.get("label", None)
siamese_infer = siamese.infer(siamese_infer)
siamese_infer = siamese.infer(siamese_infer, batch_size=batch_size)
siamese_end = time.time()
siamese_time = (siamese_end - siamese_start)

if label: # Classification task
if mode != 'deploy':
if mode != 'deploy':
if label: # Classification task
# if mode != 'deploy':
cls_metrics(siamese_infer, "Siamese")
threshold = float(siamese.config['Thresholds']['HIGH_THRESHOLD'])
siamese_topk = topk(siamese_infer, threshold)
table_data["siamese"] = {}
table_data["siamese"]["topk"] = siamese_topk["topk"]
table_data["siamese"]["kns"] = siamese_topk["kns"]
table_data["siamese"]["fpr"] = siamese_topk["fpr"]
table_data["siamese"]["unks"] = siamese_topk["unks"]
table_data["siamese"]["time"] = siamese_time
else:
(score_auc, score_f1) = sim_metrics(siamese_infer, "Siamese")
threshold = float(siamese.config['infer_thresholds_siamese']['HIGH_THRESHOLD'])
siamese_topk = topk(siamese_infer, threshold)
table_data["siamese"] = {}
table_data["siamese"]["topk"] = siamese_topk["topk"]
table_data["siamese"]["kns"] = siamese_topk["kns"]
table_data["siamese"]["fpr"] = siamese_topk["fpr"]
table_data["siamese"]["unks"] = siamese_topk["unks"]
table_data["siamese"]["time"] = siamese_time
else:
(score_auc, score_f1) = sim_metrics(siamese_infer, "Siamese")
'''
if model_type == "gnn" or model_type == "all":
logging.info("----------- GNN -------------")
Expand Down
2 changes: 1 addition & 1 deletion config/deploy/siamese.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[model]
backbone = prajjwal1/bert-small
backbone = ./models/deploy/siamese_model

[train]
seed = 0
Expand Down
6 changes: 3 additions & 3 deletions entity_standardizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ of entities.
# Training and Inference with single-shot models
Entity standardizer package contains the following two single-shot models:

1. *tfidf* - A supervised approach that computes tfidf vectors for a given training dataset.
2. *siamese* - A supervised approach that uses BERT models connected in a Siamese network. The default model is the siamese model.
1. *siamese* - A supervised approach that uses BERT models connected in a Siamese network. The default model is the siamese model.
2. *tfidf* - A supervised approach that computes tfidf vectors for a given training dataset.

Use the following steps to install, train, and inference with the entity standardizer models as a standalone package.
The entity standardizer package requires python >= 3.6 environment.
The entity standardizer package requires python >= 3.9 environment.

1. Run *setup.sh* to install dependencies and entity standardizer package
```
Expand Down
62 changes: 52 additions & 10 deletions entity_standardizer/entity_standardizer/siamese/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,55 +39,96 @@ def __init__(self, task_name):
self.config["task"] = {}
self.config["task"]["name"] = self.task_name

def infer(self, infer_data):
def infer(self, infer_data, batch_size=0):

if self.config['train']['disable_cuda']=='False' and torch.cuda.is_available():
logging.info(f"batch_size {batch_size}.")

if self.config['train']['disable_cuda'] == 'False' and torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
logging.info(f"Device = {device}")

model_dir = os.path.join(self.config['general']['model_dir'], self.config['task']['name'])
model_name = 'siamese.pt'
model_dir = os.path.join(self.config['general']['model_dir'], self.config['task']['name'],'siamese_model')
model_name = 'pytorch_model.bin' #'siamese.pt'
model_path = os.path.join(model_dir, model_name)
model = Model(self.config)
model = Model(self.config)

if not os.path.exists(model_path):
logging.info(f"{model_name} does not exist, running training to generate model.")
model = self.train()
else:
logging.info(f"Loading training parameters from {model_path}.")
model.load_state_dict(torch.load(model_path, map_location=device))
logging.info(f"Done.")

model.to(device=device)
model.eval()

entity_vector_name = 'entity_vector.pickle'
entity_vector_path = os.path.join(model_dir, entity_vector_name)
if not os.path.exists(entity_vector_path):
logging.info(f"Computing training embeddings to {entity_vector_path}.")
_, train_entity_id_to_name = loader(self.config)
train_entities, labels = list(train_entity_id_to_name.values()), list(train_entity_id_to_name.keys())
cls = model(train_entities, device)
embeddings = cls.detach().cpu().numpy()

if batch_size > 0:
for i in tqdm(range(0, int(len(train_entities)/batch_size)+1)):
if i * batch_size < len(train_entities) - 1:
# logging.info(f'{i}/{int(len(train_entities)/batch_size)}')
cls = model(train_entities[i*batch_size:min(len(train_entities),(i+1)*batch_size)], device)
if i == 0:
embeddings = cls.detach().cpu().numpy()
else:
embeddings = np.concatenate((embeddings, cls.detach().cpu().numpy()), axis=0)
else:
cls = model(train_entities, device)
embeddings = cls.detach().cpu().numpy()

logging.info(f'writing to {entity_vector_path}')
with open(entity_vector_path, 'wb') as f:
pickle.dump((embeddings, labels), f)

logging.info('done')
else:
with open(entity_vector_path, 'rb') as f:
embeddings, labels = pickle.load(f)
num_entities = len(embeddings)
logging.info(f"Loading embeddings of {num_entities} entities from {entity_vector_path}.")
logging.info(f"Loading embeddings of {num_entities} entities from {entity_vector_path}.")


inf_start = time.time()
label = infer_data.get("label", None)
if label:
logging.info('doing test embeddings extraction')

x_test = [d['mention(s)'] for _, d in infer_data['data'].items()]
# y_test = [d['entity_id'] for _, d in infer_data['data'].items()]

knn = KNeighborsClassifier(n_neighbors=1, metric='cosine').fit(embeddings, labels)
cls_test = model(x_test, device)

if batch_size > 0:
for i in tqdm(range(0, int(len(x_test)/batch_size)+1)):
if i*batch_size < len(x_test)-1:
# logging.info(f'{i}/{int(len(train_entities)/batch_size)}')
cls_test = model(x_test[i*batch_size:min(len(x_test),(i+1)*batch_size)], device)
if i == 0:
test_embeddings = cls_test.detach().cpu().numpy()
else:
test_embeddings = np.concatenate((test_embeddings, cls_test.detach().cpu().numpy()), axis=0)
else:
cls_test = model(x_test, device)
test_embeddings = cls_test.detach().cpu().numpy()



n_neighbors = int(self.config['infer'].get('topk', 10))
distances, indices = knn.kneighbors(cls_test.detach().cpu().numpy(), n_neighbors=n_neighbors)

logging.info('doing knn matching')

distances, indices = knn.kneighbors(test_embeddings, n_neighbors=n_neighbors)
distances, indices = distances.tolist(), indices.tolist()

pred_label_ids = []
for pred in indices:
label = [labels[i] for i in pred]
Expand All @@ -96,6 +137,7 @@ def infer(self, infer_data):
predictions = list(zip(pred_label_ids[idx], [1-d for d in distances[idx]]))
infer_data['data'][inf_id]['predictions'] = predictions
else:
logging.info('get infer data')
data = infer_data["data"]
slice_size = 5000
with tqdm(range(int(np.ceil(len(data)/slice_size))), ncols=100) as progress:
Expand Down
6 changes: 3 additions & 3 deletions entity_standardizer/entity_standardizer/siamese/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
################################################################################

import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModel, BertTokenizer# AutoTokenizer
import transformers
transformers.utils.logging.set_verbosity_error()

class Model(nn.Module):
def __init__(self, params):
super().__init__()
self.backbone = params["model"].get("backbone", "prajjwal1/bert-small")
self.tokenizer = AutoTokenizer.from_pretrained(self.backbone)
self.encoder = AutoModel.from_pretrained(self.backbone)
self.tokenizer = BertTokenizer.from_pretrained(self.backbone, cache_dir='models/deploy/siamese_model')
self.encoder = AutoModel.from_pretrained(self.backbone, cache_dir='models/deploy/siamese_model')

def forward(self, inputs, device):
inputs = self.tokenizer(inputs, padding=True, return_tensors='pt')
Expand Down
7 changes: 4 additions & 3 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,11 @@ echo "----------------Generated KG Utility Files--------------------"
######################################################################
echo "--------------Generating Entity Standardizer Models------------------"
$python benchmarks/generate_data.py
# wget https://ibm.box.com/shared/static/mnp323fxslbel8qjecfmryvs8yypooka.pt -O "./models/deploy/siamese.pt"
mkdir -p models/deploy
wget https://www.dropbox.com/s/efpx2qy7n9z5niu/siamese.pt -O "./models/deploy/siamese.pt"
$python benchmarks/run_models.py -model_type siamese
wget https://www.dropbox.com/s/bobcey7ufklw7mr/siamese_model.zip -O "./models/deploy/siamese_model.zip"
unzip models/deploy/siamese_model.zip -d models/deploy/
rm models/deploy/siamese_model.zip
$python benchmarks/run_models.py -model_type siamese -batch_size 5
echo "---------Generated Entity Standardizer Models--------------"

echo "+---------------------------------------------------------+"
Expand Down