Skip to content

Commit

Permalink
Merge pull request #340 from dice-group/tdl
Browse files Browse the repository at this point in the history
tDL, Verbalization, and CV
  • Loading branch information
Demirrr authored Jan 18, 2024
2 parents ac2d69d + 857d8df commit 7a70145
Show file tree
Hide file tree
Showing 4 changed files with 588 additions and 210 deletions.
134 changes: 60 additions & 74 deletions examples/concept_learning_cv_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
"""
StratifiedKFold Cross Validating DL Concept Learning Algorithms
Usage
python examples/concept_learning_evaluation.py
--lps LPs/Family/lps.json
--kb KGs/Family/family.owl
--max_runtime 30
--report family.csv
"""
import json
import os
import time
import pandas as pd
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.concept_learner import CELOE, OCEL, EvoLearner
from ontolearn.learners import Drill, TDL
from ontolearn.learning_problem import PosNegLPStandard
from ontolearn.metrics import Accuracy, F1
from owlapy.model import OWLClass, OWLNamedIndividual, IRI
from ontolearn.metrics import F1
from owlapy.model import OWLNamedIndividual, IRI
import argparse
from rdflib import Graph
from sklearn.model_selection import StratifiedKFold
import numpy as np

pd.set_option("display.precision", 5)


# @TODO This should be standalone function that can be imported from ontolearn/static_funcs.py
def compute_f1_score(individuals, pos, neg):
tp = len(pos.intersection(individuals))
tn = len(neg.difference(individuals))
# tn = len(neg.difference(individuals))

fp = len(neg.intersection(individuals))
fn = len(pos.difference(individuals))
Expand All @@ -45,18 +54,27 @@ def dl_concept_learning(args):
settings = json.load(json_file)

kb = KnowledgeBase(path=args.kb)
ocel = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(),
max_runtime=args.max_runtime)
celoe = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(),
max_runtime=args.max_runtime)
drill = Drill(knowledge_base=KnowledgeBase(path=args.kb), path_pretrained_kge=args.path_pretrained_kge,
quality_func=F1(), max_runtime=args.max_runtime)
tdl = TDL(knowledge_base=KnowledgeBase(path=args.kb),
dataframe_triples=pd.DataFrame(
data=sorted([(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], key=lambda x: len(x)),
columns=['subject', 'relation', 'object'], dtype=str),
kwargs_classifier={"random_state": 0},
max_runtime=args.max_runtime)

# dictionary to store the data
data = dict()
for str_target_concept, examples in settings['problems'].items():
print('Target concept: ', str_target_concept)
p = examples['positive_examples']
n = examples['negative_examples']
print('\n\n')

print('Target concept: ', str_target_concept)

# Take p and n, generate Kfolds
kf = StratifiedKFold(n_splits=10, shuffle=False)
kf = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.random_seed)
X = np.array(p + n)
y = np.array([1.0 for _ in p] + [0.0 for _ in n])

Expand All @@ -67,6 +85,7 @@ def dl_concept_learning(args):
# () Extract positive and negative examples from train fold
train_pos = {pos_individual for pos_individual in X[train_index][y[train_index] == 1]}
train_neg = {neg_individual for neg_individual in X[train_index][y[train_index] == 0]}

# Sanity checking for individuals used for training.
assert train_pos.issubset(examples['positive_examples'])
assert train_neg.issubset(examples['negative_examples'])
Expand All @@ -82,23 +101,20 @@ def dl_concept_learning(args):
neg=set(map(OWLNamedIndividual, map(IRI.create, train_neg))))

test_lp = PosNegLPStandard(pos=set(map(OWLNamedIndividual, map(IRI.create, test_pos))),
neg=set(map(OWLNamedIndividual, map(IRI.create, test_neg))))

neg=set(map(OWLNamedIndividual, map(IRI.create, test_neg))))
print("OCEL starts..", end="\t")
start_time = time.time()
model = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
pred_ocel = model.fit(train_lp).best_hypotheses(n=1)
pred_ocel = ocel.fit(train_lp).best_hypotheses(n=1)
rt_ocel = time.time() - start_time
print("OCEL ends..", end="\t")

# () Quality on the training data
train_f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)},
pos=train_lp.pos,
neg=train_lp.neg)
pos=train_lp.pos,
neg=train_lp.neg)
# () Quality on test data
test_f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)},
pos=test_lp.pos,
neg=test_lp.neg)
pos=test_lp.pos,
neg=test_lp.neg)
# Reporting
data.setdefault("Train-F1-OCEL", []).append(train_f1_ocel)
data.setdefault("Test-F1-OCEL", []).append(test_f1_ocel)
Expand All @@ -107,23 +123,19 @@ def dl_concept_learning(args):
print(f"OCEL Test Quality: {test_f1_ocel:.3f}", end="\t")
print(f"OCEL Runtime: {rt_ocel:.3f}")



print("CELOE starts..", end="\t")
start_time = time.time()
model = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
pred_celoe = model.fit(train_lp).best_hypotheses(n=1)
pred_celoe = celoe.fit(train_lp).best_hypotheses(n=1)
rt_celoe = time.time() - start_time
print("CELOE ends..", end="\t")

# () Quality on the training data
train_f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)},
pos=train_lp.pos,
neg=train_lp.neg)
pos=train_lp.pos,
neg=train_lp.neg)
# () Quality on test data
test_f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)},
pos=test_lp.pos,
neg=test_lp.neg)
pos=test_lp.pos,
neg=test_lp.neg)
# Reporting
data.setdefault("Train-F1-CELOE", []).append(train_f1_celoe)
data.setdefault("Test-F1-CELOE", []).append(test_f1_celoe)
Expand All @@ -132,12 +144,14 @@ def dl_concept_learning(args):
print(f"CELOE Test Quality: {test_f1_celoe:.3f}", end="\t")
print(f"CELOE Runtime: {rt_celoe:.3f}")



print("Evo starts..", end="\t")
start_time = time.time()
model = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
pred_evo = model.fit(train_lp).best_hypotheses(n=1)
# BUG: Evolearner needs to be intialized for each learning problem
evolearner = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(),
max_runtime=args.max_runtime,
use_data_properties=False,
use_inverse=False, use_card_restrictions=False)
pred_evo = evolearner.fit(train_lp).best_hypotheses(n=1)
rt_evo = time.time() - start_time
print("Evo ends..", end="\t")

Expand All @@ -157,41 +171,34 @@ def dl_concept_learning(args):
print(f"Evo Test Quality: {test_f1_evo:.3f}", end="\t")
print(f"Evo Runtime: {rt_evo:.3f}")


print("DRILL starts..", end="\t")
start_time = time.time()
model = Drill(knowledge_base=KnowledgeBase(path=args.kb), path_pretrained_kge=args.path_pretrained_kge,quality_func=F1(), max_runtime=args.max_runtime)
pred_drill = model.fit(train_lp).best_hypotheses(n=1)
pred_drill = drill.fit(train_lp).best_hypotheses(n=1)
rt_drill = time.time() - start_time
print("DRILL ends..", end="\t")

# () Quality on the training data
train_f1_drill = compute_f1_score(individuals={i for i in kb.individuals(pred_drill.concept)},
pos=train_lp.pos,
neg=train_lp.neg)
pos=train_lp.pos,
neg=train_lp.neg)
# () Quality on test data
test_f1_drill = compute_f1_score(individuals={i for i in kb.individuals(pred_drill.concept)},
pos=test_lp.pos,
neg=test_lp.neg)
pos=test_lp.pos,
neg=test_lp.neg)
# Reporting
data.setdefault("Train-F1-DRILL", []).append(train_f1_drill)
data.setdefault("Test-F1-DRILL", []).append(test_f1_drill)
data.setdefault("RT-DRILL", []).append(rt_drill)
print(f"DRILL Train Quality: {train_f1_drill:.3f}", end="\t")
print(f"DRILL Test Quality: {test_f1_drill:.3f}", end="\t")
print(f"DRILL Runtime: {rt_drill:.3f}")

print("TDL starts..", end="\t")
start_time = time.time()
model = TDL(knowledge_base=KnowledgeBase(path=args.kb), dataframe_triples=pd.DataFrame(
data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)],
columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'),
kwargs_classifier={"criterion": "gini", "random_state": 0},
max_runtime=args.max_runtime)
# () Fit model training dataset
pred_tdl = model.fit(train_lp).best_hypotheses(n=1)
pred_tdl = tdl.fit(train_lp).best_hypotheses(n=1)
print("TDL ends..", end="\t")
rt_tdl = time.time() - start_time

# () Quality on the training data
train_f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)},
pos=train_lp.pos,
Expand All @@ -216,33 +223,12 @@ def dl_concept_learning(args):

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Description Logic Concept Learning')

parser.add_argument("--max_runtime", type=int, default=1)
parser.add_argument("--lps", type=str, required=True)
parser.add_argument("--kb", type=str, required=True)
parser.add_argument("--max_runtime", type=int, default=10, help="Max runtime")
parser.add_argument("--lps", type=str, required=True, help="Path fto the learning problems")
parser.add_argument("--folds", type=int, default=10, help="Number of folds of cross validation.")
parser.add_argument("--kb", type=str, required=True,
help="Knowledge base")
parser.add_argument("--path_pretrained_kge", type=str, default=None)
parser.add_argument("--report", type=str, default="report.csv")
dl_concept_learning(parser.parse_args())


"""
# Benchmarking: Run a bash script tdl_stratified_kfold_cv_experiments.sh with the followings
mkdir CVFamilyBenchmarkResults
python examples/concept_learning_cv_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family.owl --max_runtime 60 --report cv_family_results.csv && mv cv_family_results.csv CVFamilyBenchmarkResults
mkdir CVMutagenesisBenchmarkResults
python examples/concept_learning_cv_evaluation.py --lps LPs/Mutagenesis/lps.json --kb KGs/Mutagenesis/mutagenesis.owl --max_runtime 60 --report cv_mutagenesis_results.csv && mv cv_mutagenesis_results.csv CVMutagenesisBenchmarkResults
mkdir CVCarcinogenesisBenchmarkResults
python examples/concept_learning_cv_evaluation.py --lps LPs/Carcinogenesis/lps.json --kb KGs/Carcinogenesis/carcinogenesis.owl --max_runtime 60 --report cv_carcinogenesis_results.csv && mv cv_carcinogenesis_results.csv CVCarcinogenesisBenchmarkResults
#Anaylsing results
import pandas as pd
pd.set_option("display.precision", 3)
pd.set_option('display.max_columns', None)
path="CVCarcinogenesisBenchmarkResults/cv_carcinogenesis_results.csv"
df = pd.read_csv(path, index_col=0)
df_mean_by_lp = df.groupby(by=df.index).mean()
filter_col = [col for col in df if col.startswith('Test-F1') or col.startswith('RT')]
print(df_mean_by_lp[filter_col])
print(df_mean_by_lp[filter_col].to_latex(index=True, formatters={"name": str.upper}, float_format="{:.1f}".format))
"""
parser.add_argument("--random_seed", type=int, default=1)
dl_concept_learning(parser.parse_args())
56 changes: 35 additions & 21 deletions examples/concept_learning_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
"""
Fitting DL Concept Learning Algorithms:
Given E^+ and E^-, a learner finds a concept H and F1 score is computed w.r.t. E^+, E^-, and R(H) retrieval of H.
python examples/concept_learning_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family.owl --max_runtime 30 --report family.csv
"""

import json
import os
import time
Expand Down Expand Up @@ -44,6 +53,18 @@ def dl_concept_learning(args):

kb = KnowledgeBase(path=args.kb)

ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime)
celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime)
drill = Drill(knowledge_base=KnowledgeBase(path=args.kb),
path_pretrained_kge=args.path_pretrained_kge,
quality_func=F1(),
max_runtime=args.max_runtime)
tdl = TDL(knowledge_base=KnowledgeBase(path=args.kb),
dataframe_triples=pd.DataFrame(
data=sorted([(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], key=lambda x: len(x)),
columns=['subject', 'relation', 'object'], dtype=str),
kwargs_classifier={"random_state": 0},
max_runtime=args.max_runtime)
# dictionary to store the data
data = dict()
for str_target_concept, examples in settings['problems'].items():
Expand All @@ -59,9 +80,8 @@ def dl_concept_learning(args):
lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)

print("OCEL starts..", end="\t")
model = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
start_time = time.time()
pred_ocel = model.fit(lp).best_hypotheses(n=1)
pred_ocel = ocel.fit(lp).best_hypotheses(n=1)
print("OCEL ends..", end="\t")
rt_ocel = time.time() - start_time
f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, pos=lp.pos, neg=lp.neg)
Expand All @@ -71,9 +91,8 @@ def dl_concept_learning(args):
print(f"OCEL Runtime: {rt_ocel:.3f}")

print("CELOE starts..", end="\t")
model = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
start_time = time.time()
pred_celoe = model.fit(lp).best_hypotheses(n=1)
pred_celoe = celoe.fit(lp).best_hypotheses(n=1)
print("CELOE Ends..", end="\t")
rt_celoe = time.time() - start_time
f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, pos=lp.pos, neg=lp.neg)
Expand All @@ -83,9 +102,10 @@ def dl_concept_learning(args):
print(f"CELOE Runtime: {rt_celoe:.3f}")

print("Evo starts..", end="\t")
model = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
start_time = time.time()
pred_evo = model.fit(lp).best_hypotheses(n=1)
# Evolearner has a bug and KB needs to be reloaded
evo = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime)
pred_evo = evo.fit(lp).best_hypotheses(n=1)
print("Evo ends..", end="\t")
rt_evo = time.time() - start_time
f1_evo = compute_f1_score(individuals={i for i in kb.individuals(pred_evo.concept)}, pos=lp.pos, neg=lp.neg)
Expand All @@ -96,11 +116,7 @@ def dl_concept_learning(args):

print("DRILL starts..", end="\t")
start_time = time.time()
model = Drill(knowledge_base=KnowledgeBase(path=args.kb),
path_pretrained_kge=args.path_pretrained_kge,
quality_func=F1(),
max_runtime=args.max_runtime)
pred_drill = model.fit(lp).best_hypotheses(n=1)
pred_drill = drill.fit(lp).best_hypotheses(n=1)
print("DRILL ends..", end="\t")
rt_drill = time.time() - start_time
f1_drill = compute_f1_score(individuals=set(kb.individuals(pred_drill.concept)), pos=lp.pos, neg=lp.neg)
Expand All @@ -111,21 +127,20 @@ def dl_concept_learning(args):

print("TDL starts..", end="\t")
start_time = time.time()
model = TDL(knowledge_base=KnowledgeBase(path=args.kb), dataframe_triples=pd.DataFrame(
data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)],
columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'),
kwargs_classifier={"criterion": "gini", "random_state": 0},
max_runtime=args.max_runtime)
pred_tdl = model.fit(lp).best_hypotheses(n=1)
# () Fit model training dataset
pred_tdl = tdl.fit(lp).best_hypotheses(n=1)
print("TDL ends..", end="\t")
rt_tdl = time.time() - start_time
# Compute quality of best prediction
f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, pos=lp.pos, neg=lp.neg)

# () Quality on the training data
f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)},
pos=lp.pos,
neg=lp.neg)

data.setdefault("F1-TDL", []).append(f1_tdl)
data.setdefault("RT-TDL", []).append(rt_tdl)
print(f"TDL Quality: {f1_tdl:.3f}", end="\t")
print(f"TDL Runtime: {rt_tdl:.3f}")

df = pd.DataFrame.from_dict(data)
df.to_csv(args.report, index=False)
print(df)
Expand All @@ -134,7 +149,6 @@ def dl_concept_learning(args):

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Description Logic Concept Learning')

parser.add_argument("--max_runtime", type=int, default=1)
parser.add_argument("--lps", type=str, required=True)
parser.add_argument("--kb", type=str, required=True)
Expand Down
3 changes: 0 additions & 3 deletions ontolearn/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,6 @@ def encode_learning_problem(self, lp: PosNegLPStandard):
Return:
EncodedPosNegLPStandard: The encoded learning problem.
"""

assert len(self.class_hierarchy) > 0

if lp.all is None:
kb_all = self.all_individuals_set()
else:
Expand Down
Loading

0 comments on commit 7a70145

Please sign in to comment.