-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathretrieval_eval.py
293 lines (263 loc) · 13 KB
/
retrieval_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
"""$ python examples/retrieval_eval.py --path_kg KGs/Family/father.owl
##################################################
Description of generated Concepts
NC denotes the named concepts |NC|=3
NNC denotes the negated named concepts |NNC|=3
|NC UNION NC|=9
|NC Intersection NC|=9
NC* denotes the union of named concepts and negated named concepts |NC*|=6
|NC* UNION NC*|=36
|NC* Intersection NC*|=36
|exist R* NC*|=12
|forall R* NC*|=12
|Max Cardinalities|=36
|Min Cardinalities|=36
|exist R* Nominals|=40
##################################################
Expression: ∃ hasChild⁻.{heinz , stefan , martin} | Jaccard Similarity:1.0000 | F1 :1.0000 | Runtime Benefits:-0.003: 0%| | 0/232 [00:00<?, ?it/s]Invalid IRI detected: Nec8a193431bd4ba6a57485770b54e72b, error: string index out of range
Expression: male ⊔ (¬female) | Jaccard Similarity:1.0000 | F1 :1.0000 | Runtime Benefits:-0.002: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 232/232 [00:00<00:00, 285.35it/s]
Type
OWLClass 3
OWLObjectAllValuesFrom 12
OWLObjectComplementOf 3
OWLObjectIntersectionOf 45
OWLObjectMaxCardinality 36
OWLObjectMinCardinality 36
OWLObjectSomeValuesFrom 52
OWLObjectUnionOf 45
Name: Type, dtype: int64
Jaccard Similarity F1 Runtime Benefits
Type
OWLClass 1.0 1.0 -0.001484
OWLObjectAllValuesFrom 1.0 1.0 -0.004071
OWLObjectComplementOf 1.0 1.0 -0.001592
OWLObjectIntersectionOf 1.0 1.0 -0.002914
OWLObjectMaxCardinality 1.0 1.0 0.000511
OWLObjectMinCardinality 1.0 1.0 -0.002798
OWLObjectSomeValuesFrom 1.0 1.0 -0.001701
OWLObjectUnionOf 1.0 1.0 -0.002939
"""
from ontolearn.owl_neural_reasoner import TripleStoreNeuralReasoner
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.triple_store import TripleStore
from ontolearn.utils import jaccard_similarity, f1_set_similarity, concept_reducer, concept_reducer_properties
from owlapy.class_expression import (
OWLObjectUnionOf,
OWLObjectIntersectionOf,
OWLObjectSomeValuesFrom,
OWLObjectAllValuesFrom,
OWLObjectMinCardinality,
OWLObjectMaxCardinality,
OWLObjectOneOf,
)
import time
from typing import Tuple, Set
import pandas as pd
from owlapy import owl_expression_to_dl
from itertools import chain
from argparse import ArgumentParser
import os
from tqdm import tqdm
import random
import itertools
import ast
# Set pandas options to ensure full output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.colheader_justify', 'left')
pd.set_option('display.expand_frame_repr', False)
def execute(args):
# (1) Initialize knowledge base.
assert os.path.isfile(args.path_kg)
if args.endpoint_triple_store:
symbolic_kb = TripleStore(url="http://localhost:3030/family")
else:
symbolic_kb = KnowledgeBase(path=args.path_kg)
# (2) Initialize Neural OWL Reasoner.
if args.path_kge_model:
neural_owl_reasoner = TripleStoreNeuralReasoner(path_neural_embedding=args.path_kge_model, gamma=args.gamma)
else:
neural_owl_reasoner = TripleStoreNeuralReasoner(path_of_kb=args.path_kg, gamma=args.gamma)
# Fix the random seed.
random.seed(args.seed)
###################################################################
# GENERATE DL CONCEPTS TO EVALUATE RETRIEVAL PERFORMANCES
# (3) R: Extract object properties.
object_properties = sorted({i for i in symbolic_kb.get_object_properties()})
# (3.1) Subsample if required.
if args.ratio_sample_object_prop and len(object_properties) > 0:
object_properties = {i for i in random.sample(population=list(object_properties),
k=max(1, int(len(object_properties) * args.ratio_sample_object_prop)))}
object_properties = set(object_properties)
# (4) R⁻: Inverse of object properties.
object_properties_inverse = {i.get_inverse_property() for i in object_properties}
# (5) R*: R UNION R⁻.
object_properties_and_inverse = object_properties.union(object_properties_inverse)
# (6) NC: Named owl concepts.
nc = sorted({i for i in symbolic_kb.get_concepts()})
if args.ratio_sample_nc and len(nc) > 0:
# (6.1) Subsample if required.
nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))}
nc = set(nc) # return to a set
# (7) NC⁻: Complement of NC.
nnc = {i.get_object_complement_of() for i in nc}
# (8) NC*: NC UNION NC⁻.
nc_star = nc.union(nnc)
# (9) Retrieve 10 random Nominals.
if len(symbolic_kb.all_individuals_set())>args.num_nominals:
nominals = set(random.sample(symbolic_kb.all_individuals_set(), args.num_nominals))
else:
nominals = symbolic_kb.all_individuals_set()
# (10) All combinations of 3 for Nominals, e.g. {martin, heinz, markus}
nominal_combinations = set( OWLObjectOneOf(combination)for combination in itertools.combinations(nominals, 3))
# (11) NC UNION NC.
unions = concept_reducer(nc, opt=OWLObjectUnionOf)
# (12) NC INTERSECTION NC.
intersections = concept_reducer(nc, opt=OWLObjectIntersectionOf)
# (13) NC* UNION NC*.
unions_nc_star = concept_reducer(nc_star, opt=OWLObjectUnionOf)
# (14) NC* INTERACTION NC*.
intersections_nc_star = concept_reducer(nc_star, opt=OWLObjectIntersectionOf)
# (15) \exist r. C s.t. C \in NC* and r \in R* .
exist_nc_star = concept_reducer_properties(
concepts=nc_star,
properties=object_properties_and_inverse,
cls=OWLObjectSomeValuesFrom,
)
# (16) \forall r. C s.t. C \in NC* and r \in R* .
for_all_nc_star = concept_reducer_properties(
concepts=nc_star,
properties=object_properties_and_inverse,
cls=OWLObjectAllValuesFrom,
)
# (17) >= n r. C and =< n r. C, s.t. C \in NC* and r \in R* .
min_cardinality_nc_star_1, min_cardinality_nc_star_2, min_cardinality_nc_star_3 = (
concept_reducer_properties(
concepts=nc_star,
properties=object_properties_and_inverse,
cls=OWLObjectMinCardinality,
cardinality=i,
)
for i in [1, 2, 3]
)
max_cardinality_nc_star_1, max_cardinality_nc_star_2, max_cardinality_nc_star_3 = (
concept_reducer_properties(
concepts=nc_star,
properties=object_properties_and_inverse,
cls=OWLObjectMaxCardinality,
cardinality=i,
)
for i in [1, 2, 3]
)
# (18) \exist r. Nominal s.t. Nominal \in Nominals and r \in R* .
exist_nominals = concept_reducer_properties(
concepts=nominal_combinations,
properties=object_properties_and_inverse,
cls=OWLObjectSomeValuesFrom,
)
###################################################################
# Retrieval Results
def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]:
start_time = time.time()
return {i.str for i in retriever_func.individuals(c)}, time.time() - start_time
# () Collect the data.
data = []
# () Converted to list so that the progress bar works.
concepts = list(
chain(
nc, # named concepts (C)
nnc, # negated named concepts (\neg C)
unions_nc_star, # A set of Union of named concepts and negat
intersections_nc_star, #
exist_nc_star,
for_all_nc_star,
min_cardinality_nc_star_1, min_cardinality_nc_star_1, min_cardinality_nc_star_3,
max_cardinality_nc_star_1, max_cardinality_nc_star_2, max_cardinality_nc_star_3,
exist_nominals))
print("\n")
print("#"*50)
print("Description of generated Concepts")
print(f"NC denotes the named concepts\t|NC|={len(nc)}")
print(f"NNC denotes the negated named concepts\t|NNC|={len(nnc)}")
print(f"|NC UNION NC|={len(unions)}")
print(f"|NC Intersection NC|={len(intersections)}")
print(f"NC* denotes the union of named concepts and negated named concepts\t|NC*|={len(nc_star)}")
print(f"|NC* UNION NC*|={len(unions_nc_star)}")
print(f"|NC* Intersection NC*|={len(intersections_nc_star)}")
print(f"|exist R* NC*|={len(exist_nc_star)}")
print(f"|forall R* NC*|={len(for_all_nc_star)}")
print(f"|Max Cardinalities|={len(max_cardinality_nc_star_1) + len(max_cardinality_nc_star_2)+ len(max_cardinality_nc_star_3)}")
print(f"|Min Cardinalities|={len(min_cardinality_nc_star_1) + len(min_cardinality_nc_star_1)+ len(min_cardinality_nc_star_3)}")
print(f"|exist R* Nominals|={len(exist_nominals)}")
print("#" * 50,end="\n\n")
# () Shuffled the data so that the progress bar is not influenced by the order of concepts.
random.shuffle(concepts)
# check if csv arleady exists and delete it cause we want to override it
if os.path.exists(args.path_report):
os.remove(args.path_report)
file_exists = False
# () Iterate over single OWL Class Expressions in ALCQIHO
for expression in (tqdm_bar := tqdm(concepts, position=0, leave=True)):
retrieval_y: Set[str]
runtime_y: Set[str]
# () Retrieve the true set of individuals and elapsed runtime.
retrieval_y, runtime_y = concept_retrieval(symbolic_kb, expression)
# () Retrieve a set of inferred individuals and elapsed runtime.
retrieval_neural_y, runtime_neural_y = concept_retrieval(neural_owl_reasoner, expression)
# () Compute the Jaccard similarity.
jaccard_sim = jaccard_similarity(retrieval_y, retrieval_neural_y)
# () Compute the F1-score.
f1_sim = f1_set_similarity(retrieval_y, retrieval_neural_y)
# () Store the data.
df_row = pd.DataFrame(
[{
"Expression": owl_expression_to_dl(expression),
"Type": type(expression).__name__,
"Jaccard Similarity": jaccard_sim,
"F1": f1_sim,
"Runtime Benefits": runtime_y - runtime_neural_y,
"Runtime Neural": runtime_neural_y,
"Symbolic_Retrieval": retrieval_y,
"Symbolic_Retrieval_Neural": retrieval_neural_y,
}])
# Append the row to the CSV file
df_row.to_csv(args.path_report, mode='a', header=not file_exists, index=False)
file_exists = True
# () Update the progress bar.
tqdm_bar.set_description_str(
f"Expression: {owl_expression_to_dl(expression)} | Jaccard Similarity:{jaccard_sim:.4f} | F1 :{f1_sim:.4f} | Runtime Benefits:{runtime_y - runtime_neural_y:.3f}"
)
# () Read the data into pandas dataframe
df = pd.read_csv(args.path_report, index_col=0, converters={'Symbolic_Retrieval': lambda x: ast.literal_eval(x),
'Symbolic_Retrieval_Neural': lambda x: ast.literal_eval(x)})
# () Assert that the mean Jaccard Similarity meets the threshold
assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity
# () Ensure 'Symbolic_Retrieval_Neural' contains sets
x = df["Symbolic_Retrieval_Neural"].iloc[0]
assert isinstance(x, set)
# () Extract numerical features
numerical_df = df.select_dtypes(include=["number"])
# () Group by the type of OWL concepts
df_g = df.groupby(by="Type")
print(df_g["Type"].count())
# () Compute mean of numerical columns per group
mean_df = df_g[numerical_df.columns].mean()
print(mean_df)
return jaccard_sim, f1_sim
def get_default_arguments():
parser = ArgumentParser()
parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl")
parser.add_argument("--path_kge_model", type=str, default=None)
parser.add_argument("--endpoint_triple_store", type=str, default=None)
parser.add_argument("--gamma", type=float, default=0.9)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--ratio_sample_nc", type=float, default=0.2, help="To sample OWL Classes.")
parser.add_argument("--ratio_sample_object_prop", type=float, default=0.1, help="To sample OWL Object Properties.")
parser.add_argument("--min_jaccard_similarity", type=float, default=0.0, help="Minimum Jaccard similarity to be achieve by the reasoner")
parser.add_argument("--num_nominals", type=int, default=10, help="Number of OWL named individuals to be sampled.")
# H is obtained if the forward chain is applied on KG.
parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Results.csv")
return parser.parse_args()
if __name__ == "__main__":
execute(get_default_arguments())