diff --git a/examples/concept_learning_cv_evaluation.py b/examples/concept_learning_cv_evaluation.py index c77cef03..0f0a0f04 100644 --- a/examples/concept_learning_cv_evaluation.py +++ b/examples/concept_learning_cv_evaluation.py @@ -1,13 +1,22 @@ +""" +StratifiedKFold Cross Validating DL Concept Learning Algorithms +Usage +python examples/concept_learning_evaluation.py + --lps LPs/Family/lps.json + --kb KGs/Family/family.owl + --max_runtime 30 + --report family.csv + +""" import json -import os import time import pandas as pd from ontolearn.knowledge_base import KnowledgeBase from ontolearn.concept_learner import CELOE, OCEL, EvoLearner from ontolearn.learners import Drill, TDL from ontolearn.learning_problem import PosNegLPStandard -from ontolearn.metrics import Accuracy, F1 -from owlapy.model import OWLClass, OWLNamedIndividual, IRI +from ontolearn.metrics import F1 +from owlapy.model import OWLNamedIndividual, IRI import argparse from rdflib import Graph from sklearn.model_selection import StratifiedKFold @@ -15,10 +24,10 @@ pd.set_option("display.precision", 5) - +# @TODO This should be standalone function that can be imported from ontolearn/static_funcs.py def compute_f1_score(individuals, pos, neg): tp = len(pos.intersection(individuals)) - tn = len(neg.difference(individuals)) + # tn = len(neg.difference(individuals)) fp = len(neg.intersection(individuals)) fn = len(pos.difference(individuals)) @@ -45,18 +54,27 @@ def dl_concept_learning(args): settings = json.load(json_file) kb = KnowledgeBase(path=args.kb) + ocel = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), + max_runtime=args.max_runtime) + celoe = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), + max_runtime=args.max_runtime) + drill = Drill(knowledge_base=KnowledgeBase(path=args.kb), path_pretrained_kge=args.path_pretrained_kge, + quality_func=F1(), max_runtime=args.max_runtime) + tdl = TDL(knowledge_base=KnowledgeBase(path=args.kb), + dataframe_triples=pd.DataFrame( + data=sorted([(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], key=lambda x: len(x)), + columns=['subject', 'relation', 'object'], dtype=str), + kwargs_classifier={"random_state": 0}, + max_runtime=args.max_runtime) # dictionary to store the data data = dict() for str_target_concept, examples in settings['problems'].items(): + print('Target concept: ', str_target_concept) p = examples['positive_examples'] n = examples['negative_examples'] - print('\n\n') - print('Target concept: ', str_target_concept) - - # Take p and n, generate Kfolds - kf = StratifiedKFold(n_splits=10, shuffle=False) + kf = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.random_seed) X = np.array(p + n) y = np.array([1.0 for _ in p] + [0.0 for _ in n]) @@ -67,6 +85,7 @@ def dl_concept_learning(args): # () Extract positive and negative examples from train fold train_pos = {pos_individual for pos_individual in X[train_index][y[train_index] == 1]} train_neg = {neg_individual for neg_individual in X[train_index][y[train_index] == 0]} + # Sanity checking for individuals used for training. assert train_pos.issubset(examples['positive_examples']) assert train_neg.issubset(examples['negative_examples']) @@ -82,23 +101,20 @@ def dl_concept_learning(args): neg=set(map(OWLNamedIndividual, map(IRI.create, train_neg)))) test_lp = PosNegLPStandard(pos=set(map(OWLNamedIndividual, map(IRI.create, test_pos))), - neg=set(map(OWLNamedIndividual, map(IRI.create, test_neg)))) - + neg=set(map(OWLNamedIndividual, map(IRI.create, test_neg)))) print("OCEL starts..", end="\t") start_time = time.time() - model = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) - pred_ocel = model.fit(train_lp).best_hypotheses(n=1) + pred_ocel = ocel.fit(train_lp).best_hypotheses(n=1) rt_ocel = time.time() - start_time print("OCEL ends..", end="\t") - # () Quality on the training data train_f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, - pos=train_lp.pos, - neg=train_lp.neg) + pos=train_lp.pos, + neg=train_lp.neg) # () Quality on test data test_f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, - pos=test_lp.pos, - neg=test_lp.neg) + pos=test_lp.pos, + neg=test_lp.neg) # Reporting data.setdefault("Train-F1-OCEL", []).append(train_f1_ocel) data.setdefault("Test-F1-OCEL", []).append(test_f1_ocel) @@ -107,23 +123,19 @@ def dl_concept_learning(args): print(f"OCEL Test Quality: {test_f1_ocel:.3f}", end="\t") print(f"OCEL Runtime: {rt_ocel:.3f}") - - print("CELOE starts..", end="\t") start_time = time.time() - model = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) - pred_celoe = model.fit(train_lp).best_hypotheses(n=1) + pred_celoe = celoe.fit(train_lp).best_hypotheses(n=1) rt_celoe = time.time() - start_time print("CELOE ends..", end="\t") - # () Quality on the training data train_f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, - pos=train_lp.pos, - neg=train_lp.neg) + pos=train_lp.pos, + neg=train_lp.neg) # () Quality on test data test_f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, - pos=test_lp.pos, - neg=test_lp.neg) + pos=test_lp.pos, + neg=test_lp.neg) # Reporting data.setdefault("Train-F1-CELOE", []).append(train_f1_celoe) data.setdefault("Test-F1-CELOE", []).append(test_f1_celoe) @@ -132,12 +144,14 @@ def dl_concept_learning(args): print(f"CELOE Test Quality: {test_f1_celoe:.3f}", end="\t") print(f"CELOE Runtime: {rt_celoe:.3f}") - - print("Evo starts..", end="\t") start_time = time.time() - model = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) - pred_evo = model.fit(train_lp).best_hypotheses(n=1) + # BUG: Evolearner needs to be intialized for each learning problem + evolearner = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), + max_runtime=args.max_runtime, + use_data_properties=False, + use_inverse=False, use_card_restrictions=False) + pred_evo = evolearner.fit(train_lp).best_hypotheses(n=1) rt_evo = time.time() - start_time print("Evo ends..", end="\t") @@ -157,22 +171,20 @@ def dl_concept_learning(args): print(f"Evo Test Quality: {test_f1_evo:.3f}", end="\t") print(f"Evo Runtime: {rt_evo:.3f}") - print("DRILL starts..", end="\t") start_time = time.time() - model = Drill(knowledge_base=KnowledgeBase(path=args.kb), path_pretrained_kge=args.path_pretrained_kge,quality_func=F1(), max_runtime=args.max_runtime) - pred_drill = model.fit(train_lp).best_hypotheses(n=1) + pred_drill = drill.fit(train_lp).best_hypotheses(n=1) rt_drill = time.time() - start_time print("DRILL ends..", end="\t") # () Quality on the training data train_f1_drill = compute_f1_score(individuals={i for i in kb.individuals(pred_drill.concept)}, - pos=train_lp.pos, - neg=train_lp.neg) + pos=train_lp.pos, + neg=train_lp.neg) # () Quality on test data test_f1_drill = compute_f1_score(individuals={i for i in kb.individuals(pred_drill.concept)}, - pos=test_lp.pos, - neg=test_lp.neg) + pos=test_lp.pos, + neg=test_lp.neg) # Reporting data.setdefault("Train-F1-DRILL", []).append(train_f1_drill) data.setdefault("Test-F1-DRILL", []).append(test_f1_drill) @@ -180,18 +192,13 @@ def dl_concept_learning(args): print(f"DRILL Train Quality: {train_f1_drill:.3f}", end="\t") print(f"DRILL Test Quality: {test_f1_drill:.3f}", end="\t") print(f"DRILL Runtime: {rt_drill:.3f}") - print("TDL starts..", end="\t") start_time = time.time() - model = TDL(knowledge_base=KnowledgeBase(path=args.kb), dataframe_triples=pd.DataFrame( - data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], - columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'), - kwargs_classifier={"criterion": "gini", "random_state": 0}, - max_runtime=args.max_runtime) # () Fit model training dataset - pred_tdl = model.fit(train_lp).best_hypotheses(n=1) + pred_tdl = tdl.fit(train_lp).best_hypotheses(n=1) print("TDL ends..", end="\t") rt_tdl = time.time() - start_time + # () Quality on the training data train_f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, pos=train_lp.pos, @@ -216,33 +223,12 @@ def dl_concept_learning(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Description Logic Concept Learning') - - parser.add_argument("--max_runtime", type=int, default=1) - parser.add_argument("--lps", type=str, required=True) - parser.add_argument("--kb", type=str, required=True) + parser.add_argument("--max_runtime", type=int, default=10, help="Max runtime") + parser.add_argument("--lps", type=str, required=True, help="Path fto the learning problems") + parser.add_argument("--folds", type=int, default=10, help="Number of folds of cross validation.") + parser.add_argument("--kb", type=str, required=True, + help="Knowledge base") parser.add_argument("--path_pretrained_kge", type=str, default=None) parser.add_argument("--report", type=str, default="report.csv") - dl_concept_learning(parser.parse_args()) - - -""" -# Benchmarking: Run a bash script tdl_stratified_kfold_cv_experiments.sh with the followings - -mkdir CVFamilyBenchmarkResults -python examples/concept_learning_cv_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family.owl --max_runtime 60 --report cv_family_results.csv && mv cv_family_results.csv CVFamilyBenchmarkResults -mkdir CVMutagenesisBenchmarkResults -python examples/concept_learning_cv_evaluation.py --lps LPs/Mutagenesis/lps.json --kb KGs/Mutagenesis/mutagenesis.owl --max_runtime 60 --report cv_mutagenesis_results.csv && mv cv_mutagenesis_results.csv CVMutagenesisBenchmarkResults -mkdir CVCarcinogenesisBenchmarkResults -python examples/concept_learning_cv_evaluation.py --lps LPs/Carcinogenesis/lps.json --kb KGs/Carcinogenesis/carcinogenesis.owl --max_runtime 60 --report cv_carcinogenesis_results.csv && mv cv_carcinogenesis_results.csv CVCarcinogenesisBenchmarkResults - -#Anaylsing results -import pandas as pd -pd.set_option("display.precision", 3) -pd.set_option('display.max_columns', None) -path="CVCarcinogenesisBenchmarkResults/cv_carcinogenesis_results.csv" -df = pd.read_csv(path, index_col=0) -df_mean_by_lp = df.groupby(by=df.index).mean() -filter_col = [col for col in df if col.startswith('Test-F1') or col.startswith('RT')] -print(df_mean_by_lp[filter_col]) -print(df_mean_by_lp[filter_col].to_latex(index=True, formatters={"name": str.upper}, float_format="{:.1f}".format)) -""" \ No newline at end of file + parser.add_argument("--random_seed", type=int, default=1) + dl_concept_learning(parser.parse_args()) \ No newline at end of file diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index 84013529..64083d3e 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -1,3 +1,12 @@ +""" +Fitting DL Concept Learning Algorithms: + +Given E^+ and E^-, a learner finds a concept H and F1 score is computed w.r.t. E^+, E^-, and R(H) retrieval of H. + +python examples/concept_learning_evaluation.py --lps LPs/Family/lps.json --kb KGs/Family/family.owl --max_runtime 30 --report family.csv + +""" + import json import os import time @@ -44,6 +53,18 @@ def dl_concept_learning(args): kb = KnowledgeBase(path=args.kb) + ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) + celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) + drill = Drill(knowledge_base=KnowledgeBase(path=args.kb), + path_pretrained_kge=args.path_pretrained_kge, + quality_func=F1(), + max_runtime=args.max_runtime) + tdl = TDL(knowledge_base=KnowledgeBase(path=args.kb), + dataframe_triples=pd.DataFrame( + data=sorted([(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], key=lambda x: len(x)), + columns=['subject', 'relation', 'object'], dtype=str), + kwargs_classifier={"random_state": 0}, + max_runtime=args.max_runtime) # dictionary to store the data data = dict() for str_target_concept, examples in settings['problems'].items(): @@ -59,9 +80,8 @@ def dl_concept_learning(args): lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg) print("OCEL starts..", end="\t") - model = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) start_time = time.time() - pred_ocel = model.fit(lp).best_hypotheses(n=1) + pred_ocel = ocel.fit(lp).best_hypotheses(n=1) print("OCEL ends..", end="\t") rt_ocel = time.time() - start_time f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, pos=lp.pos, neg=lp.neg) @@ -71,9 +91,8 @@ def dl_concept_learning(args): print(f"OCEL Runtime: {rt_ocel:.3f}") print("CELOE starts..", end="\t") - model = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) start_time = time.time() - pred_celoe = model.fit(lp).best_hypotheses(n=1) + pred_celoe = celoe.fit(lp).best_hypotheses(n=1) print("CELOE Ends..", end="\t") rt_celoe = time.time() - start_time f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, pos=lp.pos, neg=lp.neg) @@ -83,9 +102,10 @@ def dl_concept_learning(args): print(f"CELOE Runtime: {rt_celoe:.3f}") print("Evo starts..", end="\t") - model = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) start_time = time.time() - pred_evo = model.fit(lp).best_hypotheses(n=1) + # Evolearner has a bug and KB needs to be reloaded + evo = EvoLearner(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), max_runtime=args.max_runtime) + pred_evo = evo.fit(lp).best_hypotheses(n=1) print("Evo ends..", end="\t") rt_evo = time.time() - start_time f1_evo = compute_f1_score(individuals={i for i in kb.individuals(pred_evo.concept)}, pos=lp.pos, neg=lp.neg) @@ -96,11 +116,7 @@ def dl_concept_learning(args): print("DRILL starts..", end="\t") start_time = time.time() - model = Drill(knowledge_base=KnowledgeBase(path=args.kb), - path_pretrained_kge=args.path_pretrained_kge, - quality_func=F1(), - max_runtime=args.max_runtime) - pred_drill = model.fit(lp).best_hypotheses(n=1) + pred_drill = drill.fit(lp).best_hypotheses(n=1) print("DRILL ends..", end="\t") rt_drill = time.time() - start_time f1_drill = compute_f1_score(individuals=set(kb.individuals(pred_drill.concept)), pos=lp.pos, neg=lp.neg) @@ -111,21 +127,20 @@ def dl_concept_learning(args): print("TDL starts..", end="\t") start_time = time.time() - model = TDL(knowledge_base=KnowledgeBase(path=args.kb), dataframe_triples=pd.DataFrame( - data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], - columns=['subject', 'relation', 'object'], dtype=str).sort_values('subject'), - kwargs_classifier={"criterion": "gini", "random_state": 0}, - max_runtime=args.max_runtime) - pred_tdl = model.fit(lp).best_hypotheses(n=1) + # () Fit model training dataset + pred_tdl = tdl.fit(lp).best_hypotheses(n=1) print("TDL ends..", end="\t") rt_tdl = time.time() - start_time - # Compute quality of best prediction - f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, pos=lp.pos, neg=lp.neg) + + # () Quality on the training data + f1_tdl = compute_f1_score(individuals={i for i in kb.individuals(pred_tdl)}, + pos=lp.pos, + neg=lp.neg) + data.setdefault("F1-TDL", []).append(f1_tdl) data.setdefault("RT-TDL", []).append(rt_tdl) print(f"TDL Quality: {f1_tdl:.3f}", end="\t") print(f"TDL Runtime: {rt_tdl:.3f}") - df = pd.DataFrame.from_dict(data) df.to_csv(args.report, index=False) print(df) @@ -134,7 +149,6 @@ def dl_concept_learning(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Description Logic Concept Learning') - parser.add_argument("--max_runtime", type=int, default=1) parser.add_argument("--lps", type=str, required=True) parser.add_argument("--kb", type=str, required=True) diff --git a/ontolearn/knowledge_base.py b/ontolearn/knowledge_base.py index d314dea3..9bcfb01a 100644 --- a/ontolearn/knowledge_base.py +++ b/ontolearn/knowledge_base.py @@ -482,9 +482,6 @@ def encode_learning_problem(self, lp: PosNegLPStandard): Return: EncodedPosNegLPStandard: The encoded learning problem. """ - - assert len(self.class_hierarchy) > 0 - if lp.all is None: kb_all = self.all_individuals_set() else: diff --git a/ontolearn/learners/tree_learner.py b/ontolearn/learners/tree_learner.py index 2e234853..1e4754e2 100644 --- a/ontolearn/learners/tree_learner.py +++ b/ontolearn/learners/tree_learner.py @@ -1,7 +1,17 @@ +import numpy as np import owlapy.model import pandas as pd - +import requests +import json from ontolearn.knowledge_base import KnowledgeBase +from ontolearn.base import OWLOntologyManager_Owlready2 +from owlapy.model import OWLEquivalentClassesAxiom, OWLOntologyManager, OWLOntology, AddImport, OWLImportsDeclaration, \ + IRI, OWLDataOneOf + +# mv best_pred.owl +# (base) demir@demir:~/Desktop/Softwares/Ontolearn/LD2NL/owl2nl$ ./owl2nl.sh -a ./src/test/resources/best_pred.owl -u false -o ./src/test/resources/family.owl -t json -s test_out.json -m rule +# ./owl2nl.sh -a ./home/demir/Desktop/Softwares/Ontolearn/examples/best_pred.owl -u false -o ./home/demir/Desktop/Softwares/Ontolearn/KGs/Family/family.owl -t json -s test_out.json -m rule + from typing import Dict, Set, Tuple, List, Union, TypeVar, Callable from ontolearn.learning_problem import PosNegLPStandard import collections @@ -13,8 +23,13 @@ OWLObjectAllValuesFrom, \ OWLObjectIntersectionOf, OWLClassExpression, OWLNothing, OWLThing, OWLNaryBooleanClassExpression, \ OWLObjectUnionOf, OWLClass, OWLObjectComplementOf, OWLObjectMaxCardinality, OWLObjectMinCardinality, \ - OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLDataHasValue, OWLObjectHasValue -from owlapy.render import DLSyntaxObjectRenderer + OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLDataHasValue, OWLObjectHasValue, OWLNamedIndividual +from owlapy.render import DLSyntaxObjectRenderer, ManchesterOWLSyntaxOWLObjectRenderer +from sklearn.model_selection import GridSearchCV + +import time + +from sklearn.tree import export_text def is_float(value): @@ -39,7 +54,7 @@ def compute_quality(instances, pos, neg, conf_matrix=False, quality_func=None): return f1_score -def extract_cbd(dataframe) -> Dict[str, Set[Tuple[str, str]]]: +def extract_cbd(dataframe) -> Dict[str, List[Tuple[str, str]]]: """ Extract concise bounded description for each entity, where the entity is a subject entity. Create a mapping from a node to out-going edges and connected nodes @@ -50,44 +65,10 @@ def extract_cbd(dataframe) -> Dict[str, Set[Tuple[str, str]]]: data = dict() for i in dataframe.values.tolist(): subject_, predicate_, object_ = i - data.setdefault(subject_, set()).add((predicate_, object_)) + data.setdefault(subject_, []).append((predicate_, object_)) return data -def base_construct_second(cbd_entities: Dict[str, Set[Tuple[str, str]]], individuals: List[str], - feature_names: List[Tuple[str, Union[str, None]]]): - """ Construct a tabular representations from fixed features """ - assert cbd_entities is not None, "No cbd entities" - result = [] - # () Iterate over individuals. - for s in individuals: - # () Initialize an empty row. - representation_of_s = [False for _ in feature_names] - for (p, o) in cbd_entities[s]: - """ o can be a IRI or a number a boolean""" - # () if (p,o) not in feature_names, o must be a number - if (p, o) in feature_names: - if o is not None: - idx = feature_names.index((p, o)) - value = True - assert representation_of_s[idx] is False - else: - "Ignore information comes as p,o " - print(p, o) - exit(1) - idx = feature_names.index((p, None)) - value = o - - representation_of_s[idx] = value - result.append(representation_of_s) - result = pd.DataFrame(data=result, index=individuals, columns=feature_names, dtype="category") - # result = pd.DataFrame(data=result, index=individuals, columns=feature_names) - # print("Tabular data representing positive and negative examples:", result.shape) - result = result.loc[:, (result != False).any(axis=0)] - # print("Tabular data representing positive and negative examples after removing uninformative features:",result.shape) - return result - - def explain_inference(clf, X_test, features, only_shared): reports = [] n_nodes = clf.tree_.node_count @@ -153,18 +134,58 @@ def concepts_reducer(concepts: List[OWLClassExpression], reduced_cls: Callable) return dl_concept_path +def compute_f1_score(individuals, pos, neg): + tp = len(pos.intersection(individuals)) + tn = len(neg.difference(individuals)) + + fp = len(neg.intersection(individuals)) + fn = len(pos.difference(individuals)) + + try: + recall = tp / (tp + fn) + except ZeroDivisionError: + return 0 + + try: + precision = tp / (tp + fp) + except ZeroDivisionError: + return 0 + + if precision == 0 or recall == 0: + return 0 + + f_1 = 2 * ((precision * recall) / (precision + recall)) + return f_1 + + class TDL: """Tree-based Description Logic Concept Learner""" - def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, kwargs_classifier, - on_fly_tabular: bool = True, max_runtime=1): + def __init__(self, knowledge_base, + dataframe_triples: pd.DataFrame, + kwargs_classifier:dict, + max_runtime: int = 1, + grid_search_over=None, + report_classification: bool = False, + plot_built_tree: bool = False, + plotembeddings: bool = False): + if grid_search_over is None: + grid_search_over = {'criterion': ["entropy", "gini", "log_loss"], + "splitter": ["random", "best"], + "max_features": [None, "sqrt", "log2"], + "min_samples_leaf": [1, 2, 3, 4, 5, 10], + "max_depth": [1, 2, 3, 4, 5, 10, None]} assert isinstance(dataframe_triples, pd.DataFrame), "dataframe_triples must be a Pandas DataFrame" assert isinstance(knowledge_base, KnowledgeBase), "knowledge_base must be a KnowledgeBase instance" assert len(dataframe_triples) > 0, f"length of the dataframe must be greater than 0:{dataframe_triples.shape}" - # print(f"Knowledge Base: {knowledge_base}") - # print(f"Matrix representation of knowledge base: {dataframe_triples.shape}") + print(f"Knowledge Base: {knowledge_base}") + print(f"Matrix representation of knowledge base: {dataframe_triples.shape}") + self.grid_search_over = grid_search_over self.knowledge_base = knowledge_base self.dataframe_triples = dataframe_triples + self.report_classification = report_classification + self.plot_built_tree = plot_built_tree + self.plotembeddings = plotembeddings # Mappings from string of IRI to named concepts. self.owl_classes_dict = {c.get_iri().as_str(): c for c in self.knowledge_base.get_concepts()} # Mappings from string of IRI to object properties. @@ -173,11 +194,17 @@ def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, kwargs_class self.owl_data_property_dict = {p.get_iri().as_str(): p for p in self.knowledge_base.get_data_properties()} # Mappings from string of IRI to individuals. self.owl_individuals = {i.get_iri().as_str(): i for i in self.knowledge_base.individuals()} + self.dl_render = DLSyntaxObjectRenderer() + self.manchester_render = ManchesterOWLSyntaxOWLObjectRenderer() # Keyword arguments for sklearn Decision tree. + # Initialize classifier + self.clf = None + self.feature_names = None self.kwargs_classifier = kwargs_classifier self.max_runtime = max_runtime - self.on_fly_tabular = on_fly_tabular - self.best_pred = None + # best pred + self.disjunction_of_conjunctive_concepts = None + self.conjunctive_concepts = None # Remove uninformative triples if exists. # print("Removing uninformative triples...") self.dataframe_triples = self.dataframe_triples[ @@ -188,30 +215,148 @@ def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, kwargs_class # print(f"Matrix representation of knowledge base: {dataframe_triples.shape}") self.cbd_mapping: Dict[str, Set[Tuple[str, str]]] self.cbd_mapping = extract_cbd(self.dataframe_triples) - self.str_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" # Fix an ordering: Not quite sure whether we needed self.str_individuals = list(self.owl_individuals) - - self.cbd_mapping_entities = {k: v for k, v in self.cbd_mapping.items() if k in self.str_individuals} - # Type info - self.features = [(self.str_type, str_c) for str_c in self.owl_classes_dict.keys()] - # Object Info - self.features.extend([(str_r, i) for i in self.str_individuals for str_r in self.owl_object_property_dict]) - # Data Info. None will be filled by object s.t. i str_r, object - self.features.extend([(str_r, None) for str_r in self.owl_data_property_dict]) - # Initialize classifier - self.clf = None - - if self.on_fly_tabular: - # Trade-off between runtime at inference and memory - self.Xraw = None - else: - self.Xraw = base_construct_second(cbd_entities=self.cbd_mapping_entities, - individuals=self.str_individuals, - feature_names=self.features) - - def labeling(self, Xraw, pos, neg, apply_dummy=True): + # An entity to a list of tuples of predicate and objects + self.first_hop = {k: v for k, v in self.cbd_mapping.items() if k in self.str_individuals} + self.types_of_individuals = dict() + + for k, v in self.first_hop.items(): + for relation, tail in v: + if relation == self.str_type: + self.types_of_individuals.setdefault(k, set()).add(tail) + + self.Xraw = None + + def built_sparse_training_data(self, entity_infos: Dict[str, Dict], individuals: List[str], + feature_names: List[Tuple[str, Union[str, None]]]): + """ Construct a tabular representations from fixed features """ + assert entity_infos is not None, "No entity_infos" + result = [] + # () Iterate over individuals. + for s in individuals: + # () Initialize an empty row. + representation_of_s = [0.0 for _ in feature_names] + # All info about s should be in the features. + for relation, hop_info in entity_infos[s].items(): + assert isinstance(relation, str), "Relation must be string" + for t in hop_info: + if isinstance(t, str): + if relation == self.str_type: + assert t in self.owl_classes_dict + # Boolean feature : (type, CLASS): + representation_of_s[feature_names.index((relation, t))] = 1.0 + elif relation == self.owl_object_property_dict: + # Boolean feature : (hasChild, Individual) + assert t in self.str_individuals + representation_of_s[feature_names.index((relation, t))] = 1.0 + elif relation == self.owl_object_property_dict: + # Numerical Feature : (hasCharge, None) + assert t not in self.str_individuals + assert is_float(t) + + print("hereee") + print(s, relation, t) + representation_of_s[feature_names.index((relation, None))] = t + exit(1) + elif isinstance(t, tuple): + if len(t) == 2: + rr, oo = t + if rr in self.owl_data_property_dict: + # Feature : hasSibling, hasCharge, NUMBER + assert is_float(oo) + + representation_of_s[feature_names.index((relation, rr, None))] = eval(oo) + else: + assert rr in self.owl_object_property_dict + assert relation in self.owl_object_property_dict + assert oo in self.owl_classes_dict + representation_of_s[feature_names.index((relation, rr, oo))] = 1.0 + + else: + print(t) + print("ASDAD") + exit(1) + representation_of_s[feature_names.index((relation, *t))] = 1.0 + else: + print("asda") + print(s, relation, t) + print(t) + print("BURASI") + exit(1) + result.append(representation_of_s) + result = pd.DataFrame(data=result, index=individuals, columns=feature_names) # , dtype=np.float32) + # result = result.loc[:, (result != False).any(axis=0)] + + return result + + def construct_hop(self, individuals: List[str]) -> Dict[str, Dict]: + assert len(individuals) == len(set(individuals)), "There are duplicate individuals" + + # () Nested dictionary + hop = dict() + # () Unique features/DL concepts. + features = set() + # () Iterate over individuals. + for s in individuals: + temp = dict() + # () iterate over triples of (s,p,o) + for p, o in self.first_hop[s]: + ##### SAVE FEATURE: (type, PERSON) ##### + if p == self.str_type: + # For example, (hasChild Male). + assert o in self.owl_classes_dict + temp.setdefault(p, set()).add(o) + features.add((p, o)) + else: + # o can be an individual, + # a literal or + # blank node + + # If o is an individual + if o in self.str_individuals: + # () iterate over triples of (o,pp,oo) + for (pp, oo) in self.first_hop[o]: + if pp == self.str_type: + # (s, p=hasChild, o) + # (o, pp=TYPE, oo=Person) + ##### SAVE FEATURE: (hasChild, PERSON) ##### + assert oo in self.owl_classes_dict + temp.setdefault(p, set()).add(oo) + features.add((p, oo)) + else: + # (s, p=hasChild, o) + # (o, pp=hasChild, oo=Person) + # if oo is an individual. + if oo in self.str_individuals: + ##### SAVE FEATURE: (hasChild, married, Father) ##### + for c in self.types_of_individuals[oo]: + temp.setdefault(p, set()).add((pp, c)) + features.add((p, pp, c)) + else: + # oo is or literal + # print(s, p, o) + # print(o, pp, oo) + assert isinstance(eval(oo), float) + assert o in self.str_individuals + assert pp in self.owl_data_property_dict + temp.setdefault(p, set()).add((pp, oo)) + features.add((p, pp, None)) + + else: + # given s, p,32.1 + # Feature (hasBond ?) + # p hasBond 32.1 + + temp.setdefault(p, set()).add(o) + features.add((p, None)) + + hop[s] = temp + return hop, features + + @staticmethod + def labeling(Xraw, pos, neg, apply_dummy=False): """ Labelling """ # (5) Labeling: Label each row/node # Drop "label" if exists @@ -233,63 +378,176 @@ def labeling(self, Xraw, pos, neg, apply_dummy=True): # print(f"Train data shape:{X_train_sparse.shape}") return X_train_sparse, y_train_sparse - def decision_to_owl_class_exp(self, reasoning_step: dict, single_positive_indv): + def decision_to_owl_class_exp(self, reasoning_step: dict): """ """ - # print(f"\t{reasoning_step}") # tail can be individual or class - relation, tail = reasoning_step["feature"] - # from numpy.bool_ to python bool - value = bool(reasoning_step["value"]) - if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": - if value: - owl_class = self.owl_classes_dict[tail] - # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + feature = reasoning_step["feature"] + # relation, tail_info = reasoning_step["feature"] + if len(feature) == 2: + relation, tail_info = feature + if relation == self.str_type: + assert isinstance(tail_info, str), "Tail must be a string" + assert tail_info in self.owl_classes_dict, "a defined OWL class" + assert reasoning_step["value"] == 0.0 or reasoning_step["value"] == 1.0 + if bool(reasoning_step["value"]): + owl_class = self.owl_classes_dict[tail_info] + else: + owl_class = self.owl_classes_dict[tail_info].get_object_complement_of() + elif relation in self.owl_data_property_dict: + # To capture this ('http://dl-learner.org/mutagenesis#hasThreeOrMoreFusedRings', None) + print("HEREEEE") + print(relation) + raise RuntimeError("UNCLEAR") else: - owl_class = self.owl_classes_dict[tail].get_object_complement_of() - # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + rel1, tail = feature + if rel1 in self.owl_object_property_dict: + owl_class = OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[rel1], + filler=self.owl_classes_dict[tail]) + else: + owl_class = OWLDataHasValue(property=self.owl_data_property_dict[rel1], value=OWLLiteral(tail)) + + print("WHAT SHOULD BE") + print(feature) + print(reasoning_step["value"]) + raise RuntimeError("UNCLEAR") else: - if tail in self.owl_individuals: - owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], - individual=self.owl_individuals[tail]) - else: - owl_class = OWLDataHasValue(property=self.owl_data_property_dict[relation], value=OWLLiteral(tail)) + assert len(feature) == 3 + rel1, rel2, concept = feature + + if concept is None: + assert rel2 in self.owl_data_property_dict + assert is_float(reasoning_step["value"]) + owl_class = OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[rel1], + filler=OWLDataHasValue(property=self.owl_data_property_dict[rel2], + value=OWLLiteral( + float(reasoning_step["value"])))) + elif rel2 in self.owl_object_property_dict: + filler = OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[rel2], + filler=self.owl_classes_dict[concept]) + owl_class = OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[rel1], filler=filler) + + assert reasoning_step["value"] == 0.0 or reasoning_step["value"] == 1.0 + if bool(reasoning_step["value"]): + pass + else: + owl_class = owl_class.get_object_complement_of() - if value: - pass - # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) else: - owl_class = owl_class.get_object_complement_of() - # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + + raise RuntimeError("UNCLEAR") + assert rel2 in self.owl_data_property_dict + print(reasoning_step) + + owl_class = OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[rel1], + filler=OWLDataSomeValuesFrom( + property=self.owl_data_property_dict[rel2], + filler=OWLLiteral(float(reasoning_step["value"])))) return owl_class - def best_hypotheses(self, n=1): - assert n == 1 - return self.best_pred + def feature_pretify(self): + pretified_feature_names = [] + for i in self.feature_names: + feature = "" + for x in i: + x = x.replace("http://www.benchmark.org/family#", "") + x = x.replace("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "") + feature += x + " " + pretified_feature_names.append(feature) + return pretified_feature_names + + def plot(self): + """ + # plt.figure(figsize=(30, 30)) + # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) + # plt.show() - def fit(self, lp: PosNegLPStandard, max_runtime=None): + """ + pretified_feature_names = [] + for i in self.feature_names: + f = [] + for x in i: + x = x.replace("http://www.benchmark.org/family#", "") + x = x.replace("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "") + f.append(x) + pretified_feature_names.append(f) + + plt.figure(figsize=(10, 10)) + tree.plot_tree(self.clf, fontsize=10, feature_names=pretified_feature_names, + class_names=["Negative", "Positive"], + filled=True) + plt.savefig('Aunt_Tree.pdf') + plt.show() + + feature_importance = pd.Series(np.array(self.clf.feature_importances_), + index=[",".join(i) for i in pretified_feature_names]) + feature_importance = feature_importance[feature_importance > 0.0] + fig, ax = plt.subplots() + feature_importance.plot.bar(ax=ax) + ax.set_title("Feature Importance") + fig.tight_layout() + plt.savefig('feature_importance.pdf') + plt.show() + + def fit(self, lp: PosNegLPStandard = None, max_runtime: int = None): + """ Fit the learner to the given learning problem + + (1) Extract multi-hop information about E^+ and E^- denoted by \mathcal{F}. + (1.1) E = list of (E^+ \sqcup E^-). + (2) Build a training data \mathbf{X} \in \mathbb{R}^{ |E| \times |\mathcal{F}| } . + (3) Create binary labels \mathbf{X}. + + (4) Construct a set of DL concept for each e \in E^+ + (5) Union (4) + :param lp: The learning problem + :param max_runtime:total runtime of the learning + + """ + assert lp is not None, "Learning problem cannot be None." if max_runtime is not None: self.max_runtime = max_runtime str_pos_examples = [i.get_iri().as_str() for i in lp.pos] str_neg_examples = [i.get_iri().as_str() for i in lp.neg] - if self.on_fly_tabular: - # print("Constructing representations on the fly...") - Xraw = base_construct_second(cbd_entities=self.cbd_mapping_entities, - individuals=str_pos_examples + str_neg_examples, - feature_names=self.features) - X, y = self.labeling(Xraw=Xraw, pos=str_pos_examples, neg=str_neg_examples, apply_dummy=False) - else: - X, y = self.labeling(Xraw=self.Xraw, pos=str_pos_examples, neg=str_neg_examples, apply_dummy=False) + """self.features.extend([(str_r, None) for str_r in self.owl_data_property_dict])""" + # Nested dictionary [inv][relation]: => [] Dict[str, Dict] + hop_info, features = self.construct_hop(str_pos_examples + str_neg_examples) + + # list of tuples having length 2 or 3 + features = list(features) + + Xraw = self.built_sparse_training_data(entity_infos=hop_info, + individuals=str_pos_examples + str_neg_examples, + feature_names=features) + X, y = self.labeling(Xraw=Xraw, pos=str_pos_examples, neg=str_neg_examples) + + if self.plotembeddings: + import umap + print("Fitting") + reducer = umap.UMAP(random_state=1) + embedding = reducer.fit_transform(X) + plt.scatter(embedding[:, 0], embedding[:, 1], + c=["r" if x == 1 else "b" for x in y]) + plt.grid() + plt.gca().set_aspect('equal', 'datalim') + plt.savefig("UMAP_AUNT.pdf") + plt.show() + + if self.grid_search_over is not None: + grid_search = GridSearchCV(tree.DecisionTreeClassifier(**self.kwargs_classifier), + param_grid=self.grid_search_over, cv=10).fit(X.values, y.values) + print(grid_search.best_params_) + self.kwargs_classifier.update(grid_search.best_params_) - # Binaries self.clf = tree.DecisionTreeClassifier(**self.kwargs_classifier).fit(X=X.values, y=y.values) - # print("Classification Report: Negatives: -1, Unknowns:0, Positives 1 ") - # print(sklearn.metrics.classification_report(y.values, self.clf.predict(X.values), target_names=None)) - # plt.figure(figsize=(30, 30)) - # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) - # plt.show() + self.feature_names = X.columns.to_list() + if self.report_classification: + print("Classification Report: Negatives: -1 and Positives 1 ") + print(sklearn.metrics.classification_report(y.values, self.clf.predict(X.values), + target_names=["Negative", "Positive"])) + if self.plot_built_tree: + self.plot() prediction_per_example = [] # () Iterate over E^+ @@ -298,17 +556,140 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): X_test=X.loc[str_pos_examples].values, features=X.columns.to_list(), only_shared=False), str_pos_examples): - # () Ensure that e \in E^+ is classified as positive - # assert 1 == self.clf.predict(X.loc[pos].values.reshape(1, -1)) - # () Reasoning behind of the prediction of a single positive example. - - sequence_of_concept_path_of_tree = [self.decision_to_owl_class_exp(reasoning_step, pos) for + sequence_of_concept_path_of_tree = [self.decision_to_owl_class_exp(reasoning_step) for reasoning_step in sequence_of_reasoning_steps] + pred = concepts_reducer(concepts=sequence_of_concept_path_of_tree, reduced_cls=OWLObjectIntersectionOf) + prediction_per_example.append((pred, pos)) - self.best_pred = concepts_reducer(concepts=[pred for pred, pos in prediction_per_example], - reduced_cls=OWLObjectUnionOf) + # Remove paths from the root to leafs if overallping + prediction_per_example = {p for p, indv in prediction_per_example} + self.conjunctive_concepts = [pred for pred in prediction_per_example] + self.disjunction_of_conjunctive_concepts = concepts_reducer(concepts=self.conjunctive_concepts, + reduced_cls=OWLObjectUnionOf) return self + + def best_hypotheses(self, n=1): + """ Return the prediction""" + assert n == 1, "Only one hypothesis is supported" + return self.disjunction_of_conjunctive_concepts + + def predict(self, X: List[OWLNamedIndividual], proba=True) -> np.ndarray: + """ Predict the likelihoods of individuals belonging to the classes""" + owl_individuals = [i.get_iri().as_str() for i in X] + hop_info, _ = self.construct_hop(owl_individuals) + Xraw = self.built_sparse_training_data(entity_infos=hop_info, + individuals=owl_individuals, + feature_names=self.feature_names) + # corrupt some infos + Xraw_numpy = Xraw.values + + if proba: + return self.clf.predict_proba(Xraw_numpy) + else: + return self.clf.predict(Xraw_numpy) + + @staticmethod + def llm(prompt, llm_name: str): + """ We need to refactor it""" + assert llm_name in ["mistral", "llama2"] + data = {"model": llm_name, + "prompt": prompt, + "content": "You are an expert. Be concise in your answers", + "options": { # "num_keep": 5, + "seed": 1, + # "num_predict": 100, + # "top_k": 20, + # "top_p": 0.9, + # "tfs_z": 0.5, + # "typical_p": 0.7, + # "repeat_last_n": 33, + "temperature": 0.0, + "repeat_penalty": 1.2, + # "presence_penalty": 1.5, + # "frequency_penalty": 1.0, + # "mirostat": 1, + # "mirostat_tau": 0.8, + # "mirostat_eta": 0.6, + # "penalize_newline": true, + # "stop": ["\n", "user:"], + # "numa": false, + # "num_ctx": 1024, + # "num_batch": 2, + # "num_gqa": 1, + # "num_gpu": 1, + # "main_gpu": 0, + # "low_vram": false, + # "f16_kv": true, + # "vocab_only": false, + # "use_mmap": true, + # "use_mlock": false, + # "embedding_only": false, + # "rope_frequency_base": 1.1, + # "rope_frequency_scale": 0.8, + # "num_thread": 8 + }} + + text = "" + response = requests.post("http://localhost:11434/api/generate", json=data, stream=True) + response.raise_for_status() + + for line in response.iter_lines(): + body = json.loads(line) + response_part = body.get('response', '') + # print(response_part, end='', flush=True) + text += response_part + if 'error' in body: + raise Exception(body['error']) + + if body.get('done', False): + break + return text + + def verbalize(self): + """ + Ensure that Ollama is running athttp://localhost:11434/ + + """ + + """ Map a DL concept into natural languages """ + # https://github.com/jmorganca/ollama/blob/main/docs/api.md#generate-a-completion + # Save the best prediction + self.save_best_hypothesis(concepts=self.conjunctive_concepts, path="best_pred") + for i in self.conjunctive_concepts: + prompt = f"Translate this description logic concept into english sentences. Provide no explanations: {self.dl_render.render(i)}" + print(f"PROMPT:{prompt}") + full_text_mistral = self.llm(prompt, llm_name="mistral") + print("RESPONSE:", full_text_mistral) + # full_text_llama2 = self.__post_request_llm(prompt, llm_name="llama2") + + def save_best_hypothesis(self, concepts: List[OWLClassExpression], + path: str = 'Predictions', + rdf_format: str = 'rdfxml') -> None: + """Serialise the best hypotheses to a file. + @TODO: This should be a single static function We need to refactor it + + + Args: + concepts: + path: Filename base (extension will be added automatically). + rdf_format: Serialisation format. currently supported: "rdfxml". + """ + # NS: Final = 'https://dice-research.org/predictions/' + str(time.time()) + '#' + NS: Final = 'https://dice-research.org/predictions#' + if rdf_format != 'rdfxml': + raise NotImplementedError(f'Format {rdf_format} not implemented.') + # () + manager: OWLOntologyManager = OWLOntologyManager_Owlready2() + # () + ontology: OWLOntology = manager.create_ontology(IRI.create(NS)) + # () Iterate over concepts + for i in concepts: + cls_a: OWLClass = OWLClass(IRI.create(NS, self.manchester_render.render(i))) + equivalent_classes_axiom = OWLEquivalentClassesAxiom([cls_a, i]) + manager.add_axiom(ontology, equivalent_classes_axiom) + + manager.save_ontology(ontology, IRI.create('file:/' + path + '.owl'))