0.84.0

bagustris · May 3, 2024 · 6aab034 · 6aab034
1 parent 884ef11
commit 6aab034
Show file tree

Hide file tree

Showing 28 changed files with 328 additions and 90 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+Version 0.84.0
+--------------
+* added SHAP analysis
+* started with finetuning
+
 Version 0.83.3
 --------------
 * fixed a naming error in trill features that prevented storage of experiment

diff --git a/ini_file.md b/ini_file.md
@@ -330,7 +330,9 @@
 * **dist_type**: type of plot for value counts, either histogram or density estimation (kde)
   * dist_type = hist
 * **spotlight**: open a web-browser window to inspect the data with the [spotlight software](https://github.com/Renumics/spotlight). Needs package *renumics-spotlight* to be installed!
-  * spotlight = False 
+  * spotlight = False
+* **shap**: comopute [SHAP](https://shap.readthedocs.io/en/latest/) values
+  * shap = False
 ### [PREDICT](#predict) 
 * **targets**: Speaker/speech characteristics to be predicted by some models
   * targets = ['gender', 'age', 'snr', 'arousal', 'valence', 'dominance', 'pesq', 'mos']

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.83.3"
+VERSION="0.84.0"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/demo.py b/nkululeko/demo.py
@@ -2,8 +2,9 @@
 # Demonstration code to use the ML-experiment framework
 # Test the loading of a previously trained model and demo mode
 # needs the project config file to run before
-"""
-This script is used to test the loading of a previously trained model and run it in demo mode.
+"""This script is used to test the loading of a previously trained model.
+
+And run it in demo mode.
 It requires the project config file to be run before.
 
 Usage:  
@@ -20,17 +21,15 @@
 import configparser
 import os
 
-import nkululeko.glob_conf as glob_conf
 from nkululeko.constants import VERSION
 from nkululeko.experiment import Experiment
+import nkululeko.glob_conf as glob_conf
 from nkululeko.utils.util import Util
 
 
 def main(src_dir):
-    parser = argparse.ArgumentParser(
-        description="Call the nkululeko DEMO framework.")
-    parser.add_argument("--config", default="exp.ini",
-                        help="The base configuration")
+    parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.")
+    parser.add_argument("--config", default="exp.ini", help="The base configuration")
     parser.add_argument(
         "--file", help="A file that should be processed (16kHz mono wav)"
     )

diff --git a/nkululeko/demo_predictor.py b/nkululeko/demo_predictor.py
@@ -1,18 +1,19 @@
 # demo_predictor.py
 import os
 
-import audformat
-import audiofile
 import numpy as np
 import pandas as pd
 
+import audformat
+import audiofile
+
 import nkululeko.glob_conf as glob_conf
 from nkululeko.utils.util import Util
 
 
 class Demo_predictor:
     def __init__(self, model, file, is_list, feature_extractor, label_encoder, outfile):
-        """Constructor setting up name and configuration"""
+        """Constructor setting up name and configuration."""
         self.model = model
         self.feature_extractor = feature_extractor
         self.label_encoder = label_encoder

diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -5,20 +5,22 @@
 import random
 import time
 
-import audeer
-import audformat
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 
-import nkululeko.glob_conf as glob_conf
+import audeer
+import audformat
+
 from nkululeko.data.dataset import Dataset
 from nkululeko.data.dataset_csv import Dataset_CSV
 from nkululeko.demo_predictor import Demo_predictor
 from nkululeko.feat_extract.feats_analyser import FeatureAnalyser
 from nkululeko.feature_extractor import FeatureExtractor
 from nkululeko.file_checker import FileChecker
-from nkululeko.filter_data import DataFilter, filter_min_dur
+from nkululeko.filter_data import DataFilter
+from nkululeko.filter_data import filter_min_dur
+import nkululeko.glob_conf as glob_conf
 from nkululeko.plots import Plots
 from nkululeko.reporting.report import Report
 from nkululeko.runmanager import Runmanager
@@ -101,6 +103,7 @@ def load_datasets(self):
                 self.got_speaker = True
             self.datasets.update({d: data})
         self.target = self.util.config_val("DATA", "target", "emotion")
+        glob_conf.set_target(self.target)
         # print target via debug
         self.util.debug(f"target: {self.target}")
         # print keys/column
@@ -487,11 +490,7 @@ def random_splice(self):
         return df_ret
 
     def analyse_features(self, needs_feats):
-        """
-        Do a feature exploration
-
-        """
-
+        """Do a feature exploration."""
         plot_feats = eval(
             self.util.config_val("EXPL", "feature_distributions", "False")
         )
@@ -511,7 +510,7 @@ def analyse_features(self, needs_feats):
                 f"unknown sample selection specifier {sample_selection}, should"
                 " be [all | train | test]"
             )
-
+        self.util.debug(f"sampling selection: {sample_selection}")
         if self.util.config_val("EXPL", "value_counts", False):
             self.plot_distribution(df_labels)
 
@@ -537,9 +536,13 @@ def analyse_features(self, needs_feats):
                 f"unknown sample selection specifier {sample_selection}, should"
                 " be [all | train | test]"
             )
+        feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
+        # check if SHAP features should be analysed
+        shap = eval(self.util.config_val("EXPL", "shap", "False"))
+        if shap:
+            feat_analyser.analyse_shap(self.runmgr.get_best_model())
 
         if plot_feats:
-            feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
             feat_analyser.analyse()
 
         # check if a scatterplot should be done
@@ -692,7 +695,7 @@ def save(self, filename):
         if self.runmgr.modelrunner.model.is_ann():
             self.runmgr.modelrunner.model = None
             self.util.warn(
-                "Save experiment: Can't pickle the learning model so saving without it."
+                "Save experiment: Can't pickle the trained model so saving without it. (it should be stored anyway)"
             )
         try:
             f = open(filename, "wb")

diff --git a/nkululeko/explore.py b/nkululeko/explore.py
@@ -12,9 +12,9 @@
 
 def main(src_dir):
     parser = argparse.ArgumentParser(
-        description="Call the nkululeko EXPLORE framework.")
-    parser.add_argument("--config", default="exp.ini",
-                        help="The base configuration")
+        description="Call the nkululeko EXPLORE framework."
+    )
+    parser.add_argument("--config", default="exp.ini", help="The base configuration")
     args = parser.parse_args()
     if args.config is not None:
         config_file = args.config
@@ -43,28 +43,34 @@ def main(src_dir):
         import warnings
 
         warnings.filterwarnings("ignore")
-
-    # load the data
-    expr.load_datasets()
-
-    # split into train and test
-    expr.fill_train_and_tests()
-    util.debug(
-        f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
-
-    plot_feats = eval(util.config_val(
-        "EXPL", "feature_distributions", "False"))
-    tsne = eval(util.config_val("EXPL", "tsne", "False"))
-    scatter = eval(util.config_val("EXPL", "scatter", "False"))
-    spotlight = eval(util.config_val("EXPL", "spotlight", "False"))
-    model_type = util.config_val("EXPL", "model", False)
-    plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
     needs_feats = False
-    if plot_feats or tsne or scatter or model_type or plot_tree:
-        # these investigations need features to explore
-        expr.extract_feats()
+    try:
+        # load the experiment
+        expr.load(f"{util.get_save_name()}")
         needs_feats = True
-    # explore
+    except FileNotFoundError:
+        # first time: load the data
+        expr.load_datasets()
+
+        # split into train and test
+        expr.fill_train_and_tests()
+        util.debug(
+            f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
+        )
+
+        plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
+        tsne = eval(util.config_val("EXPL", "tsne", "False"))
+        scatter = eval(util.config_val("EXPL", "scatter", "False"))
+        spotlight = eval(util.config_val("EXPL", "spotlight", "False"))
+        shap = eval(util.config_val("EXPL", "shap", "False"))
+        model_type = util.config_val("EXPL", "model", False)
+        plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
+        needs_feats = False
+        if plot_feats or tsne or scatter or model_type or plot_tree or shap:
+            # these investigations need features to explore
+            expr.extract_feats()
+            needs_feats = True
+        # explore
     expr.analyse_features(needs_feats)
     expr.store_report()
     print("DONE")

diff --git a/nkululeko/feat_extract/feats_analyser.py b/nkululeko/feat_extract/feats_analyser.py
@@ -40,6 +40,39 @@ def _get_importance(self, model, permutation):
             importance = model.feature_importances_
         return importance
 
+    def analyse_shap(self, model):
+        """Shap analysis.
+
+        Use the best model from a previous run and analyse feature importance with SHAP.
+        https://m.mage.ai/how-to-interpret-and-explain-your-machine-learning-models-using-shap-values-471c2635b78e.
+        """
+        import shap
+
+        name = "my_shap_values"
+        if not self.util.exist_pickle(name):
+
+            explainer = shap.Explainer(
+                model.predict_shap,
+                self.features,
+                output_names=glob_conf.labels,
+                algorithm="permutation",
+                npermutations=5,
+            )
+            self.util.debug("computing SHAP values...")
+            shap_values = explainer(self.features)
+            self.util.to_pickle(shap_values, name)
+        else:
+            shap_values = self.util.from_pickle(name)
+        plt.tight_layout()
+        shap.plots.bar(shap_values)
+        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
+        exp_name = self.util.get_exp_name(only_data=True)
+        format = self.util.config_val("PLOT", "format", "png")
+        filename = f"_SHAP_{model.name}"
+        filename = f"{fig_dir}{exp_name}{filename}.{format}"
+        plt.savefig(filename)
+        self.util.debug(f"plotted SHAP feature importance tp {filename}")
+
     def analyse(self):
         models = ast.literal_eval(self.util.config_val("EXPL", "model", "['log_reg']"))
         model_name = "_".join(models)

diff --git a/nkululeko/glob_conf.py b/nkululeko/glob_conf.py
@@ -29,3 +29,8 @@ def set_report(report_obj):
 def set_labels(labels_obj):
     global labels
     labels = labels_obj
+
+
+def set_target(target_obj):
+    global target
+    target = target_obj
diff --git a/nkululeko/models/model.py b/nkululeko/models/model.py
@@ -20,6 +20,7 @@ class Model:
 
     def __init__(self, df_train, df_test, feats_train, feats_test):
         """Constructor taking the configuration and all dataframes."""
+        self.name = "undefined"
         self.df_train, self.df_test, self.feats_train, self.feats_test = (
             df_train,
             df_test,

diff --git a/nkululeko/models/model_bayes.py b/nkululeko/models/model_bayes.py
@@ -12,3 +12,4 @@ class Bayes_model(Model):
     def __init__(self, df_train, df_test, feats_train, feats_test):
         super().__init__(df_train, df_test, feats_train, feats_test)
         self.clf = GaussianNB()  # set up the classifier
+        self.name = "bayes"
diff --git a/nkululeko/models/model_cnn.py b/nkululeko/models/model_cnn.py
@@ -34,7 +34,8 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
         """Constructor taking the configuration and all dataframes"""
         super().__init__(df_train, df_test, feats_train, feats_test)
         super().set_model_type("ann")
-        self.target = glob_conf.config["DATA"]["target"]
+        self.name = "cnn"
+        self.target = glob_conf.target
         labels = glob_conf.labels
         self.class_num = len(labels)
         # set up loss criterion
@@ -86,8 +87,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
         train_set = self.Dataset_image(
             feats_train, df_train, self.target, transformations
         )
-        test_set = self.Dataset_image(
-            feats_test, df_test, self.target, transformations)
+        test_set = self.Dataset_image(feats_test, df_test, self.target, transformations)
         # Define data loaders
         self.trainloader = torch.utils.data.DataLoader(
             train_set,
@@ -140,8 +140,7 @@ def train(self):
         losses = []
         for images, labels in self.trainloader:
             logits = self.model(images.to(self.device))
-            loss = self.criterion(logits, labels.to(
-                self.device, dtype=torch.int64))
+            loss = self.criterion(logits, labels.to(self.device, dtype=torch.int64))
             losses.append(loss.item())
             self.optimizer.zero_grad()
             loss.backward()
@@ -169,16 +168,14 @@ def evaluate_model(self, model, loader, device):
 
         self.loss_eval = (np.asarray(losses)).mean()
         predictions = logits.argmax(dim=1)
-        uar = recall_score(
-            targets.numpy(), predictions.numpy(), average="macro")
+        uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
         return uar, targets, predictions
 
     def predict(self):
         _, truths, predictions = self.evaluate_model(
             self.model, self.testloader, self.device
         )
-        uar, _, _ = self.evaluate_model(
-            self.model, self.trainloader, self.device)
+        uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
         report = Reporter(truths, predictions, self.run, self.epoch)
         try:
             report.result.loss = self.loss

diff --git a/nkululeko/models/model_gmm.py b/nkululeko/models/model_gmm.py
@@ -11,10 +11,9 @@ class GMM_model(Model):
 
     def __init__(self, df_train, df_test, feats_train, feats_test):
         super().__init__(df_train, df_test, feats_train, feats_test)
+        self.name = "gmm"
         n_components = int(self.util.config_val("MODEL", "GMM_components", "4"))
-        covariance_type = self.util.config_val(
-            "MODEL", "GMM_covariance_type", "full"
-        )
+        covariance_type = self.util.config_val("MODEL", "GMM_covariance_type", "full")
         self.clf = mixture.GaussianMixture(
             n_components=n_components, covariance_type=covariance_type
         )

diff --git a/nkululeko/models/model_knn.py b/nkululeko/models/model_knn.py
@@ -11,6 +11,7 @@ class KNN_model(Model):
 
     def __init__(self, df_train, df_test, feats_train, feats_test):
         super().__init__(df_train, df_test, feats_train, feats_test)
+        self.name = "knn"
         method = self.util.config_val("MODEL", "KNN_weights", "uniform")
         k = int(self.util.config_val("MODEL", "K_val", "5"))
         self.clf = KNeighborsClassifier(

diff --git a/nkululeko/models/model_knn_reg.py b/nkululeko/models/model_knn_reg.py
@@ -11,6 +11,7 @@ class KNN_reg_model(Model):
 
     def __init__(self, df_train, df_test, feats_train, feats_test):
         super().__init__(df_train, df_test, feats_train, feats_test)
+        self.name = "knn_reg"
         method = self.util.config_val("MODEL", "KNN_weights", "uniform")
         k = int(self.util.config_val("MODEL", "K_val", "5"))
         self.clf = KNeighborsRegressor(

diff --git a/nkululeko/models/model_lin_reg.py b/nkululeko/models/model_lin_reg.py
@@ -11,4 +11,5 @@ class Lin_reg_model(Model):
 
     def __init__(self, df_train, df_test, feats_train, feats_test):
         super().__init__(df_train, df_test, feats_train, feats_test)
+        self.name = "lin_reg"
         self.clf = LinearRegression()  # set up the classifier