Skip to content

Commit

Permalink
0.84.0
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed May 3, 2024
1 parent 884ef11 commit 6aab034
Show file tree
Hide file tree
Showing 28 changed files with 328 additions and 90 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

Version 0.84.0
--------------
* added SHAP analysis
* started with finetuning

Version 0.83.3
--------------
* fixed a naming error in trill features that prevented storage of experiment
Expand Down
4 changes: 3 additions & 1 deletion ini_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,9 @@
* **dist_type**: type of plot for value counts, either histogram or density estimation (kde)
* dist_type = hist
* **spotlight**: open a web-browser window to inspect the data with the [spotlight software](https://github.com/Renumics/spotlight). Needs package *renumics-spotlight* to be installed!
* spotlight = False
* spotlight = False
* **shap**: comopute [SHAP](https://shap.readthedocs.io/en/latest/) values
* shap = False
### [PREDICT](#predict)
* **targets**: Speaker/speech characteristics to be predicted by some models
* targets = ['gender', 'age', 'snr', 'arousal', 'valence', 'dominance', 'pesq', 'mos']
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.83.3"
VERSION="0.84.0"
SAMPLING_RATE = 16000
13 changes: 6 additions & 7 deletions nkululeko/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# Demonstration code to use the ML-experiment framework
# Test the loading of a previously trained model and demo mode
# needs the project config file to run before
"""
This script is used to test the loading of a previously trained model and run it in demo mode.
"""This script is used to test the loading of a previously trained model.
And run it in demo mode.
It requires the project config file to be run before.
Usage:
Expand All @@ -20,17 +21,15 @@
import configparser
import os

import nkululeko.glob_conf as glob_conf
from nkululeko.constants import VERSION
from nkululeko.experiment import Experiment
import nkululeko.glob_conf as glob_conf
from nkululeko.utils.util import Util


def main(src_dir):
parser = argparse.ArgumentParser(
description="Call the nkululeko DEMO framework.")
parser.add_argument("--config", default="exp.ini",
help="The base configuration")
parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.")
parser.add_argument("--config", default="exp.ini", help="The base configuration")
parser.add_argument(
"--file", help="A file that should be processed (16kHz mono wav)"
)
Expand Down
7 changes: 4 additions & 3 deletions nkululeko/demo_predictor.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# demo_predictor.py
import os

import audformat
import audiofile
import numpy as np
import pandas as pd

import audformat
import audiofile

import nkululeko.glob_conf as glob_conf
from nkululeko.utils.util import Util


class Demo_predictor:
def __init__(self, model, file, is_list, feature_extractor, label_encoder, outfile):
"""Constructor setting up name and configuration"""
"""Constructor setting up name and configuration."""
self.model = model
self.feature_extractor = feature_extractor
self.label_encoder = label_encoder
Expand Down
27 changes: 15 additions & 12 deletions nkululeko/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@
import random
import time

import audeer
import audformat
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import nkululeko.glob_conf as glob_conf
import audeer
import audformat

from nkululeko.data.dataset import Dataset
from nkululeko.data.dataset_csv import Dataset_CSV
from nkululeko.demo_predictor import Demo_predictor
from nkululeko.feat_extract.feats_analyser import FeatureAnalyser
from nkululeko.feature_extractor import FeatureExtractor
from nkululeko.file_checker import FileChecker
from nkululeko.filter_data import DataFilter, filter_min_dur
from nkululeko.filter_data import DataFilter
from nkululeko.filter_data import filter_min_dur
import nkululeko.glob_conf as glob_conf
from nkululeko.plots import Plots
from nkululeko.reporting.report import Report
from nkululeko.runmanager import Runmanager
Expand Down Expand Up @@ -101,6 +103,7 @@ def load_datasets(self):
self.got_speaker = True
self.datasets.update({d: data})
self.target = self.util.config_val("DATA", "target", "emotion")
glob_conf.set_target(self.target)
# print target via debug
self.util.debug(f"target: {self.target}")
# print keys/column
Expand Down Expand Up @@ -487,11 +490,7 @@ def random_splice(self):
return df_ret

def analyse_features(self, needs_feats):
"""
Do a feature exploration
"""

"""Do a feature exploration."""
plot_feats = eval(
self.util.config_val("EXPL", "feature_distributions", "False")
)
Expand All @@ -511,7 +510,7 @@ def analyse_features(self, needs_feats):
f"unknown sample selection specifier {sample_selection}, should"
" be [all | train | test]"
)

self.util.debug(f"sampling selection: {sample_selection}")
if self.util.config_val("EXPL", "value_counts", False):
self.plot_distribution(df_labels)

Expand All @@ -537,9 +536,13 @@ def analyse_features(self, needs_feats):
f"unknown sample selection specifier {sample_selection}, should"
" be [all | train | test]"
)
feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
# check if SHAP features should be analysed
shap = eval(self.util.config_val("EXPL", "shap", "False"))
if shap:
feat_analyser.analyse_shap(self.runmgr.get_best_model())

if plot_feats:
feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
feat_analyser.analyse()

# check if a scatterplot should be done
Expand Down Expand Up @@ -692,7 +695,7 @@ def save(self, filename):
if self.runmgr.modelrunner.model.is_ann():
self.runmgr.modelrunner.model = None
self.util.warn(
"Save experiment: Can't pickle the learning model so saving without it."
"Save experiment: Can't pickle the trained model so saving without it. (it should be stored anyway)"
)
try:
f = open(filename, "wb")
Expand Down
52 changes: 29 additions & 23 deletions nkululeko/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

def main(src_dir):
parser = argparse.ArgumentParser(
description="Call the nkululeko EXPLORE framework.")
parser.add_argument("--config", default="exp.ini",
help="The base configuration")
description="Call the nkululeko EXPLORE framework."
)
parser.add_argument("--config", default="exp.ini", help="The base configuration")
args = parser.parse_args()
if args.config is not None:
config_file = args.config
Expand Down Expand Up @@ -43,28 +43,34 @@ def main(src_dir):
import warnings

warnings.filterwarnings("ignore")

# load the data
expr.load_datasets()

# split into train and test
expr.fill_train_and_tests()
util.debug(
f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")

plot_feats = eval(util.config_val(
"EXPL", "feature_distributions", "False"))
tsne = eval(util.config_val("EXPL", "tsne", "False"))
scatter = eval(util.config_val("EXPL", "scatter", "False"))
spotlight = eval(util.config_val("EXPL", "spotlight", "False"))
model_type = util.config_val("EXPL", "model", False)
plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
needs_feats = False
if plot_feats or tsne or scatter or model_type or plot_tree:
# these investigations need features to explore
expr.extract_feats()
try:
# load the experiment
expr.load(f"{util.get_save_name()}")
needs_feats = True
# explore
except FileNotFoundError:
# first time: load the data
expr.load_datasets()

# split into train and test
expr.fill_train_and_tests()
util.debug(
f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
)

plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
tsne = eval(util.config_val("EXPL", "tsne", "False"))
scatter = eval(util.config_val("EXPL", "scatter", "False"))
spotlight = eval(util.config_val("EXPL", "spotlight", "False"))
shap = eval(util.config_val("EXPL", "shap", "False"))
model_type = util.config_val("EXPL", "model", False)
plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
needs_feats = False
if plot_feats or tsne or scatter or model_type or plot_tree or shap:
# these investigations need features to explore
expr.extract_feats()
needs_feats = True
# explore
expr.analyse_features(needs_feats)
expr.store_report()
print("DONE")
Expand Down
33 changes: 33 additions & 0 deletions nkululeko/feat_extract/feats_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,39 @@ def _get_importance(self, model, permutation):
importance = model.feature_importances_
return importance

def analyse_shap(self, model):
"""Shap analysis.
Use the best model from a previous run and analyse feature importance with SHAP.
https://m.mage.ai/how-to-interpret-and-explain-your-machine-learning-models-using-shap-values-471c2635b78e.
"""
import shap

name = "my_shap_values"
if not self.util.exist_pickle(name):

explainer = shap.Explainer(
model.predict_shap,
self.features,
output_names=glob_conf.labels,
algorithm="permutation",
npermutations=5,
)
self.util.debug("computing SHAP values...")
shap_values = explainer(self.features)
self.util.to_pickle(shap_values, name)
else:
shap_values = self.util.from_pickle(name)
plt.tight_layout()
shap.plots.bar(shap_values)
fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
exp_name = self.util.get_exp_name(only_data=True)
format = self.util.config_val("PLOT", "format", "png")
filename = f"_SHAP_{model.name}"
filename = f"{fig_dir}{exp_name}{filename}.{format}"
plt.savefig(filename)
self.util.debug(f"plotted SHAP feature importance tp {filename}")

def analyse(self):
models = ast.literal_eval(self.util.config_val("EXPL", "model", "['log_reg']"))
model_name = "_".join(models)
Expand Down
5 changes: 5 additions & 0 deletions nkululeko/glob_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,8 @@ def set_report(report_obj):
def set_labels(labels_obj):
global labels
labels = labels_obj


def set_target(target_obj):
global target
target = target_obj
1 change: 1 addition & 0 deletions nkululeko/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Model:

def __init__(self, df_train, df_test, feats_train, feats_test):
"""Constructor taking the configuration and all dataframes."""
self.name = "undefined"
self.df_train, self.df_test, self.feats_train, self.feats_test = (
df_train,
df_test,
Expand Down
1 change: 1 addition & 0 deletions nkululeko/models/model_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ class Bayes_model(Model):
def __init__(self, df_train, df_test, feats_train, feats_test):
super().__init__(df_train, df_test, feats_train, feats_test)
self.clf = GaussianNB() # set up the classifier
self.name = "bayes"
15 changes: 6 additions & 9 deletions nkululeko/models/model_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
"""Constructor taking the configuration and all dataframes"""
super().__init__(df_train, df_test, feats_train, feats_test)
super().set_model_type("ann")
self.target = glob_conf.config["DATA"]["target"]
self.name = "cnn"
self.target = glob_conf.target
labels = glob_conf.labels
self.class_num = len(labels)
# set up loss criterion
Expand Down Expand Up @@ -86,8 +87,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
train_set = self.Dataset_image(
feats_train, df_train, self.target, transformations
)
test_set = self.Dataset_image(
feats_test, df_test, self.target, transformations)
test_set = self.Dataset_image(feats_test, df_test, self.target, transformations)
# Define data loaders
self.trainloader = torch.utils.data.DataLoader(
train_set,
Expand Down Expand Up @@ -140,8 +140,7 @@ def train(self):
losses = []
for images, labels in self.trainloader:
logits = self.model(images.to(self.device))
loss = self.criterion(logits, labels.to(
self.device, dtype=torch.int64))
loss = self.criterion(logits, labels.to(self.device, dtype=torch.int64))
losses.append(loss.item())
self.optimizer.zero_grad()
loss.backward()
Expand Down Expand Up @@ -169,16 +168,14 @@ def evaluate_model(self, model, loader, device):

self.loss_eval = (np.asarray(losses)).mean()
predictions = logits.argmax(dim=1)
uar = recall_score(
targets.numpy(), predictions.numpy(), average="macro")
uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
return uar, targets, predictions

def predict(self):
_, truths, predictions = self.evaluate_model(
self.model, self.testloader, self.device
)
uar, _, _ = self.evaluate_model(
self.model, self.trainloader, self.device)
uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
report = Reporter(truths, predictions, self.run, self.epoch)
try:
report.result.loss = self.loss
Expand Down
5 changes: 2 additions & 3 deletions nkululeko/models/model_gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ class GMM_model(Model):

def __init__(self, df_train, df_test, feats_train, feats_test):
super().__init__(df_train, df_test, feats_train, feats_test)
self.name = "gmm"
n_components = int(self.util.config_val("MODEL", "GMM_components", "4"))
covariance_type = self.util.config_val(
"MODEL", "GMM_covariance_type", "full"
)
covariance_type = self.util.config_val("MODEL", "GMM_covariance_type", "full")
self.clf = mixture.GaussianMixture(
n_components=n_components, covariance_type=covariance_type
)
Expand Down
1 change: 1 addition & 0 deletions nkululeko/models/model_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class KNN_model(Model):

def __init__(self, df_train, df_test, feats_train, feats_test):
super().__init__(df_train, df_test, feats_train, feats_test)
self.name = "knn"
method = self.util.config_val("MODEL", "KNN_weights", "uniform")
k = int(self.util.config_val("MODEL", "K_val", "5"))
self.clf = KNeighborsClassifier(
Expand Down
1 change: 1 addition & 0 deletions nkululeko/models/model_knn_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class KNN_reg_model(Model):

def __init__(self, df_train, df_test, feats_train, feats_test):
super().__init__(df_train, df_test, feats_train, feats_test)
self.name = "knn_reg"
method = self.util.config_val("MODEL", "KNN_weights", "uniform")
k = int(self.util.config_val("MODEL", "K_val", "5"))
self.clf = KNeighborsRegressor(
Expand Down
1 change: 1 addition & 0 deletions nkululeko/models/model_lin_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ class Lin_reg_model(Model):

def __init__(self, df_train, df_test, feats_train, feats_test):
super().__init__(df_train, df_test, feats_train, feats_test)
self.name = "lin_reg"
self.clf = LinearRegression() # set up the classifier
Loading

0 comments on commit 6aab034

Please sign in to comment.