Skip to content

Commit

Permalink
0.77.11
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Jan 15, 2024
1 parent fd90b8f commit 8197dc2
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 56 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

Version 0.77.11
--------------
* added simple target distribution plots

Version 0.77.10
--------------
* show the best and not the last result for multidb
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.77.10"
VERSION="0.77.11"
SAMPLING_RATE = 16000
1 change: 1 addition & 0 deletions nkululeko/multidb.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def plot_heatmap(results, labels, name, config, datasets):
)
data_s = ", ".join(datasets)
text_file.write(f"{data_s}\n")
colsums = np.array2string(colsums, separator=", ")
text_file.write(f"{colsums}\n")

plt.figure(figsize=(10, 7))
Expand Down
123 changes: 69 additions & 54 deletions nkululeko/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,14 @@ def plot_distributions_speaker(self, df):
)
ax.set_ylabel(f"number of speakers")
ax.set_xlabel("number of samples")
fig = ax.figure
# plt.tight_layout()
img_path = f"{fig_dir}{filename}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
glob_conf.report.add_item(
ReportItem(
Header.HEADER_EXPLORE,
"Samples per speaker",
f"Samples per speaker ({df_speakers.shape[0]})",
img_path,
)
self._save_plot(
ax,
"Samples per speaker",
f"Samples per speaker ({df_speakers.shape[0]})",
filename,
"speakers",
)

# fig.clear()
else:
filename = f"samples_value_counts"
Expand All @@ -74,27 +69,47 @@ def plot_distributions_speaker(self, df):
)
ax.set_ylabel(f"number of speakers")
ax.set_xlabel("number of samples")
fig = ax.figure
# plt.tight_layout()
img_path = f"{fig_dir}{filename}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
fig.clear()
glob_conf.report.add_item(
ReportItem(
Header.HEADER_EXPLORE,
"Sample value counts",
f"Samples per speaker ({df_speakers.shape[0]})",
img_path,
)
self._save_plot(
ax,
"Sample value counts",
f"Samples per speaker ({df_speakers.shape[0]})",
filename,
"speakers",
)

self.plot_distributions(df_speakers, type_s="speakers")

def plot_distributions(self, df, type_s="samples"):
fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
class_label, df = self._check_binning("class_label", df)
attributes = ast.literal_eval(
self.util.config_val("EXPL", "value_counts", False)
)
# always plot the distribution of the main attribute
filename = f"{class_label}_distribution"
if self.util.is_categorical(df[class_label]):
ax = df[class_label].value_counts().plot(kind="bar")
else:
# for continous variables, also add a discretized version
binned_data = self.util.continuous_to_categorical(df[class_label])
ax = binned_data.value_counts().plot(kind="bar")
filename_binned = f"{class_label}_discreet"
self._save_plot(
ax,
"Sample value counts",
filename_binned,
filename_binned,
type_s,
)
ax = df[class_label].plot(kind="kde")

self._save_plot(
ax,
"Sample value counts",
filename,
filename,
type_s,
)

for att in attributes:
if len(att) == 1:
att1 = att[0]
Expand All @@ -104,7 +119,6 @@ def plot_distributions(self, df, type_s="samples"):
if att1 not in df:
self.util.error(f"unknown feature: {att1}")
att1, df = self._check_binning(att1, df)
class_label, df = self._check_binning("class_label", df)
self.util.debug(f"plotting {att1}")
filename = f"{self.target}-{att1}"
if self.util.is_categorical(df[class_label]):
Expand All @@ -123,18 +137,12 @@ def plot_distributions(self, df, type_s="samples"):
)
else:
ax, caption = self._plot2cont(df, class_label, att1, type_s)
fig = ax.figure
# plt.tight_layout()
img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
glob_conf.report.add_item(
ReportItem(
Header.HEADER_EXPLORE,
f"Correlation of {self.target} and {att[0]}",
caption,
img_path,
)
self._save_plot(
ax,
caption,
f"Correlation of {self.target} and {att[0]}",
filename,
type_s,
)
# fig.clear() # avoid error
elif len(att) == 2:
Expand Down Expand Up @@ -193,27 +201,34 @@ def plot_distributions(self, df, type_s="samples"):
# class_label = cont, att1 = cont, att2 = cont
ax, caption = self._plot2cont(df, att1, att2, type_s)

fig = ax.figure
# avoid warning
# plt.tight_layout()
img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
# fig.clear() # avoid error
glob_conf.report.add_item(
ReportItem(
Header.HEADER_EXPLORE,
f"Correlation of {att1} and {att2}",
caption,
img_path,
)
self._save_plot(
ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
)

else:
self.util.error(
"plot value counts: the plot distribution descriptor for"
f" {att} has more than 2 values"
f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
)

def _save_plot(self, ax, caption, header, filename, type_s):
fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
fig = ax.figure
# avoid warning
# plt.tight_layout()
img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
plt.savefig(img_path)
plt.close(fig)
# fig.clear() # avoid error
glob_conf.report.add_item(
ReportItem(
Header.HEADER_EXPLORE,
header,
caption,
img_path,
)
)

def _check_binning(self, att, df):
bin_reals_att = eval(self.util.config_val("EXPL", f"{att}.bin_reals", "False"))
if bin_reals_att:
Expand Down
9 changes: 8 additions & 1 deletion nkululeko/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, caller=None, has_config=True):
self.error(f"no such file: {self.got_data_roots}")
self.data_roots = configparser.ConfigParser()
self.data_roots.read(self.got_data_roots)
self.debug(f"getting data roots from {self.got_data_roots}")
# self.debug(f"getting data roots from {self.got_data_roots}")

def get_path(self, entry):
"""
Expand Down Expand Up @@ -253,6 +253,13 @@ def config_val_list(self, section, key, default):
return default

def continuous_to_categorical(self, series):
"""
discretize a categorical variable.
uses the labels and bins from the ini if present
:param series: a pandas series
:return a pandas series with discretized values as categories
"""
try:
bins = ast.literal_eval(self.config["DATA"]["bins"])
labels = ast.literal_eval(self.config["DATA"]["labels"])
Expand Down

0 comments on commit 8197dc2

Please sign in to comment.