0.77.11

bagustris · Jan 15, 2024 · 8197dc2 · 8197dc2
1 parent fd90b8f
commit 8197dc2
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.77.11
+--------------
+* added simple target distribution plots
+
 Version 0.77.10
 --------------
 * show the best and not the last result for multidb

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.77.10"
+VERSION="0.77.11"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/multidb.py b/nkululeko/multidb.py
@@ -97,6 +97,7 @@ def plot_heatmap(results, labels, name, config, datasets):
         )
         data_s = ", ".join(datasets)
         text_file.write(f"{data_s}\n")
+        colsums = np.array2string(colsums, separator=", ")
         text_file.write(f"{colsums}\n")
 
     plt.figure(figsize=(10, 7))

diff --git a/nkululeko/plots.py b/nkululeko/plots.py
@@ -45,19 +45,14 @@ def plot_distributions_speaker(self, df):
             )
             ax.set_ylabel(f"number of speakers")
             ax.set_xlabel("number of samples")
-            fig = ax.figure
-            # plt.tight_layout()
-            img_path = f"{fig_dir}{filename}.{self.format}"
-            plt.savefig(img_path)
-            plt.close(fig)
-            glob_conf.report.add_item(
-                ReportItem(
-                    Header.HEADER_EXPLORE,
-                    "Samples per speaker",
-                    f"Samples per speaker ({df_speakers.shape[0]})",
-                    img_path,
-                )
+            self._save_plot(
+                ax,
+                "Samples per speaker",
+                f"Samples per speaker ({df_speakers.shape[0]})",
+                filename,
+                "speakers",
             )
+
             # fig.clear()
         else:
             filename = f"samples_value_counts"
@@ -74,27 +69,47 @@ def plot_distributions_speaker(self, df):
             )
             ax.set_ylabel(f"number of speakers")
             ax.set_xlabel("number of samples")
-            fig = ax.figure
-            # plt.tight_layout()
-            img_path = f"{fig_dir}{filename}.{self.format}"
-            plt.savefig(img_path)
-            plt.close(fig)
-            fig.clear()
-            glob_conf.report.add_item(
-                ReportItem(
-                    Header.HEADER_EXPLORE,
-                    "Sample value counts",
-                    f"Samples per speaker ({df_speakers.shape[0]})",
-                    img_path,
-                )
+            self._save_plot(
+                ax,
+                "Sample value counts",
+                f"Samples per speaker ({df_speakers.shape[0]})",
+                filename,
+                "speakers",
             )
+
         self.plot_distributions(df_speakers, type_s="speakers")
 
     def plot_distributions(self, df, type_s="samples"):
-        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
+        class_label, df = self._check_binning("class_label", df)
         attributes = ast.literal_eval(
             self.util.config_val("EXPL", "value_counts", False)
         )
+        # always plot the distribution of the main attribute
+        filename = f"{class_label}_distribution"
+        if self.util.is_categorical(df[class_label]):
+            ax = df[class_label].value_counts().plot(kind="bar")
+        else:
+            # for continous variables, also add a discretized version
+            binned_data = self.util.continuous_to_categorical(df[class_label])
+            ax = binned_data.value_counts().plot(kind="bar")
+            filename_binned = f"{class_label}_discreet"
+            self._save_plot(
+                ax,
+                "Sample value counts",
+                filename_binned,
+                filename_binned,
+                type_s,
+            )
+            ax = df[class_label].plot(kind="kde")
+
+        self._save_plot(
+            ax,
+            "Sample value counts",
+            filename,
+            filename,
+            type_s,
+        )
+
         for att in attributes:
             if len(att) == 1:
                 att1 = att[0]
@@ -104,7 +119,6 @@ def plot_distributions(self, df, type_s="samples"):
                 if att1 not in df:
                     self.util.error(f"unknown feature: {att1}")
                 att1, df = self._check_binning(att1, df)
-                class_label, df = self._check_binning("class_label", df)
                 self.util.debug(f"plotting {att1}")
                 filename = f"{self.target}-{att1}"
                 if self.util.is_categorical(df[class_label]):
@@ -123,18 +137,12 @@ def plot_distributions(self, df, type_s="samples"):
                         )
                     else:
                         ax, caption = self._plot2cont(df, class_label, att1, type_s)
-                fig = ax.figure
-                # plt.tight_layout()
-                img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
-                plt.savefig(img_path)
-                plt.close(fig)
-                glob_conf.report.add_item(
-                    ReportItem(
-                        Header.HEADER_EXPLORE,
-                        f"Correlation of {self.target} and {att[0]}",
-                        caption,
-                        img_path,
-                    )
+                self._save_plot(
+                    ax,
+                    caption,
+                    f"Correlation of {self.target} and {att[0]}",
+                    filename,
+                    type_s,
                 )
                 # fig.clear()           # avoid error
             elif len(att) == 2:
@@ -193,27 +201,34 @@ def plot_distributions(self, df, type_s="samples"):
                             # class_label = cont, att1 = cont, att2 = cont
                             ax, caption = self._plot2cont(df, att1, att2, type_s)
 
-                fig = ax.figure
-                # avoid warning
-                # plt.tight_layout()
-                img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
-                plt.savefig(img_path)
-                plt.close(fig)
-                # fig.clear()   # avoid error
-                glob_conf.report.add_item(
-                    ReportItem(
-                        Header.HEADER_EXPLORE,
-                        f"Correlation of {att1} and {att2}",
-                        caption,
-                        img_path,
-                    )
+                self._save_plot(
+                    ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
                 )
+
             else:
                 self.util.error(
                     "plot value counts: the plot distribution descriptor for"
-                    f" {att} has more than 2 values"
+                    f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
                 )
 
+    def _save_plot(self, ax, caption, header, filename, type_s):
+        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
+        fig = ax.figure
+        # avoid warning
+        # plt.tight_layout()
+        img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
+        plt.savefig(img_path)
+        plt.close(fig)
+        # fig.clear()   # avoid error
+        glob_conf.report.add_item(
+            ReportItem(
+                Header.HEADER_EXPLORE,
+                header,
+                caption,
+                img_path,
+            )
+        )
+
     def _check_binning(self, att, df):
         bin_reals_att = eval(self.util.config_val("EXPL", f"{att}.bin_reals", "False"))
         if bin_reals_att:

diff --git a/nkululeko/utils/util.py b/nkululeko/utils/util.py
@@ -40,7 +40,7 @@ def __init__(self, caller=None, has_config=True):
                     self.error(f"no such file: {self.got_data_roots}")
                 self.data_roots = configparser.ConfigParser()
                 self.data_roots.read(self.got_data_roots)
-                self.debug(f"getting data roots from {self.got_data_roots}")
+                # self.debug(f"getting data roots from {self.got_data_roots}")
 
     def get_path(self, entry):
         """
@@ -253,6 +253,13 @@ def config_val_list(self, section, key, default):
             return default
 
     def continuous_to_categorical(self, series):
+        """
+        discretize a categorical variable.
+        uses the labels and bins from the ini if present
+
+        :param series: a pandas series
+        :return a pandas series with discretized values as categories
+        """
         try:
             bins = ast.literal_eval(self.config["DATA"]["bins"])
             labels = ast.literal_eval(self.config["DATA"]["labels"])