diff --git a/analyses/hello-clusters/02_compare-clustering-parameters.Rmd b/analyses/hello-clusters/02_compare-clustering-parameters.Rmd
index e5ffb6c5c..6ba7df341 100644
--- a/analyses/hello-clusters/02_compare-clustering-parameters.Rmd
+++ b/analyses/hello-clusters/02_compare-clustering-parameters.Rmd
@@ -361,7 +361,9 @@ umap_plots <- cluster_results_list |>
theme(
axis.ticks = element_blank(),
axis.text = element_blank(),
- legend.position = "bottom"
+ # Ensure legends fit in the figure
+ legend.position = "bottom",
+ legend.key.size = unit(0.2, "cm")
)
}
)
diff --git a/analyses/hello-clusters/02_compare-clustering-parameters.nb.html b/analyses/hello-clusters/02_compare-clustering-parameters.nb.html
index 212e116c5..76257489e 100644
--- a/analyses/hello-clusters/02_compare-clustering-parameters.nb.html
+++ b/analyses/hello-clusters/02_compare-clustering-parameters.nb.html
@@ -11,7 +11,7 @@
-
+
Comparing clustering parameters with rOpenScPCA
@@ -2901,7 +2901,7 @@
Comparing clustering parameters with
rOpenScPCA
Data Lab
-2025-01-13
+2025-01-14
@@ -3040,7 +3040,7 @@ Varying a single clustering parameter
listed below. Clusters will be calculated for all combinations of
parameters values (where applicable); default values that the function
will use for any unspecified parameter values are shown in
-parentheses
+parentheses.
algorithm
: Which clustering algorithm to use
(Louvain)
@@ -3051,8 +3051,8 @@ Varying a single clustering parameter
objective_function
: The objective function to optimize
clusters (CPM; used only with Leiden clustering)
-rOpenScPCA::sweep_clusters()
does allow you to specify
-values for any other parameters.
+rOpenScPCA::sweep_clusters()
does not allow you to
+specify values for any other parameters.
This function will return a list of data frames of clustering
results. Each data frame will have the following columns:
@@ -3066,9 +3066,9 @@ Varying a single clustering parameter
Louvain algorithm while varying the nn
parameter:
-
+
# Define nn parameter values of interest
-nn_values <- seq(10, 30, 10)
+nn_values <- c(10, 20, 30)
# Calculate clusters varying nn, but leaving other parameters at their default values
cluster_results_list <- rOpenScPCA::sweep_clusters(
@@ -3091,7 +3091,8 @@ Varying a single clustering parameter
It can be helpful (although it is not strictly necessary to keep
-track) to name this list by the varied nn
parameter:
+track) to name this list by the varied nn
parameter. In
+this case, we’ll use these names to label plots.
@@ -3173,7 +3174,7 @@ Visualizing clustering results
the UMAPs.
-
+
umap_plots <- cluster_results_list |>
purrr::imap(
\(cluster_df, clustering_name) {
@@ -3196,7 +3197,7 @@ Visualizing clustering results
)
# Print the plots with patchwork::wrap_plots()
-patchwork::wrap_plots(umap_plots)
+patchwork::wrap_plots(umap_plots, ncol = 3)
@@ -3221,31 +3222,19 @@ Silhouette width and neighborhood purity
cluster_results_list
to calculate these quantities.
-
+
cell_metric_list <- cluster_results_list |>
purrr::map(
\(cluster_df) {
# calculate silhouette width
silhouette_df <- rOpenScPCA::calculate_silhouette(pca_matrix, cluster_df)
- # calculate neighbhorhood purity
- purity_df <- rOpenScPCA::calculate_purity(pca_matrix, cluster_df)
-
- # Combine into a single data frame
- dplyr::left_join(silhouette_df, purity_df)
+ # calculate neighbhorhood purity, starting from silhouette_df
+ rOpenScPCA::calculate_purity(pca_matrix, silhouette_df)
}
- )
-
-
-Joining with `by = join_by(cell_id, cluster, algorithm, weighting, nn,
-resolution)`
-Joining with `by = join_by(cell_id, cluster, algorithm, weighting, nn,
-resolution)`
-Joining with `by = join_by(cell_id, cluster, algorithm, weighting, nn,
-resolution)`
-
-
-# View the first six rows of each clustering result's cell-level QC metrics
+ )
+
+# View the first six rows of each clustering result's cell-level QC metrics
purrr::map(cell_metric_list, head)
@@ -3304,8 +3293,8 @@ Silhouette width and neighborhood purity
nn
column will distinguish among conditions.
-
-cell_metrics_df <- dplyr::bind_rows(cell_metric_list)
+
+cell_metrics_df <- purrr::list_rbind(cell_metric_list)
@@ -3347,7 +3336,10 @@ Silhouette width and neighborhood purity
While there does not appear to be a salient difference among
silhouette width distributions, it does appear that purity is higher
-with a higher nearest neighbors parameter.
+with a higher nearest neighbors parameter. It’s worth noting that this
+trend in purity values is expected: Higher nearest neighbor parameter
+values lead to fewer clusters, and neighborhood purity tends to be
+higher when there are fewer clusters.
Stability
@@ -3357,11 +3349,11 @@
Stability
iteration.
-
+
stability_list <- cluster_results_list |>
purrr::map(
\(cluster_df) {
- nn <- unique(cluster_df$nn)
+ nn <- cluster_df$nn[1] # all rows have the same `nn` parameter, so we'll take the first
# calculate stability, passing in the parameter value used for this iteration
rOpenScPCA::calculate_stability(pca_matrix, cluster_df, nn = nn)
@@ -3375,8 +3367,8 @@ Stability
nn
parameterizations.
-
-stability_df <- dplyr::bind_rows(stability_list)
+
+stability_df <- purrr::list_rbind(stability_list)
ggplot(stability_df) +
aes(x = as.factor(nn), y = ari, fill = as.factor(nn)) +
@@ -3408,10 +3400,10 @@ Varying multiple clustering parameters
examples) and visualize results.
-
+
# Define vectors of parameters to vary
-nn_values <- seq(10, 30, 10)
-res_values <- seq(5, 15, 5) / 10
+nn_values <- c(10, 20, 30)
+res_values <- c(0.5, 1.0, 1.5)
cluster_results_list <- rOpenScPCA::sweep_clusters(
@@ -3442,7 +3434,7 @@ Visualize clusters
UMAP panel title.
-
+
umap_plots <- cluster_results_list |>
purrr::map(
\(cluster_df) {
@@ -3452,7 +3444,7 @@ Visualize clusters
# Create a title for the UMAP with both parameters
umap_title <- glue::glue(
- "nn: {unique(cluster_df$nn)}; res: {unique(cluster_df$resolution)}"
+ "nn: {cluster_df$nn[1]}; res: {cluster_df$resolution[1]}"
)
# Plot the UMAP, colored by the new cluster variable
@@ -3464,16 +3456,18 @@ Visualize clusters
theme(
axis.ticks = element_blank(),
axis.text = element_blank(),
- legend.position = "bottom"
+ # Ensure legends fit in the figure
+ legend.position = "bottom",
+ legend.key.size = unit(0.2, "cm")
)
}
)
# Print the plots with patchwork::wrap_plots()
-patchwork::wrap_plots(umap_plots)
+patchwork::wrap_plots(umap_plots, ncol = 3)
-
+
@@ -3492,36 +3486,26 @@ Neighborhood purity
single data frame.
-
+
purity_df <- cluster_results_list |>
purrr::map(
\(cluster_df) {
rOpenScPCA::calculate_purity(pca_matrix, cluster_df)
}
) |>
- dplyr::bind_rows()
+ purrr::list_rbind()
-We’ll add a column resolution_label
which we’ll use to
-have informative panel titles in the faceted ggplot we make next.
-
-purity_df <- purity_df |>
- dplyr::mutate(resolution_label = glue::glue("Resolution: {resolution}"))
-
-
-
-
-
-
+
ggplot(purity_df) +
aes(x = as.factor(nn), y = purity, fill = as.factor(nn)) +
geom_boxplot() +
scale_fill_brewer(palette = "Pastel2") +
- # facet by resolution
- facet_wrap(vars(resolution_label)) +
+ # facet by resolution, labeling panels with both the resolution column name and value
+ facet_wrap(vars(resolution), labeller = label_both) +
labs(
x = "Number of nearest neighbors",
y = "Neighborhood purity"
@@ -3529,7 +3513,7 @@ Neighborhood purity
theme(legend.position = "none")
-
+
@@ -3542,7 +3526,7 @@ Stability
plot interpretation, and finally make our plot.
-
+
stability_df <- cluster_results_list |>
purrr::map(
\(cluster_df) {
@@ -3558,22 +3542,18 @@ Stability
)
}
) |>
- dplyr::bind_rows()
-
-stability_df <- stability_df |>
- dplyr::mutate(resolution_label = glue::glue("Resolution: {resolution}"))
+ purrr::list_rbind()
-
+
ggplot(stability_df) +
aes(x = as.factor(nn), y = ari, fill = as.factor(nn)) +
geom_boxplot() +
scale_fill_brewer(palette = "Pastel2") +
- # facet by resolution
- facet_wrap(vars(resolution_label)) +
+ facet_wrap(vars(resolution), labeller = label_both) +
labs(
x = "Number of nearest neighbors",
y = "Adjusted Rand Index"
@@ -3581,7 +3561,7 @@ Stability
theme(legend.position = "none")
-
+
@@ -3651,7 +3631,7 @@ Session Info

