diff --git a/analysis/generate_paper.sh b/analysis/generate_paper.sh index 4b830592..51a55870 100755 --- a/analysis/generate_paper.sh +++ b/analysis/generate_paper.sh @@ -59,7 +59,7 @@ main() { # RQ1 # Rscript analysis/paper/performance_distribution.R "${TMP_DIR}/decomposition_scores.csv" "${PAPER_REPOSITORY}/figures/rq1-performance-distribution.pdf" - Rscript analysis/paper/statistical_analysis_untangling_tool.R "${TMP_DIR}/decomposition_scores.csv" "${PAPER_REPOSITORY}/data/rq1.txt" + Rscript src/r/main/statistical_analysis_untangling_tool.R "${TMP_DIR}/decomposition_scores.csv" "${PAPER_REPOSITORY}/data/" Rscript analysis/paper/compare_models.R "${TMP_DIR}/decomposition_scores.csv" "${PAPER_REPOSITORY}/tables/model-comparison.tex" > "${PAPER_REPOSITORY}"/data/model-comparison.txt # diff --git a/analysis/paper/statistical_analysis_untangling_tool.R b/analysis/paper/statistical_analysis_untangling_tool.R deleted file mode 100755 index ccc510e0..00000000 --- a/analysis/paper/statistical_analysis_untangling_tool.R +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env Rscript -# -# Generates statistical analysis for the untangling tools with p-value, Cohen's d, R^2, and model residuals normality test results. -# -# Arguments: -# - 1: The `decomposition_scores.csv` file generated by `decompose.sh`. -# - 1: The path to the file where the results will be saved. -# -# Output: -# The results are saved as text data. The output file contains -# the output of the summary() and cohen.d() functions. - -library(librarian) -library(tidyverse) -library(car) -library(ggpubr) -library(lme4) -library(effsize) -library(lmerTest) -library(flexplot) -library(rsq) -shelf(broom) - -args = commandArgs(trailingOnly=TRUE) - -if (length(args)!=2) { - stop("Please provide an input file and output file.", call.=FALSE) -} -inputFile = args[1] -outputFile = args[2] - -data <- read.csv(inputFile, header = FALSE, col.names = c('Project', 'BugID', 'SmartCommit', 'Flexeme', 'FileUntangling')) -data <- subset(data, select = -c(FileUntangling)) -data$BugID <- as_factor(data$BugID) - -# Convert to long format -data_long = pivot_longer(data, cols = 3:4, names_to = 'Tool', values_to = 'Performance') - -# The summary can be interpreted as follows: - -# Intercept row shows whether the baseline treatment (whichever is first) is significantly different from 0. -# The second row, containing the other treatment, shows whether the other treatment is significantly -# different from the intercept. - -model <- lm(Performance ~ Tool, data=data_long) -# Residuals -# It is recommended to look at the residuals to check for normality rather than apply a statistical test. -visualize(model, "residuals") - -sink(outputFile) -summary(model) -#estimates(model_simple) - -tidy(model) - -cohen.d(data_long$Performance[data_long$Tool == "SmartCommit"], data_long$Performance[data_long$Tool == "Flexeme"]) -sink() - diff --git a/src/r/main/statistical_analysis_untangling_tool.R b/src/r/main/statistical_analysis_untangling_tool.R new file mode 100755 index 00000000..a7ab7d39 --- /dev/null +++ b/src/r/main/statistical_analysis_untangling_tool.R @@ -0,0 +1,64 @@ +#!/usr/bin/env Rscript +# +# Run and export the statistical analysis for the untangling tools with p-value, Cohen's d, R^2, and model residuals normality test results. +# The statistical analysis results are saved in two files: +# - `outputDir/untangling_tool_performance_statistical_analysis.txt` contains the summary of the model, ANOVA, Shapiro-Wilk normality test, and Cohen's d. +# - `outputDir/untangling_tool_performance_residuals.pdf` contains the residual plots of the model. +# +# Arguments: +# - 1: The untangling performance file `decomposition_scores.csv` containing the performance of the untangling tools for a dataset. +# - 1: The path to the directory where the analysis results will be saved. +# +# Output: +# The results are saved as text data. The output file contains +# the output of the summary() and cohen.d() functions. + +args = commandArgs(trailingOnly=TRUE) + +if (length(args)!=2) { + stop("Please provide the untangling performance file and the path where to store the results. Example: 'statistical_analysis_untangling_tool.R tool_performance.csv analysis/'", call.=FALSE) +} +untanglingPerformanceFile = args[1] +outputDir= args[2] + +residuals_plot_file = paste(outputDir, "untangling_tool_performance_residuals.pdf", sep="/") +untangling_tool_performance_statistical_analysis_file = paste(outputDir, "untangling_tool_performance_statistical_analysis.txt", sep="/") + +library(librarian) +library(tidyverse) +library(car) +library(ggpubr) +library(lme4) +library(effsize) +library(lmerTest) +library(flexplot) +library(rsq) +shelf(broom) + +data <- read.csv(untanglingPerformanceFile, header = FALSE, col.names = c('Project', 'BugID', 'SmartCommit', 'Flexeme', 'FileUntangling')) +data <- subset(data, select = -c(FileUntangling)) +data$BugID <- as_factor(data$BugID) + +# Convert to long format and select only SmartCommit and Flexeme to compare. +data_long = pivot_longer(data, cols = c('SmartCommit', 'Flexeme'), names_to = 'Tool', values_to = 'Performance') + +# The summary can be interpreted as follows: +# Intercept row shows whether the baseline treatment (whichever is first) is significantly different from 0. +# The second row, containing the other treatment, shows whether the other treatment is significantly +# different from the intercept. +model <- lm(Performance ~ Tool, data=data_long) + +# Residuals +# It is recommended to look at the residuals to check for normality rather than apply a statistical test. +pdf(residuals_plot_file) +visualize(model, "residuals") +dev.off() + +# Analysis results +sink(untangling_tool_performance_statistical_analysis_file) +summary(model) +anova(model) +shapiro.test(residuals(model)) +cohen.d(data_long$Performance[data_long$Tool == "SmartCommit"], data_long$Performance[data_long$Tool == "Flexeme"]) +sink() +