From 2260f381839bff5d6ae32c2df204419bed2b7921 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Tue, 16 Mar 2021 18:28:46 +0100 Subject: [PATCH 01/26] add TODO comments to all parts that need changing --- .dev/compare_branches.R | 2 +- R/cache.R | 2 +- R/get_source_expressions.R | 4 ++-- tests/testthat/test-cache.R | 2 ++ tests/testthat/test-exclusions.R | 2 ++ tests/testthat/test-get_source_expressions.R | 2 ++ 6 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.dev/compare_branches.R b/.dev/compare_branches.R index 6124308a8..fa5824334 100755 --- a/.dev/compare_branches.R +++ b/.dev/compare_branches.R @@ -146,7 +146,7 @@ test_encoding <- function(dir) { tryCatch({ lapply( list.files(dir, pattern = "(?i)\\.r(?:md)?$", recursive = TRUE, full.names = TRUE), - function(x) nchar(readLines(x, warn = FALSE)) + function(x) nchar(readLines(x, warn = FALSE)) # TODO respect encoding ) FALSE }, error = function(x) TRUE) diff --git a/R/cache.R b/R/cache.R index 7a412be9e..9a517e1d8 100644 --- a/R/cache.R +++ b/R/cache.R @@ -119,7 +119,7 @@ digest_content <- function(linters, obj) { list(linters, obj$content, is.null(obj$parsed_content)) } else { # assume a filename - list(linters, readLines(obj)) + list(linters, readLines(obj)) # TODO respect encoding } digest::digest(content, algo = "sha1") } diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 49031f9d3..1ad8d3f5c 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -26,7 +26,7 @@ #' consisting of 6 elements: #' \itemize{ #' \item{`filename` (`character`)} -#' \item{`file_lines` (`character`) the [readLines()] output for this file} +#' \item{`file_lines` (`character`) the [readLines()] output for this file} # TODO document encoding #' \item{`content` (`character`) for .R files, the same as `file_lines`; #' for .Rmd scripts, this is the extracted R source code (as text)} #' \item{`full_parsed_content` (`data.frame`) as given by @@ -38,7 +38,7 @@ #' } #' } #' \item{error}{A `Lint` object describing any parsing error.} -#' \item{lines}{The [readLines()] output for this file.} +#' \item{lines}{The [readLines()] output for this file.} # TODO document encoding #' @export #' @md get_source_expressions <- function(filename, lines = NULL) { diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index 9d4693a01..941e7a8cd 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -480,3 +480,5 @@ test_that("cache = TRUE works with nolint", { writeLines("1+1 # nolint\n", file) expect_length(lint(file, linters, cache = TRUE), 0) }) + +# TODO test encoding-specific behaviour diff --git a/tests/testthat/test-exclusions.R b/tests/testthat/test-exclusions.R index d053dbe1d..f650feca8 100644 --- a/tests/testthat/test-exclusions.R +++ b/tests/testthat/test-exclusions.R @@ -26,4 +26,6 @@ test_that("it excludes properly", { } }) +# TODO test encoding-specific behaviour + options(old_ops) diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index 59d676318..e0ff739b6 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -90,3 +90,5 @@ test_that("Multi-byte character truncated by parser is ignored", { expect_equal(error$column_number, 8L) }) }) + +# TODO test encoding-specific behaviour From 643a918ce36003f4a11915df7aaf2350594200ca Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Tue, 16 Mar 2021 18:59:18 +0100 Subject: [PATCH 02/26] add encoding setting --- R/lint.R | 17 +++++++++++++++++ R/settings.R | 43 +++++++++++++++++++++++++++++++++++++++++++ R/zzz.R | 1 + 3 files changed, 61 insertions(+) diff --git a/R/lint.R b/R/lint.R index 59d0e294d..513ace6eb 100644 --- a/R/lint.R +++ b/R/lint.R @@ -376,6 +376,23 @@ find_package <- function(path) { path } +has_rproj <- function(path) { + length(list.files(path = path, pattern = "\\.Rproj$")) > 0L +} + +find_rproj <- function(path) { + path <- normalizePath(path, mustWork = FALSE) + + while (!has_rproj(path)) { + path <- dirname(path) + if (is_root(path)) { + return(NULL) + } + } + + list.files(path = path, pattern = "\\.Rproj$", full.names = TRUE)[1L] +} + is_root <- function(path) { identical(path, dirname(path)) } diff --git a/R/settings.R b/R/settings.R index 113c20bca..555e12121 100644 --- a/R/settings.R +++ b/R/settings.R @@ -17,6 +17,11 @@ read_settings <- function(filename) { clear_settings() config_file <- find_config(filename) + default_encoding <- find_default_encoding(filename) + if (!is.null(default_encoding)) { + # Locally override the default for encoding if we found a smart default + default_settings[["encoding"]] <- default_encoding + } if (!is.null(config_file)) { f <- function(e) { @@ -100,6 +105,44 @@ find_config <- function(filename) { NULL } +find_default_encoding <- function(filename) { + if (is.null(filename)) { + return(NULL) + } + + pkg_path <- find_package(filename) + if (!is.null(pkg_path)) { + # Get Encoding from DESCRIPTION + dcf <- tryCatch( + read.dcf(file.path(pkg_path, "DESCRIPTION")), + error = function(e) NULL + ) + if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { + return(unname(dcf[1L, "Encoding"])) + } + } + + rproj_file <- find_rproj(filename) + if (!is.null(rproj_file)) { + # Get Encoding from .Rproj + dcf <- tryCatch( + read.dcf(rproj_file), + error = function(e) NULL, + warning = function(e) NULL + ) + + if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { + encodings <- dcf[, "Encoding"] + encodings <- encodings[!is.na(encodings)] + if (length(encodings) > 0L) { + return(encodings[1L]) + } + } + } + + NULL +} + is_directory <- function(filename) { is_dir <- file.info(filename)$isdir diff --git a/R/zzz.R b/R/zzz.R index c7a37b1f5..c64baa876 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -200,6 +200,7 @@ settings <- NULL default_settings <<- list( linters = default_linters, + encoding = "UTF-8", exclude = rex::rex("#", any_spaces, "nolint"), exclude_start = rex::rex("#", any_spaces, "nolint start"), exclude_end = rex::rex("#", any_spaces, "nolint end"), From 6aa3dd32c44b45ecde47cbfb6cf65f94422d0fac Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 17 Mar 2021 19:21:26 +0100 Subject: [PATCH 03/26] add ISO8859-1 tests for Rproj and packages --- R/settings.R | 10 ++- .../dummy_packages/cp1252/.Rbuildignore | 2 + .../dummy_packages/cp1252/DESCRIPTION | 12 ++++ .../testthat/dummy_packages/cp1252/NAMESPACE | 1 + .../cp1252/R/metropolis-hastings-rho.R | 71 +++++++++++++++++++ .../dummy_packages/cp1252/cp1252.Rproj | 20 ++++++ .../project/default_linter_testcode.R | 46 ++++++++++++ .../project/metropolis-hastings-rho.R | 71 +++++++++++++++++++ .../dummy_projects/project/project.Rproj | 16 +++++ tests/testthat/test-settings.R | 11 +++ 10 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/dummy_packages/cp1252/.Rbuildignore create mode 100644 tests/testthat/dummy_packages/cp1252/DESCRIPTION create mode 100644 tests/testthat/dummy_packages/cp1252/NAMESPACE create mode 100644 tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R create mode 100644 tests/testthat/dummy_packages/cp1252/cp1252.Rproj create mode 100644 tests/testthat/dummy_projects/project/default_linter_testcode.R create mode 100644 tests/testthat/dummy_projects/project/metropolis-hastings-rho.R create mode 100644 tests/testthat/dummy_projects/project/project.Rproj diff --git a/R/settings.R b/R/settings.R index 555e12121..40ea790bb 100644 --- a/R/settings.R +++ b/R/settings.R @@ -111,6 +111,15 @@ find_default_encoding <- function(filename) { } pkg_path <- find_package(filename) + rproj_file <- find_rproj(filename) + + if (!is.null(rproj_file) && !is.null(pkg_path) && startsWith(rproj_file, pkg_path)) { + # Check precedence via directory hierarchy. + # Both paths are normalized so checking if rproj_file is within pkg_path is sufficient. + # Force reading from Rproj file + pkg_path <- NULL + } + if (!is.null(pkg_path)) { # Get Encoding from DESCRIPTION dcf <- tryCatch( @@ -122,7 +131,6 @@ find_default_encoding <- function(filename) { } } - rproj_file <- find_rproj(filename) if (!is.null(rproj_file)) { # Get Encoding from .Rproj dcf <- tryCatch( diff --git a/tests/testthat/dummy_packages/cp1252/.Rbuildignore b/tests/testthat/dummy_packages/cp1252/.Rbuildignore new file mode 100644 index 000000000..91114bf2f --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/tests/testthat/dummy_packages/cp1252/DESCRIPTION b/tests/testthat/dummy_packages/cp1252/DESCRIPTION new file mode 100644 index 000000000..15288c4c4 --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/DESCRIPTION @@ -0,0 +1,12 @@ +Package: cp1252 +Type: Package +Title: What the Package Does (Title Case) +Version: 0.1.0 +Author: Who wrote it +Maintainer: The package maintainer +Description: More about what it does (maybe more than one line) + Use four spaces when indenting paragraphs within the Description. +License: What license is it under? +Encoding: UTF-8 +LazyData: true +Roxygen: list(markdown = TRUE) diff --git a/tests/testthat/dummy_packages/cp1252/NAMESPACE b/tests/testthat/dummy_packages/cp1252/NAMESPACE new file mode 100644 index 000000000..d75f824ec --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R b/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R new file mode 100644 index 000000000..456983eaf --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R @@ -0,0 +1,71 @@ +# Copied from the spatialprobit package https://raw.githubusercontent.com/cran/spatialprobit/master/R/metropolis-hastings-rho.R +# This file is encoded in Cp-1252 + +# @param beta parameter vektor (k x 1) +# @param z imputed observed variables before truncation (n x 1) +# @param W spatial weight matrix (n x n) +# @param X design matrix (n x k) +density_rho <- function(rho, beta, z, I_n, W, X, type=c("SAR","SEM")) { + A <- (I_n - rho * W) # n x n + if (type == "SAR") { + S <- A %*% z - X %*% beta # n x 1; Residuals of SAR + } else { + S <- as.numeric(A %*% (z - X %*% beta)) # n x 1; Residuals of SEM + } + as.numeric(det(A) * exp(-0.5 * t(S) %*% S)) + #as.vector(exp(log(det(A)) - 0.5 * t(S) %*% S)) +} + +# Metropolis-Hastings-Chain with tuned acceptance rate, see LeSage (2009) +# +# proposal density is normal distribution +# g(rho* | rho_t) = rho_t + c * N(0, 1) +# +# @param type "SEM" or "SAR", +# @param n number of samples to draw +# @param start.value start value for rho +# @param c tuning parameter +draw_rho_metropolis <- function(type="SEM", n, z, I_n, W, X, burn.in=100, start.value=0.2, c=1) { +n <- n + burn.in +u <- runif(n=n, 0, 1) # samples for M-H-ratio +s <- rnorm(n=n, 0, 1) # realisations from proposal density +rho_t <- numeric(n) # vector for rho within the chain +rho_t[1] <- start.value # start value +p_t <- density_rho(rho_t[1], beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) # f(rho_t | beta, z) +i <- 2 # number of accepted proposals / length of chain +acceptance_rate <- numeric(n) # running acceptance rate +num_accepted <- 0 +while (i <= n) { + + # create proposal rho_p from proposal density g(rho_p | rho_{t-1}) ~ N(rho_{t-1}, c^2) + rho_p <- rho_t[i-1] + c * s[i] # proposal rho = letztes akzeptiertes Rho + normalverteilte Zufallsstreuung s + + # Berechnung der Dichte f(rho_p) für das Proposal rho_p + p_p <- density_rho(rho_p, beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) + # SW: Berechnung der Dichte ist teuer, daher besser für ein Grid vorher rechnen? + # Jain, Dichte p(rho|t,beta( hängt von z, beta ab. Aber zumindestens die Log-Determinante kann man vorher berechnen + # Berechnung Dichte f(rho_{t-1}) für letzten Wert der Chain + # p_t <- p(rho_t[i-1], beta=beta, z=z, W=W, X=X, type=type) # SW: Kann ich die alte Dichte nicht merken, dann muss ich die nicht neu berechnen + + # Wegen Symmetrie der Normalverteilung als Proposal-Dichte g(rho_p | rho_t) = g(rho_t | rho_p) + # vereinfacht sich das M-H-Ratio von [ f(rho_p) * g(rho_t | rho_p) ] / [ f(rho_t) * g(rho_p | rho_t) ] + # auf f(rho_p) / f(rho_t)! + # determine M-H-Ratio R(rho_p, rho_t) = min[ 1 ; p(rho_p) / p(rho_t) ] + # see LeSage (2009), eqn (5.27) + R <- min(1, p_p / p_t) + if (u[i] <= R) { # see Givens/Hoeting, p.184, eqn (7.2) + # accept proposal + rho_t[i] <- rho_p + p_t <- p_p # save density + num_accepted <- num_accepted + 1 + } else { + # stay with the current value + rho_t[i] <- rho_t[i-1] + } + acceptance_rate[i] <- num_accepted / i + if (acceptance_rate[i] < 0.4) c <- c / 1.1 # Wenn Akzeptanzrate zu klein, dann verringere (?) Streuungsparameter "c" + if (acceptance_rate[i] > 0.6) c <- c * 1.1 + i <- i + 1 +} +return(list(rho_t=rho_t,acceptance_rate=acceptance_rate)) +} diff --git a/tests/testthat/dummy_packages/cp1252/cp1252.Rproj b/tests/testthat/dummy_packages/cp1252/cp1252.Rproj new file mode 100644 index 000000000..c7aebb9a4 --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/cp1252.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: ISO8859-1 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/tests/testthat/dummy_projects/project/default_linter_testcode.R b/tests/testthat/dummy_projects/project/default_linter_testcode.R new file mode 100644 index 000000000..ba7ba7cca --- /dev/null +++ b/tests/testthat/dummy_projects/project/default_linter_testcode.R @@ -0,0 +1,46 @@ +# Each of the default linters should throw at least one lint on this file + +# assignment +# function_left_parentheses +# closed_curly +# commas +# paren_brace +f = function (x,y = 1){} + +# commented_code +# some <- commented("out code") + +# cyclocomp +# equals_na +# infix_spaces +# line_length +# object_length +# object_name +# object_usage +# open_curly +# T_and_F_symbol +someComplicatedFunctionWithALongCamelCaseName <- function(x) +{ + y <- 1 + if (1 > 2 && 2 > 3 && 3 > 4 && 4 > 5 && 5*10 > 6 && 5 > 6 && 6 > 7 && x == NA) {T} else {F} +} + +# no_tab +# pipe_continuation +# seq_linter +# spaces_inside +x <- 1:10 +x[ 2] +1:length(x) %>% lapply(function(x) x*2) %>% + head() + +# single_quotes +message('single_quotes') + +# spaces_left_parentheses +# trailing_whitespace +# semicolon_terminator +x <- 42; y <- 2 +(1:10) + +# trailing_blank_lines + diff --git a/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R b/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R new file mode 100644 index 000000000..aba63b6ed --- /dev/null +++ b/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R @@ -0,0 +1,71 @@ +# Copied from https://raw.githubusercontent.com/cran/spatialprobit/master/R/metropolis-hastings-rho.R +# This file is encoded in Cp-1252 + +# @param beta parameter vektor (k x 1) +# @param z imputed observed variables before truncation (n x 1) +# @param W spatial weight matrix (n x n) +# @param X design matrix (n x k) +density_rho <- function(rho, beta, z, I_n, W, X, type=c("SAR","SEM")) { + A <- (I_n - rho * W) # n x n + if (type == "SAR") { + S <- A %*% z - X %*% beta # n x 1; Residuals of SAR + } else { + S <- as.numeric(A %*% (z - X %*% beta)) # n x 1; Residuals of SEM + } + as.numeric(det(A) * exp(-0.5 * t(S) %*% S)) + #as.vector(exp(log(det(A)) - 0.5 * t(S) %*% S)) +} + +# Metropolis-Hastings-Chain with tuned acceptance rate, see LeSage (2009) +# +# proposal density is normal distribution +# g(rho* | rho_t) = rho_t + c * N(0, 1) +# +# @param type "SEM" or "SAR", +# @param n number of samples to draw +# @param start.value start value for rho +# @param c tuning parameter +draw_rho_metropolis <- function(type="SEM", n, z, I_n, W, X, burn.in=100, start.value=0.2, c=1) { +n <- n + burn.in +u <- runif(n=n, 0, 1) # samples for M-H-ratio +s <- rnorm(n=n, 0, 1) # realisations from proposal density +rho_t <- numeric(n) # vector for rho within the chain +rho_t[1] <- start.value # start value +p_t <- density_rho(rho_t[1], beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) # f(rho_t | beta, z) +i <- 2 # number of accepted proposals / length of chain +acceptance_rate <- numeric(n) # running acceptance rate +num_accepted <- 0 +while (i <= n) { + + # create proposal rho_p from proposal density g(rho_p | rho_{t-1}) ~ N(rho_{t-1}, c^2) + rho_p <- rho_t[i-1] + c * s[i] # proposal rho = letztes akzeptiertes Rho + normalverteilte Zufallsstreuung s + + # Berechnung der Dichte f(rho_p) für das Proposal rho_p + p_p <- density_rho(rho_p, beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) + # SW: Berechnung der Dichte ist teuer, daher besser für ein Grid vorher rechnen? + # Jain, Dichte p(rho|t,beta( hängt von z, beta ab. Aber zumindestens die Log-Determinante kann man vorher berechnen + # Berechnung Dichte f(rho_{t-1}) für letzten Wert der Chain + # p_t <- p(rho_t[i-1], beta=beta, z=z, W=W, X=X, type=type) # SW: Kann ich die alte Dichte nicht merken, dann muss ich die nicht neu berechnen + + # Wegen Symmetrie der Normalverteilung als Proposal-Dichte g(rho_p | rho_t) = g(rho_t | rho_p) + # vereinfacht sich das M-H-Ratio von [ f(rho_p) * g(rho_t | rho_p) ] / [ f(rho_t) * g(rho_p | rho_t) ] + # auf f(rho_p) / f(rho_t)! + # determine M-H-Ratio R(rho_p, rho_t) = min[ 1 ; p(rho_p) / p(rho_t) ] + # see LeSage (2009), eqn (5.27) + R <- min(1, p_p / p_t) + if (u[i] <= R) { # see Givens/Hoeting, p.184, eqn (7.2) + # accept proposal + rho_t[i] <- rho_p + p_t <- p_p # save density + num_accepted <- num_accepted + 1 + } else { + # stay with the current value + rho_t[i] <- rho_t[i-1] + } + acceptance_rate[i] <- num_accepted / i + if (acceptance_rate[i] < 0.4) c <- c / 1.1 # Wenn Akzeptanzrate zu klein, dann verringere (?) Streuungsparameter "c" + if (acceptance_rate[i] > 0.6) c <- c * 1.1 + i <- i + 1 +} +return(list(rho_t=rho_t,acceptance_rate=acceptance_rate)) +} \ No newline at end of file diff --git a/tests/testthat/dummy_projects/project/project.Rproj b/tests/testthat/dummy_projects/project/project.Rproj new file mode 100644 index 000000000..e856a0a49 --- /dev/null +++ b/tests/testthat/dummy_projects/project/project.Rproj @@ -0,0 +1,16 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: ISO8859-1 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index ce23dabee..e653af01e 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -124,3 +124,14 @@ test_that("with_defaults doesn't break on very long input", { "lintr::undesirable_function_linter" ) }) + +test_that("it has a smart default for encodings", { + read_settings(NULL) + expect_equal(settings$encoding, "UTF-8") + + read_settings("dummy_projects/project/metropolis-hastings-rho.R") + expect_equal(settings$encoding, "ISO8859-1") + + read_settings("dummy_packages/cp1252/R/metropolis-hastings-rho.R") + expect_equal(settings$encoding, "ISO8859-1") +}) \ No newline at end of file From 4ac90b5e953d71aa63a516e503f126da560c232b Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Mon, 14 Jun 2021 23:15:46 +0200 Subject: [PATCH 04/26] handle encoding in read_lines() handle invalid encoding in parse_exclusions() and get_source_file() suppress linting of file if invalid encoding detected also had to fix deprecated warnings caused by R 4.1.0 and testthat::expect_equal() with functions --- R/exclude.R | 6 ++++ R/get_source_expressions.R | 66 ++++++++++++++++++++++++---------- R/utils.R | 4 +-- tests/testthat/test-settings.R | 6 ++-- 4 files changed, 59 insertions(+), 23 deletions(-) diff --git a/R/exclude.R b/R/exclude.R index fff816835..17feceeff 100644 --- a/R/exclude.R +++ b/R/exclude.R @@ -79,6 +79,12 @@ parse_exclusions <- function(file, exclude = settings$exclude, exclusions <- list() + e <- tryCatch(nchar(lines), error = identity) + if (inherits(e, "error")) { + # Invalid encoding. Don't parse exclusions. + return(list()) + } + start_locations <- rex::re_matches(lines, exclude_start, locations = TRUE)[, "end"] + 1L starts <- which(!is.na(start_locations)) ends <- which(rex::re_matches(lines, exclude_end)) diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 1ad8d3f5c..9643acf80 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -77,6 +77,20 @@ get_source_expressions <- function(filename, lines = NULL) { # an error that does not use R_ParseErrorMsg if (is.na(message_info$line)) { + if (grepl("invalid multibyte string, element", e$message, fixed = TRUE)) { + # Invalid encoding, will break even re_matches() below, so we need to handle this first. + return( + Lint( + filename = source_file$filename, + line_number = 1L, + column_number = 1L, + type = "error", + message = "Invalid multibyte string. Is the encoding correct?", + line = "" + ) + ) + } + message_info <- re_matches(e$message, rex(single_quotes, capture(name = "name", anything), single_quotes, anything, @@ -185,26 +199,31 @@ get_source_expressions <- function(filename, lines = NULL) { parsed_content <- get_source_file(source_file, error = lint_error) tree <- generate_tree(parsed_content) - expressions <- lapply( - X = top_level_expressions(parsed_content), - FUN = get_single_source_expression, - parsed_content, - source_file, - filename, - tree - ) - - # add global expression - expressions[[length(expressions) + 1L]] <- - list( - filename = filename, - file_lines = source_file$lines, - content = source_file$lines, - full_parsed_content = parsed_content, - full_xml_parsed_content = safe_parse_to_xml(parsed_content), - terminal_newline = terminal_newline + if (inherits(e, "lint") && !nzchar(e$line)) { + # Don't create expression list if it's unreliable (invalid encoding or unhandled parse error) + expressions <- NULL + } else { + expressions <- lapply( + X = top_level_expressions(parsed_content), + FUN = get_single_source_expression, + parsed_content, + source_file, + filename, + tree ) + # add global expression + expressions[[length(expressions) + 1L]] <- + list( + filename = filename, + file_lines = source_file$lines, + content = source_file$lines, + full_parsed_content = parsed_content, + full_xml_parsed_content = safe_parse_to_xml(parsed_content), + terminal_newline = terminal_newline + ) + } + list(expressions = expressions, error = e, lines = source_file$lines) } @@ -252,6 +271,17 @@ get_source_file <- function(source_file, error = identity) { assign("e", e, envir = parent.frame()) } + # Triggers an error if the lines contain invalid characters. + e <- tryCatch( + nchar(source_file$content, type = "chars"), + error = error + ) + + if (inherits(e, "error") || inherits(e, "lint")) { + assign("e", e, envir = parent.frame()) + return() # parsed_content is unreliable if encoding is invalid + } + fix_eq_assigns(fix_column_numbers(fix_tab_indentations(source_file))) } diff --git a/R/utils.R b/R/utils.R index c927f71d0..3c99c3661 100644 --- a/R/utils.R +++ b/R/utils.R @@ -250,10 +250,10 @@ Linter <- function(fun, name = linter_auto_name()) { # nolint: object_name_linte structure(fun, class = "linter", name = name) } -read_lines <- function(file, ...) { +read_lines <- function(file, encoding = settings$encoding, ...) { terminal_newline <- TRUE lines <- withCallingHandlers({ - readLines(file, warn = TRUE, ...) + readLines(file, warn = TRUE, encoding = encoding, ...) }, warning = function(w) { if (grepl("incomplete final line found on", w$message, fixed = TRUE)) { diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index e653af01e..dfdc5ca29 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -3,7 +3,7 @@ test_that("it uses default settings if none provided", { read_settings(NULL) lapply(ls(settings), function(setting) { - expect_equal(settings[[setting]], default_settings[[setting]]) + expect_identical(settings[[setting]], default_settings[[setting]]) }) }) @@ -32,7 +32,7 @@ test_that("it uses config settings in same directory if provided", { read_settings(file) lapply(setdiff(ls(settings), "exclude"), function(setting) { - expect_equal(settings[[setting]], default_settings[[setting]]) + expect_identical(settings[[setting]], default_settings[[setting]]) }) expect_equal(settings$exclude, "test") @@ -58,7 +58,7 @@ test_that("it uses config home directory settings if provided", { withr::with_envvar(c(HOME = home_path), read_settings(file)) lapply(setdiff(ls(settings), "exclude"), function(setting) { - expect_equal(settings[[setting]], default_settings[[setting]]) + expect_identical(settings[[setting]], default_settings[[setting]]) }) expect_equal(settings$exclude, "test") From 893cdb5e6674e77072494e0956aa6ee3e213a5c8 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 18:33:29 +0200 Subject: [PATCH 05/26] docment() --- NAMESPACE | 1 + R/get_source_expressions.R | 2 +- man/default_settings.Rd | 2 +- man/get_source_expressions.Rd | 4 ++-- man/linters.Rd | 11 ++++++++--- tests/testthat/test-settings.R | 6 +++--- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b977c3bd4..becf3a8eb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,7 @@ export(object_name_linter) export(object_usage_linter) export(open_curly_linter) export(paren_brace_linter) +export(pipe_call_linter) export(pipe_continuation_linter) export(semicolon_terminator_linter) export(seq_linter) diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 9643acf80..a22ae07f6 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -42,7 +42,7 @@ #' @export #' @md get_source_expressions <- function(filename, lines = NULL) { - source_file <- srcfile(filename) + source_file <- srcfile(filename, encoding = settings$encoding) # Ensure English locale for terminal newline and zero-length variable warning messages old_lang <- set_lang("en") diff --git a/man/default_settings.Rd b/man/default_settings.Rd index a9bdc2d1e..032bc5a7d 100644 --- a/man/default_settings.Rd +++ b/man/default_settings.Rd @@ -5,7 +5,7 @@ \alias{default_settings} \title{Default lintr settings} \format{ -An object of class \code{list} of length 11. +An object of class \code{list} of length 12. } \usage{ default_settings diff --git a/man/get_source_expressions.Rd b/man/get_source_expressions.Rd index a7b3950af..f32ed9f78 100644 --- a/man/get_source_expressions.Rd +++ b/man/get_source_expressions.Rd @@ -35,7 +35,7 @@ The final element of \code{expressions} is a list corresponding to the full file consisting of 6 elements: \itemize{ \item{\code{filename} (\code{character})} -\item{\code{file_lines} (\code{character}) the \code{\link[=readLines]{readLines()}} output for this file} +\item{\code{file_lines} (\code{character}) the \code{\link[=readLines]{readLines()}} output for this file} # TODO document encoding \item{\code{content} (\code{character}) for .R files, the same as \code{file_lines}; for .Rmd scripts, this is the extracted R source code (as text)} \item{\code{full_parsed_content} (\code{data.frame}) as given by @@ -47,7 +47,7 @@ newline (as determined by \code{\link[=readLines]{readLines()}} producing a corr } } \item{error}{A \code{Lint} object describing any parsing error.} -\item{lines}{The \code{\link[=readLines]{readLines()}} output for this file.} +\item{lines}{The \code{\link[=readLines]{readLines()}} output for this file.} # TODO document encoding } \description{ This object is given as input to each linter diff --git a/man/linters.Rd b/man/linters.Rd index 793e726cb..cd05b469a 100644 --- a/man/linters.Rd +++ b/man/linters.Rd @@ -6,9 +6,9 @@ % R/extraction_operator_linter.R, R/function_left_parentheses.R, % R/implicit_integer_linter.R, R/infix_spaces_linter.R, % R/line_length_linter.R, R/missing_argument_linter.R, -% R/missing_package_linter.R, R/namespace_linter.R, -% R/no_tab_linter.R, R/object_usage_linter.R, R/open_curly_linter.R, -% R/paren_brace_linter.R, R/path_linters.R, R/pipe_continuation_linter.R, +% R/missing_package_linter.R, R/namespace_linter.R, R/no_tab_linter.R, +% R/object_usage_linter.R, R/open_curly_linter.R, R/paren_brace_linter.R, +% R/path_linters.R, R/pipe_call_linter.R, R/pipe_continuation_linter.R, % R/semicolon_terminator_linter.R, R/seq_linter.R, R/single_quotes_linter.R, % R/spaces_inside_linter.R, R/spaces_left_parentheses_linter.R, % R/sprintf_linter.R, R/trailing_blank_lines_linter.R, @@ -42,6 +42,7 @@ \alias{paren_brace_linter} \alias{absolute_path_linter} \alias{nonportable_path_linter} +\alias{pipe_call_linter} \alias{pipe_continuation_linter} \alias{semicolon_terminator_linter} \alias{seq_linter} @@ -108,6 +109,8 @@ absolute_path_linter(lax = TRUE) nonportable_path_linter(lax = TRUE) +pipe_call_linter() + pipe_continuation_linter() semicolon_terminator_linter(semicolon = c("compound", "trailing")) @@ -261,6 +264,8 @@ parenthesis and an opening curly brace. \item \code{nonportable_path_linter}: Check that file.path() is used to construct safe and portable paths. +\item \code{pipe_call_linter}: that forces explicit calls in magrittr pipes + \item \code{pipe_continuation_linter}: Check that each step in a pipeline is on a new line, or the entire pipe fits on one line. diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index dfdc5ca29..207079969 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -129,9 +129,9 @@ test_that("it has a smart default for encodings", { read_settings(NULL) expect_equal(settings$encoding, "UTF-8") - read_settings("dummy_projects/project/metropolis-hastings-rho.R") + read_settings(file.path("dummy_projects", "project", "metropolis-hastings-rho.R")) expect_equal(settings$encoding, "ISO8859-1") - read_settings("dummy_packages/cp1252/R/metropolis-hastings-rho.R") + read_settings(file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R")) expect_equal(settings$encoding, "ISO8859-1") -}) \ No newline at end of file +}) From ca777c53e16f22ad9e64e62db6cab8aa3181e5e0 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 19:07:42 +0200 Subject: [PATCH 06/26] try disabling fix_column_numbers() and getting more info on test failures --- .lintr | 1 + R/get_source_expressions.R | 27 +++++++++++++++------------ tests/testthat/test-settings.R | 6 ++++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/.lintr b/.lintr index 639fe9f22..7fa51541a 100644 --- a/.lintr +++ b/.lintr @@ -7,6 +7,7 @@ exclusions: list( "inst/example/bad.R", "tests/testthat/default_linter_testcode.R", "tests/testthat/dummy_packages", + "tests/testthat/dummy_projects", "tests/testthat/exclusions-test", "tests/testthat/knitr_formats", "tests/testthat/knitr_malformed" diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index a22ae07f6..062b9f804 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -324,11 +324,14 @@ fix_column_numbers <- function(content) { return(NULL) } + # Temporary: Apparently this is no longer necessary. + return(content) + text_lengths <- nchar(content$text[content$terminal], "chars") byte_lengths <- nchar(content$text[content$terminal], "bytes") differences <- byte_lengths - text_lengths - to_change <- which(differences > 0L) + to_change <- which(content$terminal[differences > 0L]) adjusted_col1 <- content$col1 adjusted_col2 <- content$col2 @@ -337,17 +340,17 @@ fix_column_numbers <- function(content) { needs_adjustment <- which(content$line1 == content$line1[i] & content$col1 >= content$col1[i]) for (j in needs_adjustment) { - adjusted_col1[j] <- - content$col1[j] - - sum(differences[ - content$line1 == content$line1[j] & - content$col1 < content$col1[j]]) - - adjusted_col2[j] <- - content$col2[j] - - sum(differences[ - content$line1 == content$line1[j] & - content$col2 < content$col2[j]]) + adjusted_col1[j] <- content$col1[j] - + sum(differences[ + content$line1[content$terminal] == content$line1[j] & + content$col1[content$terminal] < content$col1[j] + ]) + + adjusted_col2[j] <- content$col2[j] - + sum(differences[ + content$line1[content$terminal] == content$line1[j] & + content$col2[content$terminal] <= content$col2[j] + ]) } } content$col1 <- adjusted_col1 diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index 207079969..c9ee86e97 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -129,6 +129,12 @@ test_that("it has a smart default for encodings", { read_settings(NULL) expect_equal(settings$encoding, "UTF-8") + ex1 <- file.exists(file.path("dummy_projects", "project", "metropolis-hastings-rho.R")) + ex2 <- file.exists(file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R")) + if (!ex1 | !ex2) { + stop("Encoding test files do not exist (", ex1, ", ", ex2, "). getwd() = '", getwd(), "'.") + } + read_settings(file.path("dummy_projects", "project", "metropolis-hastings-rho.R")) expect_equal(settings$encoding, "ISO8859-1") From 4b8b9a73c2c547333d821bb49e23074c60865df3 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 19:45:36 +0200 Subject: [PATCH 07/26] try explicitly testing find_default_encoding() --- tests/testthat/test-settings.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index c9ee86e97..f55aab2b1 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -129,15 +129,15 @@ test_that("it has a smart default for encodings", { read_settings(NULL) expect_equal(settings$encoding, "UTF-8") - ex1 <- file.exists(file.path("dummy_projects", "project", "metropolis-hastings-rho.R")) - ex2 <- file.exists(file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R")) - if (!ex1 | !ex2) { - stop("Encoding test files do not exist (", ex1, ", ", ex2, "). getwd() = '", getwd(), "'.") - } + proj_file <- file.path("dummy_projects", "project", "metropolis-hastings-rho.R") + pkg_file <- file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R") - read_settings(file.path("dummy_projects", "project", "metropolis-hastings-rho.R")) + expect_equal(find_default_encoding(proj_file), "ISO8859-1") + expect_equal(find_default_encoding(pkg_file), "ISO8859-1") + + read_settings(proj_file) expect_equal(settings$encoding, "ISO8859-1") - read_settings(file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R")) + read_settings(pkg_file) expect_equal(settings$encoding, "ISO8859-1") }) From c3f1388ff4d8610183531dee89b8c2895ac1d9b9 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 19:55:48 +0200 Subject: [PATCH 08/26] test even deeper --- tests/testthat/test-settings.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index f55aab2b1..1462f78d9 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -132,6 +132,12 @@ test_that("it has a smart default for encodings", { proj_file <- file.path("dummy_projects", "project", "metropolis-hastings-rho.R") pkg_file <- file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R") + expect_false("encoding" %in% colnames(read.dcf(find_config(proj_file)))) + expect_false("encoding" %in% colnames(read.dcf(find_config(pkg_file)))) + + expect_type(find_rproj(proj_file), "character") + expect_type(find_package(pkg_file), "character") + expect_equal(find_default_encoding(proj_file), "ISO8859-1") expect_equal(find_default_encoding(pkg_file), "ISO8859-1") From d85219776d6f2d408e8bb4ba6ad1cea74344b9f1 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 20:05:08 +0200 Subject: [PATCH 09/26] check actual config file names --- tests/testthat/test-settings.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index 1462f78d9..7ea4bf14e 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -135,8 +135,8 @@ test_that("it has a smart default for encodings", { expect_false("encoding" %in% colnames(read.dcf(find_config(proj_file)))) expect_false("encoding" %in% colnames(read.dcf(find_config(pkg_file)))) - expect_type(find_rproj(proj_file), "character") - expect_type(find_package(pkg_file), "character") + expect_equal(find_rproj(proj_file), normalizePath(file.path("dummy_projects", "project", "project.Rproj"))) + expect_equal(find_package(pkg_file), normalizePath(file.path("dummy_packages", "cp1252"))) expect_equal(find_default_encoding(proj_file), "ISO8859-1") expect_equal(find_default_encoding(pkg_file), "ISO8859-1") From 088d0158eedc96a5125250f81046639de562f3fd Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 20:20:15 +0200 Subject: [PATCH 10/26] try tweaking .Rbuildignore --- .Rbuildignore | 2 +- tests/testthat/test-settings.R | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 44104f43b..98837c7c5 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,7 +6,7 @@ ^travis-tool\.sh$ ^.*\.gz$ ^cran-comments\.md$ -^.*\.Rproj$ +^lintr\.Rproj$ ^\.Rproj\.user$ ^\.idea$ ^\.dev$ diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index 7ea4bf14e..ed970134b 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -132,9 +132,6 @@ test_that("it has a smart default for encodings", { proj_file <- file.path("dummy_projects", "project", "metropolis-hastings-rho.R") pkg_file <- file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R") - expect_false("encoding" %in% colnames(read.dcf(find_config(proj_file)))) - expect_false("encoding" %in% colnames(read.dcf(find_config(pkg_file)))) - expect_equal(find_rproj(proj_file), normalizePath(file.path("dummy_projects", "project", "project.Rproj"))) expect_equal(find_package(pkg_file), normalizePath(file.path("dummy_packages", "cp1252"))) From 3441e2fe2f5def4f5c5d9283e8bf9b2e41967315 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 20:40:10 +0200 Subject: [PATCH 11/26] fix test for windows, fix R CMD check NOTE, fix new regression test (expects parse error instead of encoding error to be reported) --- .Rbuildignore | 1 + R/get_source_expressions.R | 5 ++++- tests/testthat/test-settings.R | 10 ++++++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 98837c7c5..986e8cc38 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -24,3 +24,4 @@ ^bench$ ^tests/testthat/dummy_packages/package/[.]Rbuildignore$ ^tests/testthat/dummy_packages/package/[.]lintr$ +^tests/testthat/dummy_packages/cp1252/[.]Rbuildignore$ diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 062b9f804..7c7dbe6c0 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -254,6 +254,7 @@ get_single_source_expression <- function(loc, } get_source_file <- function(source_file, error = identity) { + parse_error <- FALSE e <- tryCatch( source_file$parsed_content <- parse(text = source_file$content, srcfile = source_file, keep.source = TRUE), @@ -269,6 +270,7 @@ get_source_file <- function(source_file, error = identity) { if (inherits(e, "error") || inherits(e, "lint")) { assign("e", e, envir = parent.frame()) + parse_error <- TRUE } # Triggers an error if the lines contain invalid characters. @@ -278,7 +280,8 @@ get_source_file <- function(source_file, error = identity) { ) if (inherits(e, "error") || inherits(e, "lint")) { - assign("e", e, envir = parent.frame()) + # Let parse errors take precedence over encoding problems + if (!parse_error) assign("e", e, envir = parent.frame()) return() # parsed_content is unreliable if encoding is invalid } diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index ed970134b..bd6ab51bc 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -132,8 +132,14 @@ test_that("it has a smart default for encodings", { proj_file <- file.path("dummy_projects", "project", "metropolis-hastings-rho.R") pkg_file <- file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R") - expect_equal(find_rproj(proj_file), normalizePath(file.path("dummy_projects", "project", "project.Rproj"))) - expect_equal(find_package(pkg_file), normalizePath(file.path("dummy_packages", "cp1252"))) + expect_equal( + find_rproj(proj_file), + normalizePath(file.path("dummy_projects", "project", "project.Rproj"), winslash = "/") + ) + expect_equal( + find_package(pkg_file), + normalizePath(file.path("dummy_packages", "cp1252"), winslash = "/") + ) expect_equal(find_default_encoding(proj_file), "ISO8859-1") expect_equal(find_default_encoding(pkg_file), "ISO8859-1") From 52dca5c4c80993c54b9f5ca120f46f5e5dab794f Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 22:36:44 +0200 Subject: [PATCH 12/26] properly supply encoding to readLines() --- .dev/compare_branches.R | 6 +++++- R/cache.R | 4 +++- R/get_source_expressions.R | 14 ++++++++++++-- R/utils.R | 4 +++- tests/testthat/test-get_source_expressions.R | 6 +++++- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/.dev/compare_branches.R b/.dev/compare_branches.R index fa5824334..a7665661a 100755 --- a/.dev/compare_branches.R +++ b/.dev/compare_branches.R @@ -146,7 +146,11 @@ test_encoding <- function(dir) { tryCatch({ lapply( list.files(dir, pattern = "(?i)\\.r(?:md)?$", recursive = TRUE, full.names = TRUE), - function(x) nchar(readLines(x, warn = FALSE)) # TODO respect encoding + function(x) { + con <- file(x, encoding = lintr:::find_default_encoding(x)) + on.exit(close(con)) + nchar(readLines(con, warn = FALSE)) + } ) FALSE }, error = function(x) TRUE) diff --git a/R/cache.R b/R/cache.R index 9a517e1d8..cf09b0cb5 100644 --- a/R/cache.R +++ b/R/cache.R @@ -119,7 +119,9 @@ digest_content <- function(linters, obj) { list(linters, obj$content, is.null(obj$parsed_content)) } else { # assume a filename - list(linters, readLines(obj)) # TODO respect encoding + con <- file(obj, encoding = settings$encoding) + on.exit(close(con)) + list(linters, readLines(con)) } digest::digest(content, algo = "sha1") } diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 7c7dbe6c0..04effb687 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -1,6 +1,16 @@ #' Parsed sourced file from a filename #' #' This object is given as input to each linter +#' +#' @details +#' The file is read in using the `encoding` setting. +#' This setting found by taking the first valid result from the following locations +#' +#' 1. The `encoding` key from the usual lintr configuration settings. +#' 2. The `Encoding` field from a Package `DESCRIPTION` file in a parent directory. +#' 3. The `Encoding` field from an R Project `.Rproj` file in a parent directory. +#' 4. `"UTF-8"` as a fallback. +#' #' @param filename the file to be parsed. #' @param lines a character vector of lines. #' If \code{NULL}, then \code{filename} will be read. @@ -26,7 +36,7 @@ #' consisting of 6 elements: #' \itemize{ #' \item{`filename` (`character`)} -#' \item{`file_lines` (`character`) the [readLines()] output for this file} # TODO document encoding +#' \item{`file_lines` (`character`) the [readLines()] output for this file} #' \item{`content` (`character`) for .R files, the same as `file_lines`; #' for .Rmd scripts, this is the extracted R source code (as text)} #' \item{`full_parsed_content` (`data.frame`) as given by @@ -38,7 +48,7 @@ #' } #' } #' \item{error}{A `Lint` object describing any parsing error.} -#' \item{lines}{The [readLines()] output for this file.} # TODO document encoding +#' \item{lines}{The [readLines()] output for this file.} #' @export #' @md get_source_expressions <- function(filename, lines = NULL) { diff --git a/R/utils.R b/R/utils.R index 3c99c3661..3bfe33ee7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -252,8 +252,10 @@ Linter <- function(fun, name = linter_auto_name()) { # nolint: object_name_linte read_lines <- function(file, encoding = settings$encoding, ...) { terminal_newline <- TRUE + con <- file(file, encoding = encoding) + on.exit(close(con)) lines <- withCallingHandlers({ - readLines(file, warn = TRUE, encoding = encoding, ...) + readLines(con, warn = TRUE, ...) }, warning = function(w) { if (grepl("incomplete final line found on", w$message, fixed = TRUE)) { diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index e0ff739b6..2f9d7350d 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -91,4 +91,8 @@ test_that("Multi-byte character truncated by parser is ignored", { }) }) -# TODO test encoding-specific behaviour +test_that("Can read non UTF-8 file", { + file <- "dummy_projects/project/metropolis-hastings-rho.R" + read_settings(file) + expect_null(get_source_expressions(file)$error) +}) From 235daf2ab057618b8beef011b592f5b6bbf4d871 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 22:41:25 +0200 Subject: [PATCH 13/26] remove GPLv2 code, make path test more resilient --- man/get_source_expressions.Rd | 14 +++- .../testthat/dummy_packages/cp1252/R/cp1252.R | 4 ++ .../cp1252/R/metropolis-hastings-rho.R | 71 ------------------- .../testthat/dummy_projects/project/cp1252.R | 4 ++ .../project/metropolis-hastings-rho.R | 71 ------------------- tests/testthat/test-get_source_expressions.R | 2 +- tests/testthat/test-settings.R | 4 +- 7 files changed, 23 insertions(+), 147 deletions(-) create mode 100644 tests/testthat/dummy_packages/cp1252/R/cp1252.R delete mode 100644 tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R create mode 100644 tests/testthat/dummy_projects/project/cp1252.R delete mode 100644 tests/testthat/dummy_projects/project/metropolis-hastings-rho.R diff --git a/man/get_source_expressions.Rd b/man/get_source_expressions.Rd index f32ed9f78..068808770 100644 --- a/man/get_source_expressions.Rd +++ b/man/get_source_expressions.Rd @@ -35,7 +35,7 @@ The final element of \code{expressions} is a list corresponding to the full file consisting of 6 elements: \itemize{ \item{\code{filename} (\code{character})} -\item{\code{file_lines} (\code{character}) the \code{\link[=readLines]{readLines()}} output for this file} # TODO document encoding +\item{\code{file_lines} (\code{character}) the \code{\link[=readLines]{readLines()}} output for this file} \item{\code{content} (\code{character}) for .R files, the same as \code{file_lines}; for .Rmd scripts, this is the extracted R source code (as text)} \item{\code{full_parsed_content} (\code{data.frame}) as given by @@ -47,8 +47,18 @@ newline (as determined by \code{\link[=readLines]{readLines()}} producing a corr } } \item{error}{A \code{Lint} object describing any parsing error.} -\item{lines}{The \code{\link[=readLines]{readLines()}} output for this file.} # TODO document encoding +\item{lines}{The \code{\link[=readLines]{readLines()}} output for this file.} } \description{ This object is given as input to each linter } +\details{ +The file is read in using the \code{encoding} setting. +This setting found by taking the first valid result from the following locations +\enumerate{ +\item The \code{encoding} key from the usual lintr configuration settings. +\item The \code{Encoding} field from a Package \code{DESCRIPTION} file in a parent directory. +\item The \code{Encoding} field from an R Project \code{.Rproj} file in a parent directory. +\item \code{"UTF-8"} as a fallback. +} +} diff --git a/tests/testthat/dummy_packages/cp1252/R/cp1252.R b/tests/testthat/dummy_packages/cp1252/R/cp1252.R new file mode 100644 index 000000000..a2ffbe425 --- /dev/null +++ b/tests/testthat/dummy_packages/cp1252/R/cp1252.R @@ -0,0 +1,4 @@ +# This file is encoded in Cp-1252 + +# Comment containing non-ASCII ä +ü <- 42 \ No newline at end of file diff --git a/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R b/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R deleted file mode 100644 index 456983eaf..000000000 --- a/tests/testthat/dummy_packages/cp1252/R/metropolis-hastings-rho.R +++ /dev/null @@ -1,71 +0,0 @@ -# Copied from the spatialprobit package https://raw.githubusercontent.com/cran/spatialprobit/master/R/metropolis-hastings-rho.R -# This file is encoded in Cp-1252 - -# @param beta parameter vektor (k x 1) -# @param z imputed observed variables before truncation (n x 1) -# @param W spatial weight matrix (n x n) -# @param X design matrix (n x k) -density_rho <- function(rho, beta, z, I_n, W, X, type=c("SAR","SEM")) { - A <- (I_n - rho * W) # n x n - if (type == "SAR") { - S <- A %*% z - X %*% beta # n x 1; Residuals of SAR - } else { - S <- as.numeric(A %*% (z - X %*% beta)) # n x 1; Residuals of SEM - } - as.numeric(det(A) * exp(-0.5 * t(S) %*% S)) - #as.vector(exp(log(det(A)) - 0.5 * t(S) %*% S)) -} - -# Metropolis-Hastings-Chain with tuned acceptance rate, see LeSage (2009) -# -# proposal density is normal distribution -# g(rho* | rho_t) = rho_t + c * N(0, 1) -# -# @param type "SEM" or "SAR", -# @param n number of samples to draw -# @param start.value start value for rho -# @param c tuning parameter -draw_rho_metropolis <- function(type="SEM", n, z, I_n, W, X, burn.in=100, start.value=0.2, c=1) { -n <- n + burn.in -u <- runif(n=n, 0, 1) # samples for M-H-ratio -s <- rnorm(n=n, 0, 1) # realisations from proposal density -rho_t <- numeric(n) # vector for rho within the chain -rho_t[1] <- start.value # start value -p_t <- density_rho(rho_t[1], beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) # f(rho_t | beta, z) -i <- 2 # number of accepted proposals / length of chain -acceptance_rate <- numeric(n) # running acceptance rate -num_accepted <- 0 -while (i <= n) { - - # create proposal rho_p from proposal density g(rho_p | rho_{t-1}) ~ N(rho_{t-1}, c^2) - rho_p <- rho_t[i-1] + c * s[i] # proposal rho = letztes akzeptiertes Rho + normalverteilte Zufallsstreuung s - - # Berechnung der Dichte f(rho_p) für das Proposal rho_p - p_p <- density_rho(rho_p, beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) - # SW: Berechnung der Dichte ist teuer, daher besser für ein Grid vorher rechnen? - # Jain, Dichte p(rho|t,beta( hängt von z, beta ab. Aber zumindestens die Log-Determinante kann man vorher berechnen - # Berechnung Dichte f(rho_{t-1}) für letzten Wert der Chain - # p_t <- p(rho_t[i-1], beta=beta, z=z, W=W, X=X, type=type) # SW: Kann ich die alte Dichte nicht merken, dann muss ich die nicht neu berechnen - - # Wegen Symmetrie der Normalverteilung als Proposal-Dichte g(rho_p | rho_t) = g(rho_t | rho_p) - # vereinfacht sich das M-H-Ratio von [ f(rho_p) * g(rho_t | rho_p) ] / [ f(rho_t) * g(rho_p | rho_t) ] - # auf f(rho_p) / f(rho_t)! - # determine M-H-Ratio R(rho_p, rho_t) = min[ 1 ; p(rho_p) / p(rho_t) ] - # see LeSage (2009), eqn (5.27) - R <- min(1, p_p / p_t) - if (u[i] <= R) { # see Givens/Hoeting, p.184, eqn (7.2) - # accept proposal - rho_t[i] <- rho_p - p_t <- p_p # save density - num_accepted <- num_accepted + 1 - } else { - # stay with the current value - rho_t[i] <- rho_t[i-1] - } - acceptance_rate[i] <- num_accepted / i - if (acceptance_rate[i] < 0.4) c <- c / 1.1 # Wenn Akzeptanzrate zu klein, dann verringere (?) Streuungsparameter "c" - if (acceptance_rate[i] > 0.6) c <- c * 1.1 - i <- i + 1 -} -return(list(rho_t=rho_t,acceptance_rate=acceptance_rate)) -} diff --git a/tests/testthat/dummy_projects/project/cp1252.R b/tests/testthat/dummy_projects/project/cp1252.R new file mode 100644 index 000000000..a2ffbe425 --- /dev/null +++ b/tests/testthat/dummy_projects/project/cp1252.R @@ -0,0 +1,4 @@ +# This file is encoded in Cp-1252 + +# Comment containing non-ASCII ä +ü <- 42 \ No newline at end of file diff --git a/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R b/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R deleted file mode 100644 index aba63b6ed..000000000 --- a/tests/testthat/dummy_projects/project/metropolis-hastings-rho.R +++ /dev/null @@ -1,71 +0,0 @@ -# Copied from https://raw.githubusercontent.com/cran/spatialprobit/master/R/metropolis-hastings-rho.R -# This file is encoded in Cp-1252 - -# @param beta parameter vektor (k x 1) -# @param z imputed observed variables before truncation (n x 1) -# @param W spatial weight matrix (n x n) -# @param X design matrix (n x k) -density_rho <- function(rho, beta, z, I_n, W, X, type=c("SAR","SEM")) { - A <- (I_n - rho * W) # n x n - if (type == "SAR") { - S <- A %*% z - X %*% beta # n x 1; Residuals of SAR - } else { - S <- as.numeric(A %*% (z - X %*% beta)) # n x 1; Residuals of SEM - } - as.numeric(det(A) * exp(-0.5 * t(S) %*% S)) - #as.vector(exp(log(det(A)) - 0.5 * t(S) %*% S)) -} - -# Metropolis-Hastings-Chain with tuned acceptance rate, see LeSage (2009) -# -# proposal density is normal distribution -# g(rho* | rho_t) = rho_t + c * N(0, 1) -# -# @param type "SEM" or "SAR", -# @param n number of samples to draw -# @param start.value start value for rho -# @param c tuning parameter -draw_rho_metropolis <- function(type="SEM", n, z, I_n, W, X, burn.in=100, start.value=0.2, c=1) { -n <- n + burn.in -u <- runif(n=n, 0, 1) # samples for M-H-ratio -s <- rnorm(n=n, 0, 1) # realisations from proposal density -rho_t <- numeric(n) # vector for rho within the chain -rho_t[1] <- start.value # start value -p_t <- density_rho(rho_t[1], beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) # f(rho_t | beta, z) -i <- 2 # number of accepted proposals / length of chain -acceptance_rate <- numeric(n) # running acceptance rate -num_accepted <- 0 -while (i <= n) { - - # create proposal rho_p from proposal density g(rho_p | rho_{t-1}) ~ N(rho_{t-1}, c^2) - rho_p <- rho_t[i-1] + c * s[i] # proposal rho = letztes akzeptiertes Rho + normalverteilte Zufallsstreuung s - - # Berechnung der Dichte f(rho_p) für das Proposal rho_p - p_p <- density_rho(rho_p, beta=beta, z=z, I_n=I_n, W=W, X=X, type=type) - # SW: Berechnung der Dichte ist teuer, daher besser für ein Grid vorher rechnen? - # Jain, Dichte p(rho|t,beta( hängt von z, beta ab. Aber zumindestens die Log-Determinante kann man vorher berechnen - # Berechnung Dichte f(rho_{t-1}) für letzten Wert der Chain - # p_t <- p(rho_t[i-1], beta=beta, z=z, W=W, X=X, type=type) # SW: Kann ich die alte Dichte nicht merken, dann muss ich die nicht neu berechnen - - # Wegen Symmetrie der Normalverteilung als Proposal-Dichte g(rho_p | rho_t) = g(rho_t | rho_p) - # vereinfacht sich das M-H-Ratio von [ f(rho_p) * g(rho_t | rho_p) ] / [ f(rho_t) * g(rho_p | rho_t) ] - # auf f(rho_p) / f(rho_t)! - # determine M-H-Ratio R(rho_p, rho_t) = min[ 1 ; p(rho_p) / p(rho_t) ] - # see LeSage (2009), eqn (5.27) - R <- min(1, p_p / p_t) - if (u[i] <= R) { # see Givens/Hoeting, p.184, eqn (7.2) - # accept proposal - rho_t[i] <- rho_p - p_t <- p_p # save density - num_accepted <- num_accepted + 1 - } else { - # stay with the current value - rho_t[i] <- rho_t[i-1] - } - acceptance_rate[i] <- num_accepted / i - if (acceptance_rate[i] < 0.4) c <- c / 1.1 # Wenn Akzeptanzrate zu klein, dann verringere (?) Streuungsparameter "c" - if (acceptance_rate[i] > 0.6) c <- c * 1.1 - i <- i + 1 -} -return(list(rho_t=rho_t,acceptance_rate=acceptance_rate)) -} \ No newline at end of file diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index 2f9d7350d..bce962b4b 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -92,7 +92,7 @@ test_that("Multi-byte character truncated by parser is ignored", { }) test_that("Can read non UTF-8 file", { - file <- "dummy_projects/project/metropolis-hastings-rho.R" + file <- "dummy_projects/project/cp1252.R" read_settings(file) expect_null(get_source_expressions(file)$error) }) diff --git a/tests/testthat/test-settings.R b/tests/testthat/test-settings.R index bd6ab51bc..5a40e87d7 100644 --- a/tests/testthat/test-settings.R +++ b/tests/testthat/test-settings.R @@ -133,11 +133,11 @@ test_that("it has a smart default for encodings", { pkg_file <- file.path("dummy_packages", "cp1252", "R", "metropolis-hastings-rho.R") expect_equal( - find_rproj(proj_file), + normalizePath(find_rproj(proj_file), winslash = "/"), normalizePath(file.path("dummy_projects", "project", "project.Rproj"), winslash = "/") ) expect_equal( - find_package(pkg_file), + normalizePath(find_package(pkg_file), winslash = "/"), normalizePath(file.path("dummy_packages", "cp1252"), winslash = "/") ) From f1a84a659c00a6c5d73bbe5ab18187b34e0bcf8b Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 22:57:41 +0200 Subject: [PATCH 14/26] remove fix_column_numbers() as it's not needed when srcfile() gets the correct encoding. --- R/get_source_expressions.R | 43 +------------------------------------- 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 04effb687..e9fbffc71 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -295,7 +295,7 @@ get_source_file <- function(source_file, error = identity) { return() # parsed_content is unreliable if encoding is invalid } - fix_eq_assigns(fix_column_numbers(fix_tab_indentations(source_file))) + fix_eq_assigns(fix_tab_indentations(source_file)) } find_line_fun <- function(content) { @@ -331,47 +331,6 @@ find_column_fun <- function(content) { } } -# Adjust the columns that getParseData reports from bytes to characters. -fix_column_numbers <- function(content) { - if (is.null(content)) { - return(NULL) - } - - # Temporary: Apparently this is no longer necessary. - return(content) - - text_lengths <- nchar(content$text[content$terminal], "chars") - byte_lengths <- nchar(content$text[content$terminal], "bytes") - differences <- byte_lengths - text_lengths - - to_change <- which(content$terminal[differences > 0L]) - - adjusted_col1 <- content$col1 - adjusted_col2 <- content$col2 - - for (i in to_change) { - needs_adjustment <- which(content$line1 == content$line1[i] & content$col1 >= content$col1[i]) - - for (j in needs_adjustment) { - adjusted_col1[j] <- content$col1[j] - - sum(differences[ - content$line1[content$terminal] == content$line1[j] & - content$col1[content$terminal] < content$col1[j] - ]) - - adjusted_col2[j] <- content$col2[j] - - sum(differences[ - content$line1[content$terminal] == content$line1[j] & - content$col2[content$terminal] <= content$col2[j] - ]) - } - } - content$col1 <- adjusted_col1 - content$col2 <- adjusted_col2 - content -} - - # Fix column numbers when there are tabs # getParseData() counts 1 tab as a variable number of spaces instead of one: # https://github.com/wch/r-source/blame/e7401b68ab0e032fce3e376aaca9a5431619b2b4/src/main/gram.y#L512 From 7240edac895abb3d22bba857693a28f18157d231 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Wed, 30 Jun 2021 23:32:28 +0200 Subject: [PATCH 15/26] write files with proper encoding --- R/expect_lint.R | 6 ++++-- R/lint.R | 4 +++- tests/testthat/test-get_source_expressions.R | 6 ++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/R/expect_lint.R b/R/expect_lint.R index 3241600cd..5b9c99a10 100644 --- a/R/expect_lint.R +++ b/R/expect_lint.R @@ -45,8 +45,10 @@ expect_lint <- function(content, checks, ..., file = NULL, language = "en") { if (is.null(file)) { file <- tempfile() - on.exit(unlink(file), add = TRUE) - writeLines(content, con = file, sep = "\n") + con <- base::file(file, encoding = "UTF-8") + on.exit({ unlink(file) }, add = TRUE) + writeLines(content, con = con, sep = "\n") + close(con) } lints <- lint(file, ...) diff --git a/R/lint.R b/R/lint.R index 513ace6eb..4fd2673d1 100644 --- a/R/lint.R +++ b/R/lint.R @@ -56,8 +56,10 @@ lint <- function(filename, linters = NULL, cache = FALSE, ..., parse_settings = if (inline_data && no_filename) { filename <- tempfile() + con <- file(filename, open = "w", encoding = settings$encoding) on.exit(unlink(filename), add = TRUE) - writeLines(text = text, con = filename, sep = "\n") + writeLines(text = text, con = con, sep = "\n") + close(con) } lines <- if (is.null(text)) { diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index bce962b4b..3e5b1ad77 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -1,7 +1,9 @@ with_content_to_parse <- function(content, code) { f <- tempfile() - on.exit(unlink(f)) - writeLines(content, f) + con <- file(f, open = "w", encoding = "UTF-8") + on.exit({ unlink(f) }) + writeLines(content, con) + close(con) source_expressions <- get_source_expressions(f) content_env <- new.env() content_env$pc <- lapply(source_expressions[["expressions"]], `[[`, "parsed_content") From dcdb01c75ee85644002c59cbcb46024c8efb3cd6 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Thu, 1 Jul 2021 09:54:53 +0200 Subject: [PATCH 16/26] use iconv() instead of connections to get more control over messages --- R/cache.R | 4 +--- R/get_source_expressions.R | 18 +++++++++++++++++- R/utils.R | 7 ++++--- .../dummy_projects/project/cp1252_parseable.R | 4 ++++ tests/testthat/test-get_source_expressions.R | 16 ++++++++++++++++ 5 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 tests/testthat/dummy_projects/project/cp1252_parseable.R diff --git a/R/cache.R b/R/cache.R index cf09b0cb5..7a412be9e 100644 --- a/R/cache.R +++ b/R/cache.R @@ -119,9 +119,7 @@ digest_content <- function(linters, obj) { list(linters, obj$content, is.null(obj$parsed_content)) } else { # assume a filename - con <- file(obj, encoding = settings$encoding) - on.exit(close(con)) - list(linters, readLines(con)) + list(linters, readLines(obj)) } digest::digest(content, algo = "sha1") } diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index e9fbffc71..7e0a11696 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -87,7 +87,23 @@ get_source_expressions <- function(filename, lines = NULL) { # an error that does not use R_ParseErrorMsg if (is.na(message_info$line)) { - if (grepl("invalid multibyte string, element", e$message, fixed = TRUE)) { + if (grepl("invalid multibyte character in parser at line", e$message, fixed = TRUE)) { + l <- as.integer(re_matches( + e$message, + rex("invalid multibyte character in parser at line ", capture(name = "line", digits)) + )$line) + # Invalid encoding in source code + return( + Lint( + filename = source_file$filename, + line_number = l, + column_number = 1L, + type = "error", + message = "Invalid multibyte character in parser. Is the encoding correct?", + line = source_file$lines[[l]] + ) + ) + } else if (grepl("invalid multibyte string, element", e$message, fixed = TRUE)) { # Invalid encoding, will break even re_matches() below, so we need to handle this first. return( Lint( diff --git a/R/utils.R b/R/utils.R index 3bfe33ee7..eaa6da636 100644 --- a/R/utils.R +++ b/R/utils.R @@ -252,10 +252,8 @@ Linter <- function(fun, name = linter_auto_name()) { # nolint: object_name_linte read_lines <- function(file, encoding = settings$encoding, ...) { terminal_newline <- TRUE - con <- file(file, encoding = encoding) - on.exit(close(con)) lines <- withCallingHandlers({ - readLines(con, warn = TRUE, ...) + readLines(file, warn = TRUE, ...) }, warning = function(w) { if (grepl("incomplete final line found on", w$message, fixed = TRUE)) { @@ -263,6 +261,9 @@ read_lines <- function(file, encoding = settings$encoding, ...) { invokeRestart("muffleWarning") } }) + lines_conv <- iconv(lines, from = encoding, to = "UTF-8") + lines[!is.na(lines_conv)] <- lines_conv[!is.na(lines_conv)] + Encoding(lines) <- "UTF-8" attr(lines, "terminal_newline") <- terminal_newline lines } diff --git a/tests/testthat/dummy_projects/project/cp1252_parseable.R b/tests/testthat/dummy_projects/project/cp1252_parseable.R new file mode 100644 index 000000000..6027ee98b --- /dev/null +++ b/tests/testthat/dummy_projects/project/cp1252_parseable.R @@ -0,0 +1,4 @@ +# This file is encoded in Cp-1252 + +# Comment containing non-ASCII ä +a <- 42 \ No newline at end of file diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index 3e5b1ad77..9e30447f6 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -98,3 +98,19 @@ test_that("Can read non UTF-8 file", { read_settings(file) expect_null(get_source_expressions(file)$error) }) + +test_that("Warns if encoding is misspecified", { + file <- "dummy_projects/project/cp1252.R" + read_settings(NULL) + the_lint <- get_source_expressions(file)$error + expect_s3_class(the_lint, "lint") + expect_equal(the_lint$message, "Invalid multibyte character in parser. Is the encoding correct?") + expect_equal(the_lint$line_number, 4L) + + file <- "dummy_projects/project/cp1252_parseable.R" + read_settings(NULL) + the_lint <- get_source_expressions(file)$error + expect_s3_class(the_lint, "lint") + expect_equal(the_lint$message, "Invalid multibyte string. Is the encoding correct?") + expect_equal(the_lint$line_number, 1L) +}) From b97858d26ffcce5e5f5d51121d15f67e4dc0ef22 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Thu, 1 Jul 2021 10:15:34 +0200 Subject: [PATCH 17/26] update .lintr_new make lint_package test for warning more resilient (it failed the codecov build) --- .lintr_new | 1 + tests/testthat/test-lint_package.R | 19 +++++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.lintr_new b/.lintr_new index a8be72455..5d6f47b90 100644 --- a/.lintr_new +++ b/.lintr_new @@ -3,6 +3,7 @@ linters: with_defaults( ) exclusions: list( "tests/testthat/dummy_packages", + "tests/testthat/dummy_projects", "tests/testthat/knitr_formats", "tests/testthat/knitr_malformed", "inst", diff --git a/tests/testthat/test-lint_package.R b/tests/testthat/test-lint_package.R index 79db227ab..cf7ed09f1 100644 --- a/tests/testthat/test-lint_package.R +++ b/tests/testthat/test-lint_package.R @@ -113,20 +113,19 @@ test_that( }) test_that("lint_package returns early if no package is found", { - skip_if( - !is.null(suppressWarnings(lint_package(dirname(tempdir())))), - "temp directory matches a package structure" - ) - expect_warning(l <- lint_package(tempdir()), "Didn't find any R package", fixed = TRUE) + + temp_pkg <- tempfile("dir") + dir.create(temp_pkg) + on.exit(unlink(temp_pkg, recursive = TRUE)) + + expect_warning(l <- lint_package(temp_pkg), "Didn't find any R package", fixed = TRUE) expect_null(l) - skip_if( - !is.null(suppressWarnings(lint_package("."))), - "package test directory matches a package structure" - ) # ignore a folder named DESCRIPTION, #702 + file.copy(file.path("dummy_packages", "desc_dir_pkg"), temp_pkg, recursive = TRUE) + expect_warning( - lint_package(file.path("dummy_packages", "desc_dir_pkg", "DESCRIPTION", "R")), + lint_package(file.path(temp_pkg, "desc_dir_pkg", "DESCRIPTION", "R")), "Didn't find any R package", fixed = TRUE ) }) From 0454a4eafa44bbfc64322f2d5f97517c0187b1b5 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Thu, 1 Jul 2021 10:51:01 +0200 Subject: [PATCH 18/26] fix test for windows, add test for parse_exclusions --- tests/testthat/test-cache.R | 2 -- tests/testthat/test-exclusions.R | 6 +++++- tests/testthat/test-get_source_expressions.R | 10 +++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index 941e7a8cd..9d4693a01 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -480,5 +480,3 @@ test_that("cache = TRUE works with nolint", { writeLines("1+1 # nolint\n", file) expect_length(lint(file, linters, cache = TRUE), 0) }) - -# TODO test encoding-specific behaviour diff --git a/tests/testthat/test-exclusions.R b/tests/testthat/test-exclusions.R index f650feca8..394927eb7 100644 --- a/tests/testthat/test-exclusions.R +++ b/tests/testthat/test-exclusions.R @@ -26,6 +26,10 @@ test_that("it excludes properly", { } }) -# TODO test encoding-specific behaviour +test_that("it doesn't fail when encountering misspecified encodings", { + read_settings(NULL) + + expect_length(parse_exclusions("dummy_projects/project/cp1252.R"), 0L) +}) options(old_ops) diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index 9e30447f6..e89b292ba 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -104,7 +104,15 @@ test_that("Warns if encoding is misspecified", { read_settings(NULL) the_lint <- get_source_expressions(file)$error expect_s3_class(the_lint, "lint") - expect_equal(the_lint$message, "Invalid multibyte character in parser. Is the encoding correct?") + + msg <- "Invalid multibyte character in parser. Is the encoding correct?" + if (.Platform$OS.type == "windows") { + # Windows parser throws a different error message because the source code is converted to native encoding + # This results in line 4 becoming <- 42 before the parser sees it. + msg <- "unexpected '<'" + } + + expect_equal(the_lint$message, msg) expect_equal(the_lint$line_number, 4L) file <- "dummy_projects/project/cp1252_parseable.R" From c5d5ceafdc9a3d38c185290174b93d109e1158a9 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Thu, 1 Jul 2021 11:23:12 +0200 Subject: [PATCH 19/26] update documentation, add NEWS bullet. --- NEWS.md | 2 ++ README.md | 3 +++ vignettes/using_lintr.Rmd | 3 +++ 3 files changed, 8 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5cc18a17e..a1f3b8bd1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,6 +65,8 @@ * `lint()` now has a new optional argument `text` for supplying a string or lines directly, e.g. if the file is already in memory or linting is being done ad hoc. (#503, @renkun-ken) * New `pipe_call_linter()` enforces that all steps of `magrittr` pipelines use explicit calls instead of symbols, e.g. `x %>% mean()` instead of `x %>% mean` (@michaelchirico) * `get_source_expressions()` no longer fails if `getParseData()` returns a truncated (invalid) Unicode character as parsed text (#815, #816, @leogama) +* lintr now supports non-system character Encodings. Auto-detects the correct encoding from .Rproj or DESCRIPTION + files in your project. Override the default in the `encoding` setting of lintr. (#752, #782, @AshesITR) # lintr 2.0.1 diff --git a/README.md b/README.md index 5eb157753..cc9b1665e 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ The config file (default file name: `.lintr`) is in [Debian Control Field Format - `exclude` - a regex pattern for lines to exclude from linting. Default is "# nolint" - `exclude_start` - a regex pattern to start exclusion range. Default is "# nolint start" - `exclude_end` - a regex pattern to end exclusion range. Default is "# nolint end" +- `encoding` - the encoding used for source files. Default inferred from .Rproj or DESCRIPTION files, fallback to UTF-8 ### .lintr File Example @@ -120,6 +121,7 @@ Below is an example .lintr file that uses: - Excludes a couple of files - Disables a specific linter, and; - Sets different default exclude regexes +- Specifies the file encoding to be ISO-8859-1 (Latin 1) ``` linters: with_defaults( @@ -130,6 +132,7 @@ exclusions: list("inst/doc/creating_linters.R" = 1, "inst/example/bad.R", "tests exclude: "# Exclude Linting" exclude_start: "# Begin Exclude Linting" exclude_end: "# End Exclude Linting" +encoding: "ISO-8859-1" ``` With the following command, you can create a configuration file for `lintr` that ignores all linters that show at least one error: diff --git a/vignettes/using_lintr.Rmd b/vignettes/using_lintr.Rmd index c731c9001..b65ca8a1d 100644 --- a/vignettes/using_lintr.Rmd +++ b/vignettes/using_lintr.Rmd @@ -115,6 +115,9 @@ defaults_table <- data.frame( knitr::kable(defaults_table) ``` +Note that the default `encoding` setting depends on the file to be linted. +If an Encoding is found in a `.Rproj` file or a `DESCRIPTION` file, that encoding overrides the default of UTF-8. + #### Customizing active linters If you only want to customize some linters, you can use the helper function `with_defaults()`, which will keep all From 75fc2d844d63097f727ad79ec8449cb8ff3fc018 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Thu, 1 Jul 2021 11:45:57 +0200 Subject: [PATCH 20/26] fix lints, reduce cyclocomp of find_default_encoding() --- R/expect_lint.R | 2 +- R/settings.R | 47 +++++++++---------- .../testthat/dummy_packages/cp1252/R/cp1252.R | 2 +- .../testthat/dummy_projects/project/cp1252.R | 2 +- .../dummy_projects/project/cp1252_parseable.R | 2 +- tests/testthat/test-get_source_expressions.R | 2 +- 6 files changed, 27 insertions(+), 30 deletions(-) diff --git a/R/expect_lint.R b/R/expect_lint.R index 5b9c99a10..b362c0eca 100644 --- a/R/expect_lint.R +++ b/R/expect_lint.R @@ -46,7 +46,7 @@ expect_lint <- function(content, checks, ..., file = NULL, language = "en") { if (is.null(file)) { file <- tempfile() con <- base::file(file, encoding = "UTF-8") - on.exit({ unlink(file) }, add = TRUE) + on.exit(unlink(file), add = TRUE) writeLines(content, con = con, sep = "\n") close(con) } diff --git a/R/settings.R b/R/settings.R index 40ea790bb..d71bfcd01 100644 --- a/R/settings.R +++ b/R/settings.R @@ -112,39 +112,36 @@ find_default_encoding <- function(filename) { pkg_path <- find_package(filename) rproj_file <- find_rproj(filename) + pkg_enc <- get_encoding_from_dcf(file.path(pkg_path, "DESCRIPTION")) + rproj_enc <- get_encoding_from_dcf(rproj_file) if (!is.null(rproj_file) && !is.null(pkg_path) && startsWith(rproj_file, pkg_path)) { # Check precedence via directory hierarchy. # Both paths are normalized so checking if rproj_file is within pkg_path is sufficient. - # Force reading from Rproj file - pkg_path <- NULL + # Let Rproj file take precedence + return(rproj_enc %||% pkg_enc) + } else { + # Let DESCRIPTION file take precedence if .Rproj file is further up the directory hierarchy + return(pkg_enc %||% rproj_enc) } - if (!is.null(pkg_path)) { - # Get Encoding from DESCRIPTION - dcf <- tryCatch( - read.dcf(file.path(pkg_path, "DESCRIPTION")), - error = function(e) NULL - ) - if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { - return(unname(dcf[1L, "Encoding"])) - } - } + NULL +} - if (!is.null(rproj_file)) { - # Get Encoding from .Rproj - dcf <- tryCatch( - read.dcf(rproj_file), - error = function(e) NULL, - warning = function(e) NULL - ) +get_encoding_from_dcf <- function(file) { + if (is.null(file)) return(NULL) - if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { - encodings <- dcf[, "Encoding"] - encodings <- encodings[!is.na(encodings)] - if (length(encodings) > 0L) { - return(encodings[1L]) - } + dcf <- tryCatch( + read.dcf(file), + error = function(e) NULL, + warning = function(e) NULL + ) + + if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { + encodings <- dcf[, "Encoding"] + encodings <- encodings[!is.na(encodings)] + if (length(encodings) > 0L) { + return(encodings[1L]) } } diff --git a/tests/testthat/dummy_packages/cp1252/R/cp1252.R b/tests/testthat/dummy_packages/cp1252/R/cp1252.R index a2ffbe425..968f758cd 100644 --- a/tests/testthat/dummy_packages/cp1252/R/cp1252.R +++ b/tests/testthat/dummy_packages/cp1252/R/cp1252.R @@ -1,4 +1,4 @@ # This file is encoded in Cp-1252 # Comment containing non-ASCII ä -ü <- 42 \ No newline at end of file +ü <- 42 diff --git a/tests/testthat/dummy_projects/project/cp1252.R b/tests/testthat/dummy_projects/project/cp1252.R index a2ffbe425..968f758cd 100644 --- a/tests/testthat/dummy_projects/project/cp1252.R +++ b/tests/testthat/dummy_projects/project/cp1252.R @@ -1,4 +1,4 @@ # This file is encoded in Cp-1252 # Comment containing non-ASCII ä -ü <- 42 \ No newline at end of file +ü <- 42 diff --git a/tests/testthat/dummy_projects/project/cp1252_parseable.R b/tests/testthat/dummy_projects/project/cp1252_parseable.R index 6027ee98b..b9072a050 100644 --- a/tests/testthat/dummy_projects/project/cp1252_parseable.R +++ b/tests/testthat/dummy_projects/project/cp1252_parseable.R @@ -1,4 +1,4 @@ # This file is encoded in Cp-1252 # Comment containing non-ASCII ä -a <- 42 \ No newline at end of file +a <- 42 diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R index e89b292ba..f233a3ef2 100644 --- a/tests/testthat/test-get_source_expressions.R +++ b/tests/testthat/test-get_source_expressions.R @@ -1,7 +1,7 @@ with_content_to_parse <- function(content, code) { f <- tempfile() con <- file(f, open = "w", encoding = "UTF-8") - on.exit({ unlink(f) }) + on.exit(unlink(f)) writeLines(content, con) close(con) source_expressions <- get_source_expressions(f) From 7b4a9ae92267947f5762c464c8bf93d9b4009788 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:06:26 +0200 Subject: [PATCH 21/26] return expressions as an empty list() instead of NULL if bad encodings appear. --- R/get_source_expressions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R index 7e0a11696..6c0548bb7 100644 --- a/R/get_source_expressions.R +++ b/R/get_source_expressions.R @@ -227,7 +227,7 @@ get_source_expressions <- function(filename, lines = NULL) { if (inherits(e, "lint") && !nzchar(e$line)) { # Don't create expression list if it's unreliable (invalid encoding or unhandled parse error) - expressions <- NULL + expressions <- list() } else { expressions <- lapply( X = top_level_expressions(parsed_content), From f8f5e7f6f03a646e26b34f223bb14e0a64538cc7 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:09:50 +0200 Subject: [PATCH 22/26] more efficient find_rproj() --- R/lint.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/lint.R b/R/lint.R index 4fd2673d1..a36335d4d 100644 --- a/R/lint.R +++ b/R/lint.R @@ -378,21 +378,21 @@ find_package <- function(path) { path } -has_rproj <- function(path) { - length(list.files(path = path, pattern = "\\.Rproj$")) > 0L +find_rproj_at <- function(path) { + head(list.files(path = path, pattern = "\\.Rproj$"), 1L) } find_rproj <- function(path) { path <- normalizePath(path, mustWork = FALSE) - while (!has_rproj(path)) { + while (length(res <- find_rproj_at(path)) == 0L) { path <- dirname(path) if (is_root(path)) { return(NULL) } } - list.files(path = path, pattern = "\\.Rproj$", full.names = TRUE)[1L] + res } is_root <- function(path) { From 88cb2fbba8cd642dae5050dc2f316e1e29132408 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:12:43 +0200 Subject: [PATCH 23/26] simplify get_encoding_from_dcf() --- R/settings.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/settings.R b/R/settings.R index d71bfcd01..62190d130 100644 --- a/R/settings.R +++ b/R/settings.R @@ -131,14 +131,13 @@ find_default_encoding <- function(filename) { get_encoding_from_dcf <- function(file) { if (is.null(file)) return(NULL) - dcf <- tryCatch( - read.dcf(file), + encodings <- tryCatch( + drop(read.dcf(file, "Encoding")), error = function(e) NULL, warning = function(e) NULL ) - if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) { - encodings <- dcf[, "Encoding"] + if (!is.null(encodings)) { encodings <- encodings[!is.na(encodings)] if (length(encodings) > 0L) { return(encodings[1L]) From ba7714509f7a6c20a70ef38831fc342cb7af844f Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:22:03 +0200 Subject: [PATCH 24/26] full.names = TRUE D'oh --- R/lint.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/lint.R b/R/lint.R index a36335d4d..bdd86bb23 100644 --- a/R/lint.R +++ b/R/lint.R @@ -379,7 +379,7 @@ find_package <- function(path) { } find_rproj_at <- function(path) { - head(list.files(path = path, pattern = "\\.Rproj$"), 1L) + head(list.files(path = path, pattern = "\\.Rproj$", full.names = TRUE), 1L) } find_rproj <- function(path) { From 761d6259dd5fcbc42854d35a86c36e706bc3a9ba Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:42:39 +0200 Subject: [PATCH 25/26] remove pointless NULL check and make sure result is not named --- R/settings.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/settings.R b/R/settings.R index 62190d130..2547916df 100644 --- a/R/settings.R +++ b/R/settings.R @@ -132,16 +132,14 @@ get_encoding_from_dcf <- function(file) { if (is.null(file)) return(NULL) encodings <- tryCatch( - drop(read.dcf(file, "Encoding")), + unname(drop(read.dcf(file, "Encoding"))), error = function(e) NULL, warning = function(e) NULL ) - if (!is.null(encodings)) { - encodings <- encodings[!is.na(encodings)] - if (length(encodings) > 0L) { - return(encodings[1L]) - } + encodings <- encodings[!is.na(encodings)] + if (length(encodings) > 0L) { + return(encodings[1L]) } NULL From 5c8f4bf56ec0be47ef3a45c84efccee442bea6c7 Mon Sep 17 00:00:00 2001 From: Alexander Rosenstock Date: Fri, 2 Jul 2021 18:48:48 +0200 Subject: [PATCH 26/26] remove unreachable code --- R/settings.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/settings.R b/R/settings.R index 2547916df..fdc4863ae 100644 --- a/R/settings.R +++ b/R/settings.R @@ -124,8 +124,6 @@ find_default_encoding <- function(filename) { # Let DESCRIPTION file take precedence if .Rproj file is further up the directory hierarchy return(pkg_enc %||% rproj_enc) } - - NULL } get_encoding_from_dcf <- function(file) {