r-lib · MichaelChirico · Jul 2, 2021 · Mar 16, 2021 · Mar 16, 2021 · Mar 17, 2021
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -6,7 +6,7 @@
 ^travis-tool\.sh$
 ^.*\.gz$
 ^cran-comments\.md$
-^.*\.Rproj$
+^lintr\.Rproj$
 ^\.Rproj\.user$
 ^\.idea$
 ^\.dev$
@@ -24,3 +24,4 @@
 ^bench$
 ^tests/testthat/dummy_packages/package/[.]Rbuildignore$
 ^tests/testthat/dummy_packages/package/[.]lintr$
+^tests/testthat/dummy_packages/cp1252/[.]Rbuildignore$
diff --git a/.dev/compare_branches.R b/.dev/compare_branches.R
@@ -146,7 +146,11 @@ test_encoding <- function(dir) {
   tryCatch({
     lapply(
       list.files(dir, pattern = "(?i)\\.r(?:md)?$", recursive = TRUE, full.names = TRUE),
-      function(x) nchar(readLines(x, warn = FALSE))
+      function(x) {
+        con <- file(x, encoding = lintr:::find_default_encoding(x))
+        on.exit(close(con))
+        nchar(readLines(con, warn = FALSE))
+      }
     )
     FALSE
   }, error = function(x) TRUE)

diff --git a/.lintr b/.lintr
@@ -7,6 +7,7 @@ exclusions: list(
   "inst/example/bad.R",
   "tests/testthat/default_linter_testcode.R",
   "tests/testthat/dummy_packages",
+  "tests/testthat/dummy_projects",
   "tests/testthat/exclusions-test",
   "tests/testthat/knitr_formats",
   "tests/testthat/knitr_malformed"

diff --git a/.lintr_new b/.lintr_new
@@ -3,6 +3,7 @@ linters: with_defaults(
   )
 exclusions: list(
   "tests/testthat/dummy_packages",
+  "tests/testthat/dummy_projects",
   "tests/testthat/knitr_formats",
   "tests/testthat/knitr_malformed",
   "inst",

diff --git a/NAMESPACE b/NAMESPACE
@@ -52,6 +52,7 @@ export(object_name_linter)
 export(object_usage_linter)
 export(open_curly_linter)
 export(paren_brace_linter)
+export(pipe_call_linter)
 export(pipe_continuation_linter)
 export(semicolon_terminator_linter)
 export(seq_linter)

diff --git a/NEWS.md b/NEWS.md
@@ -65,6 +65,8 @@
 * `lint()` now has a new optional argument `text` for supplying a string or lines directly, e.g. if the file is already in memory or linting is being done ad hoc. (#503, @renkun-ken)
 * New `pipe_call_linter()` enforces that all steps of `magrittr` pipelines use explicit calls instead of symbols, e.g. `x %>% mean()` instead of `x %>% mean` (@michaelchirico)
 * `get_source_expressions()` no longer fails if `getParseData()` returns a truncated (invalid) Unicode character as parsed text (#815, #816, @leogama)
+* lintr now supports non-system character Encodings. Auto-detects the correct encoding from .Rproj or DESCRIPTION 
+  files in your project. Override the default in the `encoding` setting of lintr. (#752, #782, @AshesITR)
 
 # lintr 2.0.1
 

diff --git a/R/exclude.R b/R/exclude.R
@@ -79,6 +79,12 @@ parse_exclusions <- function(file, exclude = settings$exclude,
 
   exclusions <- list()
 
+  e <- tryCatch(nchar(lines), error = identity)
+  if (inherits(e, "error")) {
+    # Invalid encoding. Don't parse exclusions.
+    return(list())
+  }
+
   start_locations <- rex::re_matches(lines, exclude_start, locations = TRUE)[, "end"] + 1L
   starts <- which(!is.na(start_locations))
   ends <- which(rex::re_matches(lines, exclude_end))

diff --git a/R/expect_lint.R b/R/expect_lint.R
@@ -45,8 +45,10 @@ expect_lint <- function(content, checks, ..., file = NULL, language = "en") {
 
   if (is.null(file)) {
     file <- tempfile()
+    con <- base::file(file, encoding = "UTF-8")
     on.exit(unlink(file), add = TRUE)
-    writeLines(content, con = file, sep = "\n")
+    writeLines(content, con = con, sep = "\n")
+    close(con)
   }
 
   lints <- lint(file, ...)

diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R
@@ -1,6 +1,16 @@
 #' Parsed sourced file from a filename
 #'
 #' This object is given as input to each linter
+#'
+#' @details
+#' The file is read in using the `encoding` setting.
+#' This setting found by taking the first valid result from the following locations
+#'
+#' 1. The `encoding` key from the usual lintr configuration settings.
+#' 2. The `Encoding` field from a Package `DESCRIPTION` file in a parent directory.
+#' 3. The `Encoding` field from an R Project `.Rproj` file in a parent directory.
+#' 4. `"UTF-8"` as a fallback.
+#'
 #' @param filename the file to be parsed.
 #' @param lines a character vector of lines.
 #'   If \code{NULL}, then \code{filename} will be read.
@@ -42,7 +52,7 @@
 #' @export
 #' @md
 get_source_expressions <- function(filename, lines = NULL) {
-  source_file <- srcfile(filename)
+  source_file <- srcfile(filename, encoding = settings$encoding)
 
   # Ensure English locale for terminal newline and zero-length variable warning messages
   old_lang <- set_lang("en")
@@ -77,6 +87,36 @@ get_source_expressions <- function(filename, lines = NULL) {
     # an error that does not use R_ParseErrorMsg
     if (is.na(message_info$line)) {
 
+      if (grepl("invalid multibyte character in parser at line", e$message, fixed = TRUE)) {
+        l <- as.integer(re_matches(
+          e$message,
+          rex("invalid multibyte character in parser at line ", capture(name = "line", digits))
+        )$line)
+        # Invalid encoding in source code
+        return(
+          Lint(
+            filename = source_file$filename,
+            line_number = l,
+            column_number = 1L,
+            type = "error",
+            message = "Invalid multibyte character in parser. Is the encoding correct?",
+            line = source_file$lines[[l]]
+          )
+        )
+      } else if (grepl("invalid multibyte string, element", e$message, fixed = TRUE)) {
+        # Invalid encoding, will break even re_matches() below, so we need to handle this first.
+        return(
+          Lint(
+            filename = source_file$filename,
+            line_number = 1L,
+            column_number = 1L,
+            type = "error",
+            message = "Invalid multibyte string. Is the encoding correct?",
+            line = ""
+          )
+        )
+      }
+
       message_info <- re_matches(e$message,
         rex(single_quotes, capture(name = "name", anything), single_quotes,
           anything,
@@ -185,26 +225,31 @@ get_source_expressions <- function(filename, lines = NULL) {
   parsed_content <- get_source_file(source_file, error = lint_error)
   tree <- generate_tree(parsed_content)
 
-  expressions <- lapply(
-    X = top_level_expressions(parsed_content),
-    FUN = get_single_source_expression,
-    parsed_content,
-    source_file,
-    filename,
-    tree
-  )
-
-  # add global expression
-  expressions[[length(expressions) + 1L]] <-
-    list(
-      filename = filename,
-      file_lines = source_file$lines,
-      content = source_file$lines,
-      full_parsed_content = parsed_content,
-      full_xml_parsed_content = safe_parse_to_xml(parsed_content),
-      terminal_newline = terminal_newline
+  if (inherits(e, "lint") && !nzchar(e$line)) {
+    # Don't create expression list if it's unreliable (invalid encoding or unhandled parse error)
+    expressions <- NULL
+  } else {
+    expressions <- lapply(
+      X = top_level_expressions(parsed_content),
+      FUN = get_single_source_expression,
+      parsed_content,
+      source_file,
+      filename,
+      tree
     )
 
+    # add global expression
+    expressions[[length(expressions) + 1L]] <-
+      list(
+        filename = filename,
+        file_lines = source_file$lines,
+        content = source_file$lines,
+        full_parsed_content = parsed_content,
+        full_xml_parsed_content = safe_parse_to_xml(parsed_content),
+        terminal_newline = terminal_newline
+      )
+  }
+
   list(expressions = expressions, error = e, lines = source_file$lines)
 }
 
@@ -235,6 +280,7 @@ get_single_source_expression <- function(loc,
 }
 
 get_source_file <- function(source_file, error = identity) {
+  parse_error <- FALSE
 
   e <- tryCatch(
     source_file$parsed_content <- parse(text = source_file$content, srcfile = source_file, keep.source = TRUE),
@@ -250,9 +296,22 @@ get_source_file <- function(source_file, error = identity) {
 
   if (inherits(e, "error") || inherits(e, "lint")) {
     assign("e", e,  envir = parent.frame())
+    parse_error <- TRUE
   }
 
-  fix_eq_assigns(fix_column_numbers(fix_tab_indentations(source_file)))
+  # Triggers an error if the lines contain invalid characters.
+  e <- tryCatch(
+    nchar(source_file$content, type = "chars"),
+    error = error
+  )
+
+  if (inherits(e, "error") || inherits(e, "lint")) {
+    # Let parse errors take precedence over encoding problems
+    if (!parse_error) assign("e", e,  envir = parent.frame())
+    return() # parsed_content is unreliable if encoding is invalid
+  }
+
+  fix_eq_assigns(fix_tab_indentations(source_file))
 }
 
 find_line_fun <- function(content) {
@@ -288,44 +347,6 @@ find_column_fun <- function(content) {
   }
 }
 
-# Adjust the columns that getParseData reports from bytes to characters.
-fix_column_numbers <- function(content) {
-  if (is.null(content)) {
-    return(NULL)
-  }
-
-  text_lengths <- nchar(content$text[content$terminal], "chars")
-  byte_lengths <- nchar(content$text[content$terminal], "bytes")
-  differences <- byte_lengths - text_lengths
-
-  to_change <- which(differences > 0L)
-
-  adjusted_col1 <- content$col1
-  adjusted_col2 <- content$col2
-
-  for (i in to_change) {
-    needs_adjustment <- which(content$line1 == content$line1[i] & content$col1 >= content$col1[i])
-
-    for (j in needs_adjustment) {
-      adjusted_col1[j] <-
-        content$col1[j] -
-          sum(differences[
-            content$line1 == content$line1[j] &
-            content$col1 < content$col1[j]])
-
-      adjusted_col2[j] <-
-        content$col2[j] -
-          sum(differences[
-            content$line1 == content$line1[j] &
-            content$col2 < content$col2[j]])
-    }
-  }
-  content$col1 <- adjusted_col1
-  content$col2 <- adjusted_col2
-  content
-}
-
-
 # Fix column numbers when there are tabs
 # getParseData() counts 1 tab as a variable number of spaces instead of one:
 # https://github.com/wch/r-source/blame/e7401b68ab0e032fce3e376aaca9a5431619b2b4/src/main/gram.y#L512

diff --git a/R/lint.R b/R/lint.R
@@ -56,8 +56,10 @@ lint <- function(filename, linters = NULL, cache = FALSE, ..., parse_settings =
 
   if (inline_data && no_filename) {
     filename <- tempfile()
+    con <- file(filename, open = "w", encoding = settings$encoding)
     on.exit(unlink(filename), add = TRUE)
-    writeLines(text = text, con = filename, sep = "\n")
+    writeLines(text = text, con = con, sep = "\n")
+    close(con)
   }
 
   lines <- if (is.null(text)) {
@@ -376,6 +378,23 @@ find_package <- function(path) {
   path
 }
 
+has_rproj <- function(path) {
+  length(list.files(path = path, pattern = "\\.Rproj$")) > 0L
+}
+
+find_rproj <- function(path) {
+  path <- normalizePath(path, mustWork = FALSE)
+
+  while (!has_rproj(path)) {
+    path <- dirname(path)
+    if (is_root(path)) {
+      return(NULL)
+    }
+  }
+
+  list.files(path = path, pattern = "\\.Rproj$", full.names = TRUE)[1L]
+}
+
 is_root <- function(path) {
   identical(path, dirname(path))
 }

diff --git a/R/settings.R b/R/settings.R
@@ -17,6 +17,11 @@ read_settings <- function(filename) {
   clear_settings()
 
   config_file <- find_config(filename)
+  default_encoding <- find_default_encoding(filename)
+  if (!is.null(default_encoding)) {
+    # Locally override the default for encoding if we found a smart default
+    default_settings[["encoding"]] <- default_encoding
+  }
 
   if (!is.null(config_file)) {
     f <- function(e) {
@@ -100,6 +105,49 @@ find_config <- function(filename) {
   NULL
 }
 
+find_default_encoding <- function(filename) {
+  if (is.null(filename)) {
+    return(NULL)
+  }
+
+  pkg_path <- find_package(filename)
+  rproj_file <- find_rproj(filename)
+  pkg_enc <- get_encoding_from_dcf(file.path(pkg_path, "DESCRIPTION"))
+  rproj_enc <- get_encoding_from_dcf(rproj_file)
+
+  if (!is.null(rproj_file) && !is.null(pkg_path) && startsWith(rproj_file, pkg_path)) {
+    # Check precedence via directory hierarchy.
+    # Both paths are normalized so checking if rproj_file is within pkg_path is sufficient.
+    # Let Rproj file take precedence
+    return(rproj_enc %||% pkg_enc)
+  } else {
+    # Let DESCRIPTION file take precedence if .Rproj file is further up the directory hierarchy
+    return(pkg_enc %||% rproj_enc)
+  }
+
+  NULL
+}
+
+get_encoding_from_dcf <- function(file) {
+  if (is.null(file)) return(NULL)
+
+  dcf <- tryCatch(
+    read.dcf(file),
+    error = function(e) NULL,
+    warning = function(e) NULL
+  )
+
+  if (!is.null(dcf) && nrow(dcf) >= 1L && "Encoding" %in% colnames(dcf)) {
+    encodings <- dcf[, "Encoding"]
+    encodings <- encodings[!is.na(encodings)]
+    if (length(encodings) > 0L) {
+      return(encodings[1L])
+    }
+  }
+
+  NULL
+}
+
 is_directory <- function(filename) {
   is_dir <- file.info(filename)$isdir
 

diff --git a/R/utils.R b/R/utils.R
@@ -250,7 +250,7 @@ Linter <- function(fun, name = linter_auto_name()) { # nolint: object_name_linte
   structure(fun, class = "linter", name = name)
 }
 
-read_lines <- function(file, ...) {
+read_lines <- function(file, encoding = settings$encoding, ...) {
   terminal_newline <- TRUE
   lines <- withCallingHandlers({
     readLines(file, warn = TRUE, ...)
@@ -261,6 +261,9 @@ read_lines <- function(file, ...) {
       invokeRestart("muffleWarning")
     }
   })
+  lines_conv <- iconv(lines, from = encoding, to = "UTF-8")
+  lines[!is.na(lines_conv)] <- lines_conv[!is.na(lines_conv)]
+  Encoding(lines) <- "UTF-8"
   attr(lines, "terminal_newline") <- terminal_newline
   lines
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -200,6 +200,7 @@ settings <- NULL
 
   default_settings <<- list(
     linters = default_linters,
+    encoding = "UTF-8",
     exclude = rex::rex("#", any_spaces, "nolint"),
     exclude_start = rex::rex("#", any_spaces, "nolint start"),
     exclude_end = rex::rex("#", any_spaces, "nolint end"),