Tab correction (#237)

* Added proper handling of tab characters (fixes #44) * Test update for tab linter + small doc update * Add link to line of R's source code responsible for tab indentation * Added a test function to remove redundant code * don't pass lines to fix_tab_indentations separately * Function which calculates tab width now ~2x as fast * Small fixes * Small rework * Properly handle multiline expressions ... and halved the time to correct tab indentations * Further tab indentation correction speed optimizations
r-lib · May 11, 2017 · aa70430 · aa70430
1 parent 65b312f
commit aa70430
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,5 @@
 # lintr 1.0.0.9001 #
+* Added proper handling of tab characters (fixes #44, @fangly)
 * Fix line number sometimes wrongly reported by no_tab_linter() (#134, @fangly)
 * Fix line and column number sometimes wrongly reported by spaces_inside_linter()
   (#203, @fangly)

diff --git a/R/extract.R b/R/extract.R
@@ -34,7 +34,6 @@ extract_r_source <- function(filename, lines) {
       }
     },
     starts, ends)
-  output
   replace_prefix(output, pattern$chunk.code)
 }
 

diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R
@@ -132,7 +132,7 @@ get_source_file <- function(source_file, error = identity) {
     assign("e", e,  envir = parent.frame())
   }
 
-  fix_eq_assign(adjust_columns(getParseData(source_file)))
+  fix_eq_assigns(fix_column_numbers(fix_tab_indentations(source_file)))
 }
 
 find_line_fun <- function(content) {
@@ -168,9 +168,8 @@ find_column_fun <- function(content) {
   }
 }
 
-# This is used to adjust the columns that getParseData reports from bytes to
-# letters.
-adjust_columns <- function(content) {
+# Adjust the columns that getParseData reports from bytes to characters.
+fix_column_numbers <- function(content) {
   if (is.null(content)) {
     return(NULL)
   }
@@ -207,9 +206,74 @@ adjust_columns <- function(content) {
 }
 
 
+# Fix column numbers when there are tabs
+# getParseData() counts 1 tab as a variable number of spaces instead of one:
+# https://github.com/wch/r-source/blame/e7401b68ab0e032fce3e376aaca9a5431619b2b4/src/main/gram.y#L512
+# The number of spaces is so that the code is brought to the next 8-character indentation level e.g:
+#   "1\t;"          -> "1       ;"
+#   "12\t;"         -> "12      ;"
+#   "123\t;"        -> "123     ;"
+#   "1234\t;"       -> "1234    ;"
+#   "12345\t;"      -> "12345   ;"
+#   "123456\t;"     -> "123456  ;"
+#   "1234567\t;"    -> "1234567 ;"
+#   "12345678\t;"   -> "12345678        ;"
+#   "123456789\t;"  -> "123456789       ;"
+#   "1234567890\t;" -> "1234567890      ;"
+fix_tab_indentations <- function(source_file) {
+  pc <- getParseData(source_file)
+
+  if (is.null(pc)) {
+    return(NULL)
+  }
+
+  tab_cols <- gregexpr("\t", source_file[["lines"]], fixed = TRUE)
+  names(tab_cols) <- seq_along(tab_cols)
+  tab_cols <- tab_cols[!is.na(tab_cols)]  # source lines from .Rmd and other files are NA
+  tab_cols <- lapply(tab_cols, function(x) {if (x[[1L]] < 0L) {NA} else {x}})
+  tab_cols <- tab_cols[!is.na(tab_cols)]
+
+  if (!length(tab_cols)) {
+    return(pc)
+  }
+
+  pc_cols <- c("line1", "line2", "col1", "col2")
+  dat <- matrix(data = unlist(pc[, pc_cols], use.names = FALSE), ncol = 2)
+  lines <- as.integer(names(tab_cols))
+  for (i in seq_along(tab_cols)) {
+    is_curr_line <- dat[, 1L] == lines[[i]]
+    if (any(is_curr_line)) {
+      line_tab_offsets <- tab_offsets(tab_cols[[i]])
+      for (j in seq_along(tab_cols[[i]])) {
+        is_line_to_change <- is_curr_line & dat[, 2L] > tab_cols[[i]][[j]]
+        if (any(is_line_to_change)) {
+          dat[is_line_to_change, 2L] <- dat[is_line_to_change, 2L] - line_tab_offsets[[j]]
+        }
+      }
+    }
+  }
+  pc[, pc_cols] <- dat
+  pc
+}
+
+
+tab_offsets <- function(tab_columns) {
+  cum_offset <- 0L
+  vapply(
+    tab_columns - 1L,
+    function(tab_idx) {
+      offset <- 7L - (tab_idx + cum_offset) %% 8L  # using a tab width of 8 characters
+      cum_offset <<- cum_offset + offset
+      offset
+    },
+    integer(1L),
+    USE.NAMES = FALSE
+  )
+}
+
 # This function wraps equal assign expressions in a parent expression so they
 # are the same as the corresponding <- expression
-fix_eq_assign <- function(pc) {
+fix_eq_assigns <- function(pc) {
   if (is.null(pc)) {
     return(NULL)
   }

diff --git a/R/lint.R b/R/lint.R
@@ -212,11 +212,11 @@ pkg_name <- function(path = find_package()) {
 #' Create a \code{Lint} object
 #' @param filename path to the source file that was linted.
 #' @param line_number line number where the lint occurred.
-#' @param column_number column the lint occurred.
+#' @param column_number column number where the lint occurred.
 #' @param type type of lint.
 #' @param message message used to describe the lint error
 #' @param line code source where the lint occured
-#' @param ranges ranges on the line that should be emphasized.
+#' @param ranges a list of ranges on the line that should be emphasized.
 #' @param linter name of linter that created the Lint object.
 #' @export
 Lint <- function(filename, line_number = 1L, column_number = 1L,

diff --git a/R/no_tab_linter.R b/R/no_tab_linter.R
@@ -26,9 +26,6 @@ no_tab_linter <- function(source_file) {
             type = "style",
             message = "Use spaces to indent, not tabs.",
             line = source_file$lines[[as.character(line_number)]],
-
-            # R outputs tabs with 8 spaces
-            # TODO: this is incorrect for embedded tabs, I am not going to fix it.
             ranges = list(c(start, end)),
             linter = "no_tab_linter"
           )

diff --git a/man/Lint.Rd b/man/Lint.Rd
diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R
@@ -0,0 +1,62 @@
+context("get_source_expression")
+
+
+with_content_to_parse <- function(content, code) {
+  f <- tempfile()
+  on.exit(unlink(f))
+  writeLines(content, f)
+  pc <- lapply(get_source_expressions(f)[["expressions"]], `[[`, "parsed_content")
+  eval(substitute(code))
+}
+
+
+test_that("tab positions have been corrected", {
+  with_content_to_parse("1\n\t",
+    expect_length(pc, 2L)
+  )
+
+  with_content_to_parse("TRUE",
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "TRUE", c("col1", "col2")], c(1L, 4L))
+  )
+
+  with_content_to_parse("\tTRUE",
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "TRUE", c("col1", "col2")], c(2L, 5L))
+  )
+
+  with_content_to_parse("\t\tTRUE",
+    expect_equivalent(pc[[1]][pc[[1L]][["text"]] == "TRUE", c("col1", "col2")], c(3L, 6L))
+  )
+
+  with_content_to_parse("x\t<-\tTRUE", {
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "x", c("col1", "col2")], c(1L, 1L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "<-", c("col1", "col2")], c(3L, 4L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "TRUE", c("col1", "col2")], c(6L, 9L))
+  })
+
+  with_content_to_parse("\tfunction\t(x)\t{\tprint(pc[\t,1])\t;\t}", {
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "function", c("col1", "col2")], c(2L, 9L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "x", c("col1", "col2")], c(12L, 12L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "print", c("col1", "col2")], c(17L, 21L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == ";", c("col1", "col2")], c(32L, 32L))
+    expect_equivalent(pc[[1L]][pc[[1L]][["text"]] == "}", c("col1", "col2")], c(34L, 34L))
+  })
+
+  with_content_to_parse("# test tab\n\ns <- 'I have \\t a dog'\nrep(\ts, \t3)", {
+    expect_equivalent(
+      pc[[2L]][pc[[2L]][["text"]] == "'I have \\t a dog'", c("line1", "col1", "col2")],
+      c(3L, 6L, 22L)
+    )
+    expect_equivalent(
+      pc[[3L]][pc[[3L]][["text"]] == "3", c("line1", "col1", "col2")],
+      c(4L, 10L, 10L)
+    )
+  })
+
+  with_content_to_parse("function(){\nTRUE\n\t}", {
+    expect_equivalent(
+      pc[[1L]][1L, c("line1", "col1", "line2", "col2")],
+      c(1L, 1L, 3L, 2L),
+      info = "expression that spans several lines"
+    )
+  })
+})
diff --git a/tests/testthat/test-no_tab_linter.R b/tests/testthat/test-no_tab_linter.R
@@ -10,9 +10,15 @@ test_that("returns the correct linting", {
 
   expect_lint("#\tblah", NULL, no_tab_linter)
 
-  expect_lint("\tblah", c(message = msg, line_number = 1L), no_tab_linter)
+  expect_lint(
+    "\tblah",
+    list(c(message = msg, line_number = 1L, column_number = 1L)),
+    no_tab_linter
+  )
 
-  expect_lint("\n\t\tblah", c(message = msg, line_number = 2L), no_tab_linter)
-
-  # Note: no tests of column number since they are currently incorrect (tabs converted to 8 spaces)
+  expect_lint(
+    "\n\t\t\tblah",
+    list(c(message = msg, line_number = 2L, column_number = 1L)),
+    no_tab_linter
+  )
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,6 @@ extract_r_source <- function(filename, lines) { @@
           }
         },
         starts, ends)
-      output
       replace_prefix(output, pattern$chunk.code)
     }
@@ Expand Down @@