From adebf4723f9f786d23233048894e28985535cff3 Mon Sep 17 00:00:00 2001
From: Florent Angly <florent.angly@gmail.com>
Date: Fri, 5 May 2017 09:18:23 +0200
Subject: [PATCH] Added proper handling of tab characters (fixes #44)

---
 NEWS.md                                      |  1 +
 R/get_source_expressions.R                   | 67 +++++++++++++++++++-
 R/no_tab_linter.R                            |  3 -
 tests/testthat/test-get_source_expressions.R | 42 ++++++++++++
 4 files changed, 109 insertions(+), 4 deletions(-)
 create mode 100644 tests/testthat/test-get_source_expressions.R

diff --git a/NEWS.md b/NEWS.md
index d87df3414..78f038cc7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,5 @@
 # lintr 1.0.0.9001 #
+* Added proper handling of tab characters (fixes #44, @fangly)
 * Fix line number sometimes wrongly reported by no_tab_linter() (#134, @fangly)
 * Fix line and column number sometimes wrongly reported by spaces_inside_linter()
   (#203, @fangly)
diff --git a/R/get_source_expressions.R b/R/get_source_expressions.R
index 1f09e9a8b..49441a53c 100644
--- a/R/get_source_expressions.R
+++ b/R/get_source_expressions.R
@@ -132,7 +132,7 @@ get_source_file <- function(source_file, error = identity) {
     assign("e", e,  envir = parent.frame())
   }
 
-  fix_eq_assign(adjust_columns(getParseData(source_file)))
+  fix_eq_assign(fix_tab_indentation(adjust_columns(getParseData(source_file)), source_file$lines))
 }
 
 find_line_fun <- function(content) {
@@ -206,6 +206,71 @@ adjust_columns <- function(content) {
   content
 }
 
+# Restore column numbers without tab indentation
+#
+# parse() and thus getParseData() count 1 tab as a variable number of spaces (see src/main/gram.c).
+# The number of spaces is just so that the code is brought to the next 8-character indentation level
+# e.g.:
+#   "1\t;"          -> "1       ;"
+#   "12\t;"         -> "12      ;"
+#   "123\t;"        -> "123     ;"
+#   "1234\t;"       -> "1234    ;"
+#   "12345\t;"      -> "12345   ;"
+#   "123456\t;"     -> "123456  ;"
+#   "1234567\t;"    -> "1234567 ;"
+#   "12345678\t;"   -> "12345678        ;"
+#   "123456789\t;"  -> "123456789       ;"
+#   "1234567890\t;" -> "1234567890      ;"
+# Fix the column numbers so that each tab counts as a single character, not a tab indentation.
+fix_tab_indentation <- function(pc, lines) {
+  tab_cols <- re_matches(lines, "\t", global = TRUE, locations = TRUE)
+  tab_cols <- lapply(
+    tab_cols,
+    function(cols) {
+      start_cols <- cols[["start"]]
+      if (!is.na(start_cols[[1L]])) {
+        start_cols
+      } else {
+        NA
+      }
+    }
+  )
+  names(tab_cols) <- seq_along(tab_cols)
+  tab_cols <- tab_cols[!is.na(tab_cols)]
+
+  for (line in names(tab_cols)) {
+    tab_widths <- tab_widths(tab_cols[[line]])
+    which_lines <- pc[["line1"]] == as.integer(line)
+    cols <- pc[which_lines, c("col1", "col2")]
+    if (nrow(cols)) {
+      for (tab_col in tab_cols[[line]]) {
+        which_cols <- cols > tab_col
+        cols[which_cols] <- cols[which_cols] - tab_widths[[as.character(tab_col)]] + 1L
+        pc[which_lines, c("col1", "col2")] <- cols
+      }
+    }
+  }
+
+  pc
+}
+
+
+tab_widths <- function(tab_columns, indent_width = 8L) {
+  nms <- as.character(tab_columns)
+  widths <- vapply(
+    seq_along(tab_columns),
+    function(i) {
+      tab_col <- tab_columns[[i]]
+      tab_width <- indent_width - (tab_col - 1L) %% indent_width
+      which_cols <- tab_columns > tab_col
+      tab_columns[which_cols] <<- tab_columns[which_cols] + tab_width - 1L
+      tab_width
+    },
+    integer(1L)
+  )
+  names(widths) <- nms
+  widths
+}
 
 # This function wraps equal assign expressions in a parent expression so they
 # are the same as the corresponding <- expression
diff --git a/R/no_tab_linter.R b/R/no_tab_linter.R
index bd2a2a183..46ad73315 100644
--- a/R/no_tab_linter.R
+++ b/R/no_tab_linter.R
@@ -26,9 +26,6 @@ no_tab_linter <- function(source_file) {
             type = "style",
             message = "Use spaces to indent, not tabs.",
             line = source_file$lines[[as.character(line_number)]],
-
-            # R outputs tabs with 8 spaces
-            # TODO: this is incorrect for embedded tabs, I am not going to fix it.
             ranges = list(c(start, end)),
             linter = "no_tab_linter"
           )
diff --git a/tests/testthat/test-get_source_expressions.R b/tests/testthat/test-get_source_expressions.R
new file mode 100644
index 000000000..9394f13e2
--- /dev/null
+++ b/tests/testthat/test-get_source_expressions.R
@@ -0,0 +1,42 @@
+context("get_source_expression")
+
+test_that("tab positions have been corrected", {
+  f <- tempfile()
+
+  writeLines("1\n\t", f)
+  expect_error(lintr:::get_source_expressions(f), NA, label="empty parsed_content line")
+
+  writeLines("TRUE", f)
+  pc <- lintr:::get_source_expressions(f)[["expressions"]][[1L]][["parsed_content"]]
+  expect_equivalent(pc[pc[["text"]] == "TRUE", c("col1", "col2")], c(1L, 4L))
+
+  writeLines("\tTRUE", f)
+  pc <- lintr:::get_source_expressions(f)[["expressions"]][[1L]][["parsed_content"]]
+  expect_equivalent(pc[pc[["text"]] == "TRUE", c("col1", "col2")], c(2L, 5L))
+
+  writeLines("\t\tTRUE", f)
+  pc <- lintr:::get_source_expressions(f)[["expressions"]][[1L]][["parsed_content"]]
+  expect_equivalent(pc[pc[["text"]] == "TRUE", c("col1", "col2")], c(3L, 6L))
+
+  writeLines("n\t<=\tTRUE", f)
+  pc <- lintr:::get_source_expressions(f)[["expressions"]][[1L]][["parsed_content"]]
+  expect_equivalent(pc[pc[["text"]] == "n", c("col1", "col2")], c(1L, 1L))
+  expect_equivalent(pc[pc[["text"]] == "<=", c("col1", "col2")], c(3L, 4L))
+  expect_equivalent(pc[pc[["text"]] == "TRUE", c("col1", "col2")], c(6L, 9L))
+
+  writeLines("\tfunction\t(x)\t{\tprint(pc[\t,1])\t;\t}", f)
+  pc <- lintr:::get_source_expressions(f)[["expressions"]][[1L]][["parsed_content"]]
+  expect_equivalent(pc[pc[["text"]] == "function", c("col1", "col2")], c(2L, 9L))
+  expect_equivalent(pc[pc[["text"]] == "x", c("col1", "col2")], c(12L, 12L))
+  expect_equivalent(pc[pc[["text"]] == "print", c("col1", "col2")], c(17L, 21L))
+  expect_equivalent(pc[pc[["text"]] == ";", c("col1", "col2")], c(32L, 32L))
+  expect_equivalent(pc[pc[["text"]] == "}", c("col1", "col2")], c(34L, 34L))
+
+  writeLines("# test tab\n\ns <- 'I have \\t a dog'\nrep(\ts, \t3)", f)
+  y <- lintr:::get_source_expressions(f)[["expressions"]]
+  pc <- y[[2]][["parsed_content"]]
+  expect_equivalent(pc[pc[["token"]] == "STR_CONST", c("line1", "col1", "col2")], c(3L, 6L, 22L))
+  pc <- y[[3]][["parsed_content"]]
+  expect_equivalent(pc[pc[["token"]] == "NUM_CONST", c("line1", "col1", "col2")], c(4L, 10L, 10L))
+
+})