Skip to content

Commit

Permalink
[R-package] allow use of categorical_features in Dataset when raw d…
Browse files Browse the repository at this point in the history
…ata does not have column names (fixes #4374) (#5184)

* check for number of columns if data is matrixx for categorical indices check

* check for error when using a greater index than the number of columns

* apply suggestion

Co-authored-by: James Lamb <jaylamb20@gmail.com>

* revert whitespace change

* check if is filename instead of matrix

Co-authored-by: James Lamb <jaylamb20@gmail.com>
  • Loading branch information
jmoralez and jameslamb authored Apr 30, 2022
1 parent f53fa69 commit 8359da6
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
5 changes: 3 additions & 2 deletions R-package/R/lgb.Dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,13 @@ Dataset <- R6::R6Class(
} else {

# Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) {
data_is_not_filename <- !is.character(private$raw_data)
if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
stop(
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
, " but only "
, length(private$colnames)
, ncol(private$raw_data)
, " features"
)
}
Expand Down
15 changes: 15 additions & 0 deletions R-package/tests/testthat/test_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,18 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins)
})

test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
# check that dataset can be constructed
raw_mat <- matrix(rep(c(0L, 1L), 50L), ncol = 1L)
ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
sparse_mat <- as(raw_mat, "dgCMatrix")
ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
# check that the column names are NULL
expect_null(ds$.__enclos_env__$private$colnames)
expect_null(ds2$.__enclos_env__$private$colnames)
# check for error when index is greater than the number of columns
expect_error({
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
}, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
})

0 comments on commit 8359da6

Please sign in to comment.