Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiple train data on single machine #3900

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ Core Parameters

- ``data`` :raw-html:`<a id="data" title="Permalink to this parameter" href="#data">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename``

- path of training data, LightGBM will train from this data
- path of training data, LightGBM will train from these data

- support multiple train data, separated by ``,``

- **Note**: can be used only in CLI version

Expand Down
8 changes: 5 additions & 3 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,11 @@ struct Config {
bool linear_tree = false;

// alias = train, train_data, train_data_file, data_filename
// desc = path of training data, LightGBM will train from this data
// default = ""
// desc = path of training data, LightGBM will train from these data
// desc = support multiple train data, separated by ``,``
// desc = **Note**: can be used only in CLI version
std::string data = "";
std::vector<std::string> data;

// alias = test, valid_data, valid_data_file, test_data, test_data_file, valid_filenames
// default = ""
Expand Down Expand Up @@ -1011,7 +1013,7 @@ struct Config {

#pragma endregion

size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
static constexpr size_t file_load_progress_interval_bytes = size_t(1) * 1024 * 1024 * 1024;

bool is_parallel = false;
bool is_data_based_parallel = false;
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ class Dataset {
}

private:
std::string data_filename_;
std::vector<const char*> data_filename_;
/*! \brief Store used features */
std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
/*! \brief Mapper from real feature index to used index*/
Expand Down
30 changes: 24 additions & 6 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ namespace LightGBM {

class DatasetLoader {
public:
LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const std::vector<const char*>& filenames);

LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);

LIGHTGBM_EXPORT ~DatasetLoader();

LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
LIGHTGBM_EXPORT Dataset* LoadFromFile(const std::vector<const char*>& filenames, int rank, int num_machines);

LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines) {
return LoadFromFile(std::vector<const char*>{filename}, rank, num_machines);
}

LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
return LoadFromFile(filename, 0, 1);
Expand All @@ -40,25 +46,37 @@ class DatasetLoader {
const std::unordered_set<int>& categorical_features);

private:
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
Dataset* LoadFromBinFile(const std::vector<const char*>& data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

void SetHeader(const char* filename);
void SetHeader(const char* filenames);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change filenames back to filename? Since only the first file is used for SetHeader.


void CheckDataset(const Dataset* dataset, bool is_load_from_binary);

std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
std::vector<std::string> LoadTextDataToMemory(const std::vector<const char*>& filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
return LoadTextDataToMemory(std::vector<const char*>{filename}, metadata, rank, num_machines, num_global_data, used_data_indices);
}

std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
std::vector<std::string> SampleTextDataFromFile(const std::vector<const char*>& filenames, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
Copy link
Collaborator

@shiyu1994 shiyu1994 Feb 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that this function (SampleTextDataFromFile with single filename) is now unused. Maybe we can remove this.

return SampleTextDataFromFile(std::vector<const char*>{filename}, metadata, rank, num_machines, num_global_data, used_data_indices);
}

void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

/*! \brief Extract local features from memory */
void ExtractFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser, Dataset* dataset);

/*! \brief Extract local features from file */
void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
void ExtractFeaturesFromFile(const std::vector<const char*>& filenames, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset) {
ExtractFeaturesFromFile(std::vector<const char*>{filename}, parser, used_data_indices, dataset);
}

/*! \brief Check can load from binary file */
std::string CheckCanLoadFromBin(const char* filename);
Expand Down
1 change: 1 addition & 0 deletions include/LightGBM/utils/pipeline_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class PipelineReader {
if (skip_bytes > 0) {
// skip first k bytes
read_cnt = reader->Read(buffer_process.data(), skip_bytes);
Log::Debug("Skipped header \"%s\" in file %s", std::string(buffer_process.data(), read_cnt).c_str(), filename);
}
// read first block
read_cnt = reader->Read(buffer_process.data(), buffer_size);
Expand Down
Loading