Skip to content

Commit

Permalink
clp-s: Correctly report uncompressed size of archives during archive-…
Browse files Browse the repository at this point in the history
…splitting (fixes y-scope#469). (y-scope#463)
  • Loading branch information
gibber9809 authored and Jack Luo committed Dec 4, 2024
1 parent 589689d commit a9c9abe
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 5 deletions.
10 changes: 10 additions & 0 deletions components/core/src/clp_s/JsonFileIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,14 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
} while (read_new_json());
return false;
}

size_t JsonFileIterator::get_num_bytes_consumed() {
// If there are more documents left in the current buffer account for how much of the
// buffer has been consumed, otherwise report the total number of bytes read so that we
// capture trailing whitespace.
if (m_doc_it != m_stream.end()) {
return m_bytes_read - (m_buf_occupied - m_next_document_position);
}
return m_bytes_read;
}
} // namespace clp_s
8 changes: 8 additions & 0 deletions components/core/src/clp_s/JsonFileIterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ class JsonFileIterator {
*/
[[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; }

/**
* Note: this method can not be const because checking if a simdjson iterator is at the end
* of a document stream is non-const.
*
* @return total number of bytes consumed from the file via get_json
*/
[[nodiscard]] size_t get_num_bytes_consumed();

/**
* @return the last error code encountered when iterating over the json file
*/
Expand Down
12 changes: 7 additions & 5 deletions components/core/src/clp_s/JsonParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ bool JsonParser::parse() {
simdjson::ondemand::document_stream::iterator json_it;

m_num_messages = 0;
size_t last_num_bytes_read = 0;
size_t last_num_bytes_consumed = 0;
while (json_file_iterator.get_json(json_it)) {
m_current_schema.clear();

Expand All @@ -463,17 +463,19 @@ bool JsonParser::parse() {
->append_message(current_schema_id, m_current_schema, m_current_parsed_message);

if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
size_t num_bytes_read = json_file_iterator.get_num_bytes_read();
m_archive_writer->increment_uncompressed_size(num_bytes_read - last_num_bytes_read);
last_num_bytes_read = num_bytes_read;
size_t num_bytes_read = json_file_iterator.get_num_bytes_consumed();
m_archive_writer->increment_uncompressed_size(
num_bytes_read - last_num_bytes_consumed
);
last_num_bytes_consumed = num_bytes_read;
split_archive();
}

m_current_parsed_message.clear();
}

m_archive_writer->increment_uncompressed_size(
json_file_iterator.get_num_bytes_read() - last_num_bytes_read
json_file_iterator.get_num_bytes_read() - last_num_bytes_consumed
);

if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) {
Expand Down

0 comments on commit a9c9abe

Please sign in to comment.