Skip to content

Commit

Permalink
add support for zstd compressed corpora (#542) (#567)
Browse files Browse the repository at this point in the history
* add support for zstd compressed corpora



* revise jinja templating + rename workload param + add new chunks for 1tb workload



---------


(cherry picked from commit a593f0c)

Signed-off-by: Michael Oviedo <mikeovi@amazon.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
1 parent 964df8c commit df043f5
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 37 deletions.
75 changes: 48 additions & 27 deletions big5/workload.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,34 +22,55 @@
{% endif %}

{% if corpus_size == 100 %}
{
"source-file": "documents-100.json.bz2",
"document-count": 116000000,
"compressed-bytes": 6023614688,
"uncompressed-bytes": 107321418111
}
{% elif corpus_size == 880 %}
{
"source-file": "documents-880.json.bz2",
"document-count": 1020000000,
"compressed-bytes": 53220934846,
"uncompressed-bytes": 943679382267
}
{% elif corpus_size == 1000 %}
{
"source-file": "documents-1000.json.bz2",
"source-file-parts": [ { "name": "documents-1000-part0", "size": 20189061054 }, { "name": "documents-1000-part1", "size": 20189061054 }, { "name": "documents-1000-part2", "size": 20189061055 } ],
"document-count": 1160800000,
"compressed-bytes": 60567183163,
"uncompressed-bytes": 1073936121222
}
{
{% if use_zstd %}
"source-file": "documents-100.json.zst",
"compressed-bytes": 7306225533,
{% else %}
"source-file": "documents-100.json.bz2",
"compressed-bytes": 6023614688,
{% endif %}
"document-count": 116000000,
"uncompressed-bytes": 107321418111
}
{% elif corpus_size == 880 %}
{
{% if use_zstd %}
"source-file": "documents-880.json.zst",
"compressed-bytes": 27685953536,
{% else %}
"source-file": "documents-880.json.bz2",
"compressed-bytes": 27685953536,
{% endif %}
"document-count": 1020000000,
"uncompressed-bytes": 943679382267
}
{% elif corpus_size == 1000 %}
{
{% if use_zstd %}
"source-file": "documents-1000.json.zst",
"source-file-parts": [ { "name": "documents-1000-zstd-part0", "size": 21474836480 }, { "name": "documents-1000-zstd-part1", "size": 21474836480 }, { "name": "documents-1000-zstd-part2", "size": 21474836480 }, { "name": "documents-1000-zstd-part3", "size": 8944159128 } ],
"compressed-bytes": 73368668568,
{% else %}
"source-file": "documents-1000.json.bz2",
"source-file-parts": [ { "name": "documents-1000-part0", "size": 20189061054 }, { "name": "documents-1000-part1", "size": 20189061054 }, { "name": "documents-1000-part2", "size": 20189061055 } ],
"compressed-bytes": 60567183163,
{% endif %}
"document-count": 1160800000,
"uncompressed-bytes": 1073936121222
}
{% elif corpus_size == 60 %}
{
"source-file": "documents-60.json.bz2",
"document-count": 69223950,
"compressed-bytes": 3494648233,
"uncompressed-bytes": 64048001338
}
{
{% if use_zstd %}
"source-file": "documents-60.json.zst",
"compressed-bytes": 4309639180,
{% else %}
"source-file": "documents-60.json.bz2",
"compressed-bytes": 3494648233,
{% endif %}
"document-count": 69223950,
"uncompressed-bytes": 64048001338
}
{% else %}
{
"source-url": "{{ document_url | safe }}",
Expand Down
7 changes: 6 additions & 1 deletion geonames/workload.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/geonames",
"documents": [
{
{% if use_zstd %}
"source-file": "documents-2.json.zst",
"compressed-bytes": 289546908,
{% else %}
"source-file": "documents-2.json.bz2",
"document-count": 11396503,
"compressed-bytes": 265208777,
{% endif %}
"document-count": 11396503,
"uncompressed-bytes": 3547613828
}
]
Expand Down
49 changes: 42 additions & 7 deletions http_logs/workload.json
Original file line number Diff line number Diff line change
Expand Up @@ -116,51 +116,86 @@
"documents": [
{
"target-index": "logs-181998",
{% if use_zstd %}
"source-file": "documents-181998.json.zst",
"compressed-bytes": 16098947,
{% else %}
"source-file": "documents-181998.json.bz2",
"document-count": 2708746,
"compressed-bytes": 13843641,
{% endif %}
"document-count": 2708746,
"uncompressed-bytes": 363512754
},
{
"target-index": "logs-191998",
{% if use_zstd %}
"source-file": "documents-191998.json.zst",
"compressed-bytes": 58306789,
{% else %}
"source-file": "documents-191998.json.bz2",
"document-count": 9697882,
"compressed-bytes": 49546887,
{% endif %}
"document-count": 9697882,
"uncompressed-bytes": 1301732149
},
{
"target-index": "logs-201998",
{% if use_zstd %}
"source-file": "documents-201998.json.zst",
"compressed-bytes": 77690696,
{% else %}
"source-file": "documents-201998.json.bz2",
"document-count": 13053463,
"compressed-bytes": 65759419,
{% endif %}
"document-count": 13053463,
"uncompressed-bytes": 1744012279
},
{
"target-index": "logs-211998",
{% if use_zstd %}
"source-file": "documents-211998.json.zst",
"compressed-bytes": 105769843,
{% else %}
"source-file": "documents-211998.json.bz2",
"document-count": 17647279,
"compressed-bytes": 88445049,
{% endif %}
"document-count": 17647279,
"uncompressed-bytes": 2364230815
},
{
"target-index": "logs-221998",
{% if use_zstd %}
"source-file": "documents-221998.json.zst",
"compressed-bytes": 65227695,
{% else %}
"source-file": "documents-221998.json.bz2",
"document-count": 10716760,
"compressed-bytes": 54274027,
{% endif %}
"document-count": 10716760,
"uncompressed-bytes": 1438320123
},
{
"target-index": "logs-231998",
{% if use_zstd %}
"source-file": "documents-231998.json.zst",
"compressed-bytes": 74590323,
{% else %}
"source-file": "documents-231998.json.bz2",
"document-count": 11961342,
"compressed-bytes": 61043842,
{% endif %}
"document-count": 11961342,
"uncompressed-bytes": 1597530673
},
{
"target-index": "logs-241998",
{% if use_zstd %}
"source-file": "documents-241998.json.zst",
"compressed-bytes": 1111746761,
{% else %}
"source-file": "documents-241998.json.bz2",
"document-count": 181463624,
"compressed-bytes": 907295259,
{% endif %}
"document-count": 181463624,
"uncompressed-bytes": 24555905444
}
]
Expand Down
9 changes: 7 additions & 2 deletions nyc_taxis/workload.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/nyc_taxis",
"documents": [
{
{% if use_zstd %}
"source-file": "documents.json.zst",
"compressed-bytes": 4805742161,
{% else %}
"source-file": "documents.json.bz2",
"#COMMENT": "ML benchmark rely on the fact that the document count stays constant.",
"document-count": 165346692,
"compressed-bytes": 4820107188,
{% endif %}
"#COMMENT": "ML benchmark relies on the fact that the document count stays constant.",
"document-count": 165346692,
"uncompressed-bytes": 79802445255
}
]
Expand Down

0 comments on commit df043f5

Please sign in to comment.