diff --git a/Cargo.lock b/Cargo.lock index cceaf470..ed15017d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -10,72 +19,108 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] name = "aho-corasick" -version = "1.0.2" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "4cd2405b3ac1faab2990b74d728624cd9fd115651fcecc7c2d8daf01376275ba" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.1" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "windows-sys 0.52.0", +] + +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + +[[package]] +name = "atomic-traits" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b29ec3788e96fb4fdb275ccb9d62811f2fa903d76c5eb4dd6fe7d09a7ed5871f" +dependencies = [ + "cfg-if", + "rustc_version 0.3.3", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", ] [[package]] @@ -102,11 +147,11 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 1.9.0", "hex", "http", "hyper", - "ring", + "ring 0.16.20", "time", "tokio", "tower", @@ -122,7 +167,7 @@ checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae" dependencies = [ "aws-smithy-async", "aws-smithy-types", - "fastrand", + "fastrand 1.9.0", "tokio", "tracing", "zeroize", @@ -325,14 +370,14 @@ dependencies = [ "aws-smithy-http-tower", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 1.9.0", "http", "http-body", "hyper", "hyper-rustls", "lazy_static", "pin-project-lite", - "rustls", + "rustls 0.20.9", "tokio", "tower", "tracing", @@ -441,15 +486,36 @@ dependencies = [ "aws-smithy-http", "aws-smithy-types", "http", - "rustc_version", + "rustc_version 0.4.0", "tracing", ] +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" -version = "0.21.2" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64-simd" @@ -467,6 +533,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + [[package]] name = "block-buffer" version = "0.10.4" @@ -478,27 +550,27 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "bytes-utils" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47d3a8076e283f3acd27400535992edb3ba4b5bb72f8891ad8fbe7932a7d4b9" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" dependencies = [ "bytes", "either", @@ -506,9 +578,12 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.79" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] [[package]] name = "cfg-if" @@ -518,45 +593,58 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.3.8" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags 1.3.2", + "strsim 0.8.0", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "clap" +version = "4.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9394150f5b4273a1763355bd1c2ec54cc5a2593f790587bcd6b2c947cfa9211" +checksum = "58e54881c004cec7895b0068a0a954cd5d62da01aef83fa35b1e594497bf5445" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.8" +version = "4.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a78fbdd3cc2914ddf37ba444114bc7765bbdcb55ec9cbe6fa054f0137400717" +checksum = "59cb82d7f531603d2fd1f507441cdd35184fa81beff7bd489570de7f773460bb" dependencies = [ "anstream", "anstyle", - "bitflags", "clap_lex", - "strsim", + "strsim 0.10.0", ] [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" [[package]] name = "cmake" @@ -573,11 +661,34 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "colored" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf2150cce219b664a8a70df7a1f933836724b503f8a413af9365b4dcc4d90b8" +dependencies = [ + "lazy_static", + "windows-sys 0.48.0", +] + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.52.0", +] + [[package]] name = "core-foundation" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -585,26 +696,26 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "cpufeatures" -version = "0.2.8" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e69e28e9f7f77debdedbaafa2866e1de9ba56df55a8bd7cfc724c25a09987c" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] [[package]] name = "crc32c" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" +checksum = "d8f48d60e5b4d2c53d5c2b1d8a58c849a70ae5e5509b08a48d047e3b65714a74" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -616,48 +727,30 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam-channel" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crypto-common" @@ -669,6 +762,81 @@ dependencies = [ "typenum", ] +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + [[package]] name = "digest" version = "0.10.7" @@ -680,27 +848,60 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "dolma" -version = "0.9.4" +version = "1.0.0" dependencies = [ "ahash", + "anyhow", + "atomic-traits", "aws-config", "aws-sdk-s3", "byteorder", - "clap", + "clap 4.4.16", + "console", "env_logger", "flate2", "glob", + "humantime", + "indicatif", "jsonpath-rust", "log", + "num-traits", + "num_cpus", + "parse-size", "pyo3", "rand", "rayon", "regex", "serde", "serde_json", + "simple_logger", + "structopt", + "thousands", "threadpool", + "tokenizers", "tokio", "tokio-util", "unicode-segmentation", @@ -708,15 +909,21 @@ dependencies = [ [[package]] name = "either" -version = "1.8.1" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encode_unicode" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" [[package]] name = "env_logger" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +checksum = "95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece" dependencies = [ "humantime", "is-terminal", @@ -733,23 +940,21 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", "libc", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "esaxx-rs" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" dependencies = [ "cc", - "libc", ] [[package]] @@ -761,6 +966,12 @@ dependencies = [ "instant", ] +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + [[package]] name = "flate2" version = "1.0.28" @@ -778,47 +989,62 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", ] [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-core", "futures-task", @@ -838,15 +1064,21 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", "wasi", ] +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + [[package]] name = "glob" version = "0.3.1" @@ -878,6 +1110,15 @@ version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.1" @@ -886,18 +1127,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" -version = "0.3.1" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "hex" @@ -905,6 +1146,23 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hf-hub" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +dependencies = [ + "dirs", + "indicatif", + "log", + "native-tls", + "rand", + "serde", + "serde_json", + "thiserror", + "ureq", +] + [[package]] name = "hmac" version = "0.12.1" @@ -916,9 +1174,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.9" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb" dependencies = [ "bytes", "fnv", @@ -927,9 +1185,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", "http", @@ -944,9 +1202,9 @@ checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "humantime" @@ -956,9 +1214,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.27" +version = "0.14.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" +checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" dependencies = [ "bytes", "futures-channel", @@ -987,17 +1245,23 @@ dependencies = [ "http", "hyper", "log", - "rustls", + "rustls 0.20.9", "rustls-native-certs", "tokio", "tokio-rustls", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -1013,6 +1277,19 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + [[package]] name = "indoc" version = "1.0.9" @@ -1029,53 +1306,51 @@ dependencies = [ ] [[package]] -name = "io-lifetimes" -version = "1.0.11" +name = "is-terminal" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys 0.48.0", + "hermit-abi 0.3.3", + "rustix", + "windows-sys 0.52.0", ] [[package]] -name = "is-terminal" -version = "0.4.7" +name = "itertools" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix", - "windows-sys 0.48.0", + "either", ] [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" dependencies = [ "wasm-bindgen", ] [[package]] name = "jsonpath-rust" -version = "0.3.0" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7ea2fa3ba7d1404aa6b094aceec1d49106ec0110b40c40b76cedae148837a3b" +checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829" dependencies = [ "pest", "pest_derive", "regex", "serde_json", + "thiserror", ] [[package]] @@ -1086,9 +1361,20 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "libredox" +version = "0.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "85c833ca1e66078851dba29046874e38f08b2c883700aa29a03ddd3b23814ee8" +dependencies = [ + "bitflags 2.4.1", + "libc", + "redox_syscall", +] [[package]] name = "libz-ng-sys" @@ -1102,15 +1388,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.3.8" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -1118,24 +1404,41 @@ dependencies = [ [[package]] name = "log" -version = "0.4.19" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] -name = "md-5" -version = "0.10.5" +name = "macro_rules_attribute" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" dependencies = [ - "digest", + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", ] [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memoffset" @@ -1146,6 +1449,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -1157,15 +1466,64 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" dependencies = [ "libc", "wasi", "windows-sys 0.48.0", ] +[[package]] +name = "monostate" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "878c2a1f1c70e5724fa28f101ca787b6a7e8ad5c5e4ae4ca3b0fa4a419fa9075" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -1178,28 +1536,91 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi 0.3.3", "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "openssl" +version = "0.10.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cde4d2d9200ad5909f8dac647e29482e07c3a35de8a13fce7c9c7747ad9f671" +dependencies = [ + "bitflags 2.4.1", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "openssl-probe" @@ -1207,6 +1628,24 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1665caf8ab2dc9aef43d1c0023bd904633a6a05cb30b0ad59bec2ae986e57a7" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "outref" version = "0.5.1" @@ -1225,38 +1664,51 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.48.5", ] +[[package]] +name = "parse-size" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "944553dd59c802559559161f9816429058b869003836120e262e8caec061b7ae" + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" -version = "2.7.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73935e4d55e2abf7f130186537b19e7a4abc886a0252380b59248af473a3fc9" +checksum = "1f200d8d83c44a45b21764d1916299752ca035d15ecd46faca3e9a2a2bf6ad06" dependencies = [ + "memchr", "thiserror", "ucd-trie", ] [[package]] name = "pest_derive" -version = "2.7.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef623c9bbfa0eedf5a0efba11a5ee83209c326653ca31ff019bec3a95bfff2b" +checksum = "bcd6ab1236bbdb3a49027e920e693192ebfe8913f6d60e294de57463a493cfde" dependencies = [ "pest", "pest_generator", @@ -1264,22 +1716,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.7.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e8cba4ec22bada7fc55ffe51e2deb6a0e0db2d0b7ab0b103acc80d2510c190" +checksum = "2a31940305ffc96863a735bef7c7994a00b325a7138fdbc5bda0f1a0476d3275" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] name = "pest_meta" -version = "2.7.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a01f71cb40bd8bb94232df14b946909e14660e33fc05db3e50ae2a82d7ea0ca0" +checksum = "a7ff62f5259e53b78d1af898941cdcdccfae7385cf7d793a6e55de5d05bb4b7d" dependencies = [ "once_cell", "pest", @@ -1288,29 +1740,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" [[package]] name = "pin-utils" @@ -1318,26 +1770,68 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" + +[[package]] +name = "portable-atomic" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" -version = "1.0.63" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" -version = "0.19.0" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cffef52f74ec3b1a1baf295d9b8fcc3070327aefc39a6d00656b13c1d0b8885c" +checksum = "e681a6cfdc4adcc93b4d3cf993749a4552018ee0a9b65fc0ccfad74352c72a38" dependencies = [ "cfg-if", "indoc", @@ -1352,9 +1846,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.19.0" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "713eccf888fb05f1a96eb78c0dbc51907fee42b3377272dc902eb38985f418d5" +checksum = "076c73d0bc438f7a4ef6fdd0c3bb4732149136abd952b110ac93e4edb13a6ba5" dependencies = [ "once_cell", "target-lexicon", @@ -1362,9 +1856,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.19.0" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b2ecbdcfb01cbbf56e179ce969a048fd7305a66d4cdf3303e0da09d69afe4c3" +checksum = "e53cee42e77ebe256066ba8aa77eff722b3bb91f3419177cf4cd0f304d3284d9" dependencies = [ "libc", "pyo3-build-config", @@ -1372,9 +1866,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.19.0" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b78fdc0899f2ea781c463679b20cb08af9247febc8d052de941951024cd8aea0" +checksum = "dfeb4c99597e136528c6dd7d5e3de5434d1ceaf487436a3f03b2d56b6fc9efd1" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -1384,9 +1878,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.19.0" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60da7b84f1227c3e2fe7593505de274dcf4c8928b4e0a1c23d551a14e4e80a0f" +checksum = "947dc12175c254889edc0c02e399476c2f652b4b9ebd123aa655c224de259536" dependencies = [ "proc-macro2", "quote", @@ -1395,9 +1889,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -1434,51 +1928,89 @@ dependencies = [ [[package]] name = "rayon" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ "either", "rayon-core", ] +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools", + "rayon", +] + [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_users" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "a18479200779601e498ada4e8c1e1f50e3ee19deb0259c25825a98b5603b2cb4" dependencies = [ - "bitflags", + "getrandom", + "libredox", + "thiserror", ] [[package]] name = "regex" -version = "1.8.4" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.2", ] [[package]] name = "regex-syntax" -version = "0.7.2" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "regex-syntax" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "ring" @@ -1489,47 +2021,87 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", - "untrusted", + "spin 0.5.2", + "untrusted 0.7.1", "web-sys", "winapi", ] +[[package]] +name = "ring" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74" +dependencies = [ + "cc", + "getrandom", + "libc", + "spin 0.9.8", + "untrusted 0.9.0", + "windows-sys 0.48.0", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc_version" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +dependencies = [ + "semver 0.11.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.21", ] [[package]] name = "rustix" -version = "0.37.25" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags", + "bitflags 2.4.1", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" dependencies = [ "log", - "ring", + "ring 0.16.20", "sct", "webpki", ] +[[package]] +name = "rustls" +version = "0.21.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" +dependencies = [ + "log", + "ring 0.17.7", + "rustls-webpki", + "sct", +] + [[package]] name = "rustls-native-certs" version = "0.6.3" @@ -1544,51 +2116,61 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "1.0.2" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64 0.21.7", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "base64", + "ring 0.17.7", + "untrusted 0.9.0", ] [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "schannel" -version = "0.1.21" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys 0.42.0", + "windows-sys 0.52.0", ] [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "sct" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring", - "untrusted", + "ring 0.17.7", + "untrusted 0.9.0", ] [[package]] name = "security-framework" -version = "2.9.1" +version = "2.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -1597,9 +2179,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.0" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" dependencies = [ "core-foundation-sys", "libc", @@ -1607,35 +2189,53 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.17" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" + +[[package]] +name = "semver-parser" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +dependencies = [ + "pest", +] [[package]] name = "serde" -version = "1.0.164" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.164" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] name = "serde_json" -version = "1.0.99" +version = "1.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46266871c240a00b8f503b877622fe33430b3c7d963bdc0f2adc511e54a1eae3" +checksum = "176e46fa42316f18edd598015a5166857fc835ec732f5215eac6b7bdbf0a84f4" dependencies = [ "itoa", "ryu", @@ -1644,9 +2244,9 @@ dependencies = [ [[package]] name = "sha1" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", "cpufeatures", @@ -1655,9 +2255,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.7" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", @@ -1673,29 +2273,41 @@ dependencies = [ "libc", ] +[[package]] +name = "simple_logger" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc20708d703a44b96b3b700578a85b6fe887fc63ab20315757026bb8a12faaad" +dependencies = [ + "atty", + "colored", + "log", + "winapi", +] + [[package]] name = "slab" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ "autocfg", ] [[package]] name = "smallvec" -version = "1.10.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "2593d31f82ead8df961d8bd23a64c2ccf2eb5dd34b0a34bfb4dd54011c72009e" [[package]] name = "socket2" -version = "0.4.9" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" dependencies = [ "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -1704,12 +2316,60 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "structopt" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" +dependencies = [ + "clap 2.34.0", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck 0.3.3", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "subtle" version = "2.5.0" @@ -1729,9 +2389,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.22" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2efbeae7acf4eabd6bcdcbd11c92f45231ddda7539edc7806bd1a04a03b24616" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", @@ -1740,39 +2400,67 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.8" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1c7f239eb94671427157bd93b3694320f3668d4e1eff08c7285366fd777fac" +checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" + +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand 2.0.1", + "redox_syscall", + "rustix", + "windows-sys 0.52.0", +] [[package]] name = "termcolor" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" dependencies = [ "winapi-util", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + [[package]] name = "threadpool" version = "1.8.1" @@ -1784,10 +2472,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.22" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd" +checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" dependencies = [ + "deranged", + "powerfmt", "serde", "time-core", "time-macros", @@ -1795,15 +2485,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.9" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" +checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" dependencies = [ "time-core", ] @@ -1823,13 +2513,47 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c" +dependencies = [ + "aho-corasick", + "clap 4.4.16", + "derive_builder", + "esaxx-rs", + "getrandom", + "hf-hub", + "indicatif", + "itertools", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.7.5", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" -version = "1.28.2" +version = "1.35.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" dependencies = [ - "autocfg", + "backtrace", "bytes", "libc", "mio", @@ -1844,13 +2568,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] @@ -1859,7 +2583,7 @@ version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls", + "rustls 0.20.9", "tokio", "webpki", ] @@ -1877,9 +2601,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.8" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -1919,11 +2643,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -1932,53 +2655,53 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", ] [[package]] name = "tracing-core" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", ] [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "ucd-trie" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81" +checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" [[package]] name = "unicode-ident" -version = "1.0.9" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" @@ -1989,12 +2712,33 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-segmentation" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +[[package]] +name = "unicode-width" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "unindent" version = "0.1.11" @@ -2007,11 +2751,36 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cdd25c339e200129fe4de81451814e5228c9b771d57378817d6117cc2b3f97" +dependencies = [ + "base64 0.21.7", + "flate2", + "log", + "native-tls", + "once_cell", + "rustls 0.21.10", + "rustls-webpki", + "serde", + "serde_json", + "url", + "webpki-roots", +] + [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", @@ -2020,9 +2789,9 @@ dependencies = [ [[package]] name = "urlencoding" -version = "2.1.2" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "utf8parse" @@ -2030,6 +2799,18 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.4" @@ -2059,9 +2840,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -2069,24 +2850,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2094,28 +2875,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.48", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ "js-sys", "wasm-bindgen", @@ -2123,14 +2904,20 @@ dependencies = [ [[package]] name = "webpki" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f" +checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" dependencies = [ - "ring", - "untrusted", + "ring 0.17.7", + "untrusted 0.9.0", ] +[[package]] +name = "webpki-roots" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" + [[package]] name = "winapi" version = "0.3.9" @@ -2149,9 +2936,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] @@ -2164,135 +2951,164 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.42.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets 0.48.5", ] [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.0", ] [[package]] name = "windows-targets" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "xmlparser" -version = "0.13.5" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "zeroize" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" diff --git a/Cargo.toml b/Cargo.toml index 0c58087b..b5d4c3d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "0.9.4" +version = "1.0.0" edition = "2021" license = "Apache-2.0" @@ -11,26 +11,37 @@ crate-type = ["cdylib"] [dependencies] ahash = { version = "0.8.1", features = ["runtime-rng"] } +anyhow = "1.0" +atomic-traits = "0.3" aws-config = { version = "0.55.0"} aws-sdk-s3 = "0.25.0" byteorder = "1" clap = { version = "4.1.11", features = ["derive"] } +console = "0.15" env_logger = "0.10.0" flate2 = { version = "1.0.28", features = ["zlib-ng"], default-features = false } +glob = "0.3.1" +humantime = "2.1" +indicatif = "0.17" jsonpath-rust = "0.3.0" log = "0.4.17" -regex = "1.8.4" +num_cpus = "1.0" +num-traits = "0.2" +parse-size = "1.0" pyo3 = { version = "0.19.0", features = ["extension-module"] } rand = "0.8.4" rayon = "1.7.0" -serde = {version = "1.0.160", features = ["derive"]} -serde_json = "1.0" +regex = "1.8.4" +serde = { version = "1.0.160", features = ["derive", "rc"] } +serde_json = "1.0.108" +simple_logger = { version = "3.0", features = ["stderr", "colors"], default-features = false, optional = true } +structopt = { version = "0.3", optional = true } +thousands = "0.2" threadpool = "1.8.1" +tokenizers = {version = "0.15.0", features = ["http"]} tokio = {version = "1.27.0", features = ["full"]} tokio-util = "0.7.7" unicode-segmentation = "1.7" -glob = "0.3.1" - # [target.'cfg(target_arch = "aarch64")'.dependencies] # flate2 = "1.0.28" diff --git a/configs/dolma-v1_5/README.md b/configs/dolma-v1_5/README.md new file mode 100644 index 00000000..aff3e5be --- /dev/null +++ b/configs/dolma-v1_5/README.md @@ -0,0 +1,3 @@ +# Dolma 1.5 + +This directory diff --git a/configs/dolma-v1_5/decontamination/README.md b/configs/dolma-v1_5/decontamination/README.md new file mode 100644 index 00000000..f45c9520 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/README.md @@ -0,0 +1,96 @@ +# Decontamination Runbook + +## Step 1: Create decontamination bloom filter + +> Okay I think every thing is ready for decon testing now. The finalized ppl suite v3 is in `s3://ai2-llm/eval-data/perplexity/v3/`. And here is my proposed plan for decon testing if you agree and it's not too much compute. The following is the sequence of things to try. At each step if the document removal rate is >0.1% or so we back off to the next step and hope the remove rate is lower: +> +> - **Option 1** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2 (`s3://ai2-llm/eval-data/perplexity/v2/`) for full backwards compatibility. +> - **Option 2** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2-small (`s3://ai2-llm/eval-data/perplexity/v2_small/`) for at least full backwards for the in-loop metrics the model team was using. +> - **Option 3** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + a subset of PPL Suite v2-small requested by Dirk and Iz (`s3://ai2-llm/eval-data/perplexity/v2_small/c4_en/`, `s3://ai2-llm/eval-data/perplexity/v2_small/pile/`, `s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc/`, `s3://ai2-llm/eval-data/perplexity/v2_small/ice/`) +> +> Let me know if you disagree with any of this or if there's any thing I can do to help run the decon trials! + + +### Step 1.1: copy data locally + +We copy data locally since the directory structure of the eval data in S3 is slightly different from the one we need. +In particular, we need all documents to be under `documents/` directory. + +```bash +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2 $HOME/perplexity/v2/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small $HOME/perplexity/v2_small/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v3 $HOME/perplexity/v3/documents + +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/c4_en $HOME/perplexity/v2_small_subset/documents/c4_en +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/pile $HOME/perplexity/v2_small_subset/documents/pile +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc $HOME/perplexity/v2_small_subset/documents/m2d2_s2orc +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/ice $HOME/perplexity/v2_small_subset/documents/ice +``` + +### Step 1.1b: change type of IDs in v3 subset (TEMPORARY FIX) + +v3 accidentally contains ids that are integers instead of strings. Until that's fixed, run: + +```bash +python config/dolma-v1_5/decontamination/fix_ids_type.py +``` + +### Step 1.2: tag out paragraphs by uniseg length + +For dolma, we want to decontaminate against paragraphs that are at least 13 uniseg words long, +so we need to compute their length first. + +```bash +dolma tag --documents "${HOME}/perplexity/v2/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v3/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small_subset/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +``` + +### Step 1.3: filter out paragraphs that are too short + +After tagging, we can filter out to make option 1/2/3. + +```bash + +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml mix +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml mix +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml mix + +``` + +### Step 1.4: create bloom filter + +First, we cat the contents of each dataset to get number of documents: + +```bash +zcat $HOME/perplexity/option1/documents/* | jq '.text' -cr | wc -l +>>> 3681169 +zcat $HOME/perplexity/option2/documents/* | jq '.text' -cr | wc -l +>>> 2336120 +zcat $HOME/perplexity/option3/documents/* | jq '.text' -cr | wc -l +>>> 2020471 +``` + +We use this numbers in the config files at `bloom_filter.estimated_doc_count`. For all three options, we set a `bloom_filter.desired_false_positive_rate` of 0.00001. + +```bash +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml dedupe +``` + +## Step 2: Run decontamination + +Tag content for Dolma V1.5 for decontamination: + + +```bash +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/wiki.yaml dedupe +``` diff --git a/configs/dolma-v1_5/decontamination/fix_ids_type.py b/configs/dolma-v1_5/decontamination/fix_ids_type.py new file mode 100644 index 00000000..69bd4dbd --- /dev/null +++ b/configs/dolma-v1_5/decontamination/fix_ids_type.py @@ -0,0 +1,33 @@ +import argparse +import json +from dolma.core.paths import glob_path +import tqdm + +import smart_open + + +def fix_path(p: str): + with smart_open.open(p, 'rt') as f: + data = [json.loads(line) for line in f] + + with smart_open.open(p, 'wt') as f: + for d in data: + if 'id' in d: + d['id'] = str(d['id']) + f.write(json.dumps(d) + '\n') + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('path', nargs='+') + args = ap.parse_args() + + with tqdm.tqdm(desc='Files') as pbar: + for p in args.path: + for sp in glob_path(p): + fix_path(sp) + pbar.update() + + +if __name__ == '__main__': + main() diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml new file mode 100644 index 00000000..2dcb5c1b --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2" + documents: + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option1/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml new file mode 100644 index 00000000..37b7be6d --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml new file mode 100644 index 00000000..4f912a2b --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml @@ -0,0 +1,70 @@ +streams: + - name: "v2_small_subset" + documents: + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option3/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml new file mode 100644 index 00000000..9ef386d5 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml @@ -0,0 +1,41 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: + path: ${oc.env:HOME}/perplexity/ppl_v2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: + - uniseg_length_paragraphs_with_empty_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml new file mode 100644 index 00000000..f2b21ea5 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option1/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 3686676 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option1.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml new file mode 100644 index 00000000..9936ce75 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option2/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 2337305 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml new file mode 100644 index 00000000..9d617645 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option3/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 2021613 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option3.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml new file mode 100644 index 00000000..c5d55bfc --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml @@ -0,0 +1,19 @@ +documents: + - ${oc.env:HOME}/perplexity/ppl_v2/documents/*.gz + +dedupe: + name: perplexity_suite_v2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + + +bloom_filter: + file: s3://ai2-llm/bloom-filters/perplexity-suite-v2-8M.bin + size_in_bytes: 8388608 + read_only: false + estimated_doc_count: 3898706 + desired_false_positive_rate: 0.001 + + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml new file mode 100644 index 00000000..af0eca22 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml new file mode 100644 index 00000000..1e2f2848 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml new file mode 100644 index 00000000..15d88a17 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.gz + # - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/cc_en_middle-0954.json.gz + # - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1690.json.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml new file mode 100644 index 00000000..6381098c --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml @@ -0,0 +1,36 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml new file mode 100644 index 00000000..bea2e6d8 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml new file mode 100644 index 00000000..2e9291cd --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml @@ -0,0 +1,367 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml new file mode 100644 index 00000000..5022d512 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml @@ -0,0 +1,132 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=af/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ang/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=as/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ast/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=az/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ba/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=be/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ca/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ch/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=co/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=da/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=de/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=el/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=eo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=es/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=et/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=eu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=got/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=he/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=id/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ie/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=is/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=it/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ja/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ka/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=kk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=km/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=kn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ko/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ku/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ky/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=la/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=li/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ln/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=my/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=na/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nds/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ne/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=no/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=oc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ps/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=qu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=rm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ru/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=se/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=shn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=si/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=su/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ta/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=te/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=th/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=uk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=uz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=vi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=vo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=wa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=xh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=yo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=za/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zh_min_nan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zu/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml new file mode 100644 index 00000000..937e36ba --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml @@ -0,0 +1,329 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ady/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=af/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=als/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=am/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ami/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=an/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ang/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=arc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ary/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=arz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=as/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ast/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=atj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=av/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=avk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=awa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ay/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=az/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=azb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ba/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ban/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bat_smg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bcl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=be/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bjn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=blk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bpy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=br/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bxr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ca/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cbk_zam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cdo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ce/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ceb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ch/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=chr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=chy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ckb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=co/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=crh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=csb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=da/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=de/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=din/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=diq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dsb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dty/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ee/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=el/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=es/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=et/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ext/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ff/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fiu_vro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=frp/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=frr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ga/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gcr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=glk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gom/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gor/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=got/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=guw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ha/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=haw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=he/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hif/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hsb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ht/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hyw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=id/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ie/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ig/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ik/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ilo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=inh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=io/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=is/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=it/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=iu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ja/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jbo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ka/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kaa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kab/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kbd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kbp/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kcg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ki/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=km/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ko/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=koi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=krc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ks/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ksh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ku/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ky/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=la/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lad/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lbe/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lez/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lfn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=li/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lij/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lld/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lmo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ln/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ltg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mad/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mai/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=map_bms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mdf/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mhr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=min/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mni/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mnw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mrj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mwl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=my/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=myv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mzn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=na/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nap/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nds/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nds_nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ne/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=new/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=no/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nov/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nqo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nrm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nso/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ny/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=oc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=olo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=om/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=or/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=os/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pap/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pcd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pcm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pdc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pfl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pih/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pnb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pnt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ps/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pwn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=qu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rmy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=roa_tara/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ru/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rue/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sat/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=scn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sco/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=se/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=shi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=shn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=si/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=skr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=smn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=so/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=srn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ss/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=st/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=stq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=su/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=szl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=szy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ta/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tay/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tcy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=te/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tet/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=th/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ti/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=to/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tpi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=trv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ts/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tum/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ty/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tyv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=udm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=uk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=uz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ve/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vec/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vep/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vls/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=war/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wuu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xal/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xmf/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=yi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=yo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=za/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zea/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_classical/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_min_nan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_yue/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zu/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/tokenize_v3.sh b/configs/dolma-v1_5/decontamination/tokenize_v3.sh new file mode 100644 index 00000000..83dbc94e --- /dev/null +++ b/configs/dolma-v1_5/decontamination/tokenize_v3.sh @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +datasets=( + '4chan_meta_sep' + 'c4_100_domains' + 'c4_en' + 'dolma_100_subreddits' + 'dolma-v1_5' + 'falcon-refinedweb' + 'gab' + 'ice_fixed' + 'm2d2_s2orc_unsplit' + 'm2d2_wikipedia_unsplit' + 'manosphere_meta_sep' + 'mc4' + 'pile' + 'ptb' + 'redpajama' + 'twitterAAE_HELM_fixed' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh b/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh new file mode 100644 index 00000000..cdcb36d3 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh @@ -0,0 +1,29 @@ +#! /usr/bin/env bash + +datasets=( + 'c4_en' + 'dolma_books' + 'dolma_common-crawl' + 'dolma_pes2o' + 'dolma_reddit' + 'dolma_stack' + 'dolma_wiki' + 'ice' + 'm2d2_s2orc' + 'pile' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_5/eval-set.md b/configs/dolma-v1_5/eval-set.md new file mode 100644 index 00000000..d608a98b --- /dev/null +++ b/configs/dolma-v1_5/eval-set.md @@ -0,0 +1,135 @@ +# Dolma v1.5 Eval set + +We create the eval set by sampling documents in each subset. Some subsets already have an eval set (e.g. C4), so we use that. Also, for some subsets, creation of eval set was done using a different strategy (e.g., reddit; documented below), so we use other approaches. + +For each subset, we aim for roughly 1M tokens + + +## CommonCrawl + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz' \ + -p 0.0000005 \ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/common-crawl \ + -n 188 + +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/common-crawl", + "dryrun": false, + "num_workers": 188, + "probability": 5e-07, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz", + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz", + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz" + ] +} +Sampling with probability 5e-07 using MD5 suffixes ['ffffff', 'fffffe', 'fffffd', 'fffffc', 'fffffb', 'fffffa', 'fffff9', 'fffff8'] +Found 2,878 files to process +uniseg_words: 1.00Mu [19:23, 860u/s] +extracted: 1.91ke [19:23, 1.64e/s]] +documents: 4.60Gd [19:23, 3.95Md/s] +files: 2.88kf [19:23, 2.47f/s]59u/s] +``` + + +## PeS2o + +```bash +python scripts/hash_sample.py \ + -s s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/*/*.gz \ + s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/*/*.gz \ + -p 0.004 \ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/pes2o \ + -n 188 +``` + +Output: +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/pes2o", + "dryrun": false, + "num_workers": 188, + "probability": 0.004, + "source": [ + "s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/*/*.gz", + "s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/*/*.gz" + ] +} +Sampling with probability 0.004 using MD5 suffixes ['ff'] +Found 600 files to process +uniseg_words: 1.21Mu [00:06, 177ku/s] +extracted: 610e [00:06, 89.4e/s]s] +documents: 161kd [00:06, 23.6kd/s] +files: 600f [00:06, 87.9f/s] 77.4ku/s] +``` + +## Books + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books/*.gz'\ + -p 0.00035\ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/books \ + -n 188 +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/books", + "dryrun": false, + "num_workers": 188, + "probability": 0.00038, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books/*.gz" + ] +} +Sampling with probability 0.00038 using MD5 suffixes ['fff', 'ffe'] +Found 3 files to process +uniseg_words: 1.73Mu [01:12, 23.7ku/s] +extracted: 30.0e [01:12, 2.42s/e] +documents: 52.1kd [01:12, 717d/s] +files: 3.00f [01:12, 24.2s/f]20.2ku/s] +``` + +## Wiki + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki/*.gz'\ + -p 0.00038\ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/wiki \ + -n 188 +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/wiki", + "dryrun": false, + "num_workers": 188, + "probability": 0.00038, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki/*.gz" + ] +} +Sampling with probability 0.00038 using MD5 suffixes ['fff', 'ffe'] +Found 2 files to process +uniseg_words: 1.43Mu [01:58, 12.0ku/s] +extracted: 2.94ke [01:58, 24.7e/s]] +documents: 6.11Md [01:58, 51.4kd/s] +files: 2.00f [01:58, 59.4s/f]7.85ku/s] +``` diff --git a/configs/dolma-v1_5/mixing/books.yaml b/configs/dolma-v1_5/mixing/books.yaml new file mode 100644 index 00000000..4283e905 --- /dev/null +++ b/configs/dolma-v1_5/mixing/books.yaml @@ -0,0 +1,31 @@ + +streams: +- name: books + + documents: + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + + attributes: + - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] + < 25)]" + - "$.attributes[?(@.olmo_mix_v1_taggers__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] + < 0.5)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/c4.yaml b/configs/dolma-v1_5/mixing/c4.yaml new file mode 100644 index 00000000..5973929a --- /dev/null +++ b/configs/dolma-v1_5/mixing/c4.yaml @@ -0,0 +1,119 @@ +streams: +- name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + + attributes: + - olmo_mix_v1_taggers + - perplexity_suite_v3_option2 + - dedupe_paragraphs + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4 + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + # filter: + # include: [] + # exclude: + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + # && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + # span_replacement: [] + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-head.yaml b/configs/dolma-v1_5/mixing/cc-head.yaml new file mode 100644 index 00000000..eaa0e909 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-head.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-middle.yaml b/configs/dolma-v1_5/mixing/cc-middle.yaml new file mode 100644 index 00000000..bae79c93 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-middle.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-tail.yaml b/configs/dolma-v1_5/mixing/cc-tail.yaml new file mode 100644 index 00000000..ad06d090 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-tail.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/pes2o.yaml b/configs/dolma-v1_5/mixing/pes2o.yaml new file mode 100644 index 00000000..2208f5cb --- /dev/null +++ b/configs/dolma-v1_5/mixing/pes2o.yaml @@ -0,0 +1,43 @@ +--- +streams: +- name: pes2o_v2 + documents: + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + attributes: + - perplexity_suite_v3_option2 + + filter: + include: [] + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] + && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/reddit.yaml b/configs/dolma-v1_5/mixing/reddit.yaml new file mode 100644 index 00000000..86d0d157 --- /dev/null +++ b/configs/dolma-v1_5/mixing/reddit.yaml @@ -0,0 +1,26 @@ + +streams: +- name: reddit-v5-dedupe-pii-nsfw-toxic + + documents: + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + + attributes: + - perplexity_suite_v3_option2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/stack.yaml b/configs/dolma-v1_5/mixing/stack.yaml new file mode 100644 index 00000000..e67a6d6a --- /dev/null +++ b/configs/dolma-v1_5/mixing/stack.yaml @@ -0,0 +1,375 @@ +streams: +- name: stack-v4-train + documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + attributes: + - perplexity_suite_v3_option2 + + filter: + include: [] + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] + && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/wiki.yaml b/configs/dolma-v1_5/mixing/wiki.yaml new file mode 100644 index 00000000..fe7ce101 --- /dev/null +++ b/configs/dolma-v1_5/mixing/wiki.yaml @@ -0,0 +1,28 @@ +--- +streams: +- name: en_simple_wiki_v0 + documents: + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz + attributes: + - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] + < 25)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/para_dedupe/c4.yaml b/configs/dolma-v1_5/para_dedupe/c4.yaml new file mode 100644 index 00000000..e51fe0eb --- /dev/null +++ b/configs/dolma-v1_5/para_dedupe/c4.yaml @@ -0,0 +1,16 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + +dedupe: + name: dedupe_paragraphs + paragraphs: + attribute_name: bff_duplicate_paragraph_spans + skip_empty: true + +bloom_filter: + file: /tmp/c4.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_5/para_dedupe/cc-middle.yaml b/configs/dolma-v1_5/para_dedupe/cc-middle.yaml new file mode 100644 index 00000000..dccd4a52 --- /dev/null +++ b/configs/dolma-v1_5/para_dedupe/cc-middle.yaml @@ -0,0 +1,16 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.gz + +dedupe: + name: dedupe_paragraphs_v2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans + skip_empty: true + +bloom_filter: + file: /tmp/cc_en_middle.bloom + read_only: false + estimated_doc_count: 60000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-head.yaml b/configs/dolma-v1_5/sample/cc-head.yaml new file mode 100644 index 00000000..11b40b96 --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-head.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_head + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-middle.yaml b/configs/dolma-v1_5/sample/cc-middle.yaml new file mode 100644 index 00000000..0f4d1aff --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-middle.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-tail.yaml b/configs/dolma-v1_5/sample/cc-tail.yaml new file mode 100644 index 00000000..d07547a3 --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-tail.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/tagger-r2.yaml b/configs/dolma-v1_5/tagger-r2.yaml new file mode 100644 index 00000000..3355d8b0 --- /dev/null +++ b/configs/dolma-v1_5/tagger-r2.yaml @@ -0,0 +1,13 @@ +taggers: + - tokenizer_repetitions_v1 + - char_length_strip_ws_v1 + +documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/*/*.gz + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3/documents/*/split=train/*/*.gz + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/*/*.gz + +processes: 188 diff --git a/configs/dolma-v1_5/tokenizer.yaml b/configs/dolma-v1_5/tokenizer.yaml new file mode 100644 index 00000000..9ac383a5 --- /dev/null +++ b/configs/dolma-v1_5/tokenizer.yaml @@ -0,0 +1,8 @@ +destination: s3://ai2-llm/preprocessed/olmo-mix/v1_5_cc_only/gpt-neox-20b-pii-special/ +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.json.gz +processes: 168 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special diff --git a/configs/dolma-v1_5/train-set.md b/configs/dolma-v1_5/train-set.md new file mode 100644 index 00000000..1cbdca79 --- /dev/null +++ b/configs/dolma-v1_5/train-set.md @@ -0,0 +1,50 @@ +# Dolma v1.5 + +Files is this directory are used to generate Dolma v1.5. + +## Tagging + +Tagging is largely the same as v1, but we report it here for completeness. + +### C4 + +```bash +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz' --taggers pii_regex_with_counts_v2 --processes 188 +``` + +### Common Crawl + +## Filtering + +## Sampling of CC + +```bash +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz' --taggers random_number_v1 --processes 188 +``` + +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz' --taggers random_number_v1 --processes 188 + +## Tokenization + +```bash +python -m dolma.tokenizer --sources 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/*/*' --destination $HOME/preprocessed/olmo-mix/v1_5/gpt-neox-20b-pii-special --num-writers 188 --max-size 17179869184 +``` + +```bash +dolma tokens \ + --documents 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_head/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*' \ + --tokenizer_name_or_path 'allenai/gpt-neox-20b-pii-special' \ + --destination $HOME/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special \ + --processes 188 \ + --ring_size 8 \ + --batch_size 10000 \ + --max_size 5368709120 +``` diff --git a/configs/dolma-v1_6/decontamination/README.md b/configs/dolma-v1_6/decontamination/README.md new file mode 100644 index 00000000..8896746c --- /dev/null +++ b/configs/dolma-v1_6/decontamination/README.md @@ -0,0 +1,97 @@ +# Decontamination Runbook + +## Step 1: Create decontamination bloom filter + +> Okay I think every thing is ready for decon testing now. The finalized ppl suite v3 is in `s3://ai2-llm/eval-data/perplexity/v3/`. And here is my proposed plan for decon testing if you agree and it's not too much compute. The following is the sequence of things to try. At each step if the document removal rate is >0.1% or so we back off to the next step and hope the remove rate is lower: +> +> - **Option 1** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2 (`s3://ai2-llm/eval-data/perplexity/v2/`) for full backwards compatibility. +> - **Option 2** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2-small (`s3://ai2-llm/eval-data/perplexity/v2_small/`) for at least full backwards for the in-loop metrics the model team was using. +> - **Option 3** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + a subset of PPL Suite v2-small requested by Dirk and Iz (`s3://ai2-llm/eval-data/perplexity/v2_small/c4_en/`, `s3://ai2-llm/eval-data/perplexity/v2_small/pile/`, `s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc/`, `s3://ai2-llm/eval-data/perplexity/v2_small/ice/`) +> +> Let me know if you disagree with any of this or if there's any thing I can do to help run the decon trials! + + +### Step 1.1: copy data locally + +We copy data locally since the directory structure of the eval data in S3 is slightly different from the one we need. +In particular, we need all documents to be under `documents/` directory. + +```bash +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2 $HOME/perplexity/v2/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small $HOME/perplexity/v2_small/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v3 $HOME/perplexity/v3/documents + +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/c4_en $HOME/perplexity/v2_small_subset/documents/c4_en +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/pile $HOME/perplexity/v2_small_subset/documents/pile +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc $HOME/perplexity/v2_small_subset/documents/m2d2_s2orc +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/ice $HOME/perplexity/v2_small_subset/documents/ice +``` + +### Step 1.1b: change type of IDs in v3 subset (TEMPORARY FIX) + +v3 accidentally contains ids that are integers instead of strings. Until that's fixed, run: + +```bash +python config/dolma-v1_6/decontamination/fix_ids_type.py +``` + +### Step 1.2: tag out paragraphs by uniseg length + +For dolma, we want to decontaminate against paragraphs that are at least 13 uniseg words long, +so we need to compute their length first. + +```bash +dolma tag --documents "${HOME}/perplexity/v2/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v3/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small_subset/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +``` + +### Step 1.3: filter out paragraphs that are too short + +```bash + +dolma -c configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml mix + +``` + +### Step 1.4: create two bllom filters; one for paragraphs and one for documents + +First, we cat the contents of each dataset to get number of documents: + +```bash +zcat $HOME/perplexity/option2/documents/* | jq '.text' -cr | wc -l +>>> 2336120 # paragraphs +``` + +```bash +zcat $HOME/perplexity/option2/documents/* | wc -l +>>> 188815 # documents +``` + +We use this numbers in the config files at `bloom_filter.estimated_doc_count`. +For both bloom filters, we use the same `bloom_filter.false_positive_rate` of `1e-15`. + +Build both bloom filters: + +```bash +dolma -c configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml dedupe +dolma -c configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml dedupe +``` + +This will create two bloom filters in `${HOME}/perplexity/filters/` called `paloma_paragraphs.bin` and `paloma_documents.bin`. + +## Step 2: Tag contaminated documents + +Tag content for Dolma V1.6 for decontamination: + + +```bash +dolma -c configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml dedupe + +dolma -c configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml dedupe +``` + +## Step 3: Create a version of the dataset that has been decontaminated + +TODO: add this step diff --git a/configs/dolma-v1_6/decontamination/fix_ids_type.py b/configs/dolma-v1_6/decontamination/fix_ids_type.py new file mode 100644 index 00000000..69bd4dbd --- /dev/null +++ b/configs/dolma-v1_6/decontamination/fix_ids_type.py @@ -0,0 +1,33 @@ +import argparse +import json +from dolma.core.paths import glob_path +import tqdm + +import smart_open + + +def fix_path(p: str): + with smart_open.open(p, 'rt') as f: + data = [json.loads(line) for line in f] + + with smart_open.open(p, 'wt') as f: + for d in data: + if 'id' in d: + d['id'] = str(d['id']) + f.write(json.dumps(d) + '\n') + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('path', nargs='+') + args = ap.parse_args() + + with tqdm.tqdm(desc='Files') as pbar: + for p in args.path: + for sp in glob_path(p): + fix_path(sp) + pbar.update() + + +if __name__ == '__main__': + main() diff --git a/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option1.yaml b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option1.yaml new file mode 100644 index 00000000..2dcb5c1b --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option1.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2" + documents: + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option1/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml new file mode 100644 index 00000000..37b7be6d --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option3.yaml b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option3.yaml new file mode 100644 index 00000000..4f912a2b --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option3.yaml @@ -0,0 +1,70 @@ +streams: + - name: "v2_small_subset" + documents: + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option3/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/ppl_v2.yaml b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/ppl_v2.yaml new file mode 100644 index 00000000..9ef386d5 --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/ppl_v2.yaml @@ -0,0 +1,41 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: + path: ${oc.env:HOME}/perplexity/ppl_v2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: + - uniseg_length_paragraphs_with_empty_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" diff --git a/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml b/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml new file mode 100644 index 00000000..687fd7c5 --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml @@ -0,0 +1,76 @@ +documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + +dedupe: + name: paloma_documents + documents: + attribute_name: bff_duplicates + key: $.text + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 188815 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: ${oc.env:HOME}/perplexity/filters/paloma_documents.bin + +processes: 188 diff --git a/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml b/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml new file mode 100644 index 00000000..1559b2d0 --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml @@ -0,0 +1,18 @@ +documents: + - ${oc.env:HOME}/perplexity/option2/documents/*.gz + +dedupe: + name: paloma_paragraphs + paragraphs: + attribute_name: bff_duplicates + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 2336120 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + # file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + file: ${oc.env:HOME}/perplexity/filters/paloma_paragraphs.bin + +processes: 188 diff --git a/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml b/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml new file mode 100644 index 00000000..a153887d --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml @@ -0,0 +1,26 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + + +dedupe: + name: paloma_documents + documents: + attribute_name: paloma_documents_bff_duplicates + key: $.text + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 188815 + desired_false_positive_rate: 1e-15 + file: ${oc.env:HOME}/perplexity/filters/paloma_documents.bin + +processes: 94 diff --git a/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml b/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml new file mode 100644 index 00000000..1921a2ec --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml @@ -0,0 +1,29 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + + +dedupe: + name: paloma_paragraphs + paragraphs: + attribute_name: paloma_paragraphs_bff_duplicates + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 2336120 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + # file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + file: ${oc.env:HOME}/perplexity/filters/paloma_paragraphs.bin + +processes: 94 diff --git a/configs/dolma-v1_6/decontamination/step3_mixing/dolma-v1_6_decon.yaml b/configs/dolma-v1_6/decontamination/step3_mixing/dolma-v1_6_decon.yaml new file mode 100644 index 00000000..9d5bc626 --- /dev/null +++ b/configs/dolma-v1_6/decontamination/step3_mixing/dolma-v1_6_decon.yaml @@ -0,0 +1,94 @@ + +streams: + - name: books + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + attributes: &attributes + - paloma_paragraphs + - paloma_documents + output: &output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/books + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + filter: &filter + exclude: + - "$@.attributes[?(@.paloma_documents_bff_duplicates && @.paloma_documents_bff_duplicates[0] && @.paloma_documents_bff_duplicates[0][2] >= 1.0)]" + - "$@.attributes[?(@.paloma_paragraphs_bff_duplicates && @.paloma_paragraphs_bff_duplicates[0] && @.paloma_paragraphs_bff_duplicates[0][2] >= 1.0)]" + + - name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/c4 + filter: *filter + + - name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_head + filter: *filter + + - name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_middle + filter: *filter + + - name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_tail + filter: *filter + + - name: pes2o + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/pes2o + filter: *filter + + - name: reddit + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/reddit + filter: *filter + + - name: stack + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/stack + filter: *filter + + - name: wiki + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/wiki + filter: *filter + +work_dir: + input: "/tmp/olmo-mix-v1_6/input" + output: "/tmp/olmo-mix-v1_6/output" +processes: 188 diff --git a/configs/dolma-v1_6/decontamination/tokenize_v3.sh b/configs/dolma-v1_6/decontamination/tokenize_v3.sh new file mode 100644 index 00000000..83dbc94e --- /dev/null +++ b/configs/dolma-v1_6/decontamination/tokenize_v3.sh @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +datasets=( + '4chan_meta_sep' + 'c4_100_domains' + 'c4_en' + 'dolma_100_subreddits' + 'dolma-v1_5' + 'falcon-refinedweb' + 'gab' + 'ice_fixed' + 'm2d2_s2orc_unsplit' + 'm2d2_wikipedia_unsplit' + 'manosphere_meta_sep' + 'mc4' + 'pile' + 'ptb' + 'redpajama' + 'twitterAAE_HELM_fixed' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_6/decontamination/tokenize_v3_small.sh b/configs/dolma-v1_6/decontamination/tokenize_v3_small.sh new file mode 100644 index 00000000..cdcb36d3 --- /dev/null +++ b/configs/dolma-v1_6/decontamination/tokenize_v3_small.sh @@ -0,0 +1,29 @@ +#! /usr/bin/env bash + +datasets=( + 'c4_en' + 'dolma_books' + 'dolma_common-crawl' + 'dolma_pes2o' + 'dolma_reddit' + 'dolma_stack' + 'dolma_wiki' + 'ice' + 'm2d2_s2orc' + 'pile' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_6/doc_dedupe/cc_en_head.yaml b/configs/dolma-v1_6/doc_dedupe/cc_en_head.yaml new file mode 100644 index 00000000..f2060630 --- /dev/null +++ b/configs/dolma-v1_6/doc_dedupe/cc_en_head.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.gz + +dedupe: + name: dedupe_docs_v2 + documents: + attribute_name: bff_duplicate_docs + key: $.text + skip_empty: false + +bloom_filter: + file: /tmp/cc_en_head_dedupe_docs.bloom + read_only: false + estimated_doc_count: 60000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_6/doc_dedupe/cc_en_middle.yaml b/configs/dolma-v1_6/doc_dedupe/cc_en_middle.yaml new file mode 100644 index 00000000..a23c6b33 --- /dev/null +++ b/configs/dolma-v1_6/doc_dedupe/cc_en_middle.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.gz + +dedupe: + name: dedupe_docs_v2 + documents: + attribute_name: bff_duplicate_docs + key: $.text + skip_empty: true + +bloom_filter: + file: /tmp/cc_en_middle_dedupe_docs.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part1.yaml b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part1.yaml new file mode 100644 index 00000000..2763a5a0 --- /dev/null +++ b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part1.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-0*.json.gz + +dedupe: + name: dedupe_docs_v2 + documents: + attribute_name: bff_duplicate_docs + key: $.text + skip_empty: true + +bloom_filter: + file: /tmp/cc_en_tail_dedupe_docs.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part2.yaml b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part2.yaml new file mode 100644 index 00000000..41969673 --- /dev/null +++ b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part2.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1*.json.gz + +dedupe: + name: dedupe_docs_v2 + documents: + attribute_name: bff_duplicate_docs + key: $.text + skip_empty: true + +bloom_filter: + file: /tmp/cc_en_tail_dedupe_docs.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part3.yaml b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part3.yaml new file mode 100644 index 00000000..555589ee --- /dev/null +++ b/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part3.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-2*.json.gz + +dedupe: + name: dedupe_docs_v2 + documents: + attribute_name: bff_duplicate_docs + key: $.text + skip_empty: true + +bloom_filter: + file: /tmp/cc_en_tail_dedupe_docs.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_6/mixing/books.yaml b/configs/dolma-v1_6/mixing/books.yaml new file mode 100644 index 00000000..82df4650 --- /dev/null +++ b/configs/dolma-v1_6/mixing/books.yaml @@ -0,0 +1,33 @@ + +streams: +- name: books + + documents: + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + + attributes: + # - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + - tokenizer_repetitions_v2r2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] + < 25)]" + - "$.attributes[?(@.olmo_mix_v1_taggers__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] + < 0.5)]" + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] >= 100)]" + + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/c4.yaml b/configs/dolma-v1_6/mixing/c4.yaml new file mode 100644 index 00000000..ccf5d6ff --- /dev/null +++ b/configs/dolma-v1_6/mixing/c4.yaml @@ -0,0 +1,132 @@ +streams: +- name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + + attributes: + - olmo_mix_v1_taggers + # - perplexity_suite_v3_option2 + - dedupe_paragraphs + - dedupe_docs + - tokenizer_repetitions_v2r2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4 + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + # filter: + # include: [] + # exclude: + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + # && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + # span_replacement: [] + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] + >= 100)]" + + # remove duplicate docs + - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] + && @.bff_duplicate_docs[0][2] >= 1.0)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/cc-head.yaml b/configs/dolma-v1_6/mixing/cc-head.yaml new file mode 100644 index 00000000..1d7a1f40 --- /dev/null +++ b/configs/dolma-v1_6/mixing/cc-head.yaml @@ -0,0 +1,127 @@ +streams: + +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz + + attributes: + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + - tokenizer_repetitions_v2r2 + - dedupe_docs_v2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] + >= 100)]" + + # remove duplicate docs + - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] + && @.bff_duplicate_docs[0][2] >= 1.0)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/cc-middle.yaml b/configs/dolma-v1_6/mixing/cc-middle.yaml new file mode 100644 index 00000000..d6822ca9 --- /dev/null +++ b/configs/dolma-v1_6/mixing/cc-middle.yaml @@ -0,0 +1,127 @@ +streams: + +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz + + attributes: + - dedupe_paragraphs_v2 + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + - tokenizer_repetitions_v2r2 + - dedupe_docs_v2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] + >= 100)]" + + # remove duplicate docs + - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] + && @.bff_duplicate_docs[0][2] >= 1.0)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/cc-tail.yaml b/configs/dolma-v1_6/mixing/cc-tail.yaml new file mode 100644 index 00000000..f652dfeb --- /dev/null +++ b/configs/dolma-v1_6/mixing/cc-tail.yaml @@ -0,0 +1,127 @@ +streams: + +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz + + attributes: + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + - tokenizer_repetitions_v2r2 + - dedupe_docs_v2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] + && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] + >= 100)]" + + # remove duplicate docs + - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] + && @.bff_duplicate_docs[0][2] >= 1.0)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/pes2o.yaml b/configs/dolma-v1_6/mixing/pes2o.yaml new file mode 100644 index 00000000..221937f5 --- /dev/null +++ b/configs/dolma-v1_6/mixing/pes2o.yaml @@ -0,0 +1,46 @@ +--- +streams: +- name: pes2o_v2 + documents: + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + attributes: + - tokenizer_repetitions_v2r2 + + filter: + include: [] + exclude: + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] + # && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/reddit.yaml b/configs/dolma-v1_6/mixing/reddit.yaml new file mode 100644 index 00000000..9c85f4b0 --- /dev/null +++ b/configs/dolma-v1_6/mixing/reddit.yaml @@ -0,0 +1,29 @@ + +streams: +- name: reddit-v5-dedupe-pii-nsfw-toxic + + documents: + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + + attributes: + - perplexity_suite_v3_option2 + - tokenizer_repetitions_v2r2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + filter: + exclude: + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] >= 100)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/stack.yaml b/configs/dolma-v1_6/mixing/stack.yaml new file mode 100644 index 00000000..cba68e62 --- /dev/null +++ b/configs/dolma-v1_6/mixing/stack.yaml @@ -0,0 +1,381 @@ +streams: +- name: stack-v4-train + documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + attributes: + # - perplexity_suite_v3_option2 + - dedupe_docs + - tokenizer_repetitions_v2r2 + + filter: + include: [] + exclude: + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" + + # remove duplicate docs + - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] && @.bff_duplicate_docs[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/mixing/wiki.yaml b/configs/dolma-v1_6/mixing/wiki.yaml new file mode 100644 index 00000000..5c9eb7b4 --- /dev/null +++ b/configs/dolma-v1_6/mixing/wiki.yaml @@ -0,0 +1,36 @@ +--- +streams: +- name: en_simple_wiki_v0 + documents: + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz + attributes: + # - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + - tokenizer_repetitions_v2r2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki + max_size_in_bytes: 4294967296 + min_text_length: 1 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] < 25)]" + + # 100+ repetitions + - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" + + # remove duplicate docs + # - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] && @.bff_duplicate_docs[0][2] >= 1.0)]" + + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_6/sample.yaml b/configs/dolma-v1_6/sample.yaml new file mode 100644 index 00000000..3ad0a66c --- /dev/null +++ b/configs/dolma-v1_6/sample.yaml @@ -0,0 +1,31 @@ +--- +streams: +- name: v1_5r2_sample + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.json.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.json.gz + attributes: + - random_number_v1 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2_03p_sample/documents/ + max_size_in_bytes: 53_687_091_200 + min_text_length: 1 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] >= 0.003)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5r2/input" + output: "/tmp/olmo-mix-v1_5r2/output" +processes: 188 diff --git a/configs/dolma-v1_6/sample/cc-head.yaml b/configs/dolma-v1_6/sample/cc-head.yaml new file mode 100644 index 00000000..fbe474b4 --- /dev/null +++ b/configs/dolma-v1_6/sample/cc-head.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2-sample/documents/cc_en_head + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_6/sample/cc-middle.yaml b/configs/dolma-v1_6/sample/cc-middle.yaml new file mode 100644 index 00000000..0f4d1aff --- /dev/null +++ b/configs/dolma-v1_6/sample/cc-middle.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_6/sample/cc-tail.yaml b/configs/dolma-v1_6/sample/cc-tail.yaml new file mode 100644 index 00000000..d07547a3 --- /dev/null +++ b/configs/dolma-v1_6/sample/cc-tail.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_6/tokenizer.yaml b/configs/dolma-v1_6/tokenizer.yaml new file mode 100644 index 00000000..25ebedd7 --- /dev/null +++ b/configs/dolma-v1_6/tokenizer.yaml @@ -0,0 +1,7 @@ +destination: s3://ai2-llm/preprocessed/olmo-mix/v1_5r2/gpt-neox-olmo-dolma-v1_5 +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/*/*.json.gz +processes: 188 +seed: 3920 +max_size: 20_000_000_000 +tokenizer_name_or_path: allenai/gpt-neox-olmo-dolma-v1_5 diff --git a/configs/dolma-v1_6/tokenizer_v16_sc.yaml b/configs/dolma-v1_6/tokenizer_v16_sc.yaml new file mode 100644 index 00000000..573da56c --- /dev/null +++ b/configs/dolma-v1_6/tokenizer_v16_sc.yaml @@ -0,0 +1,23 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated-sc/gpt-neox-olmo-dolma-v1_6 +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/books + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/c4 + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_head + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_middle + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/cc_en_tail + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/pes2o + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/reddit + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/stack + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-decontaminated/documents/wiki + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz + +processes: 168 +seed: 3920 +max_size: 21_474_836_480 + +tokenizer: + name_or_path: allenai/gpt-neox-olmo-dolma-v1_5 + bos_token_id: null + eos_token_id: 50279 + pad_token_id: 1 + segment_before_tokenization: false diff --git a/docs/assets/dolma-datasheet-v0.1.pdf b/docs/assets/dolma-v0_1-20230819.pdf similarity index 100% rename from docs/assets/dolma-datasheet-v0.1.pdf rename to docs/assets/dolma-v0_1-20230819.pdf diff --git a/docs/assets/dolma-v1_6-20240131.pdf b/docs/assets/dolma-v1_6-20240131.pdf new file mode 100644 index 00000000..e60b2a66 Binary files /dev/null and b/docs/assets/dolma-v1_6-20240131.pdf differ diff --git a/docs/deduplication.md b/docs/deduplication.md index 945363ef..1b4177fe 100644 --- a/docs/deduplication.md +++ b/docs/deduplication.md @@ -25,7 +25,9 @@ The following parameters are supported either via CLI (e.g. `dolma dedupe --para |`dedupe.documents.key`| Mutually exclusive with `dedupe.paragraphs.attribute_name` | Use the json-path-specified field as the key for deduping. The value of the key must be a string. | |`dedupe.documents.attribute_name`|Mutually exclusive with `dedupe.paragraphs.attribute_name`| Name of the attribute to set if the document is a duplicate. | |`dedupe.paragraphs.attribute_name`|Mutually exclusive with `dedupe.documents.key` and `dedupe.documents.attribute_name` | Name of the attribute that will contain spans of duplicate paragraphs. Paragraphs are identified by splitting the `text` field by newline characters. | -|`dedupe.skip_empty`|No| If true, empty documents/paragraphs will be skipped. | +|`dedupe.skip_empty`|No| If true, empty documents/paragraphs will be skipped.| +|`dedupe.min_length`|No| Minimum length of documents/paragraphs to be deduplicated. Defaults to 0.| +|`dedupe.min_words`|No| Minimum number of uniseg word units in documents/paragraphs to be deduplicated. Defaults to 0.| |`bloom_filter.file`|Yes| Save the Bloom filter to this file after processing. If present at startup, the Bloom filter will be loaded from this file. | |`bloom_filter.size_in_bytes`| Mutually exclusive with `bloom_filter.estimated_doc_count` and `bloom_filter.desired_false_positive_rate`| Used to set the size of the Bloom filter (in bytes). | |`bloom_filter.read_only`|No| If true, do not write to the Bloom filter. Useful for things like deduping against a precomputed list of blocked attributes (e.g. URLs) or for decontamination against test data. | diff --git a/pyproject.toml b/pyproject.toml index a8d6dc84..49ad7c6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "0.9.4" +version = "1.0.0" description = "Data filters" license = {text = "Apache-2.0"} readme = "README.md" @@ -16,7 +16,8 @@ dependencies = [ "msgspec>=0.14.2", "nltk==3.8.1", "omegaconf>=2.3.0", - "pycld2==0.41", + "LTpycld2==0.42", # fork of pycld2 that works on Apple Silicon + # "pycld2==0.41", # "pycld3==0.22", # does not install correctly "pyyaml", "requests", @@ -32,10 +33,14 @@ dependencies = [ "charset-normalizer>=3.2.0" ] classifiers = [ - "Development Status :: 4 - Beta", - "Typing :: Typed", - "Programming Language :: Rust", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Rust", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Text Processing", + "Typing :: Typed", ] [[project.authors]] diff --git a/python/dolma/cli/__init__.py b/python/dolma/cli/__init__.py index 86167196..443c2ad2 100644 --- a/python/dolma/cli/__init__.py +++ b/python/dolma/cli/__init__.py @@ -31,7 +31,7 @@ from rich.console import Console from rich.syntax import Syntax -from dolma.core.errors import DolmaConfigError +from ..core.errors import DolmaConfigError __all__ = [ "BaseCli", @@ -148,7 +148,17 @@ def namespace_to_nested_omegaconf(args: Namespace, structured: Type[T], config: base_structured_config: DictConfig = om.structured(structured) merged_config = om.merge(base_structured_config, untyped_config) - assert isinstance(merged_config, DictConfig) + + # check for type + if not isinstance(merged_config, DictConfig): + raise DolmaConfigError(f"Expected a DictConfig, got {type(merged_config).__name__}") + + # try resolving all cross references in the config, raise a DolmaConfigError if it fails + try: + om.resolve(merged_config) + except OmegaConfBaseException as ex: + raise DolmaConfigError(f"Invalid error while parsing key `{ex.full_key}`: {type(ex).__name__}") from ex + return merged_config # pyright: ignore diff --git a/python/dolma/cli/__main__.py b/python/dolma/cli/__main__.py index 1e6a62dd..f40ea85f 100644 --- a/python/dolma/cli/__main__.py +++ b/python/dolma/cli/__main__.py @@ -1,10 +1,12 @@ import multiprocessing from argparse import ArgumentParser from pathlib import Path -from typing import List, Optional +from typing import Any, Dict, List, Optional, Union +import smart_open from yaml import safe_load +from ..core.paths import exists from .analyzer import AnalyzerCli from .deduper import DeduperCli from .mixer import MixerCli @@ -27,19 +29,33 @@ } +def read_config(path: Union[None, str]) -> Dict[str, Any]: + """Read a configuration file if it exists""" + if path is None: + return {} + + if not exists(path): + raise FileNotFoundError(f"Config file {path} does not exist") + + with smart_open.open(path, mode="rt") as f: + return dict(safe_load(f)) + + def main(argv: Optional[List[str]] = None): + """Main entry point for the CLI""" + try: # attempting to set start method to spawn in case it is not set multiprocessing.set_start_method("spawn") - except RuntimeError: + except RuntimeError as ex: # method already set, check if it is set to spawn if multiprocessing.get_start_method() != "spawn": - raise RuntimeError("Multiprocessing start method must be set to spawn") + raise RuntimeError("Multiprocessing start method must be set to spawn") from ex parser = ArgumentParser( prog="dolma", - usage="dolma [command] [options]", - description="Command line interface for the DOLMa dataset processing toolkit", + usage="dolma {global options} [command] {command options}", + description="Command line interface for the Dolma processing toolkit", ) parser.add_argument( "-c", @@ -48,20 +64,28 @@ def main(argv: Optional[List[str]] = None): type=Path, default=None, ) + + # Continue by adding subparsers and parsing the arguments subparsers = parser.add_subparsers(dest="command") subparsers.required = True subparsers.choices = AVAILABLE_COMMANDS.keys() # type: ignore - for command, cli in AVAILABLE_COMMANDS.items(): cli.make_parser(subparsers.add_parser(command, help=cli.DESCRIPTION)) + # parse the arguments args = parser.parse_args(argv) - # try parsing the config file - config: Optional[dict] = None - if config_path := args.__dict__.pop("config"): - assert config_path.exists(), f"Config file {config_path} does not exist" - with open(config_path) as f: - config = dict(safe_load(f)) + # first, get the command and config path to run + command = args.__dict__.pop("command") + config_path = args.__dict__.pop("config", None) or None + + # remove the other optional arguments from the top level parser + args.__dict__.pop("dolma_version", None) + args.__dict__.pop("dolma_commands", None) + + # read the config file if one was provided + config = read_config(config_path) - AVAILABLE_COMMANDS[args.__dict__.pop("command")].run_from_args(args=args, config=config) + # get the cli for the command and run it with the config we just loaded + the args + cli = AVAILABLE_COMMANDS[command] + return cli.run_from_args(args=args, config=config) diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index 5d48bed7..ef5ae3ca 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -62,6 +62,10 @@ class DedupeConfig: default=None, help="Configuration for paragraph deduplication" ) skip_empty: Optional[bool] = field(default=False, help="If true, empty documents/paragraphs will be skipped") + min_length: Optional[int] = field(default=0, help="Minimum length of documents/paragraphs to be deduplicated") + min_words: Optional[int] = field( + default=0, help="Minimum number of uniseg word units in documents/paragraphs to be deduplicated" + ) @dataclass @@ -93,9 +97,19 @@ def run(cls, parsed_config: DeduperConfig): work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir)) # create a dedupe config to populate - dedupe_dict_config: Dict[str, Any] = {"skip_empty": parsed_config.dedupe.skip_empty} + dedupe_dict_config: Dict[str, Any] = { + "skip_empty": parsed_config.dedupe.skip_empty, + "min_length": parsed_config.dedupe.min_length, + "min_words": parsed_config.dedupe.min_words, + } try_name = parsed_config.dedupe.name if not om.is_missing(parsed_config.dedupe, "name") else None + if dedupe_dict_config["min_length"] < 0: + raise ValueError("min_length must be >= 0") + + if dedupe_dict_config["min_words"] < 0: + raise ValueError("min_words must be >= 0") + # add either the document or paragraph dedupe config if not ( om.is_missing(parsed_config.dedupe.documents, "attribute_name") diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py index 814c7a49..60c531b9 100644 --- a/python/dolma/cli/mixer.py +++ b/python/dolma/cli/mixer.py @@ -16,6 +16,7 @@ class StreamOutputConfig: default=2 * 2**30, help="Maximum size of the output file in bytes. Defaults to 2GB." ) discard_fields: List[str] = field(default=[], help="List of fields to discard from the output documents.") + min_text_length: Optional[int] = field(default=0, help="Minimum length of the text in the output documents.") @dataclass @@ -120,6 +121,11 @@ def run(cls, parsed_config: MixerConfig): "max_size_in_bytes": int(stream_config.output.max_size_in_bytes), } + if stream_config.output.min_text_length: + stream_config_dict["output"]["min_text_length"] = int(stream_config.output.min_text_length) + if stream_config.output.min_text_length < 0: + raise ValueError("min_text_length must be >= 0") + if stream_config.output.discard_fields: stream_config_dict["output"]["discard_fields"] = [ str(f) for f in stream_config.output.discard_fields diff --git a/python/dolma/core/paths.py b/python/dolma/core/paths.py index 5d6ec97e..24d648d2 100644 --- a/python/dolma/core/paths.py +++ b/python/dolma/core/paths.py @@ -35,6 +35,7 @@ RE_GLOB_CLOSE_ESCAPE = re.compile(r"(? AbstractFileSystem: @@ -45,10 +46,15 @@ def _get_fs(path: Union[Path, str]) -> AbstractFileSystem: protocol = urlparse(path).scheme fs = get_filesystem_class(protocol)(**FS_KWARGS.get(protocol, {})) + global PATCHED_GLOB # pylint: disable=global-statement + # patch glob method to support recursive globbing - if protocol == "": + if protocol == "" and not PATCHED_GLOB: fs.glob = partial(glob.glob, recursive=True) + # only patch once + PATCHED_GLOB = True + return fs @@ -246,6 +252,13 @@ def add_suffix(a: str, b: str) -> str: return join_path(prot_a, str(path_a / path_b)) +def exists(path: str) -> bool: + """Check if a path exists.""" + + fs = _get_fs(path) + return fs.exists(path) + + def mkdir_p(path: str) -> None: """ Create a directory if it does not exist. diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index a181e3e2..f4256f98 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -194,11 +194,17 @@ def _write_sample_to_streams( attributes_by_stream: Dict[str, TaggerOutputDictType] = {} for tagger_name, tagger_data in samples_collectors.items(): tagger_output = taggers_paths[tagger_name] + + # if not set; it will potentially not write to the output stream + # in case a tagger emits no spans + attributes_by_stream[tagger_output.path] = {} + for tagger_key, tagger_value in tagger_data.items(): tagger_key = f"{tagger_output.exp}__{tagger_output.name}__{make_variable_name(tagger_key)}" - attributes_by_stream.setdefault(tagger_output.path, {})[tagger_key] = tagger_value + attributes_by_stream[tagger_output.path][tagger_key] = tagger_value for stream_path, attributes in attributes_by_stream.items(): + # actually write output = OutputSpec(source=row.source, id=row.id, attributes=attributes) output_streams[stream_path].write(output) diff --git a/python/dolma/taggers/language.py b/python/dolma/taggers/language.py index 504fc2ab..3a518fd4 100644 --- a/python/dolma/taggers/language.py +++ b/python/dolma/taggers/language.py @@ -101,6 +101,27 @@ def predict(self, doc: Document) -> DocResult: return DocResult(doc=doc, spans=spans) +@TaggerRegistry.add("ft_lang_id_doc_v1") +class FastTextAllLanguagesDocumentTagger(BaseFastTextTagger): + MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" + + def __init__(self): + super().__init__(model_path=self.MODEL_PATH, model_mode=self.DOCUMENT_LEVEL_TAGGER) + + def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]: + preds = self.classifier.predict(text_slice.text.lower().replace("\n", " ").strip(), k=-1) + return [ + Prediction(label=label.replace("__label__", ""), score=score) + for label, score in sorted(zip(*preds), key=lambda x: x[1], reverse=True) + ] + + +@TaggerRegistry.add("ft_lang_id_paragraph_v1") +class FastTextAllLanguageParagraphTagger(FastTextAllLanguagesDocumentTagger): + def __init__(self): + BaseFastTextTagger.__init__(self, model_path=self.MODEL_PATH, model_mode=self.PARAGRAPH_LEVEL_TAGGER) + + @TaggerRegistry.add("ft_lang_id_en_doc_v2") class FastTextEnglishLanguageDocumentTagger(BaseFastTextTagger): MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" diff --git a/python/dolma/taggers/repetitions/__init__.py b/python/dolma/taggers/repetitions/__init__.py index 3d941851..9f4e08ac 100644 --- a/python/dolma/taggers/repetitions/__init__.py +++ b/python/dolma/taggers/repetitions/__init__.py @@ -1,7 +1,13 @@ from .repetitions_taggers import ( ParagraphRepetitionsTagger, RepetitionsTagger, + TokenizerRepetitionsSkipEmptyTagger, TokenizerRepetitionsTagger, ) -__all__ = ["RepetitionsTagger", "ParagraphRepetitionsTagger", "TokenizerRepetitionsTagger"] +__all__ = [ + "RepetitionsTagger", + "ParagraphRepetitionsTagger", + "TokenizerRepetitionsTagger", + "TokenizerRepetitionsSkipEmptyTagger", +] diff --git a/python/dolma/taggers/repetitions/repetitions_taggers.py b/python/dolma/taggers/repetitions/repetitions_taggers.py index 44bc2bbe..6145458f 100644 --- a/python/dolma/taggers/repetitions/repetitions_taggers.py +++ b/python/dolma/taggers/repetitions/repetitions_taggers.py @@ -19,6 +19,8 @@ class BaseRepetitionsTagger(BaseTagger): + keep_stats_when_empty: bool = True + @abstractmethod def _extract_from_text(self, text: str) -> Generator[Span, None, None]: raise NotImplementedError() @@ -30,28 +32,30 @@ def _compute_document_stats(self, spans: List[Span], doc: Document) -> List[Span doc_max_span = Span( start=0, end=len(doc.text), - type="doc_max_repetition", + type="doc_max_score_repetition", score=max(spans, key=lambda s: s.score).score if spans else 0.0, ) doc_mean_reps_span = Span( start=0, end=len(doc.text), - type="doc_mean_repetition", - score=float(np.mean([s.score for s in spans]) if spans else 0), + type="doc_max_length_repetition", + score=max(s.end - s.start for s in spans) if spans else 0, ) doc_frac_reps_span = Span( start=0, end=len(doc.text), type="doc_frac_repetition", - score=float(sum([s.score for s in spans]) / len(doc.text) if spans else 0), + score=float(sum(s.end - s.start for s in spans) / len(doc.text) if spans else 0), ) return [doc_max_span, doc_mean_reps_span, doc_frac_reps_span] def predict(self, doc: Document) -> DocResult: """Predict method for the tagger.""" - reps_spans = list(self._extract_from_doc(doc)) - document_stats_spans = self._compute_document_stats(spans=reps_spans, doc=doc) - return DocResult(doc=doc, spans=reps_spans + document_stats_spans) + span_reps = list(self._extract_from_doc(doc)) + if self.keep_stats_when_empty or span_reps: + span_reps += self._compute_document_stats(spans=span_reps, doc=doc) + + return DocResult(doc=doc, spans=span_reps) @TaggerRegistry.add("repetitions_v1") @@ -66,12 +70,15 @@ def __init__(self) -> None: def _extract_from_text(self, text: str) -> Generator[Span, None, None]: """Extract repetitions of characters in the text.""" for match in self.re_char_repetitions.finditer(text): - yield Span( - start=(start := match.start()), - end=(end := match.end()), + repeated_text = match.group(1) + span = Span( + start=match.start(), + end=match.end(), type="repetition", - score=float(end - start), + # score=float(end - start) // len(repeated_text), + score=match.group(0).count(repeated_text), ) + yield span @TaggerRegistry.add("paragraph_repetitions_v1") @@ -110,19 +117,20 @@ def _extract_from_text(self, text: str) -> Generator[Span, None, None]: arr=np.array(tokens.ids), min_period=self.MIN_PERIOD, max_period=self.MAX_PERIOD ) for seq in sequences_iter: - yield Span( - start=(s := tokens.offsets[seq.start][0]), - end=(e := tokens.offsets[seq.end - 1][1]), + out = Span( + start=tokens.offsets[seq.start][0], + end=tokens.offsets[seq.end - 1][1], type="repetition", - score=float(e - s), + score=seq.times, ) + yield out @TaggerRegistry.add("paragraph_tokenizer_repetitions_v1") class ParagraphTokenizerRepetitionsTagger(TokenizerRepetitionsTagger): """Tagger to detect repetitions of tokens in paragraphs. - It's faster than the tokenizer repetition tagger, but it does not account for - repetitions of tokens that span multiple paragraphs.""" + It's faster than the tokenizer repetition tagger, but it does not account + for repetitions of tokens that span multiple paragraphs.""" def _extract_from_doc(self, doc: Document) -> Generator[Span, None, None]: offset = 0 @@ -134,3 +142,33 @@ def _extract_from_doc(self, doc: Document) -> Generator[Span, None, None]: span.end += offset - 1 yield span offset += len(paragraph.text) + + +@TaggerRegistry.add("tokenizer_repetitions_v2r2") +class TokenizerRepetitionsSkipEmptyTagger(TokenizerRepetitionsTagger): + keep_stats_when_empty: bool = False + max_length: int = 100_000 + + def _extract_from_text(self, text: str) -> Generator[Span, None, None]: + sorted_spans = sorted( + super()._extract_from_text(text), key=lambda span: (span.start, -span.end, -span.score) + ) + prev_start = prev_end = -1 + for span in sorted_spans: + if span.start >= prev_start and span.end <= prev_end: + # avoid overlapping spans + continue + + prev_start = span.start + prev_end = span.end + yield span + + def _extract_from_doc(self, doc: Document) -> Generator[Span, None, None]: + offset = 0 + for i in range(0, len(doc.text), self.max_length): + text = doc.text[i : i + self.max_length] + for span in self._extract_from_text(text): + span.start += offset + span.end += offset + yield span + offset += len(text) diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py index 9dff7b3b..c5673c47 100644 --- a/python/dolma/tokenizer/executor.py +++ b/python/dolma/tokenizer/executor.py @@ -80,6 +80,9 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp update_interval = 1 mm_cnt = 0 + # def test(**kwargs): + # breakpoint() + # create the tokenizer from file if it exists, otherwise from pretrained if os.path.exists(tokenizer_name_or_path) and os.path.isfile(tokenizer_name_or_path): tokenizer = Tokenizer.from_file(tokenizer_name_or_path, **tokenizer_kwargs) @@ -229,7 +232,12 @@ def tokenize_in_parallel( # do it once so it gets cached (unless it's local path, so no need) if not os.path.exists(tokenizer_name_or_path): - Tokenizer.from_pretrained(tokenizer_name_or_path) + Tokenizer.from_pretrained( + identifier=tokenizer_name_or_path, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + ) # get a run hash run_hash = hashlib.sha256(("".join(sources) + tokenizer_name_or_path).encode("utf-8")).hexdigest()[:8] diff --git a/python/dolma/tokenizer/tokenizer.py b/python/dolma/tokenizer/tokenizer.py index 9922ac0f..0809d0b8 100644 --- a/python/dolma/tokenizer/tokenizer.py +++ b/python/dolma/tokenizer/tokenizer.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import os import re from enum import Enum from functools import cached_property @@ -14,7 +13,6 @@ import msgspec import smart_open from omegaconf import DictConfig -from omegaconf.omegaconf import OmegaConf as om from tokenizers import Tokenizer as BaseTokenizer from ..core.errors import DolmaConfigError @@ -73,11 +71,11 @@ def __init__( self.base_tokenizer.no_truncation() self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id if pad_token_id is not None else eos_token_id + self.pad_token_id = pad_token_id - if self.pad_token_id: - logger.warning("No pad token ID provided; using 0.") - self.pad_token_id = 0 + if self.pad_token_id is None: + logger.warning(f"No pad token ID provided; using EOS token ID {eos_token_id}.") + self.pad_token_id = eos_token_id self.truncate_to = truncate_to self.truncate_direction = TruncationDirection(truncate_direction) @@ -173,27 +171,27 @@ def from_file(cls, filename: PathOrStr, **kwargs) -> "Tokenizer": base_tokenizer = BaseTokenizer.from_file(filename) return cls(base_tokenizer=base_tokenizer, **kwargs) - @classmethod - def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> "Tokenizer": - """ - Load a tokenizer from a checkpoint. - """ - from cached_path import cached_path - - # Load configs. - config_path = cached_path(os.path.join(checkpoint_dir, "config.yaml")) - tokenizer_config = om.load(config_path).tokenizer - model_config = om.load(config_path).model - - # Initialize tokenizer and validate vocab size. - tokenizer = cls.from_pretrained( - tokenizer_config.identifier, - eos_token_id=model_config.eos_token_id, - pad_token_id=model_config.pad_token_id, - ) - if model_config.vocab_size != tokenizer.vocab_size: - raise DolmaConfigError("vocab size mismatch between config and tokenizer") - return tokenizer + # @classmethod + # def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> "Tokenizer": + # """ + # Load a tokenizer from a checkpoint. + # """ + # from cached_path import cached_path + + # # Load configs. + # config_path = cached_path(os.path.join(checkpoint_dir, "config.yaml")) + # tokenizer_config = om.load(config_path).tokenizer + # model_config = om.load(config_path).model + + # # Initialize tokenizer and validate vocab size. + # tokenizer = cls.from_pretrained( + # tokenizer_config.identifier, + # eos_token_id=model_config.eos_token_id, + # pad_token_id=model_config.pad_token_id, + # ) + # if model_config.vocab_size != tokenizer.vocab_size: + # raise DolmaConfigError("vocab size mismatch between config and tokenizer") + # return tokenizer def add_special_tokens(self, input_ids: List[int]) -> List[int]: """ diff --git a/scripts/attributes_heatmap.py b/scripts/attributes_heatmap.py index c97e0b8c..2f417551 100644 --- a/scripts/attributes_heatmap.py +++ b/scripts/attributes_heatmap.py @@ -17,16 +17,17 @@ if os.path.exists("corr.csv"): corr = pd.read_csv("corr.csv", index_col=0) else: - # A line is e.g. + # A line is e.g. # {"gopher_span": [], "decontamination_span": [], "hatespeech_span": [], "pii_span": [], "dedupe_paragraphs_span": [[0, 615, 1.0], [615, 1214, 1.0], [1214, 1853, 1.0], [1853, 2417, 1.0], [2417, 2849, 1.0]]} df = pd.read_json( - #"/home/niklas/dolma/tmp.jsonl/cc_en_head-0000.json", lines=True - "cc_en_head_stats10.jsonl", lines=True + # "/home/niklas/dolma/tmp.jsonl/cc_en_head-0000.json", lines=True + "cc_en_head_stats10.jsonl", + lines=True, ) ### Matching based on the entire doc ### # Where the span is not empty turn it into True, elsewhere into False # Compute correlations between the attributes to later turn it into a heatmap - corr = df.map(lambda x: bool(x)).corr(method='pearson') + corr = df.map(lambda x: bool(x)).corr(method="pearson") ### Matching based on spans ### """ @@ -51,7 +52,7 @@ corr = matrix / len(df) corr *= 100 # Add the column names - corr = pd.DataFrame(corr, columns=columns, index=columns) + corr = pd.DataFrame(corr, columns=columns, index=columns) """ # Plot the heatmap @@ -60,19 +61,19 @@ mask = np.triu(np.ones_like(corr, dtype=bool)) heatmap = sns.heatmap( corr.rename(columns=COLNAME_TO_LABEL, index=COLNAME_TO_LABEL), - mask=mask, - vmin=corr.values.min(), - vmax=corr.values[~mask].max(), # Max ignoring the ones in corr - annot=True, - cmap='Blues', + mask=mask, + vmin=corr.values.min(), + vmax=corr.values[~mask].max(), # Max ignoring the ones in corr + annot=True, + cmap="Blues", linewidths=0.5, annot_kws={"fontsize": 32}, - cbar=False, # No legend + cbar=False, # No legend ) -heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize=32)#, fontweight="bold") -heatmap.set_yticklabels(heatmap.get_ymajorticklabels(), fontsize=32)#, fontweight="bold") +heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize=32) # , fontweight="bold") +heatmap.set_yticklabels(heatmap.get_ymajorticklabels(), fontsize=32) # , fontweight="bold") corr.to_csv("corr.csv") -plt.savefig('attributes_heatmap_docbased_9mdocs.pdf', dpi=450, bbox_inches='tight') -plt.savefig('attributes_heatmap_docbased_9mdocs.png', dpi=450, bbox_inches='tight') +plt.savefig("attributes_heatmap_docbased_9mdocs.pdf", dpi=450, bbox_inches="tight") +plt.savefig("attributes_heatmap_docbased_9mdocs.png", dpi=450, bbox_inches="tight") diff --git a/scripts/code_reasoning_ablations.py b/scripts/code_reasoning_ablations.py index 72204804..9f9d4070 100644 --- a/scripts/code_reasoning_ablations.py +++ b/scripts/code_reasoning_ablations.py @@ -5,20 +5,17 @@ import argparse import os import random +from abc import ABC import numpy as np import pandas as pd import torch - -from abc import ABC - -from torch.utils.data import DataLoader from accelerate import Accelerator -from tqdm import tqdm - from datasets import load_dataset from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast from rouge_score import rouge_scorer +from torch.utils.data import DataLoader +from tqdm import tqdm from transformers import DataCollatorForSeq2Seq, get_scheduler models_checkpoints = { @@ -43,9 +40,13 @@ def get_args(): args.add_argument("--per_device_train_batch_size", type=int, default=8, help="Batch size") args.add_argument("--test_per_device_train_batch_size", type=int, default=16, help="Test Batch size") args.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate") - args.add_argument("--n-solutions-sampled", default=20, type=int, help="Number of solutions to sample per example") + args.add_argument( + "--n-solutions-sampled", default=20, type=int, help="Number of solutions to sample per example" + ) args.add_argument("--num-train-epochs", type=int, default=3, help="Number of training epochs") - args.add_argument("--warmup_ratio", type=float, default=0.03, help="Ratio of total training steps used for warmup.") + args.add_argument( + "--warmup_ratio", type=float, default=0.03, help="Ratio of total training steps used for warmup." + ) return args.parse_args() @@ -75,7 +76,9 @@ def build_prompt(self, test_example, demonstrations): return prompt def get_num_new_tokens(self, test_example, tokenizer): - # generate a bit more tokens than gold (otherwise we might think prediction is correct even though the actual prediction is longer, thus wrong) - this can be replaced with a stop token + # generate a bit more tokens than gold (otherwise we might think + # prediction is correct even though the actual prediction is longer, thus + # wrong) - this can be replaced with a stop token return len(tokenizer.encode(test_example["answer"], add_special_tokens=False)) + 5 def compute_metrics(self, prediction, test_example): @@ -84,7 +87,7 @@ def compute_metrics(self, prediction, test_example): class WebNLGEval(Eval): def __init__(self): - self.scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True) + self.scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True) def build_prompt(self, test_example, demonstrations): prompt = """I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n""" @@ -100,7 +103,8 @@ def get_num_new_tokens(self, test_example, tokenizer): return 200 # this is enough for the test set def compute_metrics(self, prediction, test_example): - return {"rouge2_f1": self.scorer.score(test_example["target"], prediction)['rouge2'].fmeasure} + return {"rouge2_f1": self.scorer.score(test_example["target"], prediction)["rouge2"].fmeasure} + class GSM8KEval(Eval): def build_prompt(self, test_example, demonstrations): @@ -110,7 +114,6 @@ def get_num_new_tokens(self, test_example, tokenizer): return 300 # this is enough for the test set def compute_metrics(self, prediction, test_example): - def run_program(code): """Important: executing code outside a secure docker container is potentially dangerous""" if "import" in code: @@ -155,7 +158,7 @@ def complete_prompt(model, tokenizer, prompt: str, new_tokens=100): input_ids = torch.Tensor([input_ids]).long().to(model.device) attention_mask = torch.ones_like(input_ids).to(model.device) entire_output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length)[0] - output_new_tokens = entire_output[len(input_ids[0]):] + output_new_tokens = entire_output[len(input_ids[0]) :] decoded = tokenizer.decode(output_new_tokens, skip_special_tokens=True).strip() return decoded @@ -186,8 +189,9 @@ def eval_only(evaluator, model, model_name, tokenizer, train_dataset, test_datas context = evaluator.build_prompt(ex, demonstrations) n_new_tokens = evaluator.get_num_new_tokens(ex, tokenizer) - prediction = complete_prompt(model=model, tokenizer=tokenizer, prompt=context, - new_tokens=n_new_tokens) + prediction = complete_prompt( + model=model, tokenizer=tokenizer, prompt=context, new_tokens=n_new_tokens + ) prediction = prediction.split("\n")[0].strip() ex_metrics = evaluator.compute_metrics(prediction, ex) @@ -195,30 +199,28 @@ def eval_only(evaluator, model, model_name, tokenizer, train_dataset, test_datas all_metrics.append(ex_metrics) avg_metrics = {key: np.mean([ex[key] for ex in all_metrics]) for key in all_metrics[0].keys()} - results.append({ - "model": model_name, - "few_shot": few_shot, - "seed": seed, - "context": context, - "prediction": prediction, - "answer": ex.get("answer") or ex.get("target"), - **ex_metrics - }) + results.append( + { + "model": model_name, + "few_shot": few_shot, + "seed": seed, + "context": context, + "prediction": prediction, + "answer": ex.get("answer") or ex.get("target"), + **ex_metrics, + } + ) tqdm_loop.set_description( - f"model: {model_name}, few_shots: {few_shot}, seed: {seed}, " + ", ".join( - [f"{key}: {value:.3f}" for key, value in avg_metrics.items()])) + f"model: {model_name}, few_shots: {few_shot}, seed: {seed}, " + + ", ".join([f"{key}: {value:.3f}" for key, value in avg_metrics.items()]) + ) except Exception as e: print(e) print(f"Skipping...") continue avg_metrics = {key: np.mean([ex[key] for ex in all_metrics]) for key in all_metrics[0].keys()} - agg_results.append({ - "model": model_name, - "seed": seed, - "few_shot": few_shot, - **avg_metrics - }) + agg_results.append({"model": model_name, "seed": seed, "few_shot": few_shot, **avg_metrics}) return results, agg_results @@ -243,16 +245,22 @@ def evaluate(model, tokenizer, dataset, dataset_loader): i = 0 for eval_batch in eval_loop: max_length = eval_batch["input_ids"].size(1) + evaluator.get_num_new_tokens(None, tokenizer) - entire_output = model.generate(eval_batch["input_ids"], attention_mask=eval_batch["attention_mask"], - max_length=max_length, do_sample=True, temperature=0.7, top_p=0.6, - num_return_sequences=args.n_solutions_sampled) - output_new_tokens = entire_output[:, eval_batch["input_ids"].size(1):] + entire_output = model.generate( + eval_batch["input_ids"], + attention_mask=eval_batch["attention_mask"], + max_length=max_length, + do_sample=True, + temperature=0.7, + top_p=0.6, + num_return_sequences=args.n_solutions_sampled, + ) + output_new_tokens = entire_output[:, eval_batch["input_ids"].size(1) :] decoded = tokenizer.batch_decode(output_new_tokens, skip_special_tokens=True) - for j, ex in enumerate(examples[i:i + args.test_per_device_train_batch_size]): + for j, ex in enumerate(examples[i : i + args.test_per_device_train_batch_size]): predictions = [] predictions_accuracies = [] - for prediction in decoded[j * args.n_solutions_sampled:(j + 1) * args.n_solutions_sampled]: + for prediction in decoded[j * args.n_solutions_sampled : (j + 1) * args.n_solutions_sampled]: prediction = prediction.strip() predictions.append(prediction) @@ -260,14 +268,16 @@ def evaluate(model, tokenizer, dataset, dataset_loader): predictions_accuracies.append(accuracy) - eval_result.append({ - "model": model_name, - "input": ex["question"], - "gold": ex["answer"].split("####")[1].strip(), - "prediction_0": predictions[0], - "pass@k": any(predictions_accuracies), - "pass_rate": np.mean(predictions_accuracies), - }) + eval_result.append( + { + "model": model_name, + "input": ex["question"], + "gold": ex["answer"].split("####")[1].strip(), + "prediction_0": predictions[0], + "pass@k": any(predictions_accuracies), + "pass_rate": np.mean(predictions_accuracies), + } + ) accuracies.append(any(predictions_accuracies)) eval_loop.set_description(f"Evaluating, accuracy: {np.mean(accuracies):.3f}") i += args.test_per_device_train_batch_size @@ -280,7 +290,9 @@ def evaluate(model, tokenizer, dataset, dataset_loader): np.random.seed(seed) random.seed(seed) - train_dataset = load_dataset("json", data_files={"train": "code_reasoning_ablations_gsm8k_code.jsonl"})["train"].map(lambda ex: {"answer": ex["python"]}) + train_dataset = load_dataset("json", data_files={"train": "code_reasoning_ablations_gsm8k_code.jsonl"})[ + "train" + ].map(lambda ex: {"answer": ex["python"]}) train_dataset = train_dataset.shuffle(seed=seed).select(range(args.n_train_samples)) accelerator = Accelerator() @@ -289,14 +301,18 @@ def evaluate(model, tokenizer, dataset, dataset_loader): "train": train_dataset.map( tokenize_function_train, batched=False, - remove_columns=[name for name in train_dataset.column_names if - name not in ["input_ids", "labels", "attention_mask"]], + remove_columns=[ + name + for name in train_dataset.column_names + if name not in ["input_ids", "labels", "attention_mask"] + ], ), "test": test_dataset.map( tokenize_function_eval, batched=False, - remove_columns=[name for name in test_dataset.column_names if - name not in ["input_ids", "labels", "attention_mask"]], + remove_columns=[ + name for name in test_dataset.column_names if name not in ["input_ids", "labels", "attention_mask"] + ], ), } @@ -304,14 +320,14 @@ def evaluate(model, tokenizer, dataset, dataset_loader): tokenized_datasets["train"], shuffle=True, collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest"), - batch_size=args.per_device_train_batch_size + batch_size=args.per_device_train_batch_size, ) test_dataloader = DataLoader( tokenized_datasets["test"], shuffle=False, collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest"), - batch_size=args.test_per_device_train_batch_size + batch_size=args.test_per_device_train_batch_size, ) optimizer = torch.optim.AdamW([p for n, p in model.named_parameters()], lr=args.learning_rate) @@ -363,10 +379,7 @@ def evaluate(model, tokenizer, dataset, dataset_loader): best_acc = max(acc_per_epoch) best_epoch = acc_per_epoch.index(best_acc) - agg_results = [{ - "model": model_name, - "seed": seed, - "pass@k": max(acc_per_epoch)}] + agg_results = [{"model": model_name, "seed": seed, "pass@k": max(acc_per_epoch)}] return results_per_epoch[best_epoch], agg_results @@ -375,7 +388,9 @@ def main(): random.seed(args.seed) - hf_dataset = {"babi": ("Muennighoff/babi", ), "web_nlg": ("GEM/web_nlg", "en"), "gsm8k": ("gsm8k", "main")}[args.dataset] + hf_dataset = {"babi": ("Muennighoff/babi",), "web_nlg": ("GEM/web_nlg", "en"), "gsm8k": ("gsm8k", "main")}[ + args.dataset + ] print("Loading dataset...") train_dataset = load_dataset(*hf_dataset)["train"] @@ -392,10 +407,15 @@ def main(): evaluator = {"babi": BabiEval, "web_nlg": WebNLGEval, "gsm8k": GSM8KEval}[args.dataset]() - checkpoints_of_evaluated_models = models_checkpoints if args.models is None else {model_name: checkpoint for - model_name, checkpoint in - models_checkpoints.items() if - model_name in args.models} + checkpoints_of_evaluated_models = ( + models_checkpoints + if args.models is None + else { + model_name: checkpoint + for model_name, checkpoint in models_checkpoints.items() + if model_name in args.models + } + ) results, agg_results = [], [] for model_name, checkpoint in checkpoints_of_evaluated_models.items(): @@ -409,9 +429,13 @@ def main(): for seed in range(args.number_seeds): if args.dataset in ["babi", "web_nlg"]: - seed_results, seed_agg_results = eval_only(evaluator, model, model_name, tokenizer, train_dataset, test_dataset, seed, args) + seed_results, seed_agg_results = eval_only( + evaluator, model, model_name, tokenizer, train_dataset, test_dataset, seed, args + ) else: - seed_results, seed_agg_results = train_and_eval_gsm8k(evaluator, model, model_name, tokenizer, test_dataset, seed, args) + seed_results, seed_agg_results = train_and_eval_gsm8k( + evaluator, model, model_name, tokenizer, test_dataset, seed, args + ) results += seed_results agg_results += seed_agg_results @@ -432,7 +456,8 @@ def main(): # final results metric_keys = [key for key in df_agg.keys() if key not in ["model", "seed", "few_shot"]] - print(df.groupby(['model']).mean(numeric_only=True)[metric_keys]) + print(df.groupby(["model"]).mean(numeric_only=True)[metric_keys]) + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/dolma_paper_plots.sh b/scripts/dolma_paper_plots.sh index fdeb777c..cbde0173 100644 --- a/scripts/dolma_paper_plots.sh +++ b/scripts/dolma_paper_plots.sh @@ -96,8 +96,10 @@ V2_V3_PERPLEXITY_SUITE="$(printf "%s " "${v2_v3_perplexity_suite[@]}" | sed 's/ runs_up_to_150b=( 'olmo-small-rpj-*' 'olmo-small-pile-fixed-*' - 'olmo-small-dolma-*' + 'olmo-small-c4-*' + 'olmo-small-mc4-*' 'olmo-small-falcon-*' + 'olmo-small-dolma-*' ) RUNS_UP_TO_150B="$(printf "%s " "${runs_up_to_150b[@]}" | sed 's/ $//')" @@ -235,10 +237,13 @@ ablations_runs=( 'reddit-v1-ablation-base_* reddit-v1-ablation-pii-nsfw-toxic_filtered_* reddit-v1-ablation-toxic-filtered_*' 'olmo-mix-v1-sample_* olmo-mix-v1-sample-all-cc* olmo-mix-v1-sample-mix2_* olmo-mix-v1-gopher-like_*' 'stack-v2* stack-v4*' - 'c4-stack-15p* c4_p85-stack_v4_p15* c4_p85-starcoder_p15*' + 'GPT-Neox-20B* c4-stack-15p* c4_p85-stack_v4_p15* c4_p85-starcoder_p15*' 'v1-small-hatespeech-filtered-low* v1-small-nsfw-filtered-low* v1-small-hatespeech-filtered-high* v1-small-nsfw-filtered-high* abl-cc-v1-small-dedup_*' 'abl-cc-v1-small-dedup_* abl-cc-v2-small-dedup*' - 'abl-cc-v1-small-dedup_* v1-small-c4-cleaned_\d+ v1-small-c4-filtered_\d+ v1-small-gopher-filtered_\d+ v1-small-c4-cleaned-gopher-filtered_\d+ v1-small-c4-cleaned-gopher-filtered-deduped_\d+ olmo-mix-v1-sample-all-cc*' + 'abl-cc-v1-small-dedup_* v1-small-c4-cleaned_* v1-small-c4-filtered_* v1-small-gopher-filtered_* v1-small-c4-cleaned-gopher-filtered_* v1-small-c4-cleaned-gopher-filtered-deduped_* olmo-mix-v1-sample-all-cc*' + 'abl-cc-v1-small-dedup_* v1-small-c4-cleaned_* v1-small-c4-filtered_* v1-small-gopher-filtered_* v1-small-c4-cleaned-gopher-filtered_*' + 'abl-cc-v1-small-dedup_* v1-small-c4-cleaned-gopher-filtered_* v1-small-c4-cleaned-gopher-filtered-deduped_* olmo-mix-v1-sample-all-cc*' + 'reddit-v5-ablation-filtered-gen-2_* reddit-v3-ablation-base-* reddit-v2-ablation-base-* reddit-v4-ablation-base-* reddit-v1-ablation-base_*' ) ablations_names=( 'cc_pii_filtering' @@ -249,17 +254,23 @@ ablations_names=( 'cc_toxic_filtering' 'cc_dedupe' 'cc_quality' + 'cc_quality_only' + 'cc_to_quality_plus_content' + 'reddit_selection' ) limits=( '150e9' - '150e9' + '60e9' '150e9' '50e9' '50e9' '150e9' '150e9' '150e9' + '150e9' + '150e9' + '60e9' ) # Loop through the indices of the array. @@ -288,7 +299,7 @@ for index in "${!ablations_names[@]}"; do -v ${SCRIPT_DIR}/wandb_run_vocab.yaml \ --plotly-font-size 9 \ --plotly-figure-width 400 \ - --plotly-figure-height 400 + --plotly-figure-height 250 set +ex fi @@ -309,7 +320,7 @@ for index in "${!ablations_names[@]}"; do -v ${SCRIPT_DIR}/wandb_run_vocab.yaml \ --plotly-font-size 9 \ --plotly-figure-width 400 \ - --plotly-figure-height 400 + --plotly-figure-height 250 set +ex @@ -333,7 +344,7 @@ for index in "${!ablations_names[@]}"; do -v ${SCRIPT_DIR}/wandb_run_vocab.yaml \ --plotly-font-size 9 \ --plotly-figure-width 400 \ - --plotly-figure-height 400 + --plotly-figure-height 250 set +ex fi @@ -356,7 +367,7 @@ for index in "${!ablations_names[@]}"; do -v ${SCRIPT_DIR}/wandb_run_vocab.yaml \ --plotly-font-size 9 \ --plotly-figure-width 400 \ - --plotly-figure-height 400 + --plotly-figure-height 250 set +ex fi diff --git a/scripts/dolma_single_digit_tokenizer.py b/scripts/dolma_single_digit_tokenizer.py new file mode 100644 index 00000000..03fae38c --- /dev/null +++ b/scripts/dolma_single_digit_tokenizer.py @@ -0,0 +1,58 @@ +import argparse +import json +from tempfile import TemporaryDirectory + +from transformers import AutoTokenizer + +OLD_TOKENIZER_NAME = "allenai/gpt-neox-olmo-dolma-v1_5" +NEW_TOKENIZER_NAME = "allenai/gpt-neox-olmo-dolma-v1_5-digits" + + +def main(push_to_hub=False): + old_tok = AutoTokenizer.from_pretrained(OLD_TOKENIZER_NAME) + + with TemporaryDirectory() as tmp_dir: + old_tok.save_pretrained(tmp_dir) + + with open(f"{tmp_dir}/tokenizer.json", "r") as f: + tokenizer_config = json.load(f) + + tokenizer_config["pre_tokenizer"] = { + "type": "Sequence", + "pretokenizers": [ + {"type": "Digits", "individual_digits": True}, + tokenizer_config["pre_tokenizer"], + ], + } + + with open(f"{tmp_dir}/tokenizer.json", "w") as f: + json.dump(tokenizer_config, f) + + new_tok = AutoTokenizer.from_pretrained(tmp_dir) + + hello_world = "Hello world<|endoftext|>" + new_enc = new_tok.encode(hello_world) + old_enc = old_tok.encode(hello_world) + assert len(new_enc) == len(old_enc) + assert new_enc == old_enc + + hello_digits = "Hello *1234* world<|endoftext|>" + new_enc = new_tok.encode(hello_digits) + old_enc = old_tok.encode(hello_digits) + assert len(new_enc) == len(old_enc) + 3 + assert new_enc[:2] == old_enc[:2] + assert new_enc[2:6] == [old_tok.vocab[d] for d in "1234"] + assert old_enc[2:3] == [old_tok.vocab["1234"]] + assert new_enc[6:] == old_enc[3:] + + if push_to_hub: + print("Pushing to hub...") + new_tok.push_to_hub(NEW_TOKENIZER_NAME) + print(f"tokenizer available at: https://huggingface.co/{NEW_TOKENIZER_NAME}") + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--push-to-hub", action="store_true") + args = ap.parse_args() + main(args.push_to_hub) diff --git a/scripts/dolma_stats.py b/scripts/dolma_stats.py index 971b6092..cb45e2cc 100644 --- a/scripts/dolma_stats.py +++ b/scripts/dolma_stats.py @@ -1,6 +1,7 @@ import argparse import bisect import copy +import gzip import hashlib import json import multiprocessing @@ -20,10 +21,10 @@ import smart_open import tldextract import tqdm - from dolma.core.data_types import InputSpec, OutputSpec from dolma.core.parallel import BaseParallelProcessor -from dolma.core.paths import glob_path +from dolma.core.paths import glob_path, make_relative, split_path +from dolma.tokenizer import Tokenizer T = TypeVar("T", bound=Type["BaseStatsProcessor"]) @@ -206,6 +207,7 @@ def all(cls) -> Generator[Tuple[str, Type["BaseStatsProcessor"]], None, None]: class BaseStatsProcessor(BaseParallelProcessor): documents: Union[str, List[str]] stats: str + skip_parallel: bool = False @classmethod def increment_progressbar( @@ -244,7 +246,19 @@ def _run_parallel_processor(cls, stats_root: str, num_workers: int, debug: bool, num_processes=num_workers, debug=debug, ) - processor(**process_single_kwargs) + if not cls.skip_parallel: + processor(**process_single_kwargs) + + @staticmethod + def _group_by_subset(paths: List[str]) -> Dict[str, List[str]]: + shared, _ = make_relative(paths) + shared = shared.rstrip("/") + "/" + + grouped_paths: Dict[str, List[str]] = {} + for path in sorted(paths): + _, parts = split_path(path.replace(shared, "")) + grouped_paths.setdefault("/".join(parts[:-1]), []).append(path) + return grouped_paths @classmethod def cli(cls, num_workers: int = 1, debug: bool = False, **process_single_kwargs: Any) -> None: @@ -258,17 +272,31 @@ def cli(cls, num_workers: int = 1, debug: bool = False, **process_single_kwargs: ) paths = list(glob_path(cls.stats)) - counts: dict = {} + grouped_paths = cls._group_by_subset(paths) - with multiprocessing.Pool(num_workers) as pool: - data = (cls._read_json(path) for path in paths) if debug else pool.imap(cls._read_json, paths) + grouped_counts: Dict[str, dict] = defaultdict(dict) + + with tqdm.tqdm(desc=f"Merging {cls.__name__} stats", unit=" files", total=len(paths)) as pbar: + for subset, sub_paths in grouped_paths.items(): + with multiprocessing.Pool(num_workers) as pool: + if debug: + data = (cls._read_json(path) for path in sub_paths) + else: + data = (e for e in pool.imap(cls._read_json, sub_paths)) + + for content in data: + pbar.update(1) + grouped_counts[subset] = cls._merge_dicts(grouped_counts[subset], content) - for content in tqdm.tqdm(data, desc=f"Merging {cls.__name__} stats", unit=" files", total=len(paths)): - counts = cls._merge_dicts(counts, content) + global_counts: dict = {} + for subset_count in grouped_counts.values(): + for k, v in cls._merge_dicts(global_counts, subset_count).items(): + global_counts[k] = v + grouped_counts["__GLOBAL__"] = global_counts summary_dest = f"{stats_root}/summary.json" with smart_open.open(summary_dest, "wt") as destination_file: - destination_file.write(json.dumps(counts, indent=2, sort_keys=True)) + destination_file.write(json.dumps(grouped_counts, indent=2, sort_keys=True)) @Registry.add @@ -411,11 +439,109 @@ def process_single( destination_file.write(json.dumps(stats, indent=2)) +@Registry.add +class dolma_v15r2_counts(BaseStatsProcessor): + documents = "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/*/*.gz" + stats = "s3://ai2-llm/stats/olmo-mix/dolma-v1_5r2/counts/*/*.gz" + skip_parallel = True + + @classmethod + def process_single( + cls, source_path: str, destination_path: str, queue: "Queue[Union[Tuple[int, ...], None]]", **kwargs: Any + ): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # for the data sheet, what statistics you think we should include? I could + # do # of docs, # tokens, distribution of URLs, pronouns, s2 FOS, stack + # languages? + decoder = msgspec.json.Decoder(InputSpec) + documents = words = 0 + olmo_tokens = llama_tokens = 0 + interval = 10_000 + + olmo_tokenizer = Tokenizer.from_pretrained("allenai/gpt-neox-olmo-dolma-v1_5") + llama_tokenizer = Tokenizer.from_pretrained("NousResearch/Llama-2-7b-hf") + + with smart_open.open(source_path, "rb") as source_file: + for line in source_file: + document = decoder.decode(line) + documents += 1 + words += len(blingfire.text_to_words(document.text).split()) + olmo_tokens += len(olmo_tokenizer.encode(document.text, add_special_tokens=False)) + llama_tokens += len(llama_tokenizer.encode(document.text, add_special_tokens=False)) + + if documents % interval == 0: + cls.increment_progressbar(queue, documents=interval) + + cls.increment_progressbar(queue, files=1, documents=documents % interval) + + counters = { + "documents": documents, + "words": words, + "olmo_tokens": olmo_tokens, + "llama_tokens": llama_tokens, + } + + with smart_open.open(destination_path, "wt") as destination_file: + destination_file.write(json.dumps(counters, indent=2, sort_keys=True)) + + +@Registry.add +class dolma_v15r2_olmo(BaseStatsProcessor): + documents = "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/*/*.gz" + stats = "s3://ai2-llm/stats/olmo-mix/dolma-v1_5r2/counts_with_bytes/*/*.gz" + skip_parallel = False + + @classmethod + def process_single( + cls, source_path: str, destination_path: str, queue: "Queue[Union[Tuple[int, ...], None]]", **kwargs: Any + ): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + decoder = msgspec.json.Decoder(InputSpec) + documents = words = 0 + olmo_tokens = 0 + utf8_length = 0 + bytes_length = 0 + gzip_bytes_length = 0 + interval = 10_000 + + olmo_tokenizer = Tokenizer.from_pretrained("allenai/gpt-neox-olmo-dolma-v1_5") + + with smart_open.open(source_path, "rb") as source_file: + for line in source_file: + document = decoder.decode(line) + documents += 1 + words += len(blingfire.text_to_words(document.text).split()) + olmo_tokens += len(olmo_tokenizer.encode(document.text, add_special_tokens=False)) + bytes_length += len(d := document.text.encode("utf-8")) + utf8_length += len(d.decode("utf-8")) + gzip_bytes_length += gzip.compress(document.text.encode("utf-8")).__sizeof__() + + if documents % interval == 0: + cls.increment_progressbar(queue, documents=interval) + + cls.increment_progressbar(queue, files=1, documents=documents % interval) + + counters = { + "documents": documents, + "words": words, + "olmo_tokens": olmo_tokens, + "bytes_length": bytes_length, + "gzip_bytes_length": gzip_bytes_length, + "utf8_length": utf8_length, + } + + with smart_open.open(destination_path, "wt") as destination_file: + destination_file.write(json.dumps(counters, indent=2, sort_keys=True)) + + @Registry.add class cc_v1_c4_cleaned(BaseStatsProcessor): documents = "s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_*/*.gz" stats = "s3://ai2-llm/stats/olmo-mix/v1/cc/v1_c4_cleaned/**/*.gz" decontamination_key: str = "decontamination" + repetitions_threshold = 100 @classmethod def gopher_rules(cls, attrs: Dict[str, List[Tuple[int, int, float]]]) -> List[Tuple[int, int, float]]: @@ -527,6 +653,8 @@ def process_single( source_path.replace("/documents/", "/attributes/hatespeech_nsfw_cc_v3/"), source_path.replace("/documents/", "/attributes/pii_detection/"), source_path.replace("/documents/", "/attributes/dedupe_paragraphs/"), + source_path.replace("/documents/", "/attributes/dedupe_docs_v2/"), + source_path.replace("/documents/", "/attributes/tokenizer_repetitions_v2r2/"), ] doc_decoder = msgspec.json.Decoder(InputSpec) @@ -543,6 +671,12 @@ def process_single( "dedupe_paragraphs_count": 0, "dedupe_paragraphs_length": 0, "dedupe_paragraphs_matches": 0, + "dedupe_docs_count": 0, + "dedupe_docs_length": 0, + "dedupe_docs_matches": 0, + "repetitions_count": 0, + "repetitions_length": 0, + "repetitions_matches": 0, "hatespeech_nsfw_count": 0, "hatespeech_nsfw_length": 0, "hatespeech_nsfw_matches": 0, @@ -618,6 +752,27 @@ def process_single( stats["dedupe_paragraphs_length"] += sum(s[1] - s[0] for s in dups) stats["dedupe_paragraphs_matches"] += 1 if dups else 0 + docs_dups = [p for p in attrs.get("bff_duplicate_docs", []) if p[1] - p[0] > 0] + stats["dedupe_docs_count"] += len(docs_dups) + stats["dedupe_docs_length"] += sum(s[1] - s[0] for s in docs_dups) + stats["dedupe_docs_matches"] += 1 if docs_dups else 0 + + # Repetitions stats + (_, _, max_reps), *_ = attrs.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition", [[0, 0, 0]] + ) + if max_reps >= cls.repetitions_threshold: + reps = [ + r + for r in attrs.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__repetition", [] + ) + if r[-1] >= cls.repetitions_threshold + ] + stats["repetitions_count"] += len(reps) + stats["repetitions_length"] += len(doc.text) + stats["repetitions_matches"] += 1 + documents += 1 if documents % interval == 0: @@ -635,6 +790,76 @@ class v15_cc_c4_cleaned(cc_v1_c4_cleaned): stats = "s3://ai2-llm/stats/olmo-mix/v15/cc/v1_c4_cleaned/**/*.gz" decontamination_key: str = "perplexity_suite_v3_option2" + +@Registry.add +class v15r2_cc_c4_cleaned_dup(cc_v1_c4_cleaned): + documents = "s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_*/*.gz" + stats = "s3://ai2-llm/stats/olmo-mix/v15/cc/v15r2_cc_c4_cleaned_dup/**/*.gz" + decontamination_key: str = "perplexity_suite_v3_option2" + + @classmethod + def process_single( + cls, source_path: str, destination_path: str, queue: "Queue[Union[Tuple[int, ...], None]]", **kwargs: Any + ): + attributes = [ + source_path.replace("/documents/", "/attributes/tokenizer_repetitions_v2r2/"), + source_path.replace("/documents/", "/attributes/dedupe_paragraphs/"), + # source_path.replace("/documents/", "/attributes/dedupe_docs/"), + ] + + doc_decoder = msgspec.json.Decoder(InputSpec) + attr_decoder = msgspec.json.Decoder(OutputSpec) + + stats = { + "doc_length": 0, + "doc_count": 0, + "repetitions_count": defaultdict(int), + "repetitions_length": defaultdict(int), + "repetitions_period": defaultdict(int), + } + interval = 10_000 + + with ExitStack() as stack: + doc_file = stack.enter_context(smart_open.open(source_path, "rb")) + + try: + atts_files = [stack.enter_context(smart_open.open(path, "rb")) for path in attributes] + except Exception: + return + + for doc_line, *attr_lines in zip(doc_file, *atts_files): + doc = doc_decoder.decode(doc_line) + stats["doc_length"] += len(doc.text) + stats["doc_count"] += 1 + + attrs = {} + for line in attr_lines: + attrs.update(attr_decoder.decode(line).attributes) + + repetitions = attrs.get("tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__repetition", []) + stats["repetitions_count"][len(repetitions)] += 1 + + repetition_max_length = attrs.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_length_repetition", + [[0, 0, 0]], + )[0][-1] + stats["repetitions_length"][repetition_max_length] += 1 + + repetitions_period = attrs.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition", + [[0, 0, 0]], + )[0][-1] + stats["repetitions_period"][repetitions_period] += 1 + + if stats["doc_count"] % interval == 0: + cls.increment_progressbar(queue, documents=interval) + + cls.increment_progressbar(queue, files=1, documents=stats["doc_count"] % interval) + + with smart_open.open(destination_path, "wt") as destination_file: + destination_file.write(json.dumps(stats, indent=2)) + + @Registry.add class LineStatsCC(cc_v1_c4_cleaned): # Selection of documents: @@ -663,7 +888,7 @@ class LineStatsCC(cc_v1_c4_cleaned): "./cc_en_head-0786-stats.json.gz", "./cc_en_head-0857-stats.json.gz", ] - decontamination_key: str = 'decontamination' + decontamination_key: str = "decontamination" @classmethod def cli(cls, num_workers: int = 1, debug: bool = False, **process_single_kwargs: Any) -> None: @@ -673,7 +898,7 @@ def cli(cls, num_workers: int = 1, debug: bool = False, **process_single_kwargs: debug=debug, **process_single_kwargs, ) - + @classmethod def process_single( cls, source_path: str, destination_path: str, queue: "Queue[Union[Tuple[int, ...], None]]", **kwargs: Any @@ -751,6 +976,7 @@ def process_single( cls.increment_progressbar(queue, files=1, documents=documents % interval) + class C4InputSpec(InputSpec): metadata: Dict[str, Any] = msgspec.field(default_factory=dict) @@ -887,6 +1113,75 @@ def cli(cls, num_workers: int = 1, debug: bool = False, **process_single_kwargs: processor(**process_single_kwargs) +@Registry.add +class reddit(BaseStatsProcessor): + repetitions_threshold = 100 + documents = "s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz" + stats = "s3://ai2-llm/stats/olmo-mix/v1_5/forums/reddit/grouped/*.gz" + + @classmethod + def process_single( + cls, source_path: str, destination_path: str, queue: "Queue[Union[Tuple[int, ...], None]]", **kwargs: Any + ): + attrs_path = source_path.replace( + "/documents/", + "/attributes/tokenizer_repetitions_v2r2/", + ) + + documents_decoder = msgspec.json.Decoder(C4InputSpec) + attributes_decoder = msgspec.json.Decoder(OutputSpec) + + interval = 10_000 + + stats = { + "length": 0, + "count": 0, + "tokens": 0, + "repetitions_count": 0, + "repetitions_length": 0, + "repetitions_matches": 0, + } + cnt = 0 + + with smart_open.open(source_path, "rb") as doc_file, smart_open.open(attrs_path, "rb") as attrs_file: + for source_line, attributes_line in zip(doc_file, attrs_file): + cnt += 1 + + document = documents_decoder.decode(source_line) + attributes = attributes_decoder.decode(attributes_line) + text = document.text + + if not (text := text.strip()): + continue + + stats["count"] += 1 + stats["tokens"] += len(blingfire.text_to_words(text).split()) + stats["length"] += len(text) + + (_, _, max_reps), *_ = attributes.attributes.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition", [[0, 0, 0]] + ) + if max_reps >= cls.repetitions_threshold: + reps = [ + r + for r in attributes.attributes.get( + "tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__repetition", [] + ) + if r[-1] >= cls.repetitions_threshold + ] + stats["repetitions_count"] += len(reps) + stats["repetitions_length"] += sum(s[1] - s[0] for s in reps) + stats["repetitions_matches"] += 1 + + if cnt % interval == 0: + cls.increment_progressbar(queue, documents=interval) + + cls.increment_progressbar(queue, files=1, documents=cnt % interval) + + with smart_open.open(destination_path, "wt") as destination_file: + destination_file.write(json.dumps(stats, indent=2)) + + class StackInputSpec(InputSpec): metadata: Dict[str, Any] = msgspec.field(default_factory=dict) attributes: Dict[str, Any] = msgspec.field(default_factory=dict) @@ -1137,8 +1432,8 @@ def process_single( @Registry.add class LineStatsStack(BaseStatsProcessor): # Testing - #documents = "s3://ai2-llm/pretraining-data/sources/stack-dedup/v0/documents/abap/data_0000.jsonl.gz" - #stats = "/data/niklas/dolma/abap/data_0000.json.gz" + # documents = "s3://ai2-llm/pretraining-data/sources/stack-dedup/v0/documents/abap/data_0000.jsonl.gz" + # stats = "/data/niklas/dolma/abap/data_0000.json.gz" documents = "s3://ai2-llm/pretraining-data/sources/stack-dedup/v0/documents/*/*.gz" stats = "/data/niklas/dolma/stack" @@ -1174,28 +1469,37 @@ def process_single( except Exception as e: print(e) return - + for doc_line, attrs in zip(doc_file, attr_file): doc = doc_decoder.decode(doc_line) attrs = attr_decoder.decode(attrs).attributes out_line = {} ## RPJ ## - if (attrs["paper_analysis__code_redpajama_taggers_v1__max_line_length_doc"][0][2] > 1000) or \ - (attrs["paper_analysis__code_redpajama_taggers_v1__avg_line_length_doc"][0][2] > 100) or \ - (attrs["paper_analysis__code_redpajama_taggers_v1__alnum_prop_doc"][0][2] < 0.25) or \ - (attrs["paper_analysis__code_redpajama_taggers_v1__alpha_token_prop_doc"][0][2] < 1.5): + if ( + (attrs["paper_analysis__code_redpajama_taggers_v1__max_line_length_doc"][0][2] > 1000) + or (attrs["paper_analysis__code_redpajama_taggers_v1__avg_line_length_doc"][0][2] > 100) + or (attrs["paper_analysis__code_redpajama_taggers_v1__alnum_prop_doc"][0][2] < 0.25) + or (attrs["paper_analysis__code_redpajama_taggers_v1__alpha_token_prop_doc"][0][2] < 1.5) + ): out_line["rpj"] = 1 else: out_line["rpj"] = 0 ## StarCoder ## - if (attrs["paper_analysis__code_starcoder_taggers_v2__has_xml_template_doc"][0][2] > 0.0) or \ - (attrs["paper_analysis__code_starcoder_taggers_v2__code_to_comment_ratio_doc"][0][2] > 0.8) or \ - (attrs["paper_analysis__code_starcoder_taggers_v2__code_to_comment_ratio_doc"][0][2] <= 0.01) or \ - ( - any(x in source_path for x in ["python", "java", "javascript"]) and \ - (attrs["paper_analysis__code_starcoder_taggers_v2__code_to_text_ratio_html_doc"][0][2] <= 0.1) + if ( + (attrs["paper_analysis__code_starcoder_taggers_v2__has_xml_template_doc"][0][2] > 0.0) + or (attrs["paper_analysis__code_starcoder_taggers_v2__code_to_comment_ratio_doc"][0][2] > 0.8) + or ( + attrs["paper_analysis__code_starcoder_taggers_v2__code_to_comment_ratio_doc"][0][2] <= 0.01 + ) + or ( + any(x in source_path for x in ["python", "java", "javascript"]) + and ( + attrs["paper_analysis__code_starcoder_taggers_v2__code_to_text_ratio_html_doc"][0][2] + <= 0.1 + ) + ) ): out_line["starcoder"] = 1 else: diff --git a/scripts/find_missing_attributes.py b/scripts/find_missing_attributes.py new file mode 100644 index 00000000..9890d0e8 --- /dev/null +++ b/scripts/find_missing_attributes.py @@ -0,0 +1,68 @@ +""" +Script to identify which documents files have missing attributes +files. + +Author: Luca Soldaini (@soldni) +""" + +import sys +from typing import Generator + +try: + import click + from dolma.core.paths import _get_fs, _pathify, glob_path, join_path, sub_prefix +except ImportError as e: + raise ImportError("Missing dependency; plese run `pip install dolma click`.") from e + + +def find_missing(prefix: str, attribute_name: str) -> Generator[str, None, None]: + """ + Find all files in the given prefix that are missing the given attribute. + """ + fs = _get_fs(prefix) + protocol, _ = _pathify(prefix) + document_prefix = join_path(protocol, prefix, "documents") + + count_all_ = count_miss = 0 + + for root, directories, filenames in fs.walk(document_prefix): + if directories: + # ignore directories + continue + + subpath = sub_prefix(join_path(protocol, root), document_prefix) + + for fn in filenames: + attribute_fn = join_path(protocol, prefix, "attributes", attribute_name, subpath, fn) + documents_fn = join_path(protocol, root, fn) + count_all_ += 1 + if not fs.exists(attribute_fn): + count_miss += 1 + yield documents_fn + + print(f"Total documents: {count_all_:,}", file=sys.stderr) + print(f"Missing attrs: {count_miss:,}", file=sys.stderr) + + +@click.command() +@click.argument("attribute-path", type=str, required=True) +@click.option("--separator", type=str, default="\n", help="Separator to use between paths") +def main(attribute_path: str, separator: str) -> None: + """ + Find all files in the given prefix that are missing the given attribute. + """ + + if "/attributes/" not in attribute_path: + raise ValueError("Attribute path must contain 'attributes'") + + prefix, attribute = attribute_path.split("/attributes/", 1) + + if not attribute.strip(): + raise ValueError("Attribute name must not be empty") + + for missing in find_missing(prefix, attribute): + print(missing, end=separator, flush=True, file=sys.stdout) + + +if __name__ == "__main__": + main() diff --git a/scripts/fix_dolma_v15_tokenizer.py b/scripts/fix_dolma_v15_tokenizer.py new file mode 100644 index 00000000..b09157d7 --- /dev/null +++ b/scripts/fix_dolma_v15_tokenizer.py @@ -0,0 +1,134 @@ +import argparse +import json +from tempfile import TemporaryDirectory + +from transformers import AutoTokenizer + +OG_TOKENIZER_NAME = "eleutherai/gpt-neox-20b" +OLD_TOKENIZER_NAME = "allenai/eleuther-ai-gpt-neox-20b-pii-special" +NEW_TOKENIZER_NAME = "allenai/gpt-neox-olmo-dolma-v1_5" +EMAIL_SPECIAL_TOKEN = "|||EMAIL_ADDRESS|||" +EMAIL_SPECIAL_TOKEN_ID = 50277 +PHONE_SPECIAL_TOKEN = "|||PHONE_NUMBER|||" +PHONE_SPECIAL_TOKEN_ID = 50278 +IP_SPECIAL_TOKEN = "|||IP_ADDRESS|||" +IP_SPECIAL_TOKEN_ID = 50279 +EOS_TOKEN = "<|endoftext|>" +EOS_TOKEN_ID = 0 + + +def main(push_to_hub=False): + og_tok = AutoTokenizer.from_pretrained(OG_TOKENIZER_NAME) + old_tok = AutoTokenizer.from_pretrained(OLD_TOKENIZER_NAME) + + assert old_tok.eos_token == EOS_TOKEN + assert old_tok.bos_token == EOS_TOKEN + assert old_tok.unk_token == EOS_TOKEN + + vocab = old_tok.get_vocab() + + assert len(vocab) == 50280 + assert vocab[old_tok.eos_token] == old_tok.eos_token_id == EOS_TOKEN_ID + assert vocab[old_tok.bos_token] == old_tok.bos_token_id == EOS_TOKEN_ID + assert vocab[old_tok.unk_token] == old_tok.unk_token_id == EOS_TOKEN_ID + assert vocab[IP_SPECIAL_TOKEN] == IP_SPECIAL_TOKEN_ID + assert vocab[EMAIL_SPECIAL_TOKEN] == 50277 + assert vocab[PHONE_SPECIAL_TOKEN] == 50278 + + with TemporaryDirectory() as tmp_dir: + old_tok.save_pretrained(tmp_dir) + + with open(f"{tmp_dir}/tokenizer.json", "r") as f: + tokenizer_config = json.load(f) + + for token_config in tokenizer_config["added_tokens"]: + if token_config["content"] == EOS_TOKEN: + token_config["id"] = IP_SPECIAL_TOKEN_ID + elif token_config["content"] == IP_SPECIAL_TOKEN: + token_config["id"] = EOS_TOKEN_ID + tokenizer_config["model"]["vocab"][token_config["content"]] = token_config["id"] + tokenizer_config["added_tokens"] = sorted(tokenizer_config["added_tokens"], key=lambda x: x["id"]) + tokenizer_config["model"]["vocab"] = { + k: v for k, v in sorted(tokenizer_config["model"]["vocab"].items(), key=lambda x: x[1]) + } + + with open(f"{tmp_dir}/tokenizer.json", "w") as f: + json.dump(tokenizer_config, f) + + new_tok = AutoTokenizer.from_pretrained(tmp_dir) + + # check if the swap worked + new_vocab = new_tok.get_vocab() + assert new_vocab[new_tok.eos_token] == new_tok.eos_token_id == IP_SPECIAL_TOKEN_ID + assert new_vocab[new_tok.bos_token] == new_tok.bos_token_id == IP_SPECIAL_TOKEN_ID + assert new_vocab[new_tok.unk_token] == new_tok.unk_token_id == IP_SPECIAL_TOKEN_ID + assert new_vocab[IP_SPECIAL_TOKEN] == EOS_TOKEN_ID + + assert new_tok.encode("|||IP_ADDRESS|||") == [EOS_TOKEN_ID] + assert new_tok.encode("|||EMAIL_ADDRESS|||") == [EMAIL_SPECIAL_TOKEN_ID] + assert new_tok.encode("|||PHONE_NUMBER|||") == [PHONE_SPECIAL_TOKEN_ID] + + hello_world = "Hello world<|endoftext|>" + new_enc = new_tok.encode(hello_world) + old_enc = old_tok.encode(hello_world) + og_enc = og_tok.encode(hello_world) + assert len(new_enc) == len(old_enc) + assert new_enc[:-1] == old_enc[:-1] + assert new_enc[:-1] == og_enc[:-1] + assert new_enc[-1] == IP_SPECIAL_TOKEN_ID + assert old_enc[-1] == EOS_TOKEN_ID + assert og_enc[-1] == EOS_TOKEN_ID + + text_with_split = "Heeeeellllooooo subwords!!!<|endoftext|>" + new_enc = new_tok.encode(text_with_split) + old_enc = old_tok.encode(text_with_split) + og_enc = og_tok.encode(text_with_split) + assert len(new_enc) == len(old_enc) + assert new_enc[:-1] == old_enc[:-1] + assert new_enc[:-1] == og_enc[:-1] + assert new_enc[-1] == IP_SPECIAL_TOKEN_ID + assert old_enc[-1] == EOS_TOKEN_ID + assert og_enc[-1] == EOS_TOKEN_ID + + for token_id in range(0, len(og_tok.get_vocab())): + og_vocab_entry = og_tok.convert_ids_to_tokens(token_id) + new_vocab_entry = new_tok.convert_ids_to_tokens(token_id) + if token_id == EOS_TOKEN_ID: + assert new_vocab_entry == IP_SPECIAL_TOKEN + assert og_vocab_entry == EOS_TOKEN + else: + err = f"{token_id}: `{og_vocab_entry}` != `{new_vocab_entry}`" + assert og_vocab_entry == new_vocab_entry, err + + for token_id in range(0, len(old_tok.get_vocab())): + old_vocab_entry = old_tok.convert_ids_to_tokens(token_id) + new_vocab_entry = new_tok.convert_ids_to_tokens(token_id) + if token_id == EOS_TOKEN_ID: + assert new_vocab_entry == IP_SPECIAL_TOKEN + assert old_vocab_entry == EOS_TOKEN + elif token_id == IP_SPECIAL_TOKEN_ID: + assert new_vocab_entry == EOS_TOKEN + assert old_vocab_entry == IP_SPECIAL_TOKEN + else: + err = f"{token_id}: `{old_vocab_entry}` != `{new_vocab_entry}`" + assert old_vocab_entry == new_vocab_entry, err + + masked_text = "<|padding|>Hello my phone number is |||PHONE_NUMBER||| bye <|endoftext|>" + new_enc = new_tok.encode(masked_text) + old_enc = old_tok.encode(masked_text) + assert len(new_enc) == len(old_enc) + assert new_enc[:-1] == old_enc[:-1] + assert new_enc[-1] == IP_SPECIAL_TOKEN_ID + assert old_enc[-1] == EOS_TOKEN_ID + + if push_to_hub: + print("Pushing to hub...") + new_tok.push_to_hub(NEW_TOKENIZER_NAME) + print(f"tokenizer available at: https://huggingface.co/{NEW_TOKENIZER_NAME}") + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--push-to-hub", action="store_true") + args = ap.parse_args() + main(args.push_to_hub) diff --git a/scripts/hash_sample.py b/scripts/hash_sample.py index 50e13db9..52ab46e4 100644 --- a/scripts/hash_sample.py +++ b/scripts/hash_sample.py @@ -12,7 +12,6 @@ import msgspec import smart_open import uniseg.wordbreak - from dolma.core.data_types import InputSpec from dolma.core.parallel import BaseParallelProcessor from dolma.core.paths import join_path diff --git a/scripts/install_blingfire_macos.py b/scripts/install_blingfire_macos.py new file mode 100644 index 00000000..d5eca18b --- /dev/null +++ b/scripts/install_blingfire_macos.py @@ -0,0 +1,60 @@ +#! /usr/bin/env python3 + +# From https://github.com/allenai/smashed/blob/main/src/smashed/utils/install_blingfire_macos.py + +import platform +from subprocess import call +from warnings import warn + +BASH_SCRIPT = """ +#! /usr/bin/env bash + +# path to the current directory +CURRENT_DIR="$(pwd)" + +# remove any existing blingfire installation +pip uninstall -y blingfire 2>/dev/null + +# clone blingfire repo to a temp directory +TMP_DIR=$(mktemp -d) +cd $TMP_DIR +git clone "https://github.com/microsoft/BlingFire" +cd blingfire + +# build blingfire +mkdir Release +cd Release +cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_BUILD_TYPE=Release .. +make -j 4 +cd .. + +# copy freshly compiled blingfire to the python bindings directory +cp -rf Release/* dist-pypi/blingfire/ + +# build & install the python bindings +cd dist-pypi +python setup.py sdist bdist_wheel +pip install --force-reinstall dist/blingfire-*-py3-none-any.whl + +# cleanup +cd $CURRENT_DIR +rm -rf $TMP_DIR +""" + + +def main(): + # check if we are on MacOS + if platform.system() != "Darwin": + warn("This script is only meant to be run on MacOS; skipping...") + return + + # check that architecture is arm64 + if platform.machine() != "arm64": + warn("This script is only meant to be run on arm64; skipping...") + return + + return call(BASH_SCRIPT.strip(), shell=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/make_latex_fig_table.py b/scripts/make_latex_fig_table.py new file mode 100644 index 00000000..a3ffb01e --- /dev/null +++ b/scripts/make_latex_fig_table.py @@ -0,0 +1,201 @@ +# import argparse +# import bisect +# import fnmatch +# import re +# from collections import defaultdict +# from functools import lru_cache, partial +# from math import ceil, floor, log10 +# from pathlib import Path +# from statistics import stdev +# from typing import List, Optional, Sequence, Tuple + +from math import ceil +from pathlib import Path +from typing import Dict, List + +from necessary import necessary + +with necessary( + modules=["click"], + message="Please run `pip install click`", +): + import click + + # \item \textbf{AI2 Reasoning Challenge}~\citep{arc}: A science question-answering dataset broken into \emph{easy} and \emph{challenge} subsets. Only the easy subset was used in online evaluations. The challenge subset was, however, included in offline evaluations. + # %Only the easy subset is included in online evaluations. The challenge subset additionally included in offline evaluations. + # \item \textbf{BoolQ}~\citep{clark2019boolq}: A reading comprehension dataset consisting of naturally occurring yes/no boolean questions and background contexts. + # % \item \textbf{Choice Of Plausible Alternatives (COPA)}~\citep{copa}: A commonsense reasoning dataset that involves selecting plausible sentences given input premises. + # \item \textbf{HellaSwag}~\citep{zellers2019hellaswag}: A multiple-choice question-answering dataset that tests situational understanding and commonsense. + # \item \textbf{OpenBookQA}~\citep{openbookQA}: A multiple-choice question-answering dataset modeled on open-book science exams. + # \item \textbf{Physical Interaction: Question Answering (PIQA)}~\citep{piqa}: A multiple-choice question-answering dataset that focuses on physical commonsense and naive physics. + # \item \textbf{SciQ}~\citep{sciq}: A crowdsourced multiple-choice question-answering dataset consisting of everyday questions about physics, chemistry and biology, among other areas of science. + # \item \textbf{WinoGrande}~\citep{winogrande}: A dataset of pronoun resolution problems involving various forms of commonsense. Modeled after the Winograd challenge of \cite{wsc}. + + +metrics_names = { + "wikitext_103.pdf": "WikiText 103~\\citep{merity2016pointer}", + "4chan.pdf": "4chan~\\citep{papasavva2020raiders}", + "c4_100_domains.pdf": "C4 100 dom~\\citep{chronopoulou-etal-2022-efficient}", + "c4.pdf": "C4~\\citep{raffel2020exploring,dodge-etal-2021-documenting}", + "dolma_books.pdf": "Dolma Books Subset", + "dolma_common_crawl.pdf": "Dolma Web Subset", + "dolma_pes2o_v2.pdf": "Dolma Papers Subset", + "dolma_reddit.pdf": "Dolma Reddit Subset", + "dolma_stack_v5.pdf": "Dolma Code Subset", + "dolma_wiki.pdf": "Dolma Wikipedia Subset", + "gab.pdf": "Gab~\\citep{zannettou2018gab}", + "ice.pdf": "ICE~\\citep{greenbaum1991ice}", + "m2d2_s2orc.pdf": "M2D2~\\citep{reid-etal-2022-m2d2} (S2ORC)", + "m2d2_wiki.pdf": "M2D2~\\citep{reid-etal-2022-m2d2} (Wiki)", + "manosphere.pdf": "Manosphere~\\citep{ribeiroevolution2021}", + "mc4_english.pdf": "mC4~\\citep{mc4} (English)", + "penn_treebank.pdf": "Penn Tree Bank~\\citep{marcus-etal-1994-penn}", + "pile.pdf": "Pile~\\citep{Gao2020ThePA} (Val)", + "twitteraee.pdf": "Twitter AAE~\\citep{blodgett-etal-2016-demographic}", + "winogrande.pdf": "WinoGrande~\\citep{winogrande}", + "sciq.pdf": "SciQ~\\citep{sciq}", + "openbookqa.pdf": "OpenBookQA~\\citep{openbookQA}", + "hellaswag.pdf": "HellaSwag~\\citep{zellers2019hellaswag}", + "piqa.pdf": "PIQA~\\citep{piqa}", + "arc_easy.pdf": "ARC-E~\\citep{arc}", + "train_cross_entropy.pdf": "Training Cross Entropy", + "train_ppl.pdf": "Training Perplexity", +} + +subsets = { + "train": "Training Metrics", + "ppl": "Perplexity on Paloma", + "downstream": "Downstream Evaluation", +} + +abl_names = { + # "ablations_code_15p_stack_v2_v4_starcoder": "" + "150b_runs": "Comparing \\dolma With Other Corpora", + "ablations_cc_dedupe": "Deduping Strategy", + "ablations_cc_pii_filtering": "Filtering of Personal Identifiable Information", + # "ablations_cc_quality": "" + "ablations_cc_quality_only": "Comparing Quality Filters for Web Pipeline", + "ablations_cc_to_quality_plus_content": "Full Comparison of Web Pipeline", + "ablations_cc_toxic_filtering": "Toxicity Filtering in Web Pipeline", + "ablations_code_stack_v2_vs_v4": "Comparing Code Processing Pipeline", + "ablations_dolma_mix": "Studying \\dolma Mixture", + "ablations_reddit_selection": "Strategies to Format Conversational Forums Pipeline", + "ablations_reddit_toxic_filtering": "Evaluating Toxicity Filtering in Conversational Forums Pipeline", + "long_1b_run": "Training \\OlmoTiny", +} + + +@click.command() +@click.option("-d", "--paper-directory", type=click.Path(exists=False, path_type=Path), required=True) +@click.option("-i", "--input-prefix", type=str, default="experiments", show_default=True) +@click.option("-o", "--output-prefix", type=str, default="appendix/results", show_default=True) +@click.option("-w", "--num-columns", type=int, default=3, show_default=True) +def main( + paper_directory: Path, + input_prefix: str, + output_prefix: str, + num_columns: int, +): + (destination := paper_directory / output_prefix).mkdir(parents=True, exist_ok=True) + + group_by_path: Dict[str, List[str]] = {} + for fn in (source := paper_directory / input_prefix).glob("**/*.pdf"): + group = str(fn.relative_to(source).parent) + group_by_path.setdefault(group, []).append(str(input_prefix / fn.relative_to(source))) + + grouped_sections = {} + + for figure_group, paths in group_by_path.items(): + figure_group, subgroup = figure_group.rsplit("/", 1) + + if figure_group not in abl_names: + continue + + subsection_name_cands = [s for s in subsets if s in subgroup] + if subsection_name_cands: + subsection_name = subsets[subsection_name_cands[0]] + else: + continue + + group_num_columns = num_columns # min(num_columns, len(paths)) + + paths = [p for p in paths if p.rsplit("/", 1)[1] in metrics_names] + grouped_paths = [paths[i : i + group_num_columns] for i in range(0, len(paths), group_num_columns)] + + current_section_components = [] + + for i, paths in enumerate(sorted(grouped_paths)): + width = round(1 / group_num_columns - (0.01 * (group_num_columns - 1)), 2) + + group_subfigures = [] + captions = [] + for path in paths: + metric_name = metrics_names.get(path.rsplit("/", 1)[1], None) + if metric_name is None: + continue + + captions.append(metric_name.replace("\\\\", "\\")) + + group_subfigures.append( + f"\t\\begin{{subfigure}}{{{width}\\textwidth}}\n" + f"\t\t\\includegraphics[width=\\linewidth]{{{path}}}\n" + # f"\t\t\\caption{{{metric_name}}}\n" + # f"\t\t\\label{{fig:{path.replace('/', ':').rsplit(':', -1)[0]}}}\n" + f"\t\\end{{subfigure}}" + ) + + subfigures = "\n\t\\quad\n".join(group_subfigures) + + if len(captions) > 1: + caption_text = ", ".join(captions[:-1]) + " and " + captions[-1] + if len(captions) > 2: + # oxford comma + caption_text = caption_text.replace(" and ", ", and ") + elif len(captions) == 1: + caption_text = captions[0] + else: + breakpoint() + + if "train" in caption_text.lower(): + caption = caption_text + elif "perplexity" in subsection_name.lower(): + caption = f"Perplexity results on Paloma~\\citep{{paloma}}; subsets {caption_text}" + else: + caption = f"Results downstream tasks {caption_text}" + + figure = ( + f"\\begin{{figure}}[h!]\n" + f"\t\\centering\n" + f"{subfigures}\n" + f"\t\\caption{{{caption}}}\n" + f"\\end{{figure}}" + ) + current_section_components.append(figure) + + all_section_figures = "\n\n".join(current_section_components) + "\n" + # grouped_sections.setdefault(figure_group, []).append(all_section_figures) + current_section = ( + # f"\\subsection{{{subsection_name}}}\n" + f"\\label{{sec:{figure_group.replace('/', ':')}:{subgroup}}}\n\n" + f"{all_section_figures}\n" + ) + grouped_sections.setdefault(figure_group, []).append(current_section) + + for figure_group, sections in sorted(grouped_sections.items()): + sections = "\n\n".join(sections) + appendix_name = figure_group.replace("/", "_") + figure_group_title = abl_names[figure_group] + full_group = ( + f"\\subsection{{{figure_group_title}}}\n" + f"\\label{{sec:{figure_group.replace('/', ':')}}}\n\n" + f"{sections}\n" + f"\\clearpage\n" + ) + figure_group_path = destination / (appendix_name + ".tex") + figure_group_path.write_text(full_group) + + print(f"\\input{{{output_prefix}/{appendix_name}}}") + + +if __name__ == "__main__": + main() diff --git a/scripts/make_wikipedia.py b/scripts/make_wikipedia.py index 6001d138..a7d2c7c0 100644 --- a/scripts/make_wikipedia.py +++ b/scripts/make_wikipedia.py @@ -21,9 +21,8 @@ import requests import smart_open -from uniseg.wordbreak import words as uniseg_get_words - from dolma.core.parallel import BaseParallelProcessor, QueueType +from uniseg.wordbreak import words as uniseg_get_words CMD_INSTALL = "pip install git+https://github.com/santhoshtr/wikiextractor.git requests smart_open tqdm" diff --git a/scripts/remove_empty_docs.py b/scripts/remove_empty_docs.py index 3e12b786..c1da8e5e 100644 --- a/scripts/remove_empty_docs.py +++ b/scripts/remove_empty_docs.py @@ -7,7 +7,6 @@ from typing import Any, Tuple, Union import smart_open - from dolma.core.parallel import BaseParallelProcessor diff --git a/scripts/stack_correlation_table.py b/scripts/stack_correlation_table.py index f6f5dac1..f5711e32 100644 --- a/scripts/stack_correlation_table.py +++ b/scripts/stack_correlation_table.py @@ -12,14 +12,13 @@ halfway = len(paths) // 2 for i, lang in enumerate(paths): - all_data = [] for dataset in os.listdir(os.path.join(PATH, lang)): # Load json.gz data = pd.read_json(os.path.join(PATH, lang, dataset), lines=True) all_data.append(data) - + # Concatenate all dataframes all_data = pd.concat(all_data) # Cast to int @@ -33,7 +32,7 @@ corr = round(all_data["rpj"].corr(all_data["starcoder"]), 3) if i >= halfway: - TABLE_ROWS[i-halfway] += f" & {lang} & {rpj_flagged} & {sc_flagged} & {corr} \\\\" + TABLE_ROWS[i - halfway] += f" & {lang} & {rpj_flagged} & {sc_flagged} & {corr} \\\\" else: TABLE_ROWS.append(f"{lang} & {rpj_flagged} & {sc_flagged} & {corr}") # TABLE_ROWS.append(f"{lang} & {rpj_flagged} & {sc_flagged} & {corr} \\\\") diff --git a/scripts/tokenize_eval.sh b/scripts/tokenize_eval.sh new file mode 100644 index 00000000..cf8e31a6 --- /dev/null +++ b/scripts/tokenize_eval.sh @@ -0,0 +1,53 @@ +tokenizer="allenai/gpt-neox-olmo-dolma-v1_5-digits" +path="s3://ai2-llm/eval-data/perplexity" +suffix="gpt-neox-digits" + +v2_eval=( + "4chan" + "c4_100_domains" + "c4_en" + "gab" + "ice" + "m2d2_s2orc" + "m2d2_wiki" + "manosphere" + "mc4_en" + "pile" + "ptb" + "twitterAEE" + "wikitext_103" +) + +v3_eval=( + "c4_en" + "dolma_books" + "dolma_common-crawl" + "dolma_pes2o" + "dolma_reddit" + "dolma_stack" + "dolma_wiki" + "ice" + "m2d2_s2orc" + "pile" + "wikitext_103" +) + +set -ex + +for dataset in "${v2_eval[@]}"; do + for split in "val" "test"; do + dolma tokens \ + --tokenizer_name_or_path $tokenizer \ + --destination "${path}/v2_small_${suffix}/${dataset}/${split}" \ + --documents "${path}/v2_small/${dataset}/${split}/*.gz" & + done +done + +for dataset in "${v3_eval[@]}"; do + for split in "val" "test"; do + dolma tokens \ + --tokenizer_name_or_path $tokenizer \ + --destination "${path}/v3_small_${suffix}/${dataset}/${split}" \ + --documents "${path}/v3_small/${dataset}/${split}/*.gz" & + done +done diff --git a/scripts/wandb_run_vocab.yaml b/scripts/wandb_run_vocab.yaml index 666e2c8d..0cd89859 100644 --- a/scripts/wandb_run_vocab.yaml +++ b/scripts/wandb_run_vocab.yaml @@ -1,10 +1,14 @@ # Run Names -olmo-small-pile-fixed-*: The Pile -olmo-small-rpj-*: Red Pajama -olmo-small-dolma-*: Dolma +olmo-small-pile-fixed-*: Pile +olmo-small-rpj-*: Red Pajama v1 +olmo-small-dolma-*: Dolma v1.5 olmo-small-falcon-*: RefinedWeb +olmo-small-c4-*: C4 +olmo-small-mc4-*: mC4 (English) olmo-small-3T-lower-lr-tie_*: 1B Dolma Model +GPT-Neox-20B*: C4 Baseline + # Ablation for PII masking v1-small-pi-less-than-5-anonymize_*: PII Remove (>=5) + Mask (<5) v1-small-all-pi-removed_*: PII Remove All @@ -16,24 +20,43 @@ reddit-v1-ablation-pii-nsfw-toxic_filtered_*: PII + NSFW + Hate Filter reddit-v1-ablation-toxic-filtered_*: NSFW + Hate Filter # Data mixture -olmo-mix-v1-sample_*: Equal Data Mix -olmo-mix-v1-sample-mix2_*: "200% Wiki, 200% peS2o" -olmo-mix-v1-sample-all-cc*: "Only Common Crawl (CC)" -olmo-mix-v1-gopher-like_*: "20% Code, 60% CC, 200% peS2o" +olmo-mix-v1-sample_*: Naïve Mix +olmo-mix-v1-sample-mix2_*: "Reference+ Mix" +olmo-mix-v1-sample-all-cc*: "Web Only Mix" +olmo-mix-v1-gopher-like_*: "Gopher-like Mix" # Dedupe abl-cc-v2-small-dedup*: Paragraph Deduplication # Path to final cleanup -v1-small-c4-cleaned_\d+: C4 Rules (EoS) -v1-small-c4-filtered_\d+: C4 Rules (All) -v1-small-gopher-filtered_\d+: Gopher Rules (All) -v1-small-c4-cleaned-gopher-filtered_\d+: C4 (EoS), Gopher (All) -v1-small-c4-cleaned-gopher-filtered-deduped_\d+: C4 (EoS), Gopher (All), Dedup - +v1-small-c4-cleaned_*: C4 Rules (EoS) +v1-small-c4-filtered_*: C4 Rules (All) +v1-small-gopher-filtered_*: Gopher Rules (All) +v1-small-c4-cleaned-gopher-filtered_*: C4 (EoS), Gopher (All) +v1-small-c4-cleaned-gopher-filtered-deduped_*: C4 (EoS), Gopher (All), Dedup cc_quality: olmo-mix-v1-sample-all-cc*: C4 (EoS), Gopher (All), Dedup, PII, Toxic +cc_quality_only: + v1-small-c4-filtered_*: C4 All + v1-small-c4-cleaned_*: C4 NoPunc + v1-small-gopher-filtered_*: Gopher All + v1-small-c4-cleaned-gopher-filtered_*: C4 NoPunc + Gopher All + +cc_to_quality_plus_content: + v1-small-c4-cleaned-gopher-filtered_*: Quality Filters + v1-small-c4-cleaned-gopher-filtered-deduped_*: Quality Filters + Dedup + olmo-mix-v1-sample-all-cc*: Quality Filters + Dedup + Content Filters + +# Reddit ablations +reddit_selection: + + reddit-v5-ablation-filtered-gen-2_*: Atomic Content, Dedup, PII, Toxic + reddit-v3-ablation-base-*: Atomic Content + reddit-v2-ablation-base-*: Partial Threads, Dedup + reddit-v4-ablation-base-*: Complete Threads + reddit-v1-ablation-base_*: Partial Threads + # Code Experiments stack-v2*: Dolma (Stack v2) stack-v4*: Dolma (Stack v5) @@ -55,8 +78,8 @@ v1-small-nsfw-filtered-high*: NSFW Filter (High Threshold) # Training Metrics throughput/total_tokens: Total Tokens -train/Perplexity: Perplexity (Train) -train/CrossEntropyLoss: CE Loss (Train) +train/Perplexity: Train@@@Perplexity +train/CrossEntropyLoss: Train@@@CE # Downstream Metrics @@ -89,7 +112,7 @@ eval/v3-small-dolma_common-crawl-validation/Perplexity: Dolma (Common Crawl) eval/v2-small-gab-validation/Perplexity: GAB eval/v2-small-ice-validation/Perplexity: ICE eval/v2-small-ptb-validation/Perplexity: Penn Treebank -eval/v2-small-pile-validation/Perplexity: The Pile +eval/v2-small-pile-validation/Perplexity: Pile eval/v2-small-4chan-validation/Perplexity: 4chan eval/v2-small-c4_en-validation/Perplexity: C4 eval/v2-small-mc4_en-validation/Perplexity: mC4 (English) @@ -105,7 +128,7 @@ eval/v2-small-c4_100_domains-validation/Perplexity: C4 (100 Domains) eval/gab-validation/Perplexity: GAB eval/ice-validation/Perplexity: ICE eval/ptb-validation/Perplexity: Penn Treebank -eval/pile-validation/Perplexity: The Pile +eval/pile-validation/Perplexity: Pile eval/4chan-validation/Perplexity: 4chan eval/c4_en-validation/Perplexity: C4 eval/mc4_en-validation/Perplexity: mC4 (English) diff --git a/scripts/wandb_to_plot.py b/scripts/wandb_to_plot.py index e0248772..3b914919 100644 --- a/scripts/wandb_to_plot.py +++ b/scripts/wandb_to_plot.py @@ -3,7 +3,8 @@ import fnmatch import re from collections import defaultdict -from functools import partial +from functools import lru_cache, partial +from math import ceil, floor, log10 from pathlib import Path from statistics import stdev from typing import List, Optional, Sequence, Tuple @@ -16,9 +17,8 @@ ): import plotly.graph_objs as go import plotly.io as pio - import yaml - import wandb + import yaml pio.kaleido.scope.mathjax = None @@ -42,15 +42,19 @@ def parse_args(): ap.add_argument("-N", "--experiment-nickname", type=str, default=None, help="Experiment nickname") ap.add_argument("--plotly-theme", type=str, default="none", help="Plotly theme to use") ap.add_argument("--plotly-font-size", type=int, default=10, help="Plotly font size") - ap.add_argument("--plotly-show-title", action="store_true", help="Show plot title") ap.add_argument("--plotly-figure-width", type=int, default=800, help="Plotly figure width") ap.add_argument("--plotly-figure-height", type=int, default=500, help="Plotly figure height") return ap.parse_args() +@lru_cache(maxsize=1) +def translate_to_regex(s: str) -> re.Pattern: + return re.compile(fnmatch.translate(s)) + + def match_run_name(name: str, run_names: List[str]) -> Optional[str]: for run_name in run_names: - if fnmatch.filter([name], run_name) or re.search(run_name, name): + if translate_to_regex(run_name).search(name): return run_name return None @@ -93,16 +97,18 @@ def main(): **{k: v for k, v in vocabulary.get(opts.experiment_nickname, {}).items() if isinstance(v, str)}, } - metrics = defaultdict(lambda: {n: {"x": [], "y": []} for n in opts.wandb_names}) + metrics_values = defaultdict(lambda: {n: {"x": [], "y": []} for n in opts.wandb_names}) + metrics_names = defaultdict(lambda: {n: "" for n in opts.wandb_names}) run_name_matcher = partial(match_run_name, run_names=opts.wandb_names) print(f"Found {len(wb_runs)} matching runs in {wb_path}") for wb_run in wb_runs: plot_group_name = run_name_matcher(wb_run.name) - if plot_group_name is None: - raise ValueError(f"Could not find a name match for {wb_run.name}") + if plot_group_name is None: + print(f"WARNING: could not find a name match for {wb_run.name}") + continue print(f"Processing run {wb_run.name} into group {plot_group_name}") if opts.samples > 0: @@ -124,21 +130,42 @@ def main(): for y_axis in opts.y_axis: yaxis_pretty_name = vocabulary.get(y_axis, y_axis) + + inferred_metric_name = "" + if "perplexity" in y_axis.lower(): + inferred_metric_name = "Perplexity" + elif "_f1" in y_axis.lower(): + inferred_metric_name = "F1 Score" + elif "crossentropyloss" in y_axis.lower(): + inferred_metric_name = "Cross Entropy" + elif "downstream" in y_axis.lower(): + inferred_metric_name = "Accuracy" + + metric_name = vocabulary.get("metrics", {}).get(y_axis, inferred_metric_name) + metrics_names[yaxis_pretty_name][plot_group_name] = metric_name + for wb_step in history: loc = min(bisect.bisect_left(steps, wb_step["_step"]), len(x_axis) - 1) - metrics[yaxis_pretty_name][plot_group_name]["x"].append(x_axis[loc]) - metrics[yaxis_pretty_name][plot_group_name]["y"].append(wb_step[y_axis]) + metrics_values[yaxis_pretty_name][plot_group_name]["x"].append(x_axis[loc]) + metrics_values[yaxis_pretty_name][plot_group_name]["y"].append(wb_step[y_axis]) xaxis_pretty_name = vocabulary.get(opts.x_axis, opts.x_axis) - for y_axis, plot_groups in metrics.items(): + for y_axis, plot_groups in metrics_values.items(): fig = go.Figure() # these we figure out as we go use_y_log = opts.y_log_scale top_right_legend = False + metric_name = None + global_min_y = float("inf") + global_max_y = float("-inf") for run_name, run_data in plot_groups.items(): + if metric_name is not None: + assert metrics_names[y_axis][run_name] == metric_name, "Inconsistent metric names" + metric_name = metrics_names[y_axis][run_name] + if len(run_data["y"]) == 0: print(f"WARNING: skipping {run_name} because it has no data for {y_axis}") continue @@ -164,6 +191,10 @@ def main(): min_y = min([y for y in y if y > 0] or [1e-3]) # avoid diving by zero use_y_log = use_y_log or (max(y) / min_y > 100) + # keep track of global min and max + global_min_y = min(global_min_y, min(y)) + global_max_y = max(global_max_y, max(y)) + figure_run_name = vocabulary.get(run_name, run_name) fig.add_trace(go.Scatter(name=figure_run_name, x=x, y=y, mode="lines")) @@ -178,13 +209,12 @@ def main(): } fig.update_layout(legend=legend_config) - title_text = vocabulary.get(opts.experiment_nickname, opts.experiment_nickname) fig.update_layout( template=opts.plotly_theme, xaxis_title=xaxis_pretty_name, - yaxis_title=y_axis, + yaxis_title=metric_name, legend_title=opts.legend_title, - title_text=title_text if opts.plotly_show_title else None, + title_text=y_axis.split('@@@')[0], font=dict(size=opts.plotly_font_size), width=opts.plotly_figure_width, height=opts.plotly_figure_height, @@ -192,10 +222,20 @@ def main(): l=4 * opts.plotly_font_size, r=opts.plotly_font_size, b=4 * opts.plotly_font_size, - t=opts.plotly_font_size, + t=3 * opts.plotly_font_size, ), ) - fig.update_yaxes(type="log" if use_y_log else "linear") + if use_y_log: + steps = [] + for decade in range(ceil(global_max_y / global_min_y if global_min_y > 0 else log10(global_max_y))): + unit = 10**decade + start = max(unit, floor(global_min_y / unit) * unit) + end = min(10 ** (decade + 1), ceil(global_max_y / unit) * unit) + steps.extend(range(int(start), int(end) + unit, unit)) + + fig.update_yaxes(type="log") + fig.update_layout(yaxis={"tickmode": "array", "tickvals": steps}) + fig.update_xaxes(range=([0, opts.max_x_axis] if opts.max_x_axis is not None else None)) file_name = re.sub(r"\W+", "_", y_axis).lower().strip("_") diff --git a/scripts/wimbd_to_dolma.py b/scripts/wimbd_to_dolma.py index 6bd0d553..aa5d40ae 100644 --- a/scripts/wimbd_to_dolma.py +++ b/scripts/wimbd_to_dolma.py @@ -16,7 +16,6 @@ import msgspec import smart_open - from dolma.core.parallel import BaseParallelProcessor from dolma.core.paths import join_path from dolma.core.runtime import _make_paths_from_prefix diff --git a/src/deduper.rs b/src/deduper.rs index c026379d..968e388f 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -16,6 +16,7 @@ use crate::bloom_filter::BloomFilter; use crate::s3_util; use crate::shard::shard_config::WorkDirConfig; use crate::shard::{find_objects_matching_patterns, FileCache}; +use crate::wimbd::tokens::tokenize; use deduper_config::*; @@ -129,6 +130,9 @@ fn write_attributes( GzEncoder::new(tmp_output, Compression::default()), ); + let min_content_length = dedupe_config.min_length.unwrap_or(0); + let min_word_length = dedupe_config.min_words.unwrap_or(0); + for (line_number, line) in reader.lines().enumerate() { let line = match line { Ok(line) => line, @@ -162,14 +166,35 @@ fn write_attributes( .to_string() }; - if dedupe_config.skip_empty.unwrap_or(false) && document_key.trim().is_empty() { + if min_word_length > 0 { + // Split the text into words and check the number of words. + let words = tokenize(&document_key); + if words.count() < min_word_length { + // skip documents with fewer than min_words words + attributes[&cfg.attribute_name] = Value::Array(Vec::new()); + } + } else if document_key.len() < min_content_length { + // skip length 0 documents + attributes[&cfg.attribute_name] = Value::Array(Vec::new()); + } else if dedupe_config.skip_empty.unwrap_or(false) + && document_key.trim().is_empty() + { // skip empty documents if dedupe_config.skip_empty is true // and the document key is empty after trimming (i.e., removing whitespace) - continue; + attributes[&cfg.attribute_name] = Value::Array(Vec::new()); } else { let dedupe_key = VecDeque::from([document_key.as_str()]); if bloom_filter.contains(&dedupe_key) { - attributes[&cfg.attribute_name] = Value::Bool(true); + // attributes[&cfg.attribute_name] = Value::Bool(true); + + let mut duplicate_docs_array = Vec::new(); + let attr = vec![ + Value::from(0), + Value::Number(document_key.len().into()), + Value::from(1), + ]; + duplicate_docs_array.push(Value::Array(attr)); + attributes[&cfg.attribute_name] = Value::Array(duplicate_docs_array); } else if !bloom_filter.read_only { bloom_filter.insert(&dedupe_key); } @@ -187,7 +212,6 @@ fn write_attributes( if text_length > 0 { // skip empty documents if text_length is 0 - for p in paragraphs { let par_start = offset; offset += p.chars().count(); @@ -196,7 +220,21 @@ fn write_attributes( } let par_end = offset; - if dedupe_config.skip_empty.unwrap_or(false) && p.trim().is_empty() { + if offset < min_content_length { + // skip length 0 paragraphs + continue; + } + if min_word_length > 0 { + // Split the text into words and check the number of words. + let words = tokenize(&p); + + if words.count() < min_word_length { + // skip documents with fewer than min_words words + continue; + } + } else if dedupe_config.skip_empty.unwrap_or(false) + && p.trim().is_empty() + { // skip empty paragraphs if dedupe_config.skip_empty is true // and the paragraph is empty after trimming (i.e., removing whitespace) continue; @@ -276,7 +314,8 @@ pub mod deduper_config { pub name: String, pub documents: Option, pub paragraphs: Option, - + pub min_length: Option, + pub min_words: Option, pub skip_empty: Option, } diff --git a/src/lib.rs b/src/lib.rs index 3add8972..f3fdb974 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ pub mod deduper; pub mod mixer; pub mod s3_util; pub mod shard; +pub mod wimbd; use crate::deduper::deduper_config::DeduperConfig; use crate::mixer::mixer_config::MixerConfig; diff --git a/src/s3_util.rs b/src/s3_util.rs index 6f97b9d1..492547d3 100644 --- a/src/s3_util.rs +++ b/src/s3_util.rs @@ -6,6 +6,7 @@ use aws_sdk_s3::error::ProvideErrorMetadata; use aws_sdk_s3::primitives::ByteStream; use aws_sdk_s3::Client as S3Client; use tokio::fs::File as TokioFile; +use tokio::time::Duration; // Split an s3:// url into a bucket and key pub fn split_url(s3_url: &str) -> Result<(&str, &str), &'static str> { @@ -35,31 +36,70 @@ pub async fn download_to_file( bucket: &str, key: &str, path: &Path, + max_attempts: Option, ) -> Result<(), io::Error> { - let result = s3_client - .get_object() - .bucket(bucket) - // the type `str` does not implement `Clone`, so calling `clone` on `&str` copies the reference, which does not do anything and can be removed - .key(key) - .send() - .await - .map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!( - "Error downloading {}: {}", - key, - e.message().unwrap_or_default() - ), - ) - })?; - - std::fs::create_dir_all(path.parent().unwrap())?; - let mut file = TokioFile::create(path).await?; - let mut body = result.body.into_async_read(); - tokio::io::copy(&mut body, &mut file).await?; - - Ok(()) + // Default to no retries if max_attempts is not provided + let max_attempts: u8 = max_attempts.unwrap_or_else(|| 1); + + // Check that max_attempts is greater than 0 + if max_attempts == 0 { + return Err(io::Error::new( + io::ErrorKind::Other, + "max_attempts must be greater than 0", + )); + } + + let remote_path = format!("s3://{}/{}", bucket, key); + let local_path = path.to_str().unwrap_or_default(); + + for _attempt in 1..(max_attempts + 1) { + match s3_client.get_object().bucket(bucket).key(key).send().await { + Ok(response) => { + std::fs::create_dir_all(path.parent().unwrap())?; + let mut file = TokioFile::create(path).await?; + let mut body = response.body.into_async_read(); + tokio::io::copy(&mut body, &mut file).await?; + return Ok(()); + } + Err(error) => { + let error_message = error.message().unwrap_or_default(); + if _attempt == max_attempts { + log::error!( + "Failed LAST attempt {}/{} to download '{}' to '{}': {} ('{}')", + _attempt, + max_attempts, + remote_path, + local_path, + error, + error_message + ); + // This was the last attempt + break; + } else { + // short wait (1s) before retrying + log::warn!( + "Failed attempt {}/{} to download '{}' to '{}': {} ('{}'); will retry...", + _attempt, + max_attempts, + remote_path, + local_path, + error, + error_message + ); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + + // If we got here, all attempts failed + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "All {} attempts to download '{}' to '{}' failed", + max_attempts, remote_path, local_path + ), + )); } pub async fn upload_file( @@ -67,27 +107,71 @@ pub async fn upload_file( path: &Path, bucket: &str, key: &str, + max_attempts: Option, ) -> Result<(), io::Error> { - s3_client - .put_object() - .bucket(bucket) - // note: the type `str` does not implement `Clone`, so calling `clone` on `&str` copies the reference, which does not do anything and can be removed - .key(key) - .body(ByteStream::from_path(path).await?) - .send() - .await - .map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!( - "Error uploading {}: {}", - key, - e.message().unwrap_or_default() - ), - ) - })?; - - Ok(()) + // Default to no retries if max_attempts is not provided + let max_attempts: u8 = max_attempts.unwrap_or_else(|| 1); + + // Check that max_attempts is greater than 0 + if max_attempts == 0 { + return Err(io::Error::new( + io::ErrorKind::Other, + "max_attempts must be greater than 0", + )); + } + + let remote_path = format!("s3://{}/{}", bucket, key); + let local_path = path.to_str().unwrap_or_default(); + + for _attempt in 1..(max_attempts + 1) { + match s3_client + .put_object() + .bucket(bucket) + .key(key) + .body(ByteStream::from_path(path).await?) + .send() + .await + { + Ok(_) => return Ok(()), + Err(error) => { + let error_message = error.message().unwrap_or_default(); + if _attempt == max_attempts { + log::error!( + "Failed LAST attempt {}/{} to upload '{}' to '{}': {} ('{}')", + _attempt, + max_attempts, + local_path, + remote_path, + error, + error_message + ); + // This was the last attempt + break; + } else { + // short wait (1s) before retrying + log::warn!( + "Failed attempt {}/{} to upload '{}' to '{}': {} ('{}'); will retry...", + _attempt, + max_attempts, + local_path, + remote_path, + error, + error_message + ); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + + // If we got here, all attempts failed + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "All {} attempts to upload '{}' to '{}' failed", + max_attempts, remote_path, local_path + ), + )); } pub async fn object_size( @@ -246,6 +330,15 @@ mod test { false } + fn get_dolma_test_prefix() -> String { + let prefix = std::env::var_os("DOLMA_TESTS_S3_PREFIX") + .map(|var| var.to_str().unwrap().to_string()) + .unwrap_or_else(|| "s3://dolma-tests".to_string()); + + // remove any trailing slashes + return prefix.strip_suffix("/").unwrap_or(&prefix).to_string(); + } + fn compare_contents(expected: &str, actual: &str) { let expected_lines = BufReader::new(MultiGzDecoder::new( OpenOptions::new() @@ -328,39 +421,147 @@ mod test { .unwrap(); let s3_client = new_client(None)?; + let s3_prefix = get_dolma_test_prefix(); + let s3_dest = "/pretraining-data/tests/mixer/inputs/v0/documents/head/0000.json.gz"; + let s3_path = s3_prefix + s3_dest; + let (s3_bucket, s3_key) = split_url(s3_path.as_str()).unwrap(); + + // upload a file to s3 + let local_source_file = "tests/data/provided/documents/000.json.gz"; + rt.block_on(upload_file( + &s3_client, + Path::new(local_source_file), + s3_bucket, + s3_key, + Some(3), // number of attempts + ))?; + + // download the file back from s3 let local_output_file = "tests/work/output/pretraining-data/tests/mixer/inputs/v0/documents/head/0000.json.gz"; - let s3_path: &str = "pretraining-data/tests/mixer/inputs/v0/documents/head/0000.json.gz"; rt.block_on(download_to_file( &s3_client, - "ai2-llm", - s3_path, + s3_bucket, + // s3_path, + s3_key, Path::new(local_output_file), + Some(3), // number of attempts ))?; - compare_contents("tests/data/documents.json.gz", local_output_file); + // compare the contents of the two files + compare_contents(local_source_file, local_output_file); Ok(()) } + #[test] + fn test_failed_download_file() -> Result<(), io::Error> { + if skip_dolma_aws_tests() { + return Ok(()); + } + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let s3_client = new_client(None)?; + + let s3_prefix = get_dolma_test_prefix(); + let s3_dest = "/foo/bar/baz.json.gz"; + let s3_path = s3_prefix + s3_dest; + let (s3_bucket, s3_key) = split_url(s3_path.as_str()).unwrap(); + + // download the file back from s3 + let local_output_file = "tests/work/foo/bar/bz.json.gz"; + + let resp_too_few_attempts: Result<(), io::Error> = rt.block_on(download_to_file( + &s3_client, + s3_bucket, + s3_key, + Path::new(local_output_file), + Some(0), // number of attempts + )); + assert!(resp_too_few_attempts.is_err()); + assert_eq!( + resp_too_few_attempts.unwrap_err().to_string(), + "max_attempts must be greater than 0" + ); + + let resp_no_such_location: Result<(), io::Error> = rt.block_on(download_to_file( + &s3_client, + s3_bucket, + s3_key, + Path::new(local_output_file), + Some(3), // number of attempts + )); + + assert!(resp_no_such_location.is_err()); + let exp_msg = format!( + "All 3 attempts to download '{}' to '{}' failed", + s3_path, local_output_file + ); + assert_eq!(resp_no_such_location.unwrap_err().to_string(), exp_msg); + Ok(()) + } + #[test] fn test_find_objects_matching_patterns() -> Result<(), io::Error> { if skip_dolma_aws_tests() { return Ok(()); } + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let s3_client = new_client(None)?; + let s3_prefix = get_dolma_test_prefix(); + + let local_source_dir = "tests/data/expected"; + // iterate over the files in `tests/data/expected` and upload them to s3 + let entries = read_dir(local_source_dir)?; + for entry in entries { + let local_source_file = entry?.path(); + + // skip files not ending with .json.gz + if !local_source_file.to_str().unwrap().ends_with(".json.gz") { + continue; + } + + let s3_url = format!( + "{}/pretraining-data/tests/mixer/expected/{}", + s3_prefix, + local_source_file.file_name().unwrap().to_str().unwrap() + ); + let (s3_bucket, s3_key) = split_url(s3_url.as_str()).unwrap(); + rt.block_on(upload_file( + &s3_client, + Path::new(local_source_file.to_str().unwrap()), + s3_bucket, + s3_key, + Some(3), // number of attempts + ))?; + } + + // If we don't shutdown the runtime, the test will hang when running + // find_objects_matching_patterns. + // I'm not sure why this is the case. Need to read more. -@soldni + rt.shutdown_background(); - let patterns = - vec!["s3://ai2-llm/pretraining-data/tests/mixer/expected/*.json.gz".to_string()]; + let patterns = vec![format!( + "{}/{}", + s3_prefix, "pretraining-data/tests/mixer/expected/*.json.gz" + )]; let resp = find_objects_matching_patterns(&s3_client, &patterns).unwrap(); let mut matches: HashSet = HashSet::from_iter(resp.iter().map(|s| s.to_owned())); // list the contents of `tests/data/expected` and check that they match - let entries = read_dir("tests/data/expected")?; + let entries = read_dir(local_source_dir)?; for entry in entries { let remote_path = format!( - "s3://ai2-llm/pretraining-data/tests/mixer/expected/{}", + "{}/pretraining-data/tests/mixer/expected/{}", + s3_prefix, entry?.file_name().to_str().unwrap() ); matches.remove(&remote_path); diff --git a/src/shard.rs b/src/shard.rs index 4bff4f4d..5647cedb 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -23,6 +23,7 @@ pub struct Shard { pub filter: Option, pub span_replacements: Option>, pub discard_fields: Option>, + pub min_text_length: Option, } // A collection of paths to a document file and corresponding attribute files. @@ -78,12 +79,13 @@ impl Shard { "{}/{}-{:04}.json.gz", stream_config.output.path, stream_config.name, stream_shard_count ); - let shard = Shard { + let shard: Shard = Shard { inputs: shard_inputs.clone(), output: output.clone(), filter: stream_config.filter.clone(), span_replacements: stream_config.span_replacement.clone(), discard_fields: stream_config.output.discard_fields.clone(), + min_text_length: stream_config.output.min_text_length.clone(), }; shards.push(shard); stream_shard_count += 1; @@ -103,6 +105,7 @@ impl Shard { filter: stream_config.filter.clone(), span_replacements: stream_config.span_replacement.clone(), discard_fields: stream_config.output.discard_fields.clone(), + min_text_length: stream_config.output.min_text_length.clone(), }; shards.push(shard); stream_shard_count += 1; @@ -129,8 +132,9 @@ impl Shard { s3_client: Box::new(s3_util::new_client(None)?), work: work_dirs.clone(), }; + let min_text_length = self.min_text_length.clone().unwrap_or(0); - let output_path = cache.prepare_output(&self.output)?; + let output_path: PathBuf = cache.prepare_output(&self.output)?; { let output_file = OpenOptions::new() .read(false) @@ -350,12 +354,33 @@ impl Shard { data.as_object_mut().unwrap().remove(f); } - // TODO: add check to make sure that the text field is not empty. Something like - // if !data["text"].as_str().unwrap().is_empty() || skip_empty - // make it configurable and off by default - lines_written += 1; - serde_json::to_writer(&mut writer, &data)?; - writer.write_all(b"\n")?; + // length of text after cleanup + let curr_text_length: usize = data["text"].as_str().unwrap().trim().len(); + + // If min_text_length is not set, default to 0 + if curr_text_length >= min_text_length { + let provenance_string = Value::String(format!( + "{}:{}", + Path::new(&input_path.doc_path) + .file_name() + .unwrap() + .to_str() + .unwrap(), + line_number + )); + + // provenance string is assigned to a key of data["metadata"] + // if "metadata" is a key in data; otherwise, create "metadata" + // and add provenance to it + if !data["metadata"].is_object() { + data["metadata"] = Value::Object(serde_json::Map::new()); + } + data["metadata"]["provenance"] = provenance_string; + + lines_written += 1; + serde_json::to_writer(&mut writer, &data)?; + writer.write_all(b"\n")?; + } } } cache.finalize_input(&input_path.doc_path)?; @@ -407,6 +432,7 @@ pub mod shard_config { pub path: String, pub max_size_in_bytes: usize, pub discard_fields: Option>, + pub min_text_length: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -528,6 +554,7 @@ impl FileCache { bucket, key, &path, + Some(3), // retry twice if fail ))?; Ok(path.clone()) } else { @@ -580,7 +607,13 @@ impl FileCache { .enable_all() .build() .unwrap(); - rt.block_on(s3_util::upload_file(&self.s3_client, &path, bucket, key))?; + rt.block_on(s3_util::upload_file( + &self.s3_client, + &path, + bucket, + key, + Some(3), // retry twice if fail + ))?; std::fs::remove_file(&path)?; { // Create empty file to indicate that the shard is done. diff --git a/src/wimbd/io.rs b/src/wimbd/io.rs new file mode 100644 index 00000000..fcd32314 --- /dev/null +++ b/src/wimbd/io.rs @@ -0,0 +1,64 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/io.rs +//! and modified by @soldni to integrate in dolma. +//! +//! IO helpers. + +use std::{ + fs::File, + io::{self, prelude::*}, + rc::Rc, +}; + +use anyhow::Result; +use flate2::read::MultiGzDecoder; + +/// A buffered reader for gzip files. +pub struct GzBufReader { + reader: io::BufReader>, + buf: Rc, +} + +fn new_buf() -> Rc { + Rc::new(String::with_capacity(2048)) +} + +impl GzBufReader { + // TODO: remove once open is used. + #[allow(dead_code)] + pub fn open(path: impl AsRef) -> Result { + let reader = io::BufReader::new(MultiGzDecoder::new(File::open(path)?)); + let buf = new_buf(); + + Ok(Self { reader, buf }) + } +} + +type DataIteratorItem = io::Result>; + +impl Iterator for GzBufReader { + type Item = DataIteratorItem; + + fn next(&mut self) -> Option { + let buf = match Rc::get_mut(&mut self.buf) { + Some(buf) => { + buf.clear(); + buf + } + None => { + self.buf = new_buf(); + Rc::make_mut(&mut self.buf) + } + }; + + self.reader + .read_line(buf) + .map(|u| { + if u == 0 { + None + } else { + Some(Rc::clone(&self.buf)) + } + }) + .transpose() + } +} diff --git a/src/wimbd/mod.rs b/src/wimbd/mod.rs new file mode 100644 index 00000000..8a4e0d3e --- /dev/null +++ b/src/wimbd/mod.rs @@ -0,0 +1,6 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/io.rs +//! and modified by @soldni to integrate in dolma. + +mod io; +pub mod ngrams; +pub mod tokens; diff --git a/src/wimbd/ngrams/counter.rs b/src/wimbd/ngrams/counter.rs new file mode 100644 index 00000000..e1902758 --- /dev/null +++ b/src/wimbd/ngrams/counter.rs @@ -0,0 +1,213 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/ngrams/counter.rs +//! and modified by @soldni to integrate in dolma. + +use std::hash::{BuildHasher, Hash, Hasher}; +use std::sync::atomic::Ordering; + +use ahash::RandomState; +use anyhow::{Context, Result}; +use atomic_traits::{Atomic, NumOps}; +use num_traits::{Bounded, NumCast, One, SaturatingSub, Zero}; + +pub trait AsIterator<'a, T: 'a> { + type Iterator: Iterator; + + fn as_iter(&'a self) -> Self::Iterator; +} + +impl<'a, T: 'a> AsIterator<'a, T> for [T] { + type Iterator = std::slice::Iter<'a, T>; + + fn as_iter(&'a self) -> Self::Iterator { + self.iter() + } +} + +// NOTE: this implementation conflicts with the below for VecDeque. +// impl<'a, U, T: 'a> AsIterator<'a, T> for U +// where +// U: AsRef<[T]>, +// { +// type Iterator = std::slice::Iter<'a, T>; + +// fn as_iter(&'a self) -> Self::Iterator { +// self.as_ref().iter() +// } +// } + +impl<'a, T: 'a> AsIterator<'a, T> for std::collections::VecDeque { + type Iterator = std::collections::vec_deque::Iter<'a, T>; + + fn as_iter(&'a self) -> Self::Iterator { + self.iter() + } +} + +/// A thread-safe counting Bloom filter for ngrams. +pub struct NgramCounter +where + A: Atomic + NumOps, + ::Type: Zero + One + Bounded + NumCast + Ord + SaturatingSub + Clone, +{ + size: usize, + num_hash_functions: usize, + hash_builders: Vec, + count_array: Vec, +} + +impl NgramCounter +where + A: Atomic + NumOps, + ::Type: Zero + One + Bounded + NumCast + Ord + SaturatingSub + Clone, +{ + /// Create a new counter with a hash table of `size` elements, initialized to `initial_value`. + pub fn new( + size: usize, + num_hash_functions: usize, + seed: Option, + initial_value: ::Type, + ) -> Result { + // Initialize count table + let mut count_array = Vec::new(); + count_array.try_reserve_exact(size).with_context(|| { + "Failed to allocate counts array. You may not have enough available memory.".to_string() + })?; + for _ in 0..size { + count_array.push(A::new(initial_value.clone())); + } + + // Initialize hash builders + let mut hash_builders = Vec::with_capacity(num_hash_functions); + for i in 0..num_hash_functions { + let hash_builder = match seed { + Some(seed) => RandomState::with_seed((seed as usize) + i), + None => RandomState::new(), + }; + hash_builders.push(hash_builder); + } + + Ok(Self { + size, + num_hash_functions, + hash_builders, + count_array, + }) + } + + /// Returns the number of non-zero elements in the hash table. + pub fn nonzero(&self) -> u64 { + let mut nonzero_count: u64 = 0; + let zero = ::Type::zero(); + for item in &self.count_array { + if item.load(Ordering::Relaxed) > zero { + nonzero_count += 1; + } + } + nonzero_count + } + + /// Increment the count for an ngram. + pub fn increment<'a, N, I, T>( + &self, + ngram: &'a N, + by: ::Type, + ) -> ::Type + where + N: AsIterator<'a, T, Iterator = I> + ?Sized, + I: Iterator, + T: 'a + Hash, + { + let mut min_count = ::Type::max_value(); + for i in 0..self.num_hash_functions { + let hash = self.hash(&mut ngram.as_iter(), i); + let index = self.index_for_hash(hash); + let old_count = self.count_array[index].fetch_add(by.clone(), Ordering::Relaxed); + let count = if old_count > ::Type::max_value() - by.clone() { + // Catch overflows and just keep as MAX. + self.count_array[index].store(::Type::max_value(), Ordering::Relaxed); + ::Type::max_value() + } else { + old_count + by.clone() + }; + min_count = std::cmp::min(min_count, count); + } + min_count + } + + /// Decrement the count for an ngram. + pub fn decrement<'a, N, I, T>( + &self, + ngram: &'a N, + by: ::Type, + ) -> ::Type + where + N: AsIterator<'a, T, Iterator = I> + ?Sized, + I: Iterator, + T: 'a + Hash, + { + let mut max_count = ::Type::zero(); + for i in 0..self.num_hash_functions { + let hash = self.hash(&mut ngram.as_iter(), i); + let index = self.index_for_hash(hash); + let old_count = self.count_array[index].fetch_sub(by.clone(), Ordering::Relaxed); + let count = if old_count < by { + // Catch underflows and just keep as 0. + self.count_array[index].store(::Type::zero(), Ordering::Relaxed); + ::Type::zero() + } else { + old_count - by.clone() + }; + max_count = std::cmp::max(max_count, count); + } + max_count + } + + /// Get the max count for an ngram across all hash functions. + pub fn max_count<'a, N, I, T>(&self, ngram: &'a N) -> ::Type + where + N: AsIterator<'a, T, Iterator = I> + ?Sized, + I: Iterator, + T: 'a + Hash, + { + let mut max_count = ::Type::zero(); + for i in 0..self.num_hash_functions { + let hash = self.hash(&mut ngram.as_iter(), i); + let index = self.index_for_hash(hash); + let count = self.count_array[index].load(Ordering::Relaxed); + max_count = std::cmp::max(max_count, count); + } + max_count + } + + fn hash(&self, ngram: &mut I, hasher: usize) -> usize + where + I: Iterator + ?Sized, + T: Hash, + { + let mut hasher = self.hash_builders[hasher].build_hasher(); + for token in ngram { + token.hash(&mut hasher); + } + hasher.finish().try_into().unwrap() + } + + fn index_for_hash(&self, hash: usize) -> usize { + hash % self.size + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::VecDeque; + use std::sync::atomic::AtomicU32; + + #[test] + fn test_counter() { + let counter = NgramCounter::::new(64, 4, Some(1), 0).unwrap(); + counter.increment(&["hi", "there"][..], 1); + + let deque = VecDeque::from(["hello", "world"]); + counter.increment(&deque, 1); + } +} diff --git a/src/wimbd/ngrams/mod.rs b/src/wimbd/ngrams/mod.rs new file mode 100644 index 00000000..30364d5b --- /dev/null +++ b/src/wimbd/ngrams/mod.rs @@ -0,0 +1,176 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/ngrams/mod.rs +//! and modified by @soldni to integrate in dolma. +//! Utilities for working with and counting ngrams. + +use std::collections::VecDeque; +use std::fmt; + +use anyhow::Result; + +mod counter; +mod topk; + +pub use counter::NgramCounter; +pub use topk::TopKNgrams; + +use crate::wimbd::tokens::{tokenize, PretrainedTokenizer}; + +/// A helper function to quickly create an [`Ngram`] iterator given some text and a tokenizer. +pub fn ngrams<'a>( + text: &'a str, + num: usize, + tokenizer: &Option, +) -> Result> { + if let Some(tokenizer) = tokenizer { + Ok(tokenizer.tokenize(text)?.into_iter().ngrams(num)) + } else { + Ok(tokenize(text).map(|s| s.to_string()).ngrams(num)) + } +} + +// Ngram code here adapted from https://docs.rs/ngrams/latest/ngrams/index.html, which has a bug. + +/// A trait for iterators that gives an [`ngrams`] method. +pub trait Ngram<'a, T: 'a + fmt::Debug + Clone>: Iterator +where + Self: Sized, +{ + fn ngrams(self, n: usize) -> Ngrams<'a, T>; +} + +impl<'a, T: 'a + fmt::Debug + Clone, U: 'a + Iterator> Ngram<'a, T> for U { + fn ngrams(self, n: usize) -> Ngrams<'a, T> { + Ngrams::new(self, n) + } +} + +/// The iterator type created from [`Ngram::ngrams`]. +pub struct Ngrams<'a, T: 'a + fmt::Debug + Clone> { + source: Box + 'a>, + num: usize, + memsize: usize, + memory: VecDeque, +} + +impl<'a, T: 'a + fmt::Debug + Clone> fmt::Debug for Ngrams<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Ngrams(tokens, N)") + } +} + +impl<'a, T: 'a + fmt::Debug + Clone + Sized> Ngrams<'a, T> { + /// The source for the `Ngrams` is expected to be pre-tokenized, this library + /// does not make any decisions regarding how your input should be tokenized. + pub(crate) fn new>(source: V, n: usize) -> Ngrams<'a, T> { + let memsize = n - 1; + Ngrams { + source: Box::new(source), + num: n, + memsize, + memory: VecDeque::with_capacity(memsize), + } + } + + fn fill_memory(&mut self) { + while self.memory.len() < self.memsize { + if let Some(a) = self.source.next() { + self.memory.push_back(a); + } else { + break; + }; + } + } +} + +impl<'a, T: 'a + fmt::Debug + Clone> Iterator for Ngrams<'a, T> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.num > 1 { + self.fill_memory(); + + self.source.next().map(|n| { + let mut result = Vec::with_capacity(self.num); + + for elem in &self.memory { + result.push(elem.clone()); + } + + result.push(n.clone()); + + let _ = self.memory.pop_front(); + self.memory.push_back(n); + + result + }) + } else { + self.source.next().map(|n| { + let mut result = Vec::with_capacity(self.num); + result.push(n); + result + }) + } + } +} + +#[cfg(test)] +mod tests { + + use super::{Ngram, Ngrams}; + use std::string::ToString; + + #[test] + fn test_words_iter_adaptor() { + let result: Vec<_> = "one two three four five".split(' ').ngrams(4).collect(); + assert_eq!( + result, + vec![ + vec!["one", "two", "three", "four"], + vec!["two", "three", "four", "five"], + ] + ); + } + + #[test] + fn test_words() { + let seq = "one two three four".split(' '); + let result: Vec<_> = Ngrams::new(seq, 2).collect(); + assert_eq!( + result, + vec![ + vec!["one", "two"], + vec!["two", "three"], + vec!["three", "four"], + ] + ); + } + + #[test] + fn test_unigrams() { + let seq = "one two three four".split(' '); + let result: Vec<_> = Ngrams::new(seq, 1).collect(); + assert_eq!( + result, + vec![vec!["one"], vec!["two"], vec!["three"], vec!["four"],] + ); + } + + #[test] + fn test_chars() { + let seq = "test string".chars().map(|c| c.to_string()); + let result: Vec<_> = Ngrams::new(seq, 4).collect(); + assert_eq!( + result, + vec![ + vec!["t", "e", "s", "t"], + vec!["e", "s", "t", " "], + vec!["s", "t", " ", "s"], + vec!["t", " ", "s", "t"], + vec![" ", "s", "t", "r"], + vec!["s", "t", "r", "i"], + vec!["t", "r", "i", "n"], + vec!["r", "i", "n", "g"], + ] + ); + } +} diff --git a/src/wimbd/ngrams/topk.rs b/src/wimbd/ngrams/topk.rs new file mode 100644 index 00000000..56212fbc --- /dev/null +++ b/src/wimbd/ngrams/topk.rs @@ -0,0 +1,137 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/ngrams/topk.rs +//! and modified by @soldni to integrate in dolma. + +use std::collections::{BTreeSet, HashMap}; +use std::hash::Hash; +use std::rc::Rc; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use ahash::RandomState; +use atomic_traits::{Atomic, NumOps}; +use num_traits::One; + +/// A collection for tracking the top-k ngrams in a corpus. +pub struct TopKNgrams +where + T: Ord + Clone + Sized, + A: Atomic + NumOps, + ::Type: One + Ord + Clone + Copy, +{ + k: usize, + topk: BTreeSet<(::Type, Rc>)>, + ngrams: HashMap>, ::Type, RandomState>, + pub(crate) min_count: ::Type, + min_count_atomic: Arc, +} + +impl TopKNgrams +where + T: Ord + Clone + Sized + Hash, + A: Atomic + NumOps, + ::Type: One + Ord + Clone + Copy, +{ + pub fn new(k: usize) -> Self { + Self { + k, + topk: BTreeSet::new(), + ngrams: HashMap::with_capacity_and_hasher(k + 1, RandomState::new()), + min_count: ::Type::one(), + min_count_atomic: Arc::new(::new(::Type::one())), + } + } + + pub fn min_count(&self) -> Arc { + self.min_count_atomic.clone() + } + + pub fn insert(&mut self, ngram: Vec, count: ::Type) { + if count >= self.min_count { + let ngram = Rc::new(ngram); + + if let Some(old_count) = self.ngrams.get_mut(&ngram) { + if count <= *old_count { + // Nothing to do, return early + return; + } + + // Update existing count for ngram. + self.topk.remove(&(*old_count, ngram.clone())); + *old_count = count; + } else { + self.ngrams.insert(ngram.clone(), count); + } + + self.topk.insert((count, ngram.clone())); + } + + // Update min count if needed. + let mut update_min_count = false; + while self.topk.len() > self.k { + let (_, ngram) = self.topk.pop_first().unwrap(); + self.ngrams.remove(&ngram); + update_min_count = true; + } + if update_min_count { + if let Some((new_min_count, _)) = self.topk.first() { + if *new_min_count != self.min_count { + self.min_count = *new_min_count; + self.min_count_atomic + .store(*new_min_count, Ordering::Relaxed); + } + } + } + } + + pub fn drain(&mut self) -> Vec<(Rc>, ::Type)> { + let mut out: Vec<(Rc>, ::Type)> = Vec::with_capacity(self.k); + while let Some((count, ngram)) = self.topk.pop_last() { + self.ngrams.remove(&ngram); + out.push((ngram, count)) + } + self.min_count = ::Type::one(); + self.min_count_atomic + .store(::Type::one(), Ordering::Relaxed); + out + } +} + +#[cfg(test)] +mod tests { + use std::rc::Rc; + use std::sync::atomic::AtomicU32; + + use super::TopKNgrams; + + #[test] + fn test_adding_same_ngram_multiple_times() { + let mut topk: TopKNgrams = TopKNgrams::new(3); + + let ngram1 = vec!["foo".into(), "bar".into()]; + let ngram2 = vec!["bar".into(), "baz".into()]; + let ngram3 = vec!["baz".into(), "foo".into()]; + + // Insert 3 unique ngrams. + topk.insert(ngram1.clone(), 3); + topk.insert(ngram2, 2); + topk.insert(ngram3, 1); + + // Now try inserting a duplicate. + topk.insert(ngram1.clone(), 3); + assert_eq!(topk.ngrams.len(), 3); + assert_eq!(topk.topk.len(), 3); + assert_eq!(topk.ngrams.get(&Rc::new(ngram1.clone())), Some(&3)); + + // And insert the same ngram with a new count. + topk.insert(ngram1.clone(), 4); + assert_eq!(topk.ngrams.len(), 3); + assert_eq!(topk.topk.len(), 3); + assert_eq!(topk.ngrams.get(&Rc::new(ngram1.clone())), Some(&4)); + + // And insert the same ngram with a lower count. + topk.insert(ngram1.clone(), 2); + assert_eq!(topk.ngrams.len(), 3); + assert_eq!(topk.topk.len(), 3); + assert_eq!(topk.ngrams.get(&Rc::new(ngram1)), Some(&4)); + } +} diff --git a/src/wimbd/progress.rs b/src/wimbd/progress.rs new file mode 100644 index 00000000..8afc141f --- /dev/null +++ b/src/wimbd/progress.rs @@ -0,0 +1,69 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/io.rs +//! and modified by @soldni to integrate in dolma. + + +use anyhow::Result; +use indicatif::{ProgressDrawTarget, ProgressStyle}; + +pub(crate) use indicatif::{MultiProgress, ProgressBar, ProgressIterator}; + +pub(crate) fn get_multi_progress_bar(hidden: bool) -> MultiProgress { + if !hidden { + MultiProgress::with_draw_target(ProgressDrawTarget::stderr_with_hz(2)) + } else { + MultiProgress::with_draw_target(ProgressDrawTarget::hidden()) + } +} + +pub(crate) fn get_file_progress_bar( + msg: &'static str, + n_files: usize, + hidden: bool, +) -> Result { + let progress = ProgressBar::new(n_files.try_into()?) + .with_style( + ProgressStyle::with_template( + "{msg}: files {human_pos}/{human_len} [{elapsed_precise}] [{wide_bar:.cyan/blue}]", + )? + .progress_chars("#>-"), + ) + .with_message(msg); + if hidden { + progress.set_draw_target(ProgressDrawTarget::hidden()); + } else { + progress.set_draw_target(ProgressDrawTarget::stderr_with_hz(1)); + progress.enable_steady_tick(std::time::Duration::from_secs(1)); + } + Ok(progress) +} + +pub(crate) fn get_progress_bar( + path: impl AsRef, + limit: Option, + hidden: bool, +) -> Result { + let progress: ProgressBar = if let Some(limit) = limit { + ProgressBar::new(limit.try_into()?).with_style( + ProgressStyle::with_template( + "{msg:<35!} {human_pos} [{wide_bar:.cyan/blue}] {per_sec:>12}, <{eta:<3} ", + )? + .progress_chars("#>-"), + ) + } else { + ProgressBar::new_spinner().with_style(ProgressStyle::with_template( + "{msg:<35!} {spinner:.green} {human_pos} {per_sec:12}", + )?) + } + .with_message(format!( + "{}:", + path.as_ref().file_name().unwrap().to_string_lossy() + )); + + if hidden { + progress.set_draw_target(ProgressDrawTarget::hidden()); + } else { + progress.set_draw_target(ProgressDrawTarget::stderr_with_hz(1)); + } + + Ok(progress) +} diff --git a/src/wimbd/tokens.rs b/src/wimbd/tokens.rs new file mode 100644 index 00000000..9677272d --- /dev/null +++ b/src/wimbd/tokens.rs @@ -0,0 +1,136 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/io.rs +//! and modified by @soldni to integrate in dolma. +//! +//! Tokenizer classes and functions. + +use anyhow::{anyhow, Result}; +use tokenizers::tokenizer::Tokenizer; +use unicode_segmentation::UnicodeSegmentation; + +/// Tokenize a string using a basic unicode tokenizer. +pub fn tokenize(s: &str) -> impl Iterator { + s.split_word_bounds().filter(|w| { + for c in w.chars() { + if !c.is_whitespace() { + return true; + } + } + false + }) +} + +/// A wrapper class for HuggingFace tokenizers. +#[derive(Debug, Clone)] +pub struct PretrainedTokenizer(Tokenizer); + +impl PretrainedTokenizer { + pub fn tokenize(&self, text: &str) -> Result> { + Ok(self + .0 + .encode(text, false) + .map_err(|err| anyhow!("{}", err))? + .get_tokens() + .to_vec()) + } + + /// Initialize a new pretrained tokenizer from a path or identifier on HuggingFace. + pub fn new(name: &str) -> Result { + Ok(PretrainedTokenizer( + Tokenizer::from_pretrained(name, None) + .map_err(|err| anyhow!("Failed to load pretrained tokenizer {} - {}", name, err))?, + )) + } + + pub fn decode(&self, tokens: &[String]) -> Result { + let ids: Vec = tokens + .iter() + .filter_map(|t| self.0.token_to_id(t)) + .collect(); + self.0.decode(&ids, true).map_err(|err| anyhow!("{}", err)) + } +} + +#[cfg(test)] +mod tests { + use super::tokenize; + use crate::wimbd::ngrams::Ngram; + + #[test] + fn test_tokenize_and_ngrams() { + let s = "You can follow any responses to this entry through the RSS 2.0 feed"; + let tokens = tokenize(s).collect::>(); + assert_eq!( + tokens, + vec![ + "You", + "can", + "follow", + "any", + "responses", + "to", + "this", + "entry", + "through", + "the", + "RSS", + "2.0", + "feed" + ] + ); + + let ngrams = tokenize(s).ngrams(10).collect::>>(); + assert_eq!( + ngrams, + vec![ + vec![ + "You", + "can", + "follow", + "any", + "responses", + "to", + "this", + "entry", + "through", + "the", + ], + vec![ + "can", + "follow", + "any", + "responses", + "to", + "this", + "entry", + "through", + "the", + "RSS", + ], + vec![ + "follow", + "any", + "responses", + "to", + "this", + "entry", + "through", + "the", + "RSS", + "2.0", + ], + vec![ + "any", + "responses", + "to", + "this", + "entry", + "through", + "the", + "RSS", + "2.0", + "feed", + ], + ] + ); + } +} diff --git a/src/wimbd/util.rs b/src/wimbd/util.rs new file mode 100644 index 00000000..284bb7e0 --- /dev/null +++ b/src/wimbd/util.rs @@ -0,0 +1,28 @@ +//! Code imported from github.com/allenai/wimbd/blob/main/src/io.rs +//! and modified by @soldni to integrate in dolma. + +use anyhow::{bail, Result}; + +use std::fs::{self, File}; +use std::path::{Path, PathBuf}; + +pub(crate) fn get_output_file(path: impl AsRef, force: bool) -> Result<(File, PathBuf)> { + let path = path.as_ref(); + + if path.is_file() { + if force { + log::warn!("Overwriting output file {:?}", path); + } else { + bail!( + "Output file {:?} already exists, use --force to overwrite", + path + ); + } + Ok((File::options().write(true).open(path)?, path.into())) + } else { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + Ok((File::create(path)?, path.into())) + } +} diff --git a/tests/data/expected/dedupe-by-url.json.gz b/tests/data/expected/dedupe-by-url.json.gz index b650304c..50903087 100644 Binary files a/tests/data/expected/dedupe-by-url.json.gz and b/tests/data/expected/dedupe-by-url.json.gz differ diff --git a/tests/python/__init__.py b/tests/python/__init__.py index 447cef2f..eeba8de3 100644 --- a/tests/python/__init__.py +++ b/tests/python/__init__.py @@ -4,3 +4,11 @@ warnings.filterwarnings("ignore", message=r".*declare_namespace\(\'.*google.*", category=DeprecationWarning) # base warning raised when warning above are raised warnings.filterwarnings("ignore", message=r".*pkg_resources is deprecated.*", category=DeprecationWarning) + +# ignore warning from packages that have not updated to use utcfromtimestamp +for module in ("botocore", "tqdm", "dateutil"): + warnings.filterwarnings("ignore", module=module, message=r".*utcfromtimestamp\(\) is deprecated.*") + warnings.filterwarnings("ignore", module=module, message=r".*utcnow\(\) is deprecated.*") + +# ignore type annotation errors in this package +warnings.filterwarnings("ignore", message=r".*google\._upb\._message.*", category=DeprecationWarning) diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py index 243198cc..cf91c1d4 100644 --- a/tests/python/test_deduper.py +++ b/tests/python/test_deduper.py @@ -8,6 +8,7 @@ from dolma.cli.__main__ import main from .utils import ( + TestCasePipeline, clean_test_data, download_s3_prefix, get_test_prefix, @@ -104,3 +105,69 @@ def test_dedupe_by_url_remote_input(self): expected = load_jsonl("tests/data/expected/dedupe-by-url.json.gz") computed = load_jsonl(f"{self.local_temp_dir}/tests/data/provided/attributes/dedupe_by_url/000.json.gz") self.assertEqual(expected, computed) + + +class TestDeduperPipeline(TestCasePipeline): + def test_skip_empty(self): + duplicate_text = "More text" + documents = [ + self.combineIntoDoc("Short document", "", duplicate_text), + self.combineIntoDoc("Short document #2", "", duplicate_text), + ] + + docs_fp = self.writeDocs(documents) + key_name = "dedupe_paragraphs" + attribute_name = "bff_duplicate_paragraph_spans" + + config = { + "documents": docs_fp, + "dedupe": { + "name": key_name, + "paragraphs": {"attribute_name": attribute_name}, + "skip_empty": True, + }, + "bloom_filter": { + "file": self.makeUniquePath(), + "read_only": False, + "estimated_doc_count": 100, + "desired_false_positive_rate": 1e-06, + }, + "processes": 1, + } + + config_path = self.writeConfig(config) + + main(argv=["-c", config_path, "dedupe"]) + + expected = self.readUnits([p.replace("documents", f"attributes/{key_name}") for p in docs_fp]) + self.assertEqual(len(expected), 2) + + # no duplicate on first doc + self.assertIn("attributes", expected[0]) + self.assertIn(attribute_name, expected[0]["attributes"]) + self.assertEqual(expected[0]["attributes"][attribute_name], []) + + # duplicate on second doc + self.assertIn("attributes", expected[1]) + self.assertIn(attribute_name, expected[1]["attributes"]) + self.assertEqual(len(expected[1]["attributes"][attribute_name]), 1) + (start, end, score), *_ = expected[1]["attributes"][attribute_name] + self.assertEqual(documents[1][start:end], duplicate_text) + self.assertEqual(score, 1.0) + + # now let's not skip empty docs + config["dedupe"]["skip_empty"] = False + config["bloom_filter"]["file"] = self.makeUniquePath() # new filter + config_path = self.writeConfig(config) + + main(argv=["-c", config_path, "dedupe"]) + + expected = self.readUnits([p.replace("documents", f"attributes/{key_name}") for p in docs_fp]) + + # two duplicates on second doc + self.assertEqual(len(expected[1]["attributes"][attribute_name]), 2) + (s1, e1, v1), (s2, e2, v2) = expected[1]["attributes"][attribute_name] + self.assertEqual(documents[1][s1:e1], "\n") + self.assertEqual(v1, 1.0) + self.assertEqual(documents[1][s2:e2], duplicate_text) + self.assertEqual(v2, 1.0) diff --git a/tests/python/test_mixer.py b/tests/python/test_mixer.py index 8f739a49..8fdb5940 100644 --- a/tests/python/test_mixer.py +++ b/tests/python/test_mixer.py @@ -1,11 +1,13 @@ import json from pathlib import Path from tempfile import NamedTemporaryFile +from typing import List from unittest import TestCase from dolma.cli.__main__ import main from .utils import ( + TestCasePipeline, clean_test_data, download_s3_prefix, get_test_prefix, @@ -35,37 +37,52 @@ def tearDown(self) -> None: if self.remote_test_prefix is not None: clean_test_data(self.remote_test_prefix) + def checkAndRemoveProvenance(self, provided: List[dict]) -> List[dict]: + prev_id = 0 + for row in provided: + self.assertIn("metadata", row) + self.assertIn("provenance", row["metadata"]) + provenance = row["metadata"].pop("provenance") + path, lid = provenance.rsplit(":", 1) + self.assertGreater(int(lid), prev_id) + prev_id = int(lid) + + # remove metadata if empty + len(row["metadata"]) == 0 and row.pop("metadata") + + return provided + def test_email_spans(self): main(argv=["-c", str(EMAIL_SPANS), "mix"]) - self.assertEqual( - load_jsonl("tests/data/expected/email-spans.json.gz"), - load_jsonl("tests/work/output/email-spans/email-spans-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/email-spans.json.gz") + provided = load_jsonl("tests/work/output/email-spans/email-spans-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_filter_by_spans(self): main(argv=["-c", str(FILTER_BY_SPANS), "mix"]) - self.assertEqual( - load_jsonl("tests/data/expected/filter-by-spans.json.gz"), - load_jsonl("tests/work/output/filter-by-spans/filter-by-spans-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/filter-by-spans.json.gz") + provided = load_jsonl("tests/work/output/filter-by-spans/filter-by-spans-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_mixer(self): main(argv=["-c", str(MIXER), "mix"]) - self.assertEqual( - load_jsonl("tests/data/expected/mixer.json.gz"), - load_jsonl("tests/work/output/mixer/mixer-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/mixer.json.gz") + provided = load_jsonl("tests/work/output/mixer/mixer-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_paragraph_spans(self): main(argv=["-c", str(PARAGRAPH_SPANS), "mix"]) - self.assertEqual( - load_jsonl("tests/data/expected/remove-paragraphs.json.gz"), - load_jsonl("tests/work/output/paragraph-spans/paragraph-spans-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/remove-paragraphs.json.gz") + provided = load_jsonl("tests/work/output/paragraph-spans/paragraph-spans-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_local_input_remote_output(self): if self.remote_test_prefix is None: @@ -88,10 +105,10 @@ def test_local_input_remote_output(self): download_s3_prefix(f"{self.remote_test_prefix}/tests/work", "tests/work/remote") - self.assertEqual( - load_jsonl("tests/data/expected/mixer.json.gz"), - load_jsonl("tests/work/remote/output/mixer/mixer-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/mixer.json.gz") + provided = load_jsonl("tests/work/remote/output/mixer/mixer-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_remote_input_remote_output(self): if self.remote_test_prefix is None: @@ -117,11 +134,10 @@ def test_remote_input_remote_output(self): main(argv=["-c", f.name, "mix"]) download_s3_prefix(f"{self.remote_test_prefix}/tests/work", "tests/work/remote") - - self.assertEqual( - load_jsonl("tests/data/expected/mixer.json.gz"), - load_jsonl("tests/work/remote/output/mixer/mixer-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/mixer.json.gz") + provided = load_jsonl("tests/work/remote/output/mixer/mixer-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) def test_remote_input_local_output(self): if self.remote_test_prefix is None: @@ -142,7 +158,57 @@ def test_remote_input_local_output(self): main(argv=["-c", f.name, "mix"]) - self.assertEqual( - load_jsonl("tests/data/expected/mixer.json.gz"), - load_jsonl("tests/work/output/mixer/mixer-test-0000.json.gz"), - ) + expected = load_jsonl("tests/data/expected/mixer.json.gz") + provided = load_jsonl("tests/work/output/mixer/mixer-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) + + +class TestMixerPipeline(TestCasePipeline): + def test_min_length(self): + source_dir = Path(self.makeUniquePath()) + output_dir = Path(self.makeUniquePath()) + + to_remove = "remove second sentence" + to_keep_head = "This is a test" + to_keep_tail = "do not touch" + documents = [ + "doc", + self.combineIntoDoc(to_keep_head, to_remove), + self.combineIntoDoc("A", to_remove), + self.combineIntoDoc(to_keep_head, to_keep_tail), + self.combineIntoDoc("", "", "", "p", "", "", ""), + ] + docs_path = self.writeDocs(docs=documents, ext_dir=source_dir) + + attributes = [ + [], + [((start := documents[1].find(to_remove)), start + len(to_remove), 1)], + [((start := documents[2].find(to_remove)), start + len(to_remove), 1)], + [], + [], + ] + self.writeAttributes(attributes=attributes, attribute_name="test", ext_dir=source_dir) + + config = { + "streams": [ + { + "name": "test", + "documents": docs_path, + "attributes": ["test"], + "output": {"path": str(output_dir), "max_size_in_bytes": 10000000, "min_text_length": 4}, + "span_replacement": [{"span": "$.attributes.test", "min_score": 0.5, "replacement": ""}], + } + ], + "processes": 1, + } + + config_path = self.writeConfig(config=config) + + main(argv=["-c", config_path, "mix"]) + + new_docs = self.readUnits(list(output_dir.iterdir())) + + self.assertEqual(len(new_docs), 2) + self.assertEqual(new_docs[0]["text"], self.combineIntoDoc(to_keep_head, "")) + self.assertEqual(new_docs[1]["text"], self.combineIntoDoc(to_keep_head, to_keep_tail)) diff --git a/tests/python/test_repetitions.py b/tests/python/test_repetitions.py index 03077a1f..eb3f9c75 100644 --- a/tests/python/test_repetitions.py +++ b/tests/python/test_repetitions.py @@ -4,6 +4,7 @@ from dolma.taggers.repetitions import ( ParagraphRepetitionsTagger, RepetitionsTagger, + TokenizerRepetitionsSkipEmptyTagger, TokenizerRepetitionsTagger, ) @@ -52,33 +53,33 @@ def test_doc_with_repetitions(self): self.assertEqual(all_result.spans[0].type, "repetition") self.assertEqual(all_result.spans[0].select(self.doc_with_reps), D0M0) - self.assertEqual(all_result.spans[0].score, len(D0M0)) + self.assertEqual(all_result.spans[0].score, D0M0.count("repetition")) self.assertEqual(all_result.spans[0], par_result.spans[0]) self.assertEqual(all_result.spans[1].type, "repetition") self.assertEqual(all_result.spans[1].select(self.doc_with_reps), D0M1) - self.assertEqual(all_result.spans[1].score, len(D0M1)) + self.assertEqual(all_result.spans[1].score, D0M1.count("blah")) self.assertEqual(all_result.spans[1], par_result.spans[1]) self.assertEqual(all_result.spans[2].type, "repetition") self.assertEqual(all_result.spans[2].select(self.doc_with_reps), D0M2) - self.assertEqual(all_result.spans[2].score, len(D0M2)) + self.assertEqual(all_result.spans[2].score, D0M2.count("M")) self.assertEqual(all_result.spans[2], par_result.spans[2]) self.assertEqual(all_result.spans[3].type, "repetition") self.assertEqual(all_result.spans[3].select(self.doc_with_reps), D0M3) - self.assertEqual(all_result.spans[3].score, len(D0M3)) + self.assertEqual(all_result.spans[3].score, D0M3.count("bass")) self.assertEqual(all_result.spans[3], par_result.spans[3]) - self.assertEqual(all_result.spans[4].type, "doc_max_repetition") - self.assertEqual(all_result.spans[4].score, len(D0M0)) + self.assertEqual(all_result.spans[4].type, "doc_max_score_repetition") + self.assertEqual(all_result.spans[4].score, D0M2.count("M")) self.assertEqual(all_result.spans[4], par_result.spans[4]) - matches_length = len(D0M0) + len(D0M1) + len(D0M2) + len(D0M3) - self.assertEqual(all_result.spans[5].type, "doc_mean_repetition") - self.assertEqual(all_result.spans[5].score, matches_length / 4) + self.assertEqual(all_result.spans[5].type, "doc_max_length_repetition") + self.assertEqual(all_result.spans[5].score, len(D0M0)) self.assertEqual(all_result.spans[5], par_result.spans[5]) + matches_length = len(D0M0) + len(D0M1) + len(D0M2) + len(D0M3) self.assertEqual(all_result.spans[6].type, "doc_frac_repetition") self.assertEqual(all_result.spans[6].score, matches_length / len(self.doc_with_reps.text)) self.assertEqual(all_result.spans[6], par_result.spans[6]) @@ -89,11 +90,11 @@ def test_doc_without_repetitions(self): self.assertEqual(len(all_result.spans), 3) self.assertEqual(len(par_result.spans), 3) - self.assertEqual(all_result.spans[0].type, "doc_max_repetition") + self.assertEqual(all_result.spans[0].type, "doc_max_score_repetition") self.assertEqual(all_result.spans[0].score, 0) self.assertEqual(all_result.spans[0], par_result.spans[0]) - self.assertEqual(all_result.spans[1].type, "doc_mean_repetition") + self.assertEqual(all_result.spans[1].type, "doc_max_length_repetition") self.assertEqual(all_result.spans[1].score, 0) self.assertEqual(all_result.spans[1], par_result.spans[1]) @@ -112,31 +113,69 @@ def setUp(self) -> None: def test_doc_with_repetitions(self): repeated_strings = [ - "repetitions repetitions repetitions", - "repetitions repetitions repetitions repetitions", - "blah blah blah blah", # missing a blah bc the first element in this seq has diff token id - "MMMMMMMM", # shorter bc sequence is tokenized as 'ĠM', 'MM', 'MM', 'MM', 'MM', 'M' - "bass banana bass banana bass banana bass banana", + ("repetitions repetitions repetitions", 3), + ("repetitions repetitions repetitions repetitions", 4), + ("blah blah blah blah", 4), # missing a blah bc the first element in this seq has diff token id + ("MMMMMMMM", 4), # shorter bc sequence is tokenized as 'ĠM', 'MM', 'MM', 'MM', 'MM', 'M' + ("bass banana bass banana bass banana bass banana", 4), ] all_results = self.repetitions_tagger.predict(self.doc_with_reps) self.assertEqual(len(all_results.spans), len(repeated_strings) + 3) i = 0 - for string in repeated_strings: + for string, score in repeated_strings: self.assertEqual(all_results.spans[i].type, "repetition") self.assertEqual(string, all_results.spans[i].select(self.doc_with_reps)) - self.assertEqual(len(string), all_results.spans[i].score) + self.assertEqual(all_results.spans[i].score, score) i += 1 - self.assertEqual(all_results.spans[i].type, "doc_max_repetition") - self.assertEqual(all_results.spans[i].score, max(len(s) for s in repeated_strings)) + self.assertEqual(all_results.spans[i].type, "doc_max_score_repetition") + self.assertEqual(all_results.spans[i].score, max(v for _, v in repeated_strings)) i += 1 - matches_length = sum(map(len, repeated_strings)) - self.assertEqual(all_results.spans[i].type, "doc_mean_repetition") - self.assertEqual(all_results.spans[i].score, matches_length / len(repeated_strings)) + matches_length = sum(len(s) for s, _ in repeated_strings) + self.assertEqual(all_results.spans[i].type, "doc_max_length_repetition") + self.assertEqual(all_results.spans[i].score, max(len(s) for s, _ in repeated_strings)) i += 1 self.assertEqual(all_results.spans[i].type, "doc_frac_repetition") self.assertEqual(all_results.spans[i].score, matches_length / len(self.doc_with_reps.text)) + + def test_multiple_matches(self): + text = "NOOOOOOOOOOOOOO If it is a Pizza Oven, then it's first meal MUST be PIZZA!!!!!! otherwise it will fall apart!!!!!" + doc = Document(source=__file__, id="0", text=text) + + rep_tagger = TokenizerRepetitionsTagger() + rep_tagger_uniq = TokenizerRepetitionsSkipEmptyTagger() + + all_results = rep_tagger.predict(doc) + uniq_results = rep_tagger_uniq.predict(doc) + + self.assertEqual(len(all_results.spans), 5) + self.assertEqual(len(uniq_results.spans), 4) + + self.assertEqual(all_results.spans[0].start, 1) + self.assertEqual(all_results.spans[0].end, 15) + self.assertEqual(all_results.spans[0].score, 7) + + self.assertEqual(all_results.spans[1].start, 1) + self.assertEqual(all_results.spans[1].end, 15) + self.assertEqual(all_results.spans[1].score, 3) + + self.assertEqual(uniq_results.spans[0].start, 1) + self.assertEqual(uniq_results.spans[0].end, 15) + self.assertEqual(uniq_results.spans[0].score, 7) + + def test_skip_empty(self): + text = "Nothing to note." + doc = Document(source=__file__, id="0", text=text) + + rep_tagger = TokenizerRepetitionsTagger() + rep_tagger_uniq = TokenizerRepetitionsSkipEmptyTagger() + + all_results = rep_tagger.predict(doc) + uniq_results = rep_tagger_uniq.predict(doc) + + self.assertEqual(len(all_results.spans), 3) + self.assertEqual(len(uniq_results.spans), 0) diff --git a/tests/python/utils.py b/tests/python/utils.py index ce3aa87e..47ddcd18 100644 --- a/tests/python/utils.py +++ b/tests/python/utils.py @@ -1,8 +1,13 @@ import json +import logging import os import re import uuid -from typing import List, Tuple +from itertools import chain +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import List, Optional, Tuple, Union +from unittest import TestCase from urllib.parse import urlparse import boto3 @@ -15,6 +20,8 @@ DOLMA_TESTS_SKIP_AWS_ENV_VAR = "DOLMA_TESTS_SKIP_AWS" DOLMA_TESTS_S3_PREFIX_DEFAULT = "s3://dolma-tests" +LOGGER = logging.getLogger(__name__) + def parse_s3_path(s3_path: str) -> Tuple[str, str]: """ @@ -58,7 +65,7 @@ def get_test_prefix() -> str: def skip_aws_tests() -> bool: dolma_tests_skip = os.environ.get(DOLMA_TESTS_SKIP_AWS_ENV_VAR) - print(f"{DOLMA_TESTS_SKIP_AWS_ENV_VAR}: {dolma_tests_skip}") + LOGGER.info(f"{DOLMA_TESTS_SKIP_AWS_ENV_VAR}: {dolma_tests_skip}") return (dolma_tests_skip or "false").lower() == "true" @@ -123,6 +130,68 @@ def upload_s3_prefix(s3_prefix: str, local_prefix: str): s3.upload_file(Bucket=bucket_name, Key=f"{prefix}/{name}", Filename=local_fp) -def load_jsonl(fp: str) -> List[dict]: +def load_jsonl(fp: Union[str, Path]) -> List[dict]: with smart_open.open(fp, "r") as f: return [json.loads(ln) for ln in f] + + +class TestCasePipeline(TestCase): + def setUp(self) -> None: + self.temp_dir = TemporaryDirectory() + + def tearDown(self) -> None: + self.temp_dir.cleanup() + + def readUnits(self, paths: List[Union[Path, str]]) -> List[dict]: + units = chain.from_iterable(load_jsonl(fp) for fp in paths) + return sorted(units, key=lambda x: int(x["id"])) + + def writeUnits( + self, units: List[dict], unit_type: str, partitions: int = 1, ext_dir: Optional[Path] = None + ) -> List[str]: + if len(units) < partitions: + raise ValueError(f"Cannot partition {len(units)} {unit_type} over {partitions} partitions") + + dir_path = ext_dir or Path(self.makeUniquePath()) + file_paths = [] + for i in range(partitions): + fp = dir_path / f"{unit_type}/{i}.jsonl.gz" + fp.parent.mkdir(parents=True, exist_ok=True) + with smart_open.open(fp, "w") as f: + for doc in units[i::partitions]: + f.write(json.dumps(doc) + "\n") + file_paths.append(fp) + + return [str(p) for p in file_paths] + + def writeDocs(self, docs: List[str], partitions: int = 1, ext_dir: Optional[Path] = None) -> List[str]: + encoded_docs = [{"id": str(i), "text": d, "source": __file__} for i, d in enumerate(docs)] + return self.writeUnits(units=encoded_docs, unit_type="documents", partitions=partitions, ext_dir=ext_dir) + + def writeAttributes( + self, + attributes: List[List[Tuple[int, int, float]]], + attribute_name: str, + partitions: int = 1, + ext_dir: Optional[Path] = None, + ) -> List[str]: + encoded_attributes = [{"id": str(i), "attributes": {attribute_name: d}} for i, d in enumerate(attributes)] + return self.writeUnits( + units=encoded_attributes, + unit_type=f"attributes/{attribute_name}", + partitions=partitions, + ext_dir=ext_dir, + ) + + def makeUniquePath(self, ext_dir: Optional[Path] = None, ext: str = "") -> str: + ext = f".{ext.lstrip('.')}" if ext else ext + return f"{ext_dir or self.temp_dir.name}/{uuid.uuid4()}{ext}" + + def writeConfig(self, config: dict, ext_dir: Optional[Path] = None) -> str: + fp = Path(self.makeUniquePath(ext="json", ext_dir=ext_dir)) + with smart_open.open(fp, "wt") as f: + json.dump(config, f, indent=2, sort_keys=True) + return str(fp) + + def combineIntoDoc(self, *lines: str, join: str = "\n") -> str: + return join.join(lines)