forked from oscar-project/ungoliant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCargo.toml
76 lines (66 loc) · 1.91 KB
/
Cargo.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
[package]
name = "ungoliant"
version = "3.0.0"
authors = ["Julien Abadji <julien.e.abadji@gmail.com>, Pedro J. Ortiz <pedro@pjortiz.com>, Amir Hossein Kargaran <amir@cis.lmu.de>"]
edition = "2021"
description = "The pipeline for the GlotCC/OSCAR corpus."
license = "Apache-2.0"
homepage = "https://github.com/cisnlp/ungoliant"
repository = "https://github.com/cisnlp/ungoliant"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = { version = "0.11", default-features=false, features = ["rustls-tls", "blocking", "stream"] }
flate2 = { version = "1.0.20"}
futures-core = "0.3"
futures-util = "0.3"
futures = "0.3"
structopt = "0.3.21"
env_logger = "0.8.3"
log = "0.4.14"
itertools = "0.10.0"
tokio = { version = "1", features = ["full"] }
tokio-util = {version="0.6.6", features=["compat"]}
warc = {version="0.3.0", features=["with_serde"]}
ut1_blocklist = "0.3.0"
fasttext = "0.7.6"
bytes = "1"
rayon = "1"
twox-hash = "1.6"
glob = "0.3.0"
sha2 = "0.9.5"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
schemars = "0.8.3"
runiq-lib = "1.2.2"
rand = "0.8.4"
url = "2.2.2"
avro-rs = { version = "0.13.0", features = ["snappy"] }
unicode-script = "0.5.4"
unicode-segmentation = "1.8.0"
csv = "1.1.6"
unic-ucd = "0.9.0"
oxilangtag = {version="0.1.3", features=["serde"]}
language-tags = "0.3.2"
lazy_static = "1.4.0"
oscar-io = { git = "https://github.com/cisnlp/oscar-io.git", branch = "main" }
#tlsh = {git="https://github.com/Uinelj/tlsh-rs", branch="fix-q3-panic"}
tlsh-fixed = "0.1.1"
ctclib-pp = {version="0.2.0", optional=true}
[features]
kenlm = ["dep:ctclib-pp"]
[dev-dependencies]
rand_distr = "0.4.2"
sha-1 = "0.9"
criterion = "0.3"
serial_test = "0.5.1"
tempfile="3.2.0"
test-log = "0.2.11"
[[bench]]
name = "fasttext_bench"
harness = false
[[bench]]
name = "pipeline_bench_rayon"
harness = false
[[bench]]
name = "annotate_noisy"
harness = false