Skip to content

Commit

Permalink
Download JSON tool supports parts
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Hansen committed Feb 21, 2020
1 parent bcb6414 commit 081f7b3
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 8 deletions.
103 changes: 96 additions & 7 deletions bin/make_download_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import argparse
import json
import os
import subprocess
import sys
from pathlib import Path

Expand All @@ -16,30 +17,118 @@ def main():
# url_base = args.url_base
profile_dir = Path(args.profile_dir).absolute()
files = {}
file_paths = {}
for file_name in args.files:
file_path = Path(file_name).absolute()
if not file_path.is_file():
# Skip directories
continue

file_suffix = file_path.suffix
unzip = False
has_parts = False

if file_suffix.startswith(".part-"):
file_path = file_path.with_suffix("")
file_suffix = file_path.suffix
has_parts = True

if file_suffix == ".gz":
file_path = file_path.with_suffix("")
unzip = True

file_key = str(file_path.relative_to(profile_dir))
files[file_key] = file_path
file_details = files.get(file_key)
if not file_details:
file_details = {
"unzip": unzip,
"url": f"{profile_dir.name}/raw/master/{file_key}",
}

if has_parts:
file_parent_key = str(file_path.relative_to(profile_dir).parent)
if not file_parent_key.endswith("/"):
file_parent_key += "/"

file_details["url"] = f"{profile_dir.name}/raw/master/{file_parent_key}"
parts = file_details.get("parts", [])
parts.append(
{
"fragment": Path(file_name).name,
"bytes_expected": os.path.getsize(file_name),
}
)
file_details["parts"] = parts
elif unzip:
file_details["url"] += ".gz"

# Record zipped and unzipped sizes
file_details["zip_bytes_expected"] = os.path.getsize(file_name)
file_details["bytes_expected"] = int(
subprocess.check_output(f"zcat '{file_path}' | wc -c", shell=True)
.strip()
.decode()
)
else:
# Record size
file_details["bytes_expected"] = os.path.getsize(file_name)

files[file_key] = file_details
file_paths[file_key] = file_path

# Add sizes for files with parts
for file_key, file_details in files.items():
parts = file_details.get("parts")
if parts:
file_path = file_paths[file_key]
sum_size = sum(part["bytes_expected"] for part in parts)
if file_details["unzip"]:
file_details["zip_bytes_expected"] = sum_size
file_details["bytes_expected"] = int(
subprocess.check_output(
f"cat '{file_path}.gz'.part-* | zcat | wc -c", shell=True
)
.strip()
.decode()
)
else:
file_details["bytes_expected"] = sum_size

json.dump(
{
"conditions": {
file_key: f"{profile_dir.name}/{file_key}" for file_key in files
},
"files": {
f"{profile_dir.name}/{file_key}": {
"url": f"{profile_dir.name}/raw/master/{file_key}",
"bytes_expected": os.path.getsize(file_path),
"unzip": file_path.suffix == ".gz",
}
for file_key, file_path in files.items()
f"{profile_dir.name}/{file_key}": file_details
for file_key, file_details in files.items()
},
},
sys.stdout,
indent=4,
)


# -----------------------------------------------------------------------------


def make_file(file_dict, file_details):
file_path = file_details["path"]
file_dict["unzip"] = file_details["unzip"]

if file_dict["unzip"]:
file_dict["zip_bytes_expected"] = os.path.getsize(file_path)
file_dict["bytes_expected"] = int(
subprocess.check_output(f"zcat '{file_path}' | wc -c", shell=True)
.strip()
.decode()
)
else:
file_dict["bytes_expected"] = os.path.getsize(file_path)

return file_dict


# -----------------------------------------------------------------------------

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion english/en-us_kaldi-zamia

0 comments on commit 081f7b3

Please sign in to comment.