From 89f97676604b0eb997afd61680d54a13f4d88cc4 Mon Sep 17 00:00:00 2001 From: Yichen LI <70130867+YichenLi00@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:51:04 +0800 Subject: [PATCH 1/2] Update download.py Feat: slove the problem of repeatedly download dummy files --- src/download.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/download.py b/src/download.py index c081e55..77d1d1b 100644 --- a/src/download.py +++ b/src/download.py @@ -5,6 +5,10 @@ from functools import partial +class RequestLimitReached(Exception): + """Exception raised when the request limit is reached.""" + pass + def dict_parse(dic, pre=None): pre = pre[:] if pre else [] if isinstance(dic, dict): @@ -89,16 +93,20 @@ def req_url(dl_file, max_retry=5, headers=None, proxies=None): for i in range(max_retry): try: r = requests.get(url, headers=headers, proxies=proxies) + if r.text.startswith("You can only make 350 requests every 15min"): + raise RequestLimitReached("Request limit reached") with open(save_path, "wb") as f: f.write(r.content) return 'Downloaded: ' + str(save_path) + except RequestLimitReached as e: + return str(e) except Exception as e: exception = e - # print('file request exception (retry {}): {} - {}'.format(i, e, save_path)) sleep(0.4) return 'File request exception (retry {}): {} - {}'.format(i, exception, save_path) + def download_repo(config): """Download Anonymous Github repo""" @@ -136,11 +144,17 @@ def download_repo(config): files.append((file_url, save_path)) partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies) + limit_reached = False with concurrent.futures.ThreadPoolExecutor(max_workers=max_conns) as executor: future_to_url = (executor.submit(partial_req, dl_file) for dl_file in files) for future in concurrent.futures.as_completed(future_to_url): + if limit_reached: + break # Stop the download process if the limit is reached try: data = future.result() + if "Request limit reached" in data: + print(data) + limit_reached = True except Exception as exc: data = str(type(exc)) finally: @@ -162,4 +176,4 @@ def download_repo(config): parser.add_argument('--proxies', type=str, default='', help='Proxies used for connection') parser.add_argument('--verbose', type=bool, default=False, help='Display skipped files or not') args = parser.parse_args() - download_repo(args.__dict__) \ No newline at end of file + download_repo(args.__dict__) From 2fa58a0e8faf3f129a62ad470ccba2154fdb0688 Mon Sep 17 00:00:00 2001 From: Yichen LI <70130867+YichenLi00@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:57:08 +0800 Subject: [PATCH 2/2] Update download.py --- src/download.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/download.py b/src/download.py index 77d1d1b..916fb54 100644 --- a/src/download.py +++ b/src/download.py @@ -3,6 +3,8 @@ from time import sleep import concurrent.futures from functools import partial +import threading + class RequestLimitReached(Exception): @@ -65,7 +67,7 @@ def check_file_authentic(save_path): return False -def req_url(dl_file, max_retry=5, headers=None, proxies=None): +def req_url(dl_file, max_retry=5, headers=None, proxies=None, wait_event=None): """Download file""" url = dl_file[0] save_path = dl_file[1] @@ -90,20 +92,29 @@ def req_url(dl_file, max_retry=5, headers=None, proxies=None): } proxies = proxies if proxies else { "http": "", "https":"", } - for i in range(max_retry): + attempt = 0 + while attempt < max_retry: + if wait_event.is_set(): + print(f"Waiting due to rate limit, attempt {attempt}") + wait_event.wait() # Wait until the event is cleared try: r = requests.get(url, headers=headers, proxies=proxies) if r.text.startswith("You can only make 350 requests every 15min"): - raise RequestLimitReached("Request limit reached") + wait_event.set() + print(f"Request limit reached, waiting for 15 minutes. Attempt {attempt}") + sleep(15 * 60) + wait_event.clear() # Clear the event to resume all threads + attempt += 1 + continue with open(save_path, "wb") as f: f.write(r.content) return 'Downloaded: ' + str(save_path) - except RequestLimitReached as e: - return str(e) except Exception as e: exception = e sleep(0.4) - return 'File request exception (retry {}): {} - {}'.format(i, exception, save_path) + attempt += 1 + return 'File request exception (retry {}): {} - {}'.format(attempt, exception, save_path) + @@ -144,17 +155,13 @@ def download_repo(config): files.append((file_url, save_path)) partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies) - limit_reached = False + wait_event = threading.Event() with concurrent.futures.ThreadPoolExecutor(max_workers=max_conns) as executor: - future_to_url = (executor.submit(partial_req, dl_file) for dl_file in files) + partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies, wait_event=wait_event) + future_to_url = {executor.submit(partial_req, dl_file): dl_file for dl_file in files} for future in concurrent.futures.as_completed(future_to_url): - if limit_reached: - break # Stop the download process if the limit is reached try: data = future.result() - if "Request limit reached" in data: - print(data) - limit_reached = True except Exception as exc: data = str(type(exc)) finally: