From 89f97676604b0eb997afd61680d54a13f4d88cc4 Mon Sep 17 00:00:00 2001
From: Yichen LI <70130867+YichenLi00@users.noreply.github.com>
Date: Mon, 18 Mar 2024 18:51:04 +0800
Subject: [PATCH 1/2] Update download.py

Feat: slove the problem of repeatedly download dummy files
---
 src/download.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/download.py b/src/download.py
index c081e55..77d1d1b 100644
--- a/src/download.py
+++ b/src/download.py
@@ -5,6 +5,10 @@
 from functools import partial
 
 
+class RequestLimitReached(Exception):
+    """Exception raised when the request limit is reached."""
+    pass
+
 def dict_parse(dic, pre=None):
     pre = pre[:] if pre else []
     if isinstance(dic, dict):
@@ -89,16 +93,20 @@ def req_url(dl_file, max_retry=5, headers=None, proxies=None):
     for i in range(max_retry):
         try:
             r = requests.get(url, headers=headers, proxies=proxies)
+            if r.text.startswith("You can only make 350 requests every 15min"):
+                raise RequestLimitReached("Request limit reached")
             with open(save_path, "wb") as f:
                 f.write(r.content)
             return 'Downloaded: ' + str(save_path)
+        except RequestLimitReached as e:
+            return str(e)
         except Exception as e:
             exception = e
-            # print('file request exception (retry {}): {} - {}'.format(i, e, save_path))
             sleep(0.4)
     return 'File request exception (retry {}): {} - {}'.format(i, exception, save_path)
 
 
+
 def download_repo(config):
     """Download Anonymous Github repo"""
 
@@ -136,11 +144,17 @@ def download_repo(config):
         files.append((file_url, save_path))
 
     partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies)
+    limit_reached = False
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_conns) as executor:
         future_to_url = (executor.submit(partial_req, dl_file) for dl_file in files)
         for future in concurrent.futures.as_completed(future_to_url):
+            if limit_reached:
+                break  # Stop the download process if the limit is reached
             try:
                 data = future.result()
+                if "Request limit reached" in data:
+                    print(data)
+                    limit_reached = True
             except Exception as exc:
                 data = str(type(exc))
             finally:
@@ -162,4 +176,4 @@ def download_repo(config):
     parser.add_argument('--proxies', type=str, default='', help='Proxies used for connection')
     parser.add_argument('--verbose', type=bool, default=False, help='Display skipped files or not')
     args = parser.parse_args()
-    download_repo(args.__dict__)
\ No newline at end of file
+    download_repo(args.__dict__)

From 2fa58a0e8faf3f129a62ad470ccba2154fdb0688 Mon Sep 17 00:00:00 2001
From: Yichen LI <70130867+YichenLi00@users.noreply.github.com>
Date: Mon, 18 Mar 2024 19:57:08 +0800
Subject: [PATCH 2/2] Update download.py

---
 src/download.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/download.py b/src/download.py
index 77d1d1b..916fb54 100644
--- a/src/download.py
+++ b/src/download.py
@@ -3,6 +3,8 @@
 from time import sleep
 import concurrent.futures
 from functools import partial
+import threading
+
 
 
 class RequestLimitReached(Exception):
@@ -65,7 +67,7 @@ def check_file_authentic(save_path):
         return False
 
 
-def req_url(dl_file, max_retry=5, headers=None, proxies=None):
+def req_url(dl_file, max_retry=5, headers=None, proxies=None, wait_event=None):
     """Download file"""
     url = dl_file[0]
     save_path = dl_file[1]
@@ -90,20 +92,29 @@ def req_url(dl_file, max_retry=5, headers=None, proxies=None):
     }
     proxies = proxies if proxies else { "http": "", "https":"", }
 
-    for i in range(max_retry):
+    attempt = 0
+    while attempt < max_retry:
+        if wait_event.is_set():
+            print(f"Waiting due to rate limit, attempt {attempt}")
+            wait_event.wait()  # Wait until the event is cleared
         try:
             r = requests.get(url, headers=headers, proxies=proxies)
             if r.text.startswith("You can only make 350 requests every 15min"):
-                raise RequestLimitReached("Request limit reached")
+                wait_event.set() 
+                print(f"Request limit reached, waiting for 15 minutes. Attempt {attempt}")
+                sleep(15 * 60)  
+                wait_event.clear()  # Clear the event to resume all threads
+                attempt += 1
+                continue
             with open(save_path, "wb") as f:
                 f.write(r.content)
             return 'Downloaded: ' + str(save_path)
-        except RequestLimitReached as e:
-            return str(e)
         except Exception as e:
             exception = e
             sleep(0.4)
-    return 'File request exception (retry {}): {} - {}'.format(i, exception, save_path)
+        attempt += 1
+    return 'File request exception (retry {}): {} - {}'.format(attempt, exception, save_path)
+
 
 
 
@@ -144,17 +155,13 @@ def download_repo(config):
         files.append((file_url, save_path))
 
     partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies)
-    limit_reached = False
+    wait_event = threading.Event()
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_conns) as executor:
-        future_to_url = (executor.submit(partial_req, dl_file) for dl_file in files)
+        partial_req = partial(req_url, max_retry=max_retry, headers=headers, proxies=proxies, wait_event=wait_event)
+        future_to_url = {executor.submit(partial_req, dl_file): dl_file for dl_file in files}
         for future in concurrent.futures.as_completed(future_to_url):
-            if limit_reached:
-                break  # Stop the download process if the limit is reached
             try:
                 data = future.result()
-                if "Request limit reached" in data:
-                    print(data)
-                    limit_reached = True
             except Exception as exc:
                 data = str(type(exc))
             finally: