-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscan.py
executable file
·145 lines (121 loc) · 4.6 KB
/
scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
import os
import pickle
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import json
from multiprocessing import Pool
from tqdm import tqdm
import argparse as ap
parser = ap.ArgumentParser()
parser.add_argument("-o", "--output", help="Output file", default="out/records.json")
parser.add_argument(
"-j", "--jobs", type=int, help="Number of parallel jobs", default=None
)
options = parser.parse_args()
# Authenticate and build the Google Drive service
def authenticate_google_drive():
SCOPES = ["https://www.googleapis.com/auth/drive"]
creds = None
# Use a saved token if it exists
if os.path.exists("token.pickle"):
with open("token.pickle", "rb") as token:
creds = pickle.load(token)
# If no valid credentials, request new authorization
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for future use
with open("token.pickle", "wb") as token:
pickle.dump(creds, token)
return build("drive", "v3", credentials=creds)
# List folders directly under a specific parent folder
def list_folders_one_level(drive_service, parent_folder_id):
query = f"'{parent_folder_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed = false"
results = (
drive_service.files()
.list(q=query, fields="files(id, name)", pageSize=1000)
.execute()
)
nextPageToken = results.get("nextPageToken")
folders = results.get("files", [])
while nextPageToken:
results = (
drive_service.files()
.list(
q=query,
fields="files(id, name)",
pageSize=1000,
pageToken=nextPageToken,
)
.execute()
)
folders.extend(results.get("files", []))
nextPageToken = results.get("nextPageToken")
return folders
# List files in a specific folder
def list_files_in_folder(drive_service, folder_id):
query = f"'{folder_id}' in parents and trashed = false"
# try 3 times
for _ in range(3):
try:
results = (
drive_service.files()
.list(
q=query, fields="files(id, name, mimeType)", pageSize=300
) # NOTE: please don't reach this limit
.execute()
)
break
except Exception as e:
print(e)
else:
print(f"Failed to list files in folder {folder_id}")
return []
files = results.get("files", [])
return files
def get_author_designs(args):
drive_service, author = args
designs = list_files_in_folder(drive_service, author["id"])
for design in designs:
if design["mimeType"] == "application/vnd.google-apps.folder":
design["files"] = sorted(
list_files_in_folder(drive_service, design["id"]), key=lambda x: x["id"]
)
obj = {
author["id"]: {
"name": author["name"],
"designs": sorted(designs, key=lambda x: x["id"]),
},
}
return obj
# Main function to scan a Google Drive folder
def scan_google_drive(parent_folder_id):
drive_service = authenticate_google_drive()
# List folders at the first level
authors = list_folders_one_level(drive_service, parent_folder_id)
print(f"Found {len(authors)} subfolders in the parent folder:\n")
records = []
with tqdm(total=len(authors)) as bar:
with Pool(options.jobs) as pool:
args = [(drive_service, author) for author in authors]
# records = pool.starmap(get_author_designs, args)
for result in pool.imap_unordered(get_author_designs, args):
records.append(result)
bar.update()
# Sort by key
records = sorted(records, key=lambda x: list(x.keys())[0])
# Write the records to a JSON file
with open(options.output, "w") as file:
for record in records:
file.write(json.dumps(record, separators=(",", ":")) + "\n")
# Replace this with your shared Google Drive folder's ID
if __name__ == "__main__":
# Set the parent folder ID (replace with your shared folder's ID)
parent_folder_id = "1MHjPTELf05Yop-nodriYR7z97hCw1kpB"
# Call the main scanning function
scan_google_drive(parent_folder_id)