Skip to content

Commit

Permalink
Create new shared module and use it in scripts
Browse files Browse the repository at this point in the history
Signed-off-by: Priyanshi Gaur <noxdot1134@gmail.com>
  • Loading branch information
nox1134 committed Mar 17, 2024
1 parent df16988 commit f75365b
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 79 deletions.
21 changes: 11 additions & 10 deletions deviantart/deviantart_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback
Expand All @@ -17,20 +16,20 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up current working directory
CWD = os.path.dirname(os.path.abspath(__file__))
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
load_dotenv(PATH_DOTENV)

# Get the current date
today = dt.datetime.today()
# Retrieve API keys
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
API_KEYS_IND = 0
# Set up file path for CSV report
DATA_WRITE_FILE = (
f"{CWD}" f"/data_deviantart_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_deviantart_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)
# Retrieve Programmable Search Engine key from environment variables
PSE_KEY = os.getenv("PSE_KEY")
Expand All @@ -45,7 +44,9 @@ def get_license_list():
searched via Programmable Search Engine.
"""
# Read license data from file
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
cc_license_data = pd.read_csv(
f"{PATH_WORK_DIR}/legal-tool-paths.txt", header=None
)
# Define regex pattern to extract license types
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_list = (
Expand Down
12 changes: 6 additions & 6 deletions flickr/photos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import flickrapi
from dotenv import load_dotenv

# Get the current working directory
CWD = os.path.dirname(os.path.abspath(__file__))
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
load_dotenv(PATH_DOTENV)


def main():
Expand All @@ -37,7 +37,7 @@ def main():
photosJson = flickr.photos.search(license=i, per_page=500)
dic[i] = [json.loads(photosJson.decode("utf-8"))]
# Save the dictionary containing photo data to a JSON file
with open(os.path.join(CWD, "photos.json"), "w") as json_file:
with open(os.path.join(PATH_WORK_DIR, "photos.json"), "w") as json_file:
json.dump(dic, json_file)


Expand Down
18 changes: 9 additions & 9 deletions flickr/photos_detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
import pandas as pd
from dotenv import load_dotenv

# Set up current working directory
CWD = os.path.dirname(os.path.abspath(__file__))
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
load_dotenv(PATH_DOTENV)

# Global variable: Number of retries for error handling
RETRIES = 0
Expand Down Expand Up @@ -188,9 +188,9 @@ def page1_reset(final_csv, raw_data):


def main():
final_csv_path = os.path.join(CWD, "final.csv")
record_txt_path = os.path.join(CWD, "rec.txt")
hs_csv_path = os.path.join(CWD, "hs.csv")
final_csv_path = os.path.join(PATH_WORK_DIR, "final.csv")
record_txt_path = os.path.join(PATH_WORK_DIR, "rec.txt")
hs_csv_path = os.path.join(PATH_WORK_DIR, "hs.csv")

# Initialize Flickr API instance
flickr = flickrapi.FlickrAPI(
Expand Down Expand Up @@ -290,7 +290,7 @@ def main():
# If reached max limit of pages, reset j to 1 and
# update i to the license in the dictionary
if j == total + 1 or j > total:
license_i_path = os.path.join(CWD, f"license{i}.csv")
license_i_path = os.path.join(PATH_WORK_DIR, f"license{i}.csv")
clean_saveas_csv(final_csv_path, license_i_path)
i += 1
j = 1
Expand Down
36 changes: 22 additions & 14 deletions google_custom_search/google_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback
Expand All @@ -17,26 +16,30 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

CWD = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)
# First-party/Local
import quantify

today = dt.datetime.today()
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
load_dotenv(PATH_DOTENV)

# Retrieve API keys
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
API_KEYS_IND = 0
# Set up file path for CSV report
DATA_WRITE_FILE = (
f"{CWD}"
f"/data_google_custom_search_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_google_custom_search_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)
DATA_WRITE_FILE_TIME = (
f"{CWD}"
f"{PATH_WORK_DIR}"
f"/data_google_custom_search_time_"
f"{today.year}_{today.month}_{today.day}.csv"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)
DATA_WRITE_FILE_COUNTRY = (
f"{CWD}"
f"{PATH_WORK_DIR}"
f"/data_google_custom_search_country_"
f"{today.year}_{today.month}_{today.day}.csv"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)
SEARCH_HALFYEAR_SPAN = 20
PSE_KEY = os.getenv("PSE_KEY")
Expand All @@ -49,7 +52,9 @@ def get_license_list():
np.array: An np array containing all license types that should be
searched via Programmable Search Engine.
"""
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
cc_license_data = pd.read_csv(
f"{PATH_WORK_DIR}/legal-tool-paths.txt", header=None
)
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_list = (
cc_license_data[0]
Expand All @@ -68,7 +73,10 @@ def get_lang_list():
for the corresponding language code.
"""
languages = pd.read_csv(
f"{CWD}/google_lang.txt", sep=": ", header=None, engine="python"
f"{PATH_WORK_DIR}/google_lang.txt",
sep=": ",
header=None,
engine="python",
)
languages[0] = languages[0].str.extract(r'"([^"]+)"')
languages = languages.set_index(1)
Expand Down Expand Up @@ -101,7 +109,7 @@ def get_country_list(select_all=False):
pd.DataFrame: A Dataframe whose index is country name and has a column
for the corresponding country code.
"""
countries = pd.read_csv(CWD + "/google_countries.tsv", sep="\t")
countries = pd.read_csv(PATH_WORK_DIR + "/google_countries.tsv", sep="\t")
countries["Country"] = countries["Country"].str.replace(",", " ")
countries = countries.set_index("Country").sort_index()
if select_all:
Expand Down
15 changes: 8 additions & 7 deletions internetarchive/internetarchive_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback

Expand All @@ -15,14 +13,15 @@
from urllib3.util.retry import Retry

# First-party/Local
import quantify
from internetarchive.search import Search
from internetarchive.session import ArchiveSession

today = dt.datetime.today()
CWD = os.path.dirname(os.path.abspath(__file__))
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
DATA_WRITE_FILE = (
f"{CWD}"
f"/data_internetarchive_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_internetarchive_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)


Expand All @@ -32,7 +31,9 @@ def get_license_list():
np.array: An np array containing all license types that should be
searched via Programmable Search Engine.
"""
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
cc_license_data = pd.read_csv(
f"{PATH_WORK_DIR}/legal-tool-paths.txt", header=None
)
license_list = cc_license_data[0].unique()
return license_list

Expand Down
13 changes: 8 additions & 5 deletions metmuseum/metmuseum_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback

Expand All @@ -15,10 +13,15 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

today = dt.datetime.today()
CWD = os.path.dirname(os.path.abspath(__file__))
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()

DATA_WRITE_FILE = (
f"{CWD}" f"/data_metmuseum_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_metmuseum_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)


Expand Down
23 changes: 23 additions & 0 deletions quantify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Shared module for common functionalities across scripts

# Standard library
import datetime
import os.path


def setup():

# Datetime
datetime_today = datetime.datetime.today()

# Paths
path_work_dir = os.path.dirname(
os.path.abspath(os.path.realpath(__file__))
)
path_repo_root = os.path.dirname(
os.path.abspath(os.path.realpath(path_work_dir))
)
path_dotenv = os.path.abspath(
os.path.realpath(os.path.join(path_work_dir, ".env"))
)
return path_repo_root, path_work_dir, path_dotenv, datetime_today
14 changes: 8 additions & 6 deletions vimeo/vimeo_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback
Expand All @@ -19,15 +18,18 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

CWD = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()
load_dotenv(PATH_DOTENV)

today = dt.datetime.today()
ACCESS_TOKEN = os.getenv("VIMEO_ACCESS_TOKEN")
CLIENT_ID = os.getenv("VIMEO_CLIENT_ID")
DATA_WRITE_FILE = (
f"{CWD}" f"/data_vimeo_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_vimeo_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)


Expand Down
13 changes: 8 additions & 5 deletions wikicommons/wikicommons_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback

Expand All @@ -15,10 +13,15 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

today = dt.datetime.today()
CWD = os.path.dirname(os.path.abspath(__file__))
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()

DATA_WRITE_FILE = (
f"{CWD}" f"/data_wikicommons_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_wikicommons_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)


Expand Down
15 changes: 9 additions & 6 deletions wikipedia/wikipedia_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
"""

# Standard library
import datetime as dt
import os
import sys
import traceback

Expand All @@ -15,10 +13,15 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

today = dt.datetime.today()
CWD = os.path.dirname(os.path.abspath(__file__))
# First-party/Local
import quantify

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup()

DATA_WRITE_FILE = (
f"{CWD}" f"/data_wikipedia_{today.year}_{today.month}_{today.day}.csv"
f"{PATH_WORK_DIR}"
f"/data_wikipedia_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv"
)


Expand All @@ -35,7 +38,7 @@ def get_wiki_langs():
- pd.DataFrame: A Dataframe containing information of each Wikipedia
language and its respective encoding on web address.
"""
return pd.read_csv(f"{CWD}/language-codes_csv.csv")
return pd.read_csv(f"{PATH_WORK_DIR}/language-codes_csv.csv")


def get_request_url(lang="en"):
Expand Down
Loading

0 comments on commit f75365b

Please sign in to comment.