Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes Making Code and Comments consistent in codebase for Better Readability and Understanding #89

Closed
wants to merge 14 commits into from
Closed
40 changes: 23 additions & 17 deletions deviantart/deviantart_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,24 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up current working directory
# Set up current working directory (CWD) and root_path
CWD = os.path.dirname(os.path.abspath(__file__))
root_path = os.path.dirname(CWD)
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
dotenv_path = os.path.join(root_path, ".env")
load_dotenv(dotenv_path)

# Get the current date
# Gets Date then Create File in CWD with Date Attached
today = dt.datetime.today()
# Retrieve API keys
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
API_KEYS_IND = 0
# Set up file path for CSV report
DATA_WRITE_FILE = (
f"{CWD}" f"/data_deviantart_{today.year}_{today.month}_{today.day}.csv"
)
# Retrieve Programmable Search Engine key from environment variables

# Global Variable for API_KEYS indexing
API_KEYS_IND = 0

# Gets API_KEYS and PSE_KEY from .env file
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
PSE_KEY = os.getenv("PSE_KEY")


Expand All @@ -41,11 +43,14 @@ def get_license_list():
Provides the list of license from 2018's record of Creative Commons.

Returns:
- np.array: An array containing all license types that should be
searched via Programmable Search Engine.
- np.array:
An np array containing all license types that should be searched
via Programmable Search Engine (PSE).
"""
# Read license data from file
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
cc_license_data = pd.read_csv(
f"{root_path}/legal-tool-paths.txt", header=None
IamMQaisar marked this conversation as resolved.
Show resolved Hide resolved
)
# Define regex pattern to extract license types
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_list = (
Expand Down Expand Up @@ -126,7 +131,7 @@ def get_response_elems(license):


def set_up_data_file():
"""Writes the header row to the file to contain DeviantArt data."""
# Writes the header row to the file to contain DeviantArt data.
header_title = "LICENSE TYPE,Document Count"
with open(DATA_WRITE_FILE, "w") as f:
f.write(f"{header_title}\n")
Expand All @@ -135,9 +140,11 @@ def set_up_data_file():
def record_license_data(license_type):
"""Writes the row for LICENSE_TYPE to the file to contain DeviantArt data.
Args:
- license_type(str): A string representing the type of license.
It's a segment of the URL towards the license description. If not provided,
it defaults to None, indicating no assumption about the license type.
- license_type:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
"""
data_log = (
f"{license_type},"
Expand All @@ -153,9 +160,8 @@ def record_all_licenses():
list and writes this data into the DATA_WRITE_FILE, as specified by the
constant.
"""
# Get the list of license types
# Gets the list of license types and record data for each license type
license_list = get_license_list()
# Record data for each license types
for license_type in license_list:
record_license_data(license_type)

Expand Down
113 changes: 71 additions & 42 deletions google_custom_search/google_scratcher.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
"""
This file is dedicated to obtain a .csv record report for Google Custom Search
Data.
This file is dedicated to obtain a .csv record report for
Google Custom Search Data.
"""

# Standard library
Expand All @@ -17,13 +17,16 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up current working directory (CWD) and root_path
CWD = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
root_path = os.path.dirname(CWD)
# Load environment variables
dotenv_path = os.path.join(root_path, ".env")
load_dotenv(dotenv_path)


# Gets Date then Create Files in CWD with Date Attached
today = dt.datetime.today()
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
API_KEYS_IND = 0
DATA_WRITE_FILE = (
f"{CWD}"
f"/data_google_custom_search_{today.year}_{today.month}_{today.day}.csv"
Expand All @@ -38,18 +41,30 @@
f"/data_google_custom_search_country_"
f"{today.year}_{today.month}_{today.day}.csv"
)
SEARCH_HALFYEAR_SPAN = 20

# Gets API_KEYS and PSE_KEY from .env file
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
PSE_KEY = os.getenv("PSE_KEY")

# Global Variables for API_KEYS indexing and Search Halfyear Span
API_KEYS_IND = 0
SEARCH_HALFYEAR_SPAN = 20


def get_license_list():
"""Provides the list of license from 2018's record of Creative Commons.
"""
Provides the list of licenses from 2018's record of Creative Commons.

Returns:
np.array: An np array containing all license types that should be
searched via Programmable Search Engine.
- np.array:
An np array containing all license types that should be searched
via Programmable Search Engine (PSE).
"""
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
# Read license data from file
cc_license_data = pd.read_csv(
f"{root_path}/legal-tool-paths.txt", header=None
IamMQaisar marked this conversation as resolved.
Show resolved Hide resolved
)
# Define regex pattern to extract license types
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_list = (
cc_license_data[0]
Expand All @@ -61,11 +76,13 @@ def get_license_list():


def get_lang_list():
"""Provides the list of language to find Creative Commons usage data on.
"""
Provides the list of languages to find Creative Commons usage data on.

Returns:
pd.DataFrame: A Dataframe whose index is language name and has a column
for the corresponding language code.
- pd.DataFrame:
A Dataframe whose index is language name and has a column for
the corresponding language code.
"""
languages = pd.read_csv(
f"{CWD}/google_lang.txt", sep=": ", header=None, engine="python"
Expand All @@ -90,16 +107,18 @@ def get_lang_list():


def get_country_list(select_all=False):
"""Provides the list of countries to find Creative Commons usage data on.
"""
Provides the list of countries to find Creative Commons usage data on.

Args:
select_all:
A boolean indicating whether the returned list will have all
countries.
- select_all:
A boolean indicating whether the returned list will have all
countries.

Returns:
pd.DataFrame: A Dataframe whose index is country name and has a column
for the corresponding country code.
- pd.DataFrame:
A Dataframe whose index is country name and has a column for
the corresponding country code.
"""
countries = pd.read_csv(CWD + "/google_countries.tsv", sep="\t")
countries["Country"] = countries["Country"].str.replace(",", " ")
Expand All @@ -125,30 +144,32 @@ def get_country_list(select_all=False):


def get_request_url(license=None, country=None, language=None, time=False):
"""Provides the API Endpoint URL for specified parameter combinations.
"""
Provides the API Endpoint URL for specified parameter combinations.

Args:
license:
- license:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
country:
- country:
A string representing the country code of country that the search
results would be originating from. Alternatively, the default None
value or "all" stands for having no assumption about country of
origin.
language:
- language:
A string representing the language that the search results are
presented in. Alternatively, the default None value or "all" stands
for having no assumption about language of document.
time:
- time:
A boolean indicating whether this query is related to video time
occurrence.

Returns:
string: A string representing the API Endpoint URL for the query
specified by this function's parameters.
- string:
A string representing the API Endpoint URL for the query specified
by this function's parameters.
"""
try:
api_key = API_KEYS[API_KEYS_IND]
Expand Down Expand Up @@ -177,32 +198,35 @@ def get_request_url(license=None, country=None, language=None, time=False):


def get_response_elems(license=None, country=None, language=None, time=False):
"""Provides the metadata for query of specified parameters
"""
Provides the metadata for query of specified parameters

Args:
license:
- license:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
country:
- country:
A string representing the country code of country that the search
results would be originating from. Alternatively, the default None
value or "all" stands for having no assumption about country of
origin.
lang:
- lang:
A string representing the language that the search results are
presented in. Alternatively, the default None value or "all" stands
for having no assumption about language of document.
time:
- time:
A boolean indicating whether this query is related to video time
occurrence.

Returns:
dict: A dictionary mapping metadata to its value provided from the API
query of specified parameters.
- dict:
A dictionary mapping metadata to its value provided from the API
query of specified parameters.
"""
try:
# Make a request to the API and handle potential retries
request_url = get_request_url(license, country, language, time)
max_retries = Retry(
total=5,
Expand All @@ -221,6 +245,7 @@ def get_response_elems(license=None, country=None, language=None, time=False):
return search_data_dict
except Exception as e:
if isinstance(e, requests.exceptions.HTTPError):
# If quota limit exceeded, switch to the next API key
global API_KEYS_IND
API_KEYS_IND += 1
print(
Expand All @@ -233,7 +258,7 @@ def get_response_elems(license=None, country=None, language=None, time=False):


def set_up_data_file():
"""Writes the header row to file to contain Google Query data."""
# Write header rows in files to contain Google Query data.
header_title = "LICENSE TYPE,No Priori,"
selected_countries = get_country_list()
all_countries = get_country_list(select_all=True)
Expand All @@ -257,18 +282,19 @@ def set_up_data_file():


def record_license_data(license_type=None, time=False, country=False):
"""Writes the row for LICENSE_TYPE to file to contain Google Query data.
"""
Writes the row for LICENSE_TYPE to file to contain Google Query data.

Args:
license:
- license_type:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
time:
- time:
A boolean indicating whether this query is related to video time
occurrence.
country:
- country:
A boolean indicating whether this query is related to country
occurrence.
"""
Expand Down Expand Up @@ -317,16 +343,19 @@ def record_license_data(license_type=None, time=False, country=False):


def record_all_licenses():
"""Records the data of all license types findable in the license list and
"""
Records the data of all license types findable in the license list and
records these data into the DATA_WRITE_FILE and DATA_WRITE_FILE_TIME as
specified in that constant.
"""
license_list = get_license_list()
record_license_data(time=False)
# Record license data with no assumption about license type
record_license_data()
record_license_data(time=True)
record_license_data(country=True)
# Gets the list of license types and record data for each license type
license_list = get_license_list()
for license_type in license_list:
record_license_data(license_type, time=False)
record_license_data(license_type)
record_license_data(license_type, time=True)


Expand Down
Loading