Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github scraper initial #38

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 104 additions & 7 deletions score/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,22 @@
from .logger import setup_logger
from .data_retrieval.json_scraper import scrape_json
from .data_retrieval.web_scraper import scrape_web
from .utils.github_aggregator import aggregate
from .data_retrieval.github_scraper import scrape_github_data

OUTPUT_ROOT = Path(os.environ.get("OUTPUT_ROOT", "."))


def validate_input(ctx, param, value):
if not (
value.isdigit() or (len(value) == 1 and value.isalpha() and value.islower())
):
raise click.BadParameter(
f"{value} is not a valid input. Please enter a single letter (a-z) or number (0-9)."
)
return value


def get_letter_range(start: int, end: int):
"""
Generates a list of characters from start to end inclusive, supporting both numbers and letters.
Expand All @@ -21,7 +33,7 @@ def get_letter_range(start: int, end: int):
list: A list of characters from start to end.
"""
all_chars = string.digits + string.ascii_lowercase
return list(all_chars[start:end])
return list(all_chars[start : end + 1])


@click.group()
Expand All @@ -38,17 +50,23 @@ def cli():
@click.option(
"--start",
required=True,
type=int,
type=str,
callback=validate_input,
help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
)
@click.option(
"--end",
required=True,
type=int,
type=str,
callback=validate_input,
help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
)
def scrape_pypi(start, end, output):
letters_to_scrape = get_letter_range(start, end)
all_chars = string.digits + string.ascii_lowercase
start_index = all_chars.index(start)
end_index = all_chars.index(end)

letters_to_scrape = get_letter_range(start_index, end_index)
click.echo(
f"Will process all packages starting with characters {letters_to_scrape}."
)
Expand All @@ -65,17 +83,23 @@ def scrape_pypi(start, end, output):
@click.option(
"--start",
required=True,
type=int,
type=str,
callback=validate_input,
help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
)
@click.option(
"--end",
required=True,
type=int,
type=str,
callback=validate_input,
help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
)
def scrape_pypi_web(start, end, output):
letters_to_scrape = get_letter_range(start, end)
all_chars = string.digits + string.ascii_lowercase
start_index = all_chars.index(start)
end_index = all_chars.index(end)

letters_to_scrape = get_letter_range(start_index, end_index)
click.echo(
f"Will process all packages starting with characters {letters_to_scrape}."
)
Expand All @@ -84,5 +108,78 @@ def scrape_pypi_web(start, end, output):
click.echo("Scraping completed.")


@cli.command()
@click.option(
"--output",
default=OUTPUT_ROOT / "output" / "github-urls",
help="The output directory to save the aggregated data",
)
@click.option(
"--input",
default=OUTPUT_ROOT / "output" / "pypi-json",
help="The input directory to read the data from",
)
@click.option(
"--start",
required=True,
type=str,
callback=validate_input,
help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
)
@click.option(
"--end",
required=True,
type=str,
callback=validate_input,
help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
)
def github_aggregate(start, end, input, output):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this one have a start,end? this will not affect performance right?

all_chars = string.digits + string.ascii_lowercase
start_index = all_chars.index(start)
end_index = all_chars.index(end)

letters_to_scrape = get_letter_range(start_index, end_index)

click.echo(f"Aggregating all data starting with characters {letters_to_scrape}.")

aggregate(input, output, letters_to_scrape)
click.echo("Aggregation completed.")


@cli.command()
@click.option(
"--start",
required=True,
type=str,
callback=validate_input,
help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
)
@click.option(
"--end",
required=True,
type=str,
callback=validate_input,
help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
)
@click.option(
"--output",
default=OUTPUT_ROOT / "output" / "github-detailed",
help="The output directory to save the detailed GitHub data",
)
def scrape_github(start, end, output):
all_chars = string.digits + string.ascii_lowercase
start_index = all_chars.index(start)
end_index = all_chars.index(end)

letters_to_scrape = get_letter_range(start_index, end_index)

click.echo(f"Scraping GitHub data for characters {letters_to_scrape}.")

input_dir = OUTPUT_ROOT / "output" / "github-urls"
scrape_github_data(input_dir, output, letters_to_scrape)

click.echo("Scraping completed.")


if __name__ == "__main__":
cli()
116 changes: 116 additions & 0 deletions score/data_retrieval/github_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import os
import pandas as pd
import requests
from tqdm import tqdm
import logging
from typing import List

log = logging.getLogger(__name__)

# Constants
GITHUB_API_URL = "https://api.github.com/repos/"
AUTH_HEADER = {"Authorization": f"token {os.getenv('GITHUB_TOKEN', '')}"}

# Fields to extract from the GitHub API response
FIELDS_TO_EXTRACT = {
"created_at": "created_at",
"updated_at": "updated_at",
"pushed_at": "pushed_at",
"stargazers_count": "stargazers_count",
"forks_count": "forks_count",
"open_issues_count": "open_issues_count",
"subscribers_count": "subscribers_count",
"watchers_count": "watchers_count",
"releases_url": "releases_url",
"commits_url": "commits_url",
"collaborators_url": "collaborators_url",
"contributors_url": "contributors_url",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why store this url? can we fetch a list of collaborators instead?

"license.name": "license",
}


def fetch_github_data(repo_url):
"""
Fetches data from the GitHub API for a given repository URL and extracts specified fields.

Args:
repo_url (str): The GitHub repository URL.

Returns:
dict: A dictionary containing the extracted data fields.
"""
repo_name = "/".join(repo_url.split("/")[-2:])
response = requests.get(GITHUB_API_URL + repo_name, headers=AUTH_HEADER)
if response.status_code == 404:
log.debug(f"Skipping repository not found for URL {repo_url}")
return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to return none? We probably want to include this in the score like

  • health: unknown
  • notes: the included source URL does not exist

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you comment on how we will record this data?

response.raise_for_status() # Raise an error for bad status codes
data = response.json()

extracted_data = {}
for key, field in FIELDS_TO_EXTRACT.items():
if "." in key:
top_level_key, nested_key = key.split(".")
top_level_data = data.get(top_level_key, {})
if isinstance(top_level_data, dict):
extracted_data[field] = top_level_data.get(nested_key, None)
else:
extracted_data[field] = None
else:
extracted_data[field] = data.get(key, None)
return extracted_data


def scrape_github_data(input_dir: str, output_dir: str, letters: List[str]):
"""
Initiates the scraping process using the GitHub API based on the given configuration.

Args:
input_dir (str): Directory to read the input files from.
output_dir (str): Directory to save the output files.
letters (List[str]): List of letters to process.
"""
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)

for letter in letters:
process_repos_by_letter(input_dir, output_dir, letter)


def process_repos_by_letter(input_dir: str, output_dir: str, letter: str):
"""
Processes repositories by their first letter and saves the data to the specified output format.

Args:
input_dir (str): Directory to read the input files from.
output_dir (str): Directory to save the output files.
letter (str): The starting letter of the repositories to process.
"""
input_path = os.path.join(input_dir, f"first_letter={letter}")
if not os.path.exists(input_path):
log.debug(f"No data for letter {letter}")
return

df = pd.read_parquet(input_path)
all_repo_data = []

for _, row in tqdm(
df.iterrows(), total=len(df), desc=f"Processing letter {letter}"
):
package_name = row["name"]
source_url = row["source_url"]

data = fetch_github_data(source_url)
if data:
data["first_letter"] = letter
data["name"] = package_name
all_repo_data.append(data)

if all_repo_data:
output_df = pd.DataFrame(all_repo_data)
output_path = os.path.join(output_dir, f"first_letter={letter}")
output_df.to_parquet(output_path, partition_cols=["first_letter"])
log.info(f"Data saved for letter {letter} to {output_path}")
else:
log.info(f"No valid GitHub data found for letter {letter}")
1 change: 1 addition & 0 deletions score/data_retrieval/json_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def process_packages_by_letter(letter, package_names, output_dir):
):
package_data = get_package_data(package_name)
if package_data:
df = pd.json_normalize(package_data)
all_package_data.append(package_data)

df = pd.DataFrame(all_package_data)
Expand Down
1 change: 0 additions & 1 deletion score/data_retrieval/web_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def scrape_web(output_dir: str, letters: List[str]):
"""
package_names = get_all_package_names()

output_dir = os.path.join("output", "web")
if not os.path.exists(output_dir):
os.makedirs(output_dir)

Expand Down
31 changes: 0 additions & 31 deletions score/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,6 @@
import re


def input_formatter(letters_str):
"""
Formats an input string into a sorted set of letters and digits.
Allows for the scraper to scrape specific ranges of letters.

Args:
letters_str (str): A string containing ranges or individual characters (e.g., "a-d,0-3").

Returns:
str: A sorted string of individual characters representing the input ranges.
"""
letters = set()
if not letters_str:
letters_str = "0-9,a-z" # Default range if no input is provided
for part in letters_str.split(","):
part = part.strip()
if "-" in part:
start, end = part.split("-")
start, end = start.strip(), end.strip()
if start.isdigit() and end.isdigit():
# Add all digits in the specified range to the set
letters.update(str(i) for i in range(int(start), int(end) + 1))
elif start.isalpha() and end.isalpha():
# Add all letters in the specified range to the set
letters.update(chr(i) for i in range(ord(start), ord(end) + 1))
else:
# Add individual characters to the set
letters.add(part)
return "".join(sorted(letters))


def get_all_package_names():
"""
Fetches the list of all package names from the PyPI Simple API.
Expand Down
34 changes: 34 additions & 0 deletions score/utils/github_aggregator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import pandas as pd


def aggregate(input_dir, output_dir, letters_to_scrape):
"""
Read parquet files from the input directory, filter based on the first letter,
extract required fields, and write to the output directory with partitioning.
"""
aggregated_df = pd.DataFrame()

for letter in letters_to_scrape:
dir_path = os.path.join(input_dir, f"first_letter={letter}")
if os.path.exists(dir_path):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be done nativity with pandas or duckdb. I would use duckdb here because it is simple and I would expect it to perform much better than pandas

df = pd.read_parquet(dir_path)
df["first_letter"] = letter # Add the first_letter column manually
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should not need this if you read as a hive partition

if "name" in df.columns and "source_url" in df.columns:
df_filtered = df[["name", "source_url", "first_letter"]]
df_filtered = df_filtered.dropna(
subset=["source_url"]
) # Drop rows where source_url is null
aggregated_df = pd.concat(
[aggregated_df, df_filtered], ignore_index=True
)
else:
print(
f"The required columns are not present in the data for letter {letter}."
)

# Write the aggregated data to the output directory with partitioning by 'first_letter'
if not aggregated_df.empty:
aggregated_df.to_parquet(output_dir, partition_cols=["first_letter"])
else:
print("No data to write after aggregation.")
Loading