Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github scraper initial #38

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion score/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

import click
import duckdb

from .conda.get_conda_package_names import get_conda_package_names
from .conda.scrape_conda import scrape_conda
from .data_retrieval.json_scraper import scrape_json
from .data_retrieval.web_scraper import scrape_web
from .data_retrieval.github_scraper import scrape_github_data
from .logger import setup_logger
from .utils.get_pypi_package_list import get_pypi_package_names
from .vulnerabilities.scrape_vulnerabilities import scrape_vulnerabilities
Expand Down Expand Up @@ -82,6 +82,28 @@ def scrape_pypi_web(num_partitions, partition, output):
click.echo("Scraping completed.")


@cli.command()
@click.option(
"-i",
"--input",
default=os.path.join(OUTPUT_ROOT, "source-urls.parquet"),
help="The input file containing the GitHub URLs",
)
@click.option(
"-o",
"--output",
default=os.path.join(OUTPUT_ROOT, "github-details.parquet"),
help="The output file to save the detailed GitHub data",
)
def scrape_github(input, output):
click.echo("Scraping GitHub data.")

df = scrape_github_data(input_file=input)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add a partition on the source_url and do the file read/writes in the cli.py


click.echo(f"Saving data to {output}")
df.to_parquet(output)


@cli.command()
@click.option(
"--output",
Expand Down
113 changes: 113 additions & 0 deletions score/data_retrieval/github_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import pandas as pd
import requests
from tqdm import tqdm
import logging
import duckdb

log = logging.getLogger(__name__)

# Constants
GITHUB_API_URL = "https://api.github.com/repos/"
AUTH_HEADER = {"Authorization": f"token {os.getenv('GITHUB_TOKEN', '')}"}

# Fields to extract from the GitHub API response
FIELDS_TO_EXTRACT = {
"created_at": "created_at",
"updated_at": "updated_at",
"pushed_at": "pushed_at",
"stargazers_count": "stargazers_count",
"forks_count": "forks_count",
"open_issues_count": "open_issues_count",
"subscribers_count": "subscribers_count",
"watchers_count": "watchers_count",
"releases_url": "releases_url",
"commits_url": "commits_url",
"collaborators_url": "collaborators_url",
"contributors_url": "contributors_url",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why store this url? can we fetch a list of collaborators instead?

"license.name": "license",
}


def fetch_github_data(repo_url):
"""
Fetches data from the GitHub API for a given repository URL and extracts specified fields.
Additionally fetches details from 'collaborators_url' and 'contributors_url'.

Args:
repo_url (str): The GitHub repository URL.

Returns:
dict: A dictionary containing the extracted data fields and additional details.
"""
repo_name = "/".join(repo_url.split("/")[-2:])
response = requests.get(GITHUB_API_URL + repo_name, headers=AUTH_HEADER)

# Handle non-existent repositories gracefully
if response.status_code == 404:
log.debug(f"Skipping repository not found for URL {repo_url}")
return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to return none? We probably want to include this in the score like

  • health: unknown
  • notes: the included source URL does not exist

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you comment on how we will record this data?


response.raise_for_status() # Raise an error for bad status codes
data = response.json()

# Extract required fields from the GitHub API response
extracted_data = {}
for key, field in FIELDS_TO_EXTRACT.items():
if "." in key:
top_level_key, nested_key = key.split(".")
top_level_data = data.get(top_level_key, {})
extracted_data[field] = (
top_level_data.get(nested_key)
if isinstance(top_level_data, dict)
else None
)
else:
extracted_data[field] = data.get(key)

# Fetch additional details for contributors
if contributors_url := data.get("contributors_url"):
contributors_response = requests.get(contributors_url, headers=AUTH_HEADER)
if contributors_response.status_code == 200:
extracted_data["contributors"] = contributors_response.json()
extracted_data["contributors_count"] = len(contributors_response.json())
else:
log.debug(f"Failed to fetch contributors for URL {repo_url}")

return extracted_data


def scrape_github_data(input_file: str):
"""
Initiates the scraping process using the GitHub API for a given input file.

Args:
input_file (str): Path to the input file (github-urls.parquet).

Returns:
pd.DataFrame: A DataFrame containing the scraped data.
"""

query = f"""
SELECT *
FROM read_parquet('{input_file}')
WHERE source_url IS NOT NULL
AND source_url LIKE '%github.com%'
"""
df = duckdb.query(query).to_df()

if df.empty:
log.debug("No valid GitHub URLs found in the input file")
return pd.DataFrame()

all_repo_data = []

# Iterate over the DataFrame rows and fetch data from GitHub API
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing GitHub URLs"):
source_url = row["source_url"]
data = fetch_github_data(source_url)
if data:
data["source_url"] = source_url # Use source_url as the unique key
all_repo_data.append(data)

return pd.DataFrame(all_repo_data)
Loading