openteamsinc · karamba228 · Aug 2, 2024 · Aug 5, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/score/cli.py b/score/cli.py
@@ -5,10 +5,22 @@
 from .logger import setup_logger
 from .data_retrieval.json_scraper import scrape_json
 from .data_retrieval.web_scraper import scrape_web
+from .utils.github_aggregator import aggregate
+from .data_retrieval.github_scraper import scrape_github_data
 
 OUTPUT_ROOT = Path(os.environ.get("OUTPUT_ROOT", "."))
 
 
+def validate_input(ctx, param, value):
+    if not (
+        value.isdigit() or (len(value) == 1 and value.isalpha() and value.islower())
+    ):
+        raise click.BadParameter(
+            f"{value} is not a valid input. Please enter a single letter (a-z) or number (0-9)."
+        )
+    return value
+
+
 def get_letter_range(start: int, end: int):
     """
     Generates a list of characters from start to end inclusive, supporting both numbers and letters.
@@ -21,7 +33,7 @@ def get_letter_range(start: int, end: int):
         list: A list of characters from start to end.
     """
     all_chars = string.digits + string.ascii_lowercase
-    return list(all_chars[start:end])
+    return list(all_chars[start : end + 1])
 
 
 @click.group()
@@ -38,17 +50,23 @@ def cli():
 @click.option(
     "--start",
     required=True,
-    type=int,
+    type=str,
+    callback=validate_input,
     help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
 )
 @click.option(
     "--end",
     required=True,
-    type=int,
+    type=str,
+    callback=validate_input,
     help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
 )
 def scrape_pypi(start, end, output):
-    letters_to_scrape = get_letter_range(start, end)
+    all_chars = string.digits + string.ascii_lowercase
+    start_index = all_chars.index(start)
+    end_index = all_chars.index(end)
+
+    letters_to_scrape = get_letter_range(start_index, end_index)
     click.echo(
         f"Will process all packages starting with characters {letters_to_scrape}."
     )
@@ -65,17 +83,23 @@ def scrape_pypi(start, end, output):
 @click.option(
     "--start",
     required=True,
-    type=int,
+    type=str,
+    callback=validate_input,
     help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
 )
 @click.option(
     "--end",
     required=True,
-    type=int,
+    type=str,
+    callback=validate_input,
     help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
 )
 def scrape_pypi_web(start, end, output):
-    letters_to_scrape = get_letter_range(start, end)
+    all_chars = string.digits + string.ascii_lowercase
+    start_index = all_chars.index(start)
+    end_index = all_chars.index(end)
+
+    letters_to_scrape = get_letter_range(start_index, end_index)
     click.echo(
         f"Will process all packages starting with characters {letters_to_scrape}."
     )
@@ -84,5 +108,78 @@ def scrape_pypi_web(start, end, output):
     click.echo("Scraping completed.")
 
 
+@cli.command()
+@click.option(
+    "--output",
+    default=OUTPUT_ROOT / "output" / "github-urls",
+    help="The output directory to save the aggregated data",
+)
+@click.option(
+    "--input",
+    default=OUTPUT_ROOT / "output" / "pypi-json",
+    help="The input directory to read the data from",
+)
+@click.option(
+    "--start",
+    required=True,
+    type=str,
+    callback=validate_input,
+    help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
+)
+@click.option(
+    "--end",
+    required=True,
+    type=str,
+    callback=validate_input,
+    help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
+)
+def github_aggregate(start, end, input, output):
+    all_chars = string.digits + string.ascii_lowercase
+    start_index = all_chars.index(start)
+    end_index = all_chars.index(end)
+
+    letters_to_scrape = get_letter_range(start_index, end_index)
+
+    click.echo(f"Aggregating all data starting with characters {letters_to_scrape}.")
+
+    aggregate(input, output, letters_to_scrape)
+    click.echo("Aggregation completed.")
+
+
+@cli.command()
+@click.option(
+    "--start",
+    required=True,
+    type=str,
+    callback=validate_input,
+    help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
+)
+@click.option(
+    "--end",
+    required=True,
+    type=str,
+    callback=validate_input,
+    help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
+)
+@click.option(
+    "--output",
+    default=OUTPUT_ROOT / "output" / "github-detailed",
+    help="The output directory to save the detailed GitHub data",
+)
+def scrape_github(start, end, output):
+    all_chars = string.digits + string.ascii_lowercase
+    start_index = all_chars.index(start)
+    end_index = all_chars.index(end)
+
+    letters_to_scrape = get_letter_range(start_index, end_index)
+
+    click.echo(f"Scraping GitHub data for characters {letters_to_scrape}.")
+
+    input_dir = OUTPUT_ROOT / "output" / "github-urls"
+    scrape_github_data(input_dir, output, letters_to_scrape)
+
+    click.echo("Scraping completed.")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/score/data_retrieval/github_scraper.py b/score/data_retrieval/github_scraper.py
@@ -0,0 +1,116 @@
+import os
+import pandas as pd
+import requests
+from tqdm import tqdm
+import logging
+from typing import List
+
+log = logging.getLogger(__name__)
+
+# Constants
+GITHUB_API_URL = "https://api.github.com/repos/"
+AUTH_HEADER = {"Authorization": f"token {os.getenv('GITHUB_TOKEN', '')}"}
+
+# Fields to extract from the GitHub API response
+FIELDS_TO_EXTRACT = {
+    "created_at": "created_at",
+    "updated_at": "updated_at",
+    "pushed_at": "pushed_at",
+    "stargazers_count": "stargazers_count",
+    "forks_count": "forks_count",
+    "open_issues_count": "open_issues_count",
+    "subscribers_count": "subscribers_count",
+    "watchers_count": "watchers_count",
+    "releases_url": "releases_url",
+    "commits_url": "commits_url",
+    "collaborators_url": "collaborators_url",
+    "contributors_url": "contributors_url",
+    "license.name": "license",
+}
+
+
+def fetch_github_data(repo_url):
+    """
+    Fetches data from the GitHub API for a given repository URL and extracts specified fields.
+
+    Args:
+        repo_url (str): The GitHub repository URL.
+
+    Returns:
+        dict: A dictionary containing the extracted data fields.
+    """
+    repo_name = "/".join(repo_url.split("/")[-2:])
+    response = requests.get(GITHUB_API_URL + repo_name, headers=AUTH_HEADER)
+    if response.status_code == 404:
+        log.debug(f"Skipping repository not found for URL {repo_url}")
+        return None
+    response.raise_for_status()  # Raise an error for bad status codes
+    data = response.json()
+
+    extracted_data = {}
+    for key, field in FIELDS_TO_EXTRACT.items():
+        if "." in key:
+            top_level_key, nested_key = key.split(".")
+            top_level_data = data.get(top_level_key, {})
+            if isinstance(top_level_data, dict):
+                extracted_data[field] = top_level_data.get(nested_key, None)
+            else:
+                extracted_data[field] = None
+        else:
+            extracted_data[field] = data.get(key, None)
+    return extracted_data
+
+
+def scrape_github_data(input_dir: str, output_dir: str, letters: List[str]):
+    """
+    Initiates the scraping process using the GitHub API based on the given configuration.
+
+    Args:
+        input_dir (str): Directory to read the input files from.
+        output_dir (str): Directory to save the output files.
+        letters (List[str]): List of letters to process.
+    """
+    # Create the output directory if it doesn't exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    for letter in letters:
+        process_repos_by_letter(input_dir, output_dir, letter)
+
+
+def process_repos_by_letter(input_dir: str, output_dir: str, letter: str):
+    """
+    Processes repositories by their first letter and saves the data to the specified output format.
+
+    Args:
+        input_dir (str): Directory to read the input files from.
+        output_dir (str): Directory to save the output files.
+        letter (str): The starting letter of the repositories to process.
+    """
+    input_path = os.path.join(input_dir, f"first_letter={letter}")
+    if not os.path.exists(input_path):
+        log.debug(f"No data for letter {letter}")
+        return
+
+    df = pd.read_parquet(input_path)
+    all_repo_data = []
+
+    for _, row in tqdm(
+        df.iterrows(), total=len(df), desc=f"Processing letter {letter}"
+    ):
+        package_name = row["name"]
+        source_url = row["source_url"]
+
+        data = fetch_github_data(source_url)
+        if data:
+            data["first_letter"] = letter
+            data["name"] = package_name
+            all_repo_data.append(data)
+
+    if all_repo_data:
+        output_df = pd.DataFrame(all_repo_data)
+        output_path = os.path.join(output_dir, f"first_letter={letter}")
+        output_df.to_parquet(output_path, partition_cols=["first_letter"])
+        log.info(f"Data saved for letter {letter} to {output_path}")
+    else:
+        log.info(f"No valid GitHub data found for letter {letter}")
diff --git a/score/data_retrieval/json_scraper.py b/score/data_retrieval/json_scraper.py
@@ -90,6 +90,7 @@ def process_packages_by_letter(letter, package_names, output_dir):
     ):
         package_data = get_package_data(package_name)
         if package_data:
+            df = pd.json_normalize(package_data)
             all_package_data.append(package_data)
 
     df = pd.DataFrame(all_package_data)

diff --git a/score/data_retrieval/web_scraper.py b/score/data_retrieval/web_scraper.py
@@ -123,7 +123,6 @@ def scrape_web(output_dir: str, letters: List[str]):
     """
     package_names = get_all_package_names()
 
-    output_dir = os.path.join("output", "web")
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 

diff --git a/score/utils/common.py b/score/utils/common.py
@@ -2,37 +2,6 @@
 import re
 
 
-def input_formatter(letters_str):
-    """
-    Formats an input string into a sorted set of letters and digits.
-    Allows for the scraper to scrape specific ranges of letters.
-
-    Args:
-        letters_str (str): A string containing ranges or individual characters (e.g., "a-d,0-3").
-
-    Returns:
-        str: A sorted string of individual characters representing the input ranges.
-    """
-    letters = set()
-    if not letters_str:
-        letters_str = "0-9,a-z"  # Default range if no input is provided
-    for part in letters_str.split(","):
-        part = part.strip()
-        if "-" in part:
-            start, end = part.split("-")
-            start, end = start.strip(), end.strip()
-            if start.isdigit() and end.isdigit():
-                # Add all digits in the specified range to the set
-                letters.update(str(i) for i in range(int(start), int(end) + 1))
-            elif start.isalpha() and end.isalpha():
-                # Add all letters in the specified range to the set
-                letters.update(chr(i) for i in range(ord(start), ord(end) + 1))
-        else:
-            # Add individual characters to the set
-            letters.add(part)
-    return "".join(sorted(letters))
-
-
 def get_all_package_names():
     """
     Fetches the list of all package names from the PyPI Simple API.

diff --git a/score/utils/github_aggregator.py b/score/utils/github_aggregator.py
@@ -0,0 +1,34 @@
+import os
+import pandas as pd
+
+
+def aggregate(input_dir, output_dir, letters_to_scrape):
+    """
+    Read parquet files from the input directory, filter based on the first letter,
+    extract required fields, and write to the output directory with partitioning.
+    """
+    aggregated_df = pd.DataFrame()
+
+    for letter in letters_to_scrape:
+        dir_path = os.path.join(input_dir, f"first_letter={letter}")
+        if os.path.exists(dir_path):
+            df = pd.read_parquet(dir_path)
+            df["first_letter"] = letter  # Add the first_letter column manually
+            if "name" in df.columns and "source_url" in df.columns:
+                df_filtered = df[["name", "source_url", "first_letter"]]
+                df_filtered = df_filtered.dropna(
+                    subset=["source_url"]
+                )  # Drop rows where source_url is null
+                aggregated_df = pd.concat(
+                    [aggregated_df, df_filtered], ignore_index=True
+                )
+            else:
+                print(
+                    f"The required columns are not present in the data for letter {letter}."
+                )
+
+    # Write the aggregated data to the output directory with partitioning by 'first_letter'
+    if not aggregated_df.empty:
+        aggregated_df.to_parquet(output_dir, partition_cols=["first_letter"])
+    else:
+        print("No data to write after aggregation.")