cumulative_grants_by_award_date.py

#!/usr/bin/env python3
"""
This script extracts NIH RePORTER grant data and plots cumulative counts and award amounts (YTD)
using data through today's date (even though RePORTER updates only weekly – see README for details).
Monthly queries are run (with caching) and if any month reaches the API limit (15,000 results) a warning is issued.
At the end, a CSV is generated (compressed with zstd) listing each grant's award date and grant number.
"""

import argparse
import datetime
import time
import json
from pathlib import Path
import requests
import numpy as np
import plotly.graph_objects as go
import colorsys

API_URL = "https://api.reporter.nih.gov/v2/projects/search"


class NIHReporterCache:
    def __init__(self, cache_dir="cache"):
        """Initialize cache in the specified directory."""
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

    def get_cache_path(self, year, month):
        """Get the cache file path for a specific year and month."""
        return self.cache_dir / f"grants_{year}_{month:02d}.json"

    def get_cached_data(self, year, month):
        """
        Retrieve cached data for a specific year and month.
        Bypass cache for the current and immediately previous month.
        """
        today = datetime.date.today()
        if (year == today.year and month in [today.month, today.month - 1]) or (
            today.month == 1 and year == today.year - 1 and month == 12
        ):
            return None

        cache_path = self.get_cache_path(year, month)
        if not cache_path.exists():
            return None

        try:
            with open(cache_path, "r") as f:
                data = json.load(f)
                if not all(key in data for key in ["fetch_date", "grants"]):
                    return None
                fetch_date = datetime.datetime.strptime(data["fetch_date"], "%Y-%m-%d").date()
                if (today - fetch_date).days > 7:
                    return None
                return data["grants"]
        except (json.JSONDecodeError, KeyError):
            return None

    def save_to_cache(self, year, month, grants):
        """Save grant data to cache along with the current fetch date."""
        cache_path = self.get_cache_path(year, month)
        data = {
            "fetch_date": datetime.date.today().strftime("%Y-%m-%d"),
            "grants": grants,
        }
        with open(cache_path, "w") as f:
            json.dump(data, f)


def get_pastel_color(i, total):
    """Generate a pastel color using HLS conversion."""
    hue = i / total
    lightness = 0.8
    saturation = 0.5
    r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
    return f"#{int(r*255):02X}{int(g*255):02X}{int(b*255):02X}"


def fetch_grants_by_award_date(start_date):
    """
    Query the NIH RePORTER API for projects with award_notice_date between start_date and the first day
    of the next month.
    """
    if start_date.month == 12:
        next_month = start_date.replace(year=start_date.year + 1, month=1, day=1)
    else:
        next_month = start_date.replace(month=start_date.month + 1, day=1)

    results = []
    offset = 0
    limit = 500

    while True:
        query = {
            "criteria": {
                "award_notice_date": {
                    "from_date": start_date.strftime("%Y-%m-%d"),
                    "to_date": next_month.strftime("%Y-%m-%d"),
                }
            },
            "offset": offset,
            "limit": limit,
            "fields": [
                "award_notice_date",
                "award_amount",
                "agency_ic_admin",
                "fiscal_year",
                "project_num",
                "contact_pi_name",
                "project_title",
                "organization_name",
            ],
        }
        print(f"Query payload (award date): {query}")
        try:
            response = requests.post(API_URL, json=query)
            print(f"Response status: {response.status_code}")
            response.raise_for_status()
        except Exception as e:
            print(f"Error fetching data for {start_date} to {next_month}: {e}")
            break

        data = response.json()
        batch = data.get("results", [])
        total = data.get("meta", {}).get("total", 0)

        # Warn if the total exceeds the API offset limit.
        if offset == 0 and total >= 15000:
            print(
                f"WARNING: Query for {start_date} returned {total} awards. This may exceed the API limit (15,000)."
            )
        results.extend(batch)
        offset += limit
        if offset >= min(total, 15000):
            break
        time.sleep(0.1)
    return results


def fetch_grants_with_cache(start_date, cache):
    """Fetch grant data for a given month using award_notice_date criteria, with caching."""
    cached_data = cache.get_cached_data(start_date.year, start_date.month)
    if cached_data is not None:
        return cached_data, "hit"
    grants = fetch_grants_by_award_date(start_date)
    cache.save_to_cache(start_date.year, start_date.month, grants)
    return grants, "miss"


def fetch_all_grants_by_month(start_year, current_year, cutoff_date):
    """
    For each year from start_year to current_year, fetch monthly grant data (using award_notice_date)
    up to cutoff_date.month. Only awards with a date on or before cutoff_date are kept.
    Returns:
      - data_by_year_counts: For plotting cumulative counts.
      - data_by_year_amounts: For plotting cumulative award amounts.
      - ic_data_by_year: Cumulative Institute/Center data.
      - current_ics: Set of all encountered IC abbreviations.
      - validation_info: Warnings (if any) when monthly queries hit the API limit.
      - all_award_date_grants: Raw list of grants per year.
    """
    cache = NIHReporterCache()
    data_by_year_counts = {}
    data_by_year_amounts = {}
    ic_data_by_year = {}
    all_award_date_grants = {}
    monthly_warnings = {}
    current_ics = set()

    month_limit = cutoff_date.month
    for year in range(start_year, current_year + 1):
        ic_data_by_year.setdefault(year, {})
        all_award_date_grants[year] = []
        for month in range(1, month_limit + 1):
            start_date = datetime.date(year, month, 1)
            print(f"Fetching grants for {year}-{month:02d}...", end=" ")
            grants, cache_status = fetch_grants_with_cache(start_date, cache)
            print(f"Fetched {len(grants)} grants ({cache_status}).")
            if len(grants) == 15000:
                warning_msg = (
                    f"WARNING: {year}-{month:02d} reached the API limit of 15000 results; data may be incomplete."
                )
                print(warning_msg)
                monthly_warnings[f"{year}-{month:02d}"] = len(grants)
            all_award_date_grants[year].extend(grants)

            valid_grants = [g for g in grants if g.get("award_notice_date")]
            valid_grants.sort(
                key=lambda grant: datetime.datetime.strptime(
                    grant.get("award_notice_date"), "%Y-%m-%dT%H:%M:%SZ"
                )
            )
            for grant in valid_grants:
                award_date_str = grant.get("award_notice_date")
                try:
                    dt = datetime.datetime.strptime(award_date_str, "%Y-%m-%dT%H:%M:%SZ").date()
                except Exception as e:
                    print(f"Warning: Could not parse award_notice_date '{award_date_str}': {e}")
                    continue
                # Only include awards on or before the cutoff_date.
                if (dt.month, dt.day) > (cutoff_date.month, cutoff_date.day):
                    continue
                day_of_year = dt.timetuple().tm_yday
                data_by_year_counts.setdefault(dt.year, []).append(day_of_year)
                try:
                    amount = float(grant.get("award_amount", 0))
                except Exception:
                    amount = 0
                data_by_year_amounts.setdefault(dt.year, []).append((day_of_year, amount))
                ic_info = grant.get("agency_ic_admin", {})
                ic = ic_info.get("abbreviation", "Other") or "Other"
                current_ics.add(ic)
                # Update IC cumulative data for this day.
                if day_of_year in ic_data_by_year[year]:
                    current_counts = ic_data_by_year[year][day_of_year]["counts"]
                    current_amounts = ic_data_by_year[year][day_of_year]["amounts"]
                else:
                    current_counts = {}
                    current_amounts = {}
                current_counts[ic] = current_counts.get(ic, 0) + 1
                current_amounts[ic] = current_amounts.get(ic, 0) + amount
                ic_data_by_year[year][day_of_year] = {"counts": current_counts, "amounts": current_amounts}

    validation_info = {"monthly_warnings": monthly_warnings}
    return data_by_year_counts, data_by_year_amounts, ic_data_by_year, current_ics, validation_info, all_award_date_grants


def create_cumulative_counts(year_days, cutoff):
    """
    Build cumulative counts arrays (up to the cutoff day) for each year.
    Returns a dict mapping each year to a tuple (dates_array, cumulative_counts).
    """
    dates_array = [
        (datetime.date(2000, 1, 1) + datetime.timedelta(days=i)).strftime("%b %d")
        for i in range(cutoff)
    ]
    cum_data = {}
    for year, days in year_days.items():
        counts = np.zeros(cutoff)
        for d in days:
            if 1 <= d <= cutoff:
                counts[d - 1] += 1
        cum_data[year] = (dates_array, np.cumsum(counts))
    return cum_data


def create_cumulative_amounts(year_awards, cutoff):
    """
    Build cumulative award amount arrays (up to the cutoff day) for each year.
    Returns a dict mapping each year to a tuple (dates_array, cumulative_amounts).
    """
    dates_array = [
        (datetime.date(2000, 1, 1) + datetime.timedelta(days=i)).strftime("%b %d")
        for i in range(cutoff)
    ]
    cum_data = {}
    for year, entries in year_awards.items():
        amounts = np.zeros(cutoff)
        for d, amt in entries:
            if 1 <= d <= cutoff:
                amounts[d - 1] += amt
        cum_data[year] = (dates_array, np.cumsum(amounts))
    return cum_data


def plot_cumulative_data(cum_data, ic_data, current_ics, current_year, tick_interval=7, colors=None, output_filename="nih_awards", validation_info=None):
    """
    Plot cumulative NIH awards (YTD) by award notice date.
    The X-axis is set to show tick labels every `tick_interval` days (weekly by default).
    """
    fig = go.Figure()
    for year in sorted(cum_data.keys()):
        x, y = cum_data[year]
        custom_data = []
        for date_str in x:
            date_obj = datetime.datetime.strptime(f"{date_str} 2000", "%b %d %Y")
            day_of_year = date_obj.timetuple().tm_yday
            custom_data.append([year, day_of_year])
        if year == current_year:
            color = "#FF0000"
            line_width = 3
            dash = "solid"
        else:
            color = colors[year]  # use pastel color from the passed dictionary
            line_width = 2
            dash = "dash"
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode="lines",
                name=str(year),
                line=dict(color=color, width=line_width, dash=dash),
                customdata=custom_data,
            )
        )
    full_x = list(cum_data.values())[0][0]
    tick_vals = full_x[::tick_interval]
    fig.update_xaxes(tickmode="array", tickvals=tick_vals)
    fig.update_layout(
        title="Cumulative NIH Awards (YTD) by Award Notice Date",
        xaxis_title="Date (Month-Day)",
        yaxis_title="Cumulative Number of Awards",
        clickmode="event",
        margin=dict(t=100, r=20, b=70, l=20),
    )
    html_file = f"{output_filename}.html"
    fig.write_html(html_file, full_html=True, include_plotlyjs="cdn")
    png_file = f"{output_filename}.png"
    fig.write_image(png_file, width=1200, height=800, scale=2)
    print(f"Count plots saved as {html_file} and {png_file}")


def plot_cumulative_amounts(cum_data, ic_data, current_ics, current_year, tick_interval=7, colors=None, output_filename="nih_award_amounts", validation_info=None):
    """
    Plot cumulative NIH award amounts (YTD) by award notice date.
    The X-axis shows tick labels every `tick_interval` days (weekly by default).
    """
    fig = go.Figure()
    for year in sorted(cum_data.keys()):
        x, y = cum_data[year]
        custom_data = []
        for date_str in x:
            date_obj = datetime.datetime.strptime(f"{date_str} 2000", "%b %d %Y")
            day_of_year = date_obj.timetuple().tm_yday
            custom_data.append([year, day_of_year])
        if year == current_year:
            color = "#FF0000"
            line_width = 3
            dash = "solid"
        else:
            color = colors[year]
            line_width = 2
            dash = "dash"
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode="lines",
                name=str(year),
                line=dict(color=color, width=line_width, dash=dash),
                customdata=custom_data,
            )
        )
    full_x = list(cum_data.values())[0][0]
    tick_vals = full_x[::tick_interval]
    fig.update_xaxes(tickmode="array", tickvals=tick_vals)
    fig.update_layout(
        title="Cumulative NIH Award Amounts (YTD) by Award Notice Date",
        xaxis_title="Date (Month-Day)",
        yaxis_title="Cumulative Award Amount ($)",
        clickmode="event",
        margin=dict(t=100, r=20, b=70, l=20),
    )
    html_file = f"{output_filename}.html"
    fig.write_html(html_file, full_html=True, include_plotlyjs="cdn")
    png_file = f"{output_filename}.png"
    fig.write_image(png_file, width=1200, height=800, scale=2)
    print(f"Award amount plots saved as {html_file} and {png_file}")


def save_grants_list(all_award_date_grants, output_filename="nih_awards_all"):
    """
    Create a CSV containing a list of all grants (one row per grant) with two columns:
    award_date and grant_number. Then compress the CSV using zstd.
    """
    import pandas as pd
    import zstandard as zstd

    records = []
    for year, grants in all_award_date_grants.items():
        for grant in grants:
            award_date = grant.get("award_notice_date", "")
            grant_number = grant.get("project_num", "")
            # Format award_date to YYYY-MM-DD if possible.
            if award_date:
                try:
                    dt = datetime.datetime.strptime(award_date, "%Y-%m-%dT%H:%M:%SZ")
                    award_date = dt.strftime("%Y-%m-%d")
                except Exception:
                    pass
            records.append({"award_date": award_date, "grant_number": grant_number})
    df = pd.DataFrame(records)
    csv_file = f"{output_filename}.csv"
    df.to_csv(csv_file, index=False)
    compressed_file = f"{csv_file}.zst"
    with open(csv_file, "rb") as f_in:
        data = f_in.read()
    cctx = zstd.ZstdCompressor(level=19)
    compressed = cctx.compress(data)
    with open(compressed_file, "wb") as f_out:
        f_out.write(compressed)
    print(f"Grants list saved and compressed as {compressed_file}")


def main():
    parser = argparse.ArgumentParser(
        description=(
            "Extract NIH RePORTER grant data (last 10 years, by day) and plot cumulative counts and award amounts (YTD) "
            "using data through today's date. A compressed list of grants is also saved."
        )
    )
    args = parser.parse_args()

    # Use today's date for plotting (even though RePORTER updates weekly – note this in the README)
    today = datetime.date.today()
    cutoff_day = today.timetuple().tm_yday
    current_year = today.year
    print(f"Using data up to {today.strftime('%b %d, %Y')}.")

    start_year = current_year - 9
    print(f"Fetching grant data from {start_year} to {current_year} for awards up to {today.month:02d}-{today.day:02d}...")
    data_counts, data_amounts, ic_data, current_ics, validation_info, all_award_date_grants = fetch_all_grants_by_month(
        start_year, current_year, today
    )

    if not data_counts:
        print("No grant count data retrieved. Exiting.")
        return

    for year in sorted(data_counts.keys()):
        print(f"Year {year}: {len(data_counts[year])} awards processed (counts).")
    for year in sorted(data_amounts.keys()):
        print(f"Year {year}: {len(data_amounts[year])} awards processed (amounts).")

    print(f"Found {len(current_ics)} current ICs: {', '.join(sorted(current_ics))}")

    cum_counts = create_cumulative_counts(data_counts, cutoff_day)
    cum_amounts = create_cumulative_amounts(data_amounts, cutoff_day)

    # Generate pastel colors for non-current years.
    non_current_years = [y for y in data_counts.keys() if y != current_year]
    colors = {}
    total = len(non_current_years)
    for i, year in enumerate(sorted(non_current_years)):
        colors[year] = get_pastel_color(i, total if total > 0 else 1)
    colors[current_year] = "#FF0000"

    print("Plotting cumulative count results...")
    plot_cumulative_data(
        cum_counts,
        ic_data,
        current_ics,
        current_year,
        tick_interval=7,
        colors=colors,
        output_filename="nih_awards",
        validation_info=validation_info,
    )

    print("Plotting cumulative award amount results...")
    plot_cumulative_amounts(
        cum_amounts,
        ic_data,
        current_ics,
        current_year,
        tick_interval=7,
        colors=colors,
        output_filename="nih_award_amounts",
        validation_info=validation_info,
    )

    print("Saving grants list (award_date and grant_number) and compressing...")
    save_grants_list(all_award_date_grants, output_filename="nih_awards_all")


if __name__ == "__main__":
    main()