Merge pull request #68 from ihmeuw-msca/refactor/globals

Refactor some of the core functions
ihmeuw-msca · Aug 26, 2024 · 5bfb720 · 5bfb720
2 parents d4dc800 + 26aaed6
commit 5bfb720
Show file tree

Hide file tree

Showing 18 changed files with 350 additions and 319 deletions.
diff --git a/README.md b/README.md
@@ -29,8 +29,8 @@ $$D_i = \hat f_i \cdot p_i $$
 For the current models in use, T is just a logarithm, and this assumes that each rate is some constant muliplied by the overall rate pattern level. Allowing a more general transformation T, such as a log-odds transformation, assumes multiplicativity in the associated odds, rather than the rate, and can produce better estimates statistically (potentially being a more realistic assumption in some cases) and practically, restricting the estimated rates to lie within a reasonable interval. 
 
 ## Current Package Capabilities and Models
-Currently, the multiplicative-in-rate model RateMultiplicativeModel with $T(x)=\log(x)$ and the Log Modified Odds model LMO_model(m) with $T(x)=\log(\frac{x}{1-x^{m}})$ are implemented. Note that the LMO_model with m=1 gives a multiplicative in odds model.
+Currently, the multiplicative-in-rate model RateMultiplicativeModel with $T(x)=\log(x)$ and the Log Modified Odds model LMOModel(m) with $T(x)=\log(\frac{x}{1-x^{m}})$ are implemented. Note that the LMOModel with m=1 gives a multiplicative in odds model.
 
 A useful (but slightly wrong) analogy is that the multiplicative-in-rate is to the multiplicative-in-odds model as ordinary least squares is to logistic regression in terms of the relationship between covariates and output (not in terms of anything like the likelihood)
 
-Increasing m in the model LMO_model(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that rate estimates are bounded by 1. 
+Increasing m in the model LMOModel(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that rate estimates are bounded by 1. 
diff --git a/examples/Basic Splitting Examples.ipynb b/examples/Basic Splitting Examples.ipynb
@@ -9,8 +9,8 @@
     "from pydisagg.disaggregate import split_datapoint\n",
     "import numpy as np\n",
     "from pydisagg.models import RateMultiplicativeModel\n",
-    "from pydisagg.models import LMO_model\n",
-    "from pydisagg.models import LogOdds_model\n",
+    "from pydisagg.models import LMOModel\n",
+    "from pydisagg.models import LogOddsModel\n",
     "import pandas as pd"
    ]
   },
@@ -40,7 +40,7 @@
    "outputs": [],
    "source": [
     "rmm=RateMultiplicativeModel()\n",
-    "oddm=LogOdds_model()"
+    "oddm=LogOddsModel()"
    ]
   },
   {
@@ -163,7 +163,7 @@
     "    populations,\n",
     "    rate_pattern,\n",
     "    observed_total_SE,\n",
-    "    model=LogOdds_model()\n",
+    "    model=LogOddsModel()\n",
     ")\n",
     "print(\"Estimated incidence in each group\")\n",
     "print(estimate)\n",
@@ -208,7 +208,7 @@
     "    observed_total,\n",
     "    populations,\n",
     "    pattern,\n",
-    "    model=LogOdds_model()\n",
+    "    model=LogOddsModel()\n",
     "))"
    ]
   },
@@ -295,7 +295,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "age_splitting_model=LMO_model(1)\n",
+    "age_splitting_model=LMOModel(1)\n",
     "sex_splitting_model=RateMultiplicativeModel()\n",
     "\n",
     "age_groups=np.array([0,1,2,3])\n",

diff --git a/examples/Dataframe Splitting Demo.ipynb b/examples/Dataframe Splitting Demo.ipynb
@@ -12,8 +12,8 @@
     "\n",
     "import numpy as np\n",
     "from pydisagg.models import RateMultiplicativeModel\n",
-    "from pydisagg.models import LMO_model\n",
-    "from pydisagg.models import LogOdds_model\n",
+    "from pydisagg.models import LMOModel\n",
+    "from pydisagg.models import LogOddsModel\n",
     "import pandas as pd\n",
     "from example_df_dataset import data_df,population_sizes,baseline_patterns,groups_to_split_into"
    ]

diff --git a/examples/test_notebook.ipynb b/examples/test_notebook.ipynb
@@ -9,8 +9,8 @@
     "from pydisagg.disaggregate import split_datapoint\n",
     "import numpy as np\n",
     "from pydisagg.models import RateMultiplicativeModel\n",
-    "from pydisagg.models import LMO_model\n",
-    "from pydisagg.models import LogOdds_model\n",
+    "from pydisagg.models import LMOModel\n",
+    "from pydisagg.models import LogOddsModel\n",
     "import pandas as pd\n",
     "from tqdm.auto import tqdm"
    ]
@@ -35,7 +35,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "oddm=LogOdds_model()"
+    "oddm=LogOddsModel()"
    ]
   },
   {

diff --git a/src/pydisagg/ParameterTransformation.py b/src/pydisagg/ParameterTransformation.py
diff --git a/src/pydisagg/disaggregate.py b/src/pydisagg/disaggregate.py
@@ -1,62 +1,62 @@
 """Module containing high level api for splitting"""
 
-from typing import Literal, Optional, Union
+from typing import Literal
 
 import numpy as np
 import pandas as pd
-from numpy.typing import NDArray
-from pandas import DataFrame
 
-from pydisagg.DisaggModel import DisaggModel
-from pydisagg.models import LogOdds_model
+from pydisagg.models import DisaggModel, LogOddsModel
+from pydisagg.typing import DataFrame, NDArray
 
 
 def split_datapoint(
     observed_total: float,
     bucket_populations: NDArray,
     rate_pattern: NDArray,
-    observed_total_se: Optional[float] = None,
-    model: Optional[DisaggModel] = LogOdds_model(),
+    observed_total_se: float | None = None,
+    model: DisaggModel = LogOddsModel(),
     output_type: Literal["count", "rate"] = "count",
     normalize_pop_for_average_type_obs: bool = False,
-    pattern_covariance: Optional[NDArray] = None,
-) -> Union[tuple, NDArray]:
+    pattern_covariance: NDArray | None = None,
+) -> tuple | NDArray:
     """Disaggregate a datapoint using the model given as input.
     Defaults to assuming multiplicativity in the odds ratio
 
-    If output_type=='total', then this outputs estimates for the observed amount in each group
-        such that the sum of the point estimates equals the original total
-    If output_type=='rate', then this estimates rates for each group
-        (and doesn't multiply the rates out by the population)
+    * If output_type=='total', then this outputs estimates for the observed
+      amount in each group such that the sum of the point estimates equals the
+      original total
+    * If output_type=='rate', then this estimates rates for each group
+      (and doesn't multiply the rates out by the population)
 
 
     Parameters
     ----------
-    observed_total : float
+    observed_total
         aggregated observed_total across all buckets, value to be split
-    bucket_populations : NDArray
+    bucket_populations
         population size in each bucket
-    rate_pattern : NDArray
+    rate_pattern
         Rate Pattern to use, should be an estimate of the rates in each bucket
             that we want to rescale
-    observed_total_se : Optional[float], optional
+    observed_total_se
         standard error of observed_total, by default None
-    output_type: Literal['total','rate'], optional
+    output_type
         One of 'total' or 'rate'
-        Type of splitting to perform, whether to disaggregate and return the estimated total
-        in each group, or estimate the rate per population unit.
-    model : Optional[DisaggModel], optional
-        DisaggModel to use, by default LMO_model(1)
-    normalize_pop_for_average_type_obs: bool = True
-        Whether or not to normalize populations to sum to 1, this is appropriate when the output_type is rate
-        and when the aggregated observation is an average--whether an aggregated rate
-        or a mean of a continuous measure over different groups
-    pattern_covariance: Optional[NDArray], optional
+        Type of splitting to perform, whether to disaggregate and return the
+        estimated total in each group, or estimate the rate per population unit.
+    model
+        DisaggModel to use, by default LMOModel(1)
+    normalize_pop_for_average_type_obs
+        Whether or not to normalize populations to sum to 1, this is appropriate
+        when the output_type is rate and when the aggregated observation is an
+        average--whether an aggregated rate or a mean of a continuous measure
+        over different groups
+    pattern_covariance
         2d Numpy array with covariance matrix of pattern.
 
     Returns
     -------
-    Union[Tuple,NDArray]
+    tuple | NDArray
         If standard errors are available, this will return the tuple
             (
                 estimate_in_each_bucket,
@@ -70,6 +70,7 @@ def split_datapoint(
     If no observed_total_se is given, returns point estimates
     If observed_total_se is given, then returns a tuple
         (point_estimate,standard_error)
+
     """
     if output_type not in ["count", "rate"]:
         raise ValueError("output_type must be one of either 'total' or 'rate'")
@@ -158,58 +159,63 @@ def split_dataframe(
     observation_group_membership_df: DataFrame,
     population_sizes: DataFrame,
     rate_patterns: DataFrame,
-    use_se: Optional[bool] = False,
-    model: Optional[DisaggModel] = LogOdds_model(),
+    use_se: bool = False,
+    model: DisaggModel = LogOddsModel(),
     output_type: Literal["count", "rate"] = "count",
-    demographic_id_columns: Optional[list] = None,
+    demographic_id_columns: list | None = None,
     normalize_pop_for_average_type_obs: bool = False,
 ) -> DataFrame:
-    """Disaggregate datapoints and pivots observations into estimates for each group per demographic id
+    """Disaggregate datapoints and pivots observations into estimates for each
+    group per demographic id
 
-    If output_type=='total', then this outputs estimates for the observed amount in each group
-        such that the sum of the point estimates equals the original total
-    If output_type=='rate', then this estimates rates for each group
-        (and doesn't multiply the rates out by the population)
+    * If output_type=='total', then this outputs estimates for the observed
+      amount in each group such that the sum of the point estimates equals the
+      original total
+    * If output_type=='rate', then this estimates rates for each group
+      (and doesn't multiply the rates out by the population)
 
     Parameters
     ----------
-    groups_to_split_into : list
+    groups_to_split_into
         list of groups to disaggregate observations into
-    observation_group_membership_df : DataFrame
+    observation_group_membership_df
         Dataframe with columns demographic_id, pattern_id, obs,
         and columns for each of the groups_to_split_into
         with dummy variables that represent whether or not
         each group is included in the observations for that row.
-        This also optionally contains a obs_se column which will be used if use_se is True
-        demographic_id represents the population that the observation comes from
-        pattern_id gives the baseline that should be used for splitting
-    population_sizes : DataFrame
+        This also optionally contains a obs_se column which will be used if
+        use_se is True. demographic_id represents the population that the
+        observation comes from pattern_id gives the baseline that should be used
+        for splitting
+    population_sizes
         Dataframe with demographic_id as the index containing the
         size of each group within each population (given the demographic_id)
         INDEX FOR THIS DATAFRAME MUST BE DEMOGRAPHIC ID(PANDAS MULTIINDEX OK)
-    rate_patterns : DataFrame
+    rate_patterns
         dataframe with pattern_id as the index, and columns
-        for each of the groups_to_split where the entries represent the rate pattern
-        in the given group to use for pydisagg.
-    use_se : Optional[bool], optional
+        for each of the groups_to_split where the entries represent the rate
+        pattern in the given group to use for pydisagg.
+    use_se
         whether or not to report standard errors along with estimates
-        if set to True, then observation_group_membership_df must have an obs_se column
-        , by default False
-    model : Optional[DisaggModel], optional
-        DisaggModel to use for splitting, by default LogOdds_model()
-    output_type: Literal['total','rate'], optional
+        if set to True, then observation_group_membership_df must have an obs_se
+        column, by default False
+    model
+        DisaggModel to use for splitting, by default LogOddsModel()
+    output_type
         One of 'total' or 'rate'
-        Type of splitting to perform, whether to disaggregate and return the estimated total
-        in each group, or estimate the rate per population unit.
-    demographic_id_columns : Optional[list]
+        Type of splitting to perform, whether to disaggregate and return the
+        estimated total in each group, or estimate the rate per population unit.
+    demographic_id_columns
         Columns to use as demographic_id
         Defaults to None. If None is given, then we assume
-        that there is a already a demographic id column that matches the index in population_sizes.
-        Otherwise, we create a new demographic_id column, zipping the columns chosen into tuples
-    normalize_pop_for_average_type_obs: bool = True
-        Whether or not to normalize populations to sum to 1, this is appropriate when the output_type is rate
-        and when the aggregated observation is an average--whether an aggregated rate
-        or a mean of a continuous measure over different groups
+        that there is a already a demographic id column that matches the index
+        in population_sizes. Otherwise, we create a new demographic_id column,
+        zipping the columns chosen into tuples
+    normalize_pop_for_average_type_obs
+        Whether or not to normalize populations to sum to 1, this is appropriate
+        when the output_type is rate and when the aggregated observation is an
+        average--whether an aggregated rate or a mean of a continuous measure
+        over different groups
 
     Returns
     -------
@@ -218,6 +224,7 @@ def split_dataframe(
             two columns for each of the groups_to_split_into, giving the estimate
         If use_se==True, then has a nested column indexing, where both the
             point estimate and standard error for the estimate for each group is given.
+
     """
     if (normalize_pop_for_average_type_obs is True) and (
         output_type == "count"

diff --git a/src/pydisagg/ihme/splitter/age_splitter.py b/src/pydisagg/ihme/splitter/age_splitter.py
@@ -17,7 +17,7 @@
     validate_positive,
     validate_realnumber,
 )
-from pydisagg.models import LogOdds_model, RateMultiplicativeModel
+from pydisagg.models import LogOddsModel, RateMultiplicativeModel
 
 
 class AgeDataConfig(BaseModel):
@@ -364,7 +364,7 @@ def split(
         """
         model_mapping = {
             "rate": RateMultiplicativeModel(),
-            "logodds": LogOdds_model(),
+            "logodds": LogOddsModel(),
         }
 
         if model not in model_mapping: