Skip to content

Commit

Permalink
Merge pull request #68 from ihmeuw-msca/refactor/globals
Browse files Browse the repository at this point in the history
Refactor some of the core functions
  • Loading branch information
saalUW authored Aug 26, 2024
2 parents d4dc800 + 26aaed6 commit 5bfb720
Show file tree
Hide file tree
Showing 18 changed files with 350 additions and 319 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ $$D_i = \hat f_i \cdot p_i $$
For the current models in use, T is just a logarithm, and this assumes that each rate is some constant muliplied by the overall rate pattern level. Allowing a more general transformation T, such as a log-odds transformation, assumes multiplicativity in the associated odds, rather than the rate, and can produce better estimates statistically (potentially being a more realistic assumption in some cases) and practically, restricting the estimated rates to lie within a reasonable interval.

## Current Package Capabilities and Models
Currently, the multiplicative-in-rate model RateMultiplicativeModel with $T(x)=\log(x)$ and the Log Modified Odds model LMO_model(m) with $T(x)=\log(\frac{x}{1-x^{m}})$ are implemented. Note that the LMO_model with m=1 gives a multiplicative in odds model.
Currently, the multiplicative-in-rate model RateMultiplicativeModel with $T(x)=\log(x)$ and the Log Modified Odds model LMOModel(m) with $T(x)=\log(\frac{x}{1-x^{m}})$ are implemented. Note that the LMOModel with m=1 gives a multiplicative in odds model.

A useful (but slightly wrong) analogy is that the multiplicative-in-rate is to the multiplicative-in-odds model as ordinary least squares is to logistic regression in terms of the relationship between covariates and output (not in terms of anything like the likelihood)

Increasing m in the model LMO_model(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that rate estimates are bounded by 1.
Increasing m in the model LMOModel(m) gives results that are more similar to the multiplicative-in-rate model currently in use, while preserving the property that rate estimates are bounded by 1.
12 changes: 6 additions & 6 deletions examples/Basic Splitting Examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
"from pydisagg.disaggregate import split_datapoint\n",
"import numpy as np\n",
"from pydisagg.models import RateMultiplicativeModel\n",
"from pydisagg.models import LMO_model\n",
"from pydisagg.models import LogOdds_model\n",
"from pydisagg.models import LMOModel\n",
"from pydisagg.models import LogOddsModel\n",
"import pandas as pd"
]
},
Expand Down Expand Up @@ -40,7 +40,7 @@
"outputs": [],
"source": [
"rmm=RateMultiplicativeModel()\n",
"oddm=LogOdds_model()"
"oddm=LogOddsModel()"
]
},
{
Expand Down Expand Up @@ -163,7 +163,7 @@
" populations,\n",
" rate_pattern,\n",
" observed_total_SE,\n",
" model=LogOdds_model()\n",
" model=LogOddsModel()\n",
")\n",
"print(\"Estimated incidence in each group\")\n",
"print(estimate)\n",
Expand Down Expand Up @@ -208,7 +208,7 @@
" observed_total,\n",
" populations,\n",
" pattern,\n",
" model=LogOdds_model()\n",
" model=LogOddsModel()\n",
"))"
]
},
Expand Down Expand Up @@ -295,7 +295,7 @@
"metadata": {},
"outputs": [],
"source": [
"age_splitting_model=LMO_model(1)\n",
"age_splitting_model=LMOModel(1)\n",
"sex_splitting_model=RateMultiplicativeModel()\n",
"\n",
"age_groups=np.array([0,1,2,3])\n",
Expand Down
4 changes: 2 additions & 2 deletions examples/Dataframe Splitting Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
"\n",
"import numpy as np\n",
"from pydisagg.models import RateMultiplicativeModel\n",
"from pydisagg.models import LMO_model\n",
"from pydisagg.models import LogOdds_model\n",
"from pydisagg.models import LMOModel\n",
"from pydisagg.models import LogOddsModel\n",
"import pandas as pd\n",
"from example_df_dataset import data_df,population_sizes,baseline_patterns,groups_to_split_into"
]
Expand Down
6 changes: 3 additions & 3 deletions examples/test_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
"from pydisagg.disaggregate import split_datapoint\n",
"import numpy as np\n",
"from pydisagg.models import RateMultiplicativeModel\n",
"from pydisagg.models import LMO_model\n",
"from pydisagg.models import LogOdds_model\n",
"from pydisagg.models import LMOModel\n",
"from pydisagg.models import LogOddsModel\n",
"import pandas as pd\n",
"from tqdm.auto import tqdm"
]
Expand All @@ -35,7 +35,7 @@
"metadata": {},
"outputs": [],
"source": [
"oddm=LogOdds_model()"
"oddm=LogOddsModel()"
]
},
{
Expand Down
34 changes: 0 additions & 34 deletions src/pydisagg/ParameterTransformation.py

This file was deleted.

127 changes: 67 additions & 60 deletions src/pydisagg/disaggregate.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,62 @@
"""Module containing high level api for splitting"""

from typing import Literal, Optional, Union
from typing import Literal

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from pandas import DataFrame

from pydisagg.DisaggModel import DisaggModel
from pydisagg.models import LogOdds_model
from pydisagg.models import DisaggModel, LogOddsModel
from pydisagg.typing import DataFrame, NDArray


def split_datapoint(
observed_total: float,
bucket_populations: NDArray,
rate_pattern: NDArray,
observed_total_se: Optional[float] = None,
model: Optional[DisaggModel] = LogOdds_model(),
observed_total_se: float | None = None,
model: DisaggModel = LogOddsModel(),
output_type: Literal["count", "rate"] = "count",
normalize_pop_for_average_type_obs: bool = False,
pattern_covariance: Optional[NDArray] = None,
) -> Union[tuple, NDArray]:
pattern_covariance: NDArray | None = None,
) -> tuple | NDArray:
"""Disaggregate a datapoint using the model given as input.
Defaults to assuming multiplicativity in the odds ratio
If output_type=='total', then this outputs estimates for the observed amount in each group
such that the sum of the point estimates equals the original total
If output_type=='rate', then this estimates rates for each group
(and doesn't multiply the rates out by the population)
* If output_type=='total', then this outputs estimates for the observed
amount in each group such that the sum of the point estimates equals the
original total
* If output_type=='rate', then this estimates rates for each group
(and doesn't multiply the rates out by the population)
Parameters
----------
observed_total : float
observed_total
aggregated observed_total across all buckets, value to be split
bucket_populations : NDArray
bucket_populations
population size in each bucket
rate_pattern : NDArray
rate_pattern
Rate Pattern to use, should be an estimate of the rates in each bucket
that we want to rescale
observed_total_se : Optional[float], optional
observed_total_se
standard error of observed_total, by default None
output_type: Literal['total','rate'], optional
output_type
One of 'total' or 'rate'
Type of splitting to perform, whether to disaggregate and return the estimated total
in each group, or estimate the rate per population unit.
model : Optional[DisaggModel], optional
DisaggModel to use, by default LMO_model(1)
normalize_pop_for_average_type_obs: bool = True
Whether or not to normalize populations to sum to 1, this is appropriate when the output_type is rate
and when the aggregated observation is an average--whether an aggregated rate
or a mean of a continuous measure over different groups
pattern_covariance: Optional[NDArray], optional
Type of splitting to perform, whether to disaggregate and return the
estimated total in each group, or estimate the rate per population unit.
model
DisaggModel to use, by default LMOModel(1)
normalize_pop_for_average_type_obs
Whether or not to normalize populations to sum to 1, this is appropriate
when the output_type is rate and when the aggregated observation is an
average--whether an aggregated rate or a mean of a continuous measure
over different groups
pattern_covariance
2d Numpy array with covariance matrix of pattern.
Returns
-------
Union[Tuple,NDArray]
tuple | NDArray
If standard errors are available, this will return the tuple
(
estimate_in_each_bucket,
Expand All @@ -70,6 +70,7 @@ def split_datapoint(
If no observed_total_se is given, returns point estimates
If observed_total_se is given, then returns a tuple
(point_estimate,standard_error)
"""
if output_type not in ["count", "rate"]:
raise ValueError("output_type must be one of either 'total' or 'rate'")
Expand Down Expand Up @@ -158,58 +159,63 @@ def split_dataframe(
observation_group_membership_df: DataFrame,
population_sizes: DataFrame,
rate_patterns: DataFrame,
use_se: Optional[bool] = False,
model: Optional[DisaggModel] = LogOdds_model(),
use_se: bool = False,
model: DisaggModel = LogOddsModel(),
output_type: Literal["count", "rate"] = "count",
demographic_id_columns: Optional[list] = None,
demographic_id_columns: list | None = None,
normalize_pop_for_average_type_obs: bool = False,
) -> DataFrame:
"""Disaggregate datapoints and pivots observations into estimates for each group per demographic id
"""Disaggregate datapoints and pivots observations into estimates for each
group per demographic id
If output_type=='total', then this outputs estimates for the observed amount in each group
such that the sum of the point estimates equals the original total
If output_type=='rate', then this estimates rates for each group
(and doesn't multiply the rates out by the population)
* If output_type=='total', then this outputs estimates for the observed
amount in each group such that the sum of the point estimates equals the
original total
* If output_type=='rate', then this estimates rates for each group
(and doesn't multiply the rates out by the population)
Parameters
----------
groups_to_split_into : list
groups_to_split_into
list of groups to disaggregate observations into
observation_group_membership_df : DataFrame
observation_group_membership_df
Dataframe with columns demographic_id, pattern_id, obs,
and columns for each of the groups_to_split_into
with dummy variables that represent whether or not
each group is included in the observations for that row.
This also optionally contains a obs_se column which will be used if use_se is True
demographic_id represents the population that the observation comes from
pattern_id gives the baseline that should be used for splitting
population_sizes : DataFrame
This also optionally contains a obs_se column which will be used if
use_se is True. demographic_id represents the population that the
observation comes from pattern_id gives the baseline that should be used
for splitting
population_sizes
Dataframe with demographic_id as the index containing the
size of each group within each population (given the demographic_id)
INDEX FOR THIS DATAFRAME MUST BE DEMOGRAPHIC ID(PANDAS MULTIINDEX OK)
rate_patterns : DataFrame
rate_patterns
dataframe with pattern_id as the index, and columns
for each of the groups_to_split where the entries represent the rate pattern
in the given group to use for pydisagg.
use_se : Optional[bool], optional
for each of the groups_to_split where the entries represent the rate
pattern in the given group to use for pydisagg.
use_se
whether or not to report standard errors along with estimates
if set to True, then observation_group_membership_df must have an obs_se column
, by default False
model : Optional[DisaggModel], optional
DisaggModel to use for splitting, by default LogOdds_model()
output_type: Literal['total','rate'], optional
if set to True, then observation_group_membership_df must have an obs_se
column, by default False
model
DisaggModel to use for splitting, by default LogOddsModel()
output_type
One of 'total' or 'rate'
Type of splitting to perform, whether to disaggregate and return the estimated total
in each group, or estimate the rate per population unit.
demographic_id_columns : Optional[list]
Type of splitting to perform, whether to disaggregate and return the
estimated total in each group, or estimate the rate per population unit.
demographic_id_columns
Columns to use as demographic_id
Defaults to None. If None is given, then we assume
that there is a already a demographic id column that matches the index in population_sizes.
Otherwise, we create a new demographic_id column, zipping the columns chosen into tuples
normalize_pop_for_average_type_obs: bool = True
Whether or not to normalize populations to sum to 1, this is appropriate when the output_type is rate
and when the aggregated observation is an average--whether an aggregated rate
or a mean of a continuous measure over different groups
that there is a already a demographic id column that matches the index
in population_sizes. Otherwise, we create a new demographic_id column,
zipping the columns chosen into tuples
normalize_pop_for_average_type_obs
Whether or not to normalize populations to sum to 1, this is appropriate
when the output_type is rate and when the aggregated observation is an
average--whether an aggregated rate or a mean of a continuous measure
over different groups
Returns
-------
Expand All @@ -218,6 +224,7 @@ def split_dataframe(
two columns for each of the groups_to_split_into, giving the estimate
If use_se==True, then has a nested column indexing, where both the
point estimate and standard error for the estimate for each group is given.
"""
if (normalize_pop_for_average_type_obs is True) and (
output_type == "count"
Expand Down
4 changes: 2 additions & 2 deletions src/pydisagg/ihme/splitter/age_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
validate_positive,
validate_realnumber,
)
from pydisagg.models import LogOdds_model, RateMultiplicativeModel
from pydisagg.models import LogOddsModel, RateMultiplicativeModel


class AgeDataConfig(BaseModel):
Expand Down Expand Up @@ -364,7 +364,7 @@ def split(
"""
model_mapping = {
"rate": RateMultiplicativeModel(),
"logodds": LogOdds_model(),
"logodds": LogOddsModel(),
}

if model not in model_mapping:
Expand Down
Loading

0 comments on commit 5bfb720

Please sign in to comment.