diff --git a/experiments/arxiv/compare_trigger_policies/pipeline_config.py b/experiments/arxiv/compare_trigger_policies/pipeline_config.py index 06a16dd05..c029f5648 100644 --- a/experiments/arxiv/compare_trigger_policies/pipeline_config.py +++ b/experiments/arxiv/compare_trigger_policies/pipeline_config.py @@ -18,12 +18,11 @@ ) from modyn.config.schema.pipeline.evaluation.metrics import AccuracyMetricConfig, F1ScoreMetricConfig - arxiv_bytes_parser_function = ( "import torch\n" "import numpy as np\n" - "def bytes_parser_function(data: bytes) -> str:\n" - " return str(data, 'utf8')" + "def bytes_parser_function(data: bytes) -> str:\n" + " return str(data, 'utf8')" ) arxiv_evaluation_transformer_function = ( "import torch\n" @@ -31,6 +30,7 @@ " return torch.argmax(model_output, dim=-1)\n" ) + def gen_pipeline_config( config_ref: str, trigger_config: TriggerConfig, @@ -40,7 +40,9 @@ def gen_pipeline_config( ) -> ModynPipelineConfig: num_classes = 172 return ModynPipelineConfig( - pipeline=Pipeline(name=config_ref, description="Arxiv pipeline for comparing trigger policies", version="0.0.1"), + pipeline=Pipeline( + name=config_ref, description="Arxiv pipeline for comparing trigger policies", version="0.0.1" + ), model=ModelConfig(id="ArticleNet", config={"num_classes": num_classes}), model_storage=PipelineModelStorageConfig(full_model_strategy=FullModelStrategy(name="PyTorchFullModel")), training=TrainingConfig( @@ -87,7 +89,9 @@ def gen_pipeline_config( dataloader_workers=1, tokenizer="DistilBertTokenizerTransform", metrics=[ - AccuracyMetricConfig(evaluation_transformer_function=arxiv_evaluation_transformer_function, topn=1), + AccuracyMetricConfig( + evaluation_transformer_function=arxiv_evaluation_transformer_function, topn=1 + ), AccuracyMetricConfig(evaluation_transformer_function="", topn=2), AccuracyMetricConfig(evaluation_transformer_function="", topn=5), AccuracyMetricConfig(evaluation_transformer_function="", topn=10), diff --git a/experiments/arxiv/compare_trigger_policies/run.py b/experiments/arxiv/compare_trigger_policies/run.py index 61ea0a3dd..5df4797f2 100644 --- a/experiments/arxiv/compare_trigger_policies/run.py +++ b/experiments/arxiv/compare_trigger_policies/run.py @@ -3,8 +3,8 @@ import pandas as pd from experiments.arxiv.compare_trigger_policies.pipeline_config import gen_pipeline_config -from experiments.utils.models import Experiment from experiments.utils.experiment_runner import run_multiple_pipelines +from experiments.utils.models import Experiment from modyn.config.schema.pipeline import ( EvalHandlerConfig, ModynPipelineConfig, @@ -27,9 +27,8 @@ _FIRST_TIMESTAMP = int(pd.to_datetime("1995-01-01").timestamp()) _LAST_TIMESTAMP = int(pd.to_datetime("2024-07-01").timestamp()) -def construct_slicing_eval_handler( - execution_time: EvalHandlerExecutionTime = "manual" -) -> EvalHandlerConfig: + +def construct_slicing_eval_handler(execution_time: EvalHandlerExecutionTime = "manual") -> EvalHandlerConfig: return EvalHandlerConfig( name="slidingmatrix", execution_time=execution_time, @@ -95,9 +94,10 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: ) ] + PERIODIC_EVAL_INTERVAL = [("current", "13w")] # total: 1/2y -# pretrain/cold start can be chosen post fuction by just dropping evaluation info before a certain date +# pretrain/cold start can be chosen post function by just dropping evaluation info before a certain date _EXPERIMENT_REFS: dict[int, Experiment] = { # -------------------------------------------------------------------------------- # # 1X: Baselines with PERIODIC_EVAL_INTERVAL, executed with cautious # @@ -109,8 +109,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 10: Experiment( name="arxiv-baseline-time", eval_handlers=( - construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), time_triggers={ schedule: TimeTriggerConfig(every=schedule, start_timestamp=_FIRST_TIMESTAMP) @@ -122,8 +122,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 11: Experiment( name="arxiv-baseline-dataamount", eval_handlers=( - construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), data_amount_triggers={ f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples) diff --git a/experiments/huffpost/compare_trigger_policies/pipeline_config.py b/experiments/huffpost/compare_trigger_policies/pipeline_config.py index a6a9995c8..c1d09694d 100644 --- a/experiments/huffpost/compare_trigger_policies/pipeline_config.py +++ b/experiments/huffpost/compare_trigger_policies/pipeline_config.py @@ -21,8 +21,8 @@ hp_bytes_parser_function = ( "import torch\n" "import numpy as np\n" - "def bytes_parser_function(data: bytes) -> str:\n" - " return str(data, 'utf8')" + "def bytes_parser_function(data: bytes) -> str:\n" + " return str(data, 'utf8')" ) hp_evaluation_transformer_function = ( "import torch\n" @@ -40,7 +40,9 @@ def gen_pipeline_config( ) -> ModynPipelineConfig: num_classes = 42 return ModynPipelineConfig( - pipeline=Pipeline(name=config_ref, description="Huffpost pipeline for comparing trigger policies", version="0.0.1"), + pipeline=Pipeline( + name=config_ref, description="Huffpost pipeline for comparing trigger policies", version="0.0.1" + ), model=ModelConfig(id="ArticleNet", config={"num_classes": num_classes}), model_storage=PipelineModelStorageConfig(full_model_strategy=FullModelStrategy(name="PyTorchFullModel")), training=TrainingConfig( @@ -61,7 +63,9 @@ def gen_pipeline_config( name="default", algorithm="AdamW", source="PyTorch", - param_groups=[OptimizerParamGroup(module="model", config={"lr": 0.00002, "weight_decay": 0.01})], + param_groups=[ + OptimizerParamGroup(module="model", config={"lr": 0.00002, "weight_decay": 0.01}) + ], ) ], ) @@ -94,7 +98,9 @@ def gen_pipeline_config( dataloader_workers=1, tokenizer="DistilBertTokenizerTransform", metrics=[ - AccuracyMetricConfig(evaluation_transformer_function=hp_evaluation_transformer_function, topn=1), + AccuracyMetricConfig( + evaluation_transformer_function=hp_evaluation_transformer_function, topn=1 + ), AccuracyMetricConfig(evaluation_transformer_function="", topn=2), AccuracyMetricConfig(evaluation_transformer_function="", topn=5), AccuracyMetricConfig(evaluation_transformer_function="", topn=10), diff --git a/experiments/huffpost/compare_trigger_policies/run.py b/experiments/huffpost/compare_trigger_policies/run.py index 7ddfb9244..41b240b07 100644 --- a/experiments/huffpost/compare_trigger_policies/run.py +++ b/experiments/huffpost/compare_trigger_policies/run.py @@ -2,8 +2,8 @@ import pandas as pd -from experiments.utils.models import Experiment from experiments.utils.experiment_runner import run_multiple_pipelines +from experiments.utils.models import Experiment from modyn.config.schema.pipeline import ( EvalHandlerConfig, ModynPipelineConfig, @@ -21,15 +21,14 @@ from modyn.config.schema.pipeline.trigger.simple.data_amount import DataAmountTriggerConfig from modyn.config.schema.pipeline.trigger.simple.time import TimeTriggerConfig from modynclient.config.schema.client_config import ModynClientConfig, Supervisor + from .pipeline_config import gen_pipeline_config _FIRST_TIMESTAMP = int(pd.to_datetime("2012-01-28").timestamp()) _LAST_TIMESTAMP = int(pd.to_datetime("2022-09-24").timestamp()) # last: dummy -def construct_slicing_eval_handler( - execution_time: EvalHandlerExecutionTime = "manual" -) -> EvalHandlerConfig: +def construct_slicing_eval_handler(execution_time: EvalHandlerExecutionTime = "manual") -> EvalHandlerConfig: return EvalHandlerConfig( name="slidingmatrix", execution_time=execution_time, @@ -66,6 +65,7 @@ def construct_periodic_eval_handlers( for (interval, fake_interval) in intervals ] + def construct_between_trigger_eval_handler(execution_time: EvalHandlerExecutionTime = "manual") -> EvalHandlerConfig: return EvalHandlerConfig( name="full", @@ -75,6 +75,7 @@ def construct_between_trigger_eval_handler(execution_time: EvalHandlerExecutionT datasets=["huffpost_kaggle_all"], # train and test ) + def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: return [ gen_pipeline_config( @@ -93,6 +94,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: ) ] + # total: 14weeks -> ~4mths (with quarterly evaluations the intervals slightly overlap by 1 month) PERIODIC_EVAL_INTERVAL = [("current", "7w")] @@ -107,8 +109,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 10: Experiment( name="hp-baseline-time", eval_handlers=( - construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), time_triggers={ schedule: TimeTriggerConfig(every=schedule, start_timestamp=_FIRST_TIMESTAMP) @@ -120,8 +122,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 11: Experiment( name="hp-baseline-dataamount", eval_handlers=( - construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), data_amount_triggers={ f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples) diff --git a/experiments/yearbook/compare_trigger_policies/pipeline_config.py b/experiments/yearbook/compare_trigger_policies/pipeline_config.py index 80827debc..da5a0337d 100644 --- a/experiments/yearbook/compare_trigger_policies/pipeline_config.py +++ b/experiments/yearbook/compare_trigger_policies/pipeline_config.py @@ -16,7 +16,11 @@ ) from modyn.config.schema.pipeline.evaluation.config import EvalDataConfig from modyn.config.schema.pipeline.evaluation.handler import EvalHandlerConfig -from modyn.config.schema.pipeline.evaluation.metrics import AccuracyMetricConfig, F1ScoreMetricConfig, RocAucMetricConfig +from modyn.config.schema.pipeline.evaluation.metrics import ( + AccuracyMetricConfig, + F1ScoreMetricConfig, + RocAucMetricConfig, +) from modyn.config.schema.pipeline.model_storage import FullModelStrategy from modyn.config.schema.pipeline.sampling.config import NewDataStrategyConfig @@ -113,9 +117,7 @@ def gen_pipeline_config( num_classes=2, average="micro", ), - RocAucMetricConfig( - evaluation_transformer_function=yb_evaluation_transformer_function_rocauc - ) + RocAucMetricConfig(evaluation_transformer_function=yb_evaluation_transformer_function_rocauc), ], ) for yb_dataset_name in ["yearbook_all", "yearbook_train", "yearbook_test"] diff --git a/experiments/yearbook/compare_trigger_policies/run.py b/experiments/yearbook/compare_trigger_policies/run.py index 7581f38f1..a17380205 100644 --- a/experiments/yearbook/compare_trigger_policies/run.py +++ b/experiments/yearbook/compare_trigger_policies/run.py @@ -1,12 +1,11 @@ import os -from experiments.utils.models import Experiment from experiments.utils.experiment_runner import run_multiple_pipelines +from experiments.utils.models import Experiment from experiments.yearbook.compare_trigger_policies.pipeline_config import ( gen_pipeline_config, ) from modyn.config.schema.pipeline import ( - DataAmountTriggerConfig, ModynPipelineConfig, TimeTriggerConfig, ) @@ -31,9 +30,7 @@ AlibiDetectMmdDriftMetric, ) from modyn.config.schema.pipeline.trigger.drift.criterion import ( - DynamicPercentileThresholdCriterion, DynamicRollingAverageThresholdCriterion, - ThresholdDecisionCriterion, ) from modyn.config.schema.pipeline.trigger.drift.detection_window.time_ import ( TimeWindowingStrategy, @@ -46,13 +43,11 @@ DynamicPercentilePerformanceThresholdCriterion, StaticNumberAvoidableMisclassificationCriterion, StaticPerformanceThresholdCriterion, - _DynamicPerformanceThresholdCriterion, ) from modyn.config.schema.pipeline.trigger.performance.performance import ( PerformanceTriggerConfig, PerformanceTriggerEvaluationConfig, ) -from modyn.supervisor.internal.eval.strategies import periodic from modyn.utils.utils import SECONDS_PER_UNIT from modynclient.config.schema.client_config import ModynClientConfig, Supervisor @@ -105,15 +100,17 @@ def construct_periodic_eval_handlers( ] -def construct_between_trigger_eval_handler(execution_time: EvalHandlerExecutionTime = "manual") -> list[EvalHandlerConfig]: +def construct_between_trigger_eval_handler( + execution_time: EvalHandlerExecutionTime = "manual", +) -> list[EvalHandlerConfig]: return [ EvalHandlerConfig( - name="full", - execution_time=execution_time, - models="active", - strategy=BetweenTwoTriggersEvalStrategyConfig(), - datasets=["yearbook_all"], # train and test - ) + name="full", + execution_time=execution_time, + models="active", + strategy=BetweenTwoTriggersEvalStrategyConfig(), + datasets=["yearbook_all"], # train and test + ) ] @@ -147,9 +144,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: ("delta+-15y", f"{15*24+3}h"), ] -BEST_PERIODIC_EVAL_INTERVAL = [ - ("delta+-1y", f"{1*24+3}h") # total: 3 years -] +BEST_PERIODIC_EVAL_INTERVAL = [("delta+-1y", f"{1*24+3}h")] # total: 3 years _EXPERIMENT_REFS = { # 0: Experiment( @@ -264,12 +259,10 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # # 1: 0.4, 0.03, 0.09 # # 2: 0.2, 0.05, 0.07 # # 3: 0.15, 0.12 - # # rerun failed # # for threshold, detection_interval, window_size in [ # # # (0.03, 250, "10d"), # # ] - # for criterion_name, criterion in { # f"mmd-{threshold}": ThresholdDecisionCriterion(threshold=threshold) # }.items() @@ -280,15 +273,17 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 21: Experiment( name="yb-datadrift-dynamic", eval_handlers=( - construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), drift_detection_triggers={ f"{criterion_name}_int{detection_interval}_win{window_size}": DataDriftTriggerConfig( evaluation_interval_data_points=detection_interval, windowing_strategy=TimeWindowingStrategy( # overlap has no affect acc. to offline exploration - limit_ref=window_size, limit_cur=window_size, allow_overlap=False + limit_ref=window_size, + limit_cur=window_size, + allow_overlap=False, ), # with 30k samples and 84 years, 10y are roughly 30000/84*10=3500 samples # hence, if we want ~10 years of warmup, to 3500/detection_interval warmup intervals @@ -297,11 +292,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: warmup_policy=TimeTriggerConfig(every="3d", start_timestamp=_FIRST_TIMESTAMP), # 5k samples are enough for drift detection, in yearbook we won't accumulate that many anyway sample_size=5_000, - metrics={ - "mmd": AlibiDetectMmdDriftMetric( - decision_criterion=criterion, device="gpu" - ) - }, + metrics={"mmd": AlibiDetectMmdDriftMetric(decision_criterion=criterion, device="gpu")}, ) # multiprocessing across gpus # TODO: 0: 100 @@ -309,7 +300,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # TODO: 2: 500 for detection_interval in [100] # 100, 250, 500 for window_size in ["4d"] # dataset specific, best acc. to offline exploraion and static drift experiments - for decision_window_size in [30] # 10, 20, + for decision_window_size in [30] # 10, 20, for criterion_name, criterion in ( # { # f"mmd-perc-{percentile}-{decision_window_size}": DynamicPercentileThresholdCriterion( @@ -317,7 +308,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # ) # for percentile in [0.05, 0.15, 0.3] # } - # | + # | { f"mmd-rollavg-{deviation}-{decision_window_size}": DynamicRollingAverageThresholdCriterion( window_size=decision_window_size, deviation=deviation, absolute=False @@ -334,8 +325,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 30: Experiment( name="yb-performancetrigger", eval_handlers=( - construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), performance_triggers={ f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig( @@ -397,8 +388,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 40: Experiment( name="yb-costtrigger-dataincorporation", eval_handlers=( - construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), cost_triggers={ f"int{interval}_exch{exchange_rate}": DataIncorporationLatencyCostTriggerConfig( @@ -415,8 +406,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 41: Experiment( name="yb-costtrigger-avoidablemisclassification", eval_handlers=( - construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), cost_triggers={ f"int{interval}_exch{exchange_rate}_red{allow_reduction}": AvoidableMisclassificationCostTriggerConfig( @@ -458,8 +449,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: 51: Experiment( name="yb-ensemble", eval_handlers=( - construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + - construct_between_trigger_eval_handler("manual") + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") ), ensemble_triggers={ "ensemble1": EnsembleTriggerConfig(