Skip to content

Commit

Permalink
Fix IntegrityError in DagFileProcessor.manage_slas (apache#19553)
Browse files Browse the repository at this point in the history
The DagFileProcessor.manage_slas does not consider if an SlaMiss already exists in
DB while inserting slas.

If an SLA for a task is missed and recorded, on checking SLA again, this task
comes up again if there's no recent run of the task and we try to insert
the record into the SlaMiss table again, this results in Integrity error.

This PR fixes that by avoiding insert if the record already exists

Co-authored-by: Tzu-ping Chung <uranusjr@gmail.com>
Co-authored-by: Kaxil Naik <kaxilnaik@apache.org>
  • Loading branch information
3 people authored Nov 13, 2021
1 parent 1c2dfde commit 9519bf6
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 0 deletions.
11 changes: 11 additions & 0 deletions airflow/dag_processing/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,12 @@ def manage_slas(self, dag: DAG, session: Session = None) -> None:
.group_by(TI.task_id)
.subquery('sq')
)
# get recorded SlaMiss
recorded_slas_query = set(
session.query(SlaMiss.dag_id, SlaMiss.task_id, SlaMiss.execution_date).filter(
SlaMiss.dag_id == dag.dag_id, SlaMiss.task_id.in_(dag.task_ids)
)
)

max_tis: Iterator[TI] = (
session.query(TI)
Expand All @@ -401,6 +407,7 @@ def manage_slas(self, dag: DAG, session: Session = None) -> None:
)

ts = timezone.utcnow()

for ti in max_tis:
task = dag.get_task(ti.task_id)
if not task.sla:
Expand All @@ -419,9 +426,13 @@ def manage_slas(self, dag: DAG, session: Session = None) -> None:
else:
while next_info.logical_date < ts:
next_info = dag.next_dagrun_info(next_info.data_interval, restricted=False)

if next_info is None:
break
if (ti.dag_id, ti.task_id, next_info.logical_date) in recorded_slas_query:
break
if next_info.logical_date + task.sla < ts:

sla_miss = SlaMiss(
task_id=ti.task_id,
dag_id=ti.dag_id,
Expand Down
39 changes: 39 additions & 0 deletions tests/dag_processing/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,45 @@ def test_dag_file_processor_sla_miss_callback_sent_notification(self, create_dum

sla_callback.assert_not_called()

def test_dag_file_processor_sla_miss_doesnot_raise_integrity_error(self, dag_maker):
"""
Test that the dag file processor does not try to insert already existing item into the database
"""
session = settings.Session()

# Create dag with a start of 2 days ago, but an sla of 1 day
# ago so we'll already have an sla_miss on the books
test_start_date = days_ago(2)
with dag_maker(
dag_id='test_sla_miss',
default_args={'start_date': test_start_date, 'sla': datetime.timedelta(days=1)},
) as dag:
task = DummyOperator(task_id='dummy')

dag_maker.create_dagrun(execution_date=test_start_date, state=State.SUCCESS)

# Create a TaskInstance for two days ago
ti = TaskInstance(task=task, execution_date=test_start_date, state='success')
session.merge(ti)
session.flush()

dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock())
dag_file_processor.manage_slas(dag=dag, session=session)
sla_miss_count = (
session.query(SlaMiss)
.filter(
SlaMiss.dag_id == dag.dag_id,
SlaMiss.task_id == task.task_id,
)
.count()
)
assert sla_miss_count == 1
# Now call manage_slas and see that it runs without errors
# because of existing SlaMiss above.
# Since this is run often, it's possible that it runs before another
# ti is successful thereby trying to insert a duplicate record.
dag_file_processor.manage_slas(dag=dag, session=session)

def test_dag_file_processor_sla_miss_callback_exception(self, create_dummy_dag):
"""
Test that the dag file processor gracefully logs an exception if there is a problem
Expand Down

0 comments on commit 9519bf6

Please sign in to comment.