Skip to content

Commit

Permalink
Reproducer and test for #139 - py scheduler not handling DB outage (#144
Browse files Browse the repository at this point in the history
)
  • Loading branch information
chuck-dbos authored Nov 1, 2024
1 parent d7cbb1d commit b064b3f
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 1 deletion.
5 changes: 4 additions & 1 deletion dbos/_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ def scheduler_loop(
if stop_event.wait(timeout=sleepTime.total_seconds()):
return
with SetWorkflowID(f"sched-{func.__qualname__}-{nextExecTime.isoformat()}"):
scheduler_queue.enqueue(func, nextExecTime, datetime.now(timezone.utc))
try:
scheduler_queue.enqueue(func, nextExecTime, datetime.now(timezone.utc))
except Exception as e:
dbos_logger.warning(f"Error scheduling workflow: ", e)


def scheduled(
Expand Down
122 changes: 122 additions & 0 deletions tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,95 @@
from datetime import datetime

import pytest
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine

# Public API
from dbos import DBOS


def simulate_db_restart(engine: Engine, downtime: float) -> None:
# Get DB name
with engine.connect() as connection:
current_db = connection.execute(text("SELECT current_database()")).scalar()

# Goal here is to disable connections to the DB for a while.
# Need a temp DB to do that and recover connectivity...
temp_db_name = "temp_database_for_maintenance"

# Retrieve the URL of the current engine
main_db_url = engine.url

# Modify the URL to point to the temporary database
temp_db_url = main_db_url.set(database=temp_db_name)

try:
with engine.connect().execution_options(
isolation_level="AUTOCOMMIT"
) as connection:
# Create a temporary database
connection.execute(text(f"CREATE DATABASE {temp_db_name};"))
print(f"Temporary database '{temp_db_name}' created.")
except Exception as e:
print("Could not create temp db: ", e)

temp_engine = create_engine(temp_db_url)
with temp_engine.connect().execution_options(
isolation_level="AUTOCOMMIT"
) as temp_connection:
try:
# Disable new connections to the database
temp_connection.execute(
text(f"ALTER DATABASE {current_db} WITH ALLOW_CONNECTIONS false;")
)

# Terminate all connections except the current one
temp_connection.execute(
text(
f"""
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE pid <> pg_backend_pid()
AND datname = '{current_db}';
"""
)
)
except Exception as e:
print(f"Could not disable db {current_db}: ", e)

time.sleep(downtime)

# Re-enable new connections
try:
temp_connection.execute(
text(f"ALTER DATABASE {current_db} WITH ALLOW_CONNECTIONS true;")
)
except Exception as e:
print(f"Could not reenable db {current_db}: ", e)
temp_engine.dispose()

try:
with engine.connect().execution_options(
isolation_level="AUTOCOMMIT"
) as connection:
# Clean up the temp DB
connection.execute(
text(
f"""
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE pid <> pg_backend_pid()
AND datname = '{temp_db_name}';
"""
)
)

# Drop temporary database
connection.execute(text(f"DROP DATABASE {temp_db_name};"))
except Exception as e:
print(f"Could not clean up temp db {temp_db_name}: ", e)


def test_scheduled_workflow(dbos: DBOS) -> None:
wf_counter: int = 0

Expand All @@ -20,6 +104,44 @@ def test_workflow(scheduled: datetime, actual: datetime) -> None:
assert wf_counter > 2 and wf_counter <= 4


def test_appdb_downtime(dbos: DBOS) -> None:
wf_counter: int = 0

@DBOS.transaction()
def test_transaction(var2: str) -> str:
rows = DBOS.sql_session.execute(text("SELECT 1")).fetchall()
return "ran"

@DBOS.scheduled("* * * * * *")
@DBOS.workflow()
def test_workflow(scheduled: datetime, actual: datetime) -> None:
nonlocal wf_counter
test_transaction("x")
wf_counter += 1

time.sleep(2)
simulate_db_restart(dbos._app_db.engine, 2)
time.sleep(2)
assert wf_counter > 2


def test_sysdb_downtime(dbos: DBOS) -> None:
wf_counter: int = 0

@DBOS.scheduled("* * * * * *")
@DBOS.workflow()
def test_workflow(scheduled: datetime, actual: datetime) -> None:
nonlocal wf_counter
wf_counter += 1

time.sleep(2)
simulate_db_restart(dbos._sys_db.engine, 2)
time.sleep(2)
# We know there should be at least 3 occurrences from the 4 seconds when the DB was up.
# There could be more than 4, depending on the pace the machine...
assert wf_counter > 2


def test_scheduled_transaction(dbos: DBOS) -> None:
txn_counter: int = 0

Expand Down

0 comments on commit b064b3f

Please sign in to comment.