Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-134249 / 25.10 / convert failover.* to new API #15718

Merged
merged 9 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/middlewared/middlewared/api/v25_10_0/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .docker_network import * # noqa
from .enclosure2 import * # noqa
from .enclosure_label import * # noqa
from .failover import * # noqa
from .failover_reboot import * # noqa
from .fc import * # noqa
from .fc_host import * # noqa
Expand Down
111 changes: 111 additions & 0 deletions src/middlewared/middlewared/api/v25_10_0/failover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from middlewared.api.base import (
BaseModel,
Excluded,
excluded_field,
ForUpdateMetaclass,
NonEmptyString,
NotRequired,
single_argument_args
)

from pydantic import HttpUrl


class FailoverEntry(BaseModel):
id: int
disabled: bool
"""When true HA will be administratively disabled."""
master: bool
"""Marks the particular node in the chassis as the master node.
The standby node will have the opposite value."""
timeout: int
"""The time to WAIT (in seconds) until a failover occurs when a network
event occurs on an interface that is marked critical for failover AND
HA is enabled and working appropriately. The default time to wait is
2 seconds.

**NOTE**
This setting does NOT effect the `disabled` or `master` parameters."""


class FailoverUpdateArgs(FailoverEntry, metaclass=ForUpdateMetaclass):
id: Excluded = excluded_field()
master: bool | None


class FailoverUpdateResult(BaseModel):
result: FailoverEntry


class FailoverGetIpsArgs(BaseModel):
pass


class FailoverGetIpsResult(BaseModel):
result: list[HttpUrl]


class FailoverBecomePassiveArgs(BaseModel):
pass


class FailoverBecomePassiveResult(BaseModel):
result: None


class FailoverLicensedArgs(BaseModel):
pass


class FailoverLicensedResult(BaseModel):
result: bool


class FailoverNodeArgs(BaseModel):
pass


class FailoverNodeResult(BaseModel):
result: str


class FailoverStatusArgs(BaseModel):
pass


class FailoverStatusResult(BaseModel):
result: str


class FailoverSyncFromPeerArgs(BaseModel):
pass


class FailoverSyncFromPeerResult(BaseModel):
result: None


@single_argument_args("sync_to_peer")
class FailoverSyncToPeerArgs(BaseModel):
reboot: bool = False
"""If set to True, will reboot the other controller."""


class FailoverSyncToPeerResult(BaseModel):
result: None


@single_argument_args("failover_upgrade")
class FailoverUpgradeArgs(BaseModel):
train: NonEmptyString = NotRequired
resume: bool = False
"""Should be set to true if a previous call to this method returned a
`CallError` with `errno=EAGAIN` meaning that an upgrade can be performed
with a warning and that warning is accepted. In that case, you also have
to set `resume_manual` to `true` if a previous call to this method was
performed using update file upload."""
resume_manual: bool = False


class FailoverUpgradeResult(BaseModel):
result: bool
168 changes: 87 additions & 81 deletions src/middlewared/middlewared/plugins/failover.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,37 @@
import time
from functools import partial

from middlewared.api import api_method
from middlewared.api.current import (
FailoverBecomePassiveArgs,
FailoverBecomePassiveResult,
FailoverEntry,
FailoverGetIpsArgs,
FailoverGetIpsResult,
FailoverLicensedArgs,
FailoverLicensedResult,
FailoverNodeArgs,
FailoverNodeResult,
FailoverStatusArgs,
FailoverStatusResult,
FailoverSyncFromPeerArgs,
FailoverSyncFromPeerResult,
FailoverSyncToPeerArgs,
FailoverSyncToPeerResult,
FailoverUpdateArgs,
FailoverUpdateResult,
FailoverUpgradeArgs,
FailoverUpgradeResult,
)
from middlewared.auth import TruenasNodeSessionManagerCredentials
from middlewared.schema import accepts, Bool, Dict, Int, List, NOT_PROVIDED, Str, returns, Patch
from middlewared.schema import NOT_PROVIDED
from middlewared.service import (
job, no_authz_required, pass_app, private, CallError, ConfigService,
ValidationError, ValidationErrors
job,
private,
CallError,
ConfigService,
ValidationError,
ValidationErrors
)
import middlewared.sqlalchemy as sa
from middlewared.plugins.auth import AuthService
Expand Down Expand Up @@ -53,49 +79,24 @@ class Config:
datastore_extend = 'failover.failover_extend'
cli_private = True
role_prefix = 'FAILOVER'

ENTRY = Dict(
'failover_entry',
Int('id', required=True),
Bool('disabled', required=True),
Int('timeout', required=True),
Bool('master', required=True),
)
entry = FailoverEntry

@private
async def failover_extend(self, data):
data['master'] = await self.middleware.call('failover.node') == data.pop('master_node')
return data

@accepts(Patch(
'failover_entry', 'failover_update',
('edit', {'name': 'master', 'method': lambda x: setattr(x, 'null', True)}),
('rm', {'name': 'id'}),
('attr', {'update': True}),
), audit='Failover config update')
@api_method(
FailoverUpdateArgs,
FailoverUpdateResult,
audit='Failover config update',
)
async def do_update(self, data):
"""
Update failover state.

`disabled` When true indicates that HA will be disabled.
`master` Marks the particular node in the chassis as the master node.
The standby node will have the opposite value.

`timeout` is the time to WAIT until a failover occurs when a network
event occurs on an interface that is marked critical for failover AND
HA is enabled and working appropriately.

The default time to wait is 2 seconds.
**NOTE**
This setting does NOT effect the `disabled` or `master` parameters.
"""
"""Update failover configuration."""
master = data.pop('master', NOT_PROVIDED)

old = await self.middleware.call('datastore.config', 'system.failover')

new = old.copy()
new.update(data)

if master is not NOT_PROVIDED:
# The node making the call is the one we want to make MASTER by default
new['master_node'] = await self.middleware.call('failover.node')
Expand Down Expand Up @@ -136,9 +137,11 @@ async def _master_node(self, master):
else:
raise CallError('Unable to change node state in MANUAL mode')

@no_authz_required
@accepts()
@returns(Bool())
@api_method(
FailoverLicensedArgs,
FailoverLicensedResult,
authorization_required=False,
)
def licensed(self):
"""Checks whether this instance is licensed as a HA unit"""
try:
Expand Down Expand Up @@ -171,8 +174,11 @@ async def hardware(self):
"""
return (await self.ha_mode())[0]

@accepts(roles=['FAILOVER_READ'])
@returns(Str())
@api_method(
FailoverNodeArgs,
FailoverNodeResult,
roles=['FAILOVER_READ']
)
async def node(self):
"""
Returns the slot position in the chassis that
Expand All @@ -191,9 +197,12 @@ async def internal_interfaces(self):
ints = await self.middleware.call('failover.internal_interface.detect')
return list(ints)

@accepts(roles=['FAILOVER_READ'])
@returns(Str())
@pass_app(rest=True)
@api_method(
FailoverStatusArgs,
FailoverStatusResult,
pass_app=True,
roles=['FAILOVER_READ']
)
async def status(self, app):
"""
Get the current HA status.
Expand Down Expand Up @@ -268,15 +277,21 @@ def in_progress(self):
)
return bool(event)

@accepts(roles=['FAILOVER_READ'])
@returns(List('ips', items=[Str('ip')]))
@pass_app(rest=True)
async def get_ips(self, app):
@api_method(
FailoverGetIpsArgs,
FailoverGetIpsResult,
roles=['FAILOVER_READ']
)
async def get_ips(self):
"""Get a list of IPs for which the webUI can be accessed."""
return await self.middleware.call('system.general.get_ui_urls')

@accepts(audit='Failover become passive', roles=['FAILOVER_WRITE'])
@returns()
@api_method(
FailoverBecomePassiveArgs,
FailoverBecomePassiveResult,
audit='Failover become passive',
roles=['FAILOVER_WRITE']
)
def become_passive(self):
"""
This method is only called manually by the end-user so we fully expect that they
Expand Down Expand Up @@ -333,11 +348,11 @@ async def force_master(self):
rc = await self.middleware.call('failover.fenced.start', True)
return not rc if rc != 6 else bool(rc) # 6 means already running

@accepts(Dict(
'options',
Bool('reboot', default=False),
), roles=['FAILOVER_WRITE'])
@returns()
@api_method(
FailoverSyncToPeerArgs,
FailoverSyncToPeerResult,
roles=['FAILOVER_WRITE'],
)
def sync_to_peer(self, options):
"""
Sync database and files to the other controller.
Expand Down Expand Up @@ -379,8 +394,11 @@ def sync_to_peer(self, options):
if options['reboot']:
self.middleware.call_sync('failover.call_remote', 'system.reboot', ['Failover sync to peer', {'delay': 2}])

@accepts(roles=['FAILOVER_WRITE'])
@returns()
@api_method(
FailoverSyncFromPeerArgs,
FailoverSyncFromPeerResult,
roles=['FAILOVER_WRITE'],
)
def sync_from_peer(self):
"""
Sync database and files from the other controller.
Expand Down Expand Up @@ -577,31 +595,20 @@ async def is_single_master_node(self):
def upgrade_version(self):
return 1

@accepts(Dict(
'failover_upgrade',
Str('train', empty=False),
Bool('resume', default=False),
Bool('resume_manual', default=False),
), roles=['FAILOVER_WRITE'], audit='Failover upgrade')
@returns(Bool())
@api_method(
FailoverUpgradeArgs,
FailoverUpgradeResult,
roles=['FAILOVER_WRITE'],
audit='Failover upgrade',
)
@job(lock='failover_upgrade', pipes=['input'], check_pipes=False)
def upgrade(self, job, options):
"""Upgrades both controllers. Files will be downloaded to the
Active Controller and then transferred to the Standby Controller.
Upgrade process will start concurrently on both nodes. Once both
upgrades are applied, the Standby Controller will reboot. This
job will wait for that job to complete before finalizing.
"""
Upgrades both controllers.

Files will be downloaded to the Active Controller and then transferred to the Standby
Controller.

Upgrade process will start concurrently on both nodes.

Once both upgrades are applied, the Standby Controller will reboot. This job will wait for
that job to complete before finalizing.

`resume` should be set to `true` if a previous call to this method returned a `CallError` with `errno=EAGAIN`
meaning that an upgrade can be performed with a warning and that warning is accepted. In that case, you also
have to set `resume_manual` to `true` if a previous call to this method was performed using update file upload.
"""

if self.middleware.call_sync('failover.status') != 'MASTER':
raise CallError('Upgrade can only run on Active Controller.')

Expand All @@ -615,12 +622,11 @@ def upgrade(self, job, options):
else:
updatefile = options['resume_manual']

train = options.get('train')
if train:
train = options.get('train', NOT_PROVIDED)
if train is not NOT_PROVIDED:
self.middleware.call_sync('update.set_train', train)

local_path = self.middleware.call_sync('update.get_update_location')

updatefile_name = 'updatefile.sqsh'
updatefile_localpath = os.path.join(local_path, updatefile_name)
if not options['resume'] and updatefile:
Expand Down