Fix intermittent Redfish firmware update failures with BMC validation

Resolves a bug where firmware updates fail intermittently on some
hardware models due to invalid or unstable BMC responses immediately
after firmware update completion. The BMC may return inconsistent
responses for a period after firmware updates, causing the update
process to fail prematurely.

This change adds comprehensive BMC state validation that requires
multiple consecutive successful responses from System, Manager, and
NetworkAdapters resources before considering the firmware update
complete. This ensures the BMC has fully stabilized before proceeding.

Generated-By: Claude Code Sonnet 4
Change-Id: I5cb72f62d3fc62c3ad750c62924842cef59e79b8
Signed-off-by: Jacob Anders <janders@redhat.com>
This commit is contained in:
Jacob Anders 2025-09-09 14:41:58 +10:00
parent 7681e2216d
commit 85ec9d655f
4 changed files with 391 additions and 6 deletions

View file

@ -95,6 +95,27 @@ opts = [
default=300,
help=_('Number of seconds to wait before proceeding with the '
'reboot to finish the BMC firmware update step')),
cfg.IntOpt('firmware_update_required_successes',
min=0,
default=3,
help=_('Number of successful responses required to '
'consider post-upgrade BMC validation a success. '
'Set to 0 to disable post-upgrade validation '
'entirely.')),
cfg.IntOpt('firmware_update_validation_interval',
min=0,
default=30,
help=_('Timeout (in seconds) to wait between validation '
'attempts. Set to 0 for rapid succession retries '
'with no delay.')),
cfg.IntOpt('firmware_update_resource_validation_timeout',
min=0,
default=300,
help=_('Timeout (in seconds) to wait for BMC resources '
'(System, Manager, NetworkAdapters) to become stable '
'and consistently available after firmware update. '
'Set to 0 to disable post-upgrade validation '
'entirely.')),
cfg.StrOpt('firmware_source',
choices=[('http', _('If firmware source URL is also HTTP, then '
'serve from original location, otherwise '

View file

@ -181,7 +181,6 @@ class RedfishFirmware(base.FirmwareInterface):
@base.service_step(priority=0, abortable=False,
argsinfo=_FW_SETTINGS_ARGSINFO,
requires_ramdisk=False)
@base.cache_firmware_components
def update(self, task, settings):
"""Update the Firmware on the node using the settings for components.
@ -280,6 +279,7 @@ class RedfishFirmware(base.FirmwareInterface):
'seconds before proceeding to reboot the node %(node_uuid)s '
'to complete the step', {'node_uuid': node.uuid,
'wait_time': wait_unres_bmc})
# Store task monitor URI for periodic task polling
# TODO(iurygregory): Improve the logic here to identify if the BMC
# is back, so we don't have to unconditionally wait.
# The wait_unres_bmc will be the maximum time to wait.
@ -299,10 +299,103 @@ class RedfishFirmware(base.FirmwareInterface):
fw_clean.append(cleanup)
node.set_driver_internal_info('firmware_cleanup', fw_clean)
def _validate_resources_stability(self, node):
"""Validate that BMC resources are consistently available.
Requires consecutive successful responses from System, Manager,
and NetworkAdapters resources before considering them stable.
The number of required successes is configured via
CONF.redfish.firmware_update_required_successes.
Timeout is configured via
CONF.redfish.firmware_update_resource_validation_timeout.
:param node: the Ironic node object
:raises: RedfishError if resources don't stabilize within timeout
"""
timeout = CONF.redfish.firmware_update_resource_validation_timeout
required_successes = CONF.redfish.firmware_update_required_successes
validation_interval = CONF.redfish.firmware_update_validation_interval
# Skip validation if validation is disabled via configuration
if required_successes == 0 or timeout == 0:
reasons = []
if required_successes == 0:
reasons.append('required_successes=0')
if timeout == 0:
reasons.append('validation_timeout=0')
LOG.info('BMC resource validation disabled (%s) for node %(node)s',
', '.join(reasons), {'node': node.uuid})
return
LOG.debug('Starting resource stability validation for node %(node)s '
'(timeout: %(timeout)s seconds, '
'required_successes: %(required)s, '
'validation_interval: %(interval)s seconds)',
{'node': node.uuid, 'timeout': timeout,
'required': required_successes,
'interval': validation_interval})
start_time = time.time()
end_time = start_time + timeout
consecutive_successes = 0
last_exc = None
while time.time() < end_time:
try:
# Test System resource
system = redfish_utils.get_system(node)
# Test Manager resource
redfish_utils.get_manager(node, system)
# Test NetworkAdapters resource
chassis = redfish_utils.get_chassis(node, system)
chassis.network_adapters.get_members()
# All resources successful
consecutive_successes += 1
LOG.debug('Resource validation success %(count)d/%(required)d '
'for node %(node)s',
{'count': consecutive_successes,
'required': required_successes,
'node': node.uuid})
if consecutive_successes >= required_successes:
LOG.info('All tested Redfish resources stable and '
' available for node %(node)s',
{'node': node.uuid})
return
except (exception.RedfishError,
exception.RedfishConnectionError,
sushy.exceptions.BadRequestError) as e:
# Resource not available yet, reset counter
if consecutive_successes > 0:
LOG.debug('Resource validation interrupted for node '
'%(node)s, resetting success counter '
'(error: %(error)s)',
{'node': node.uuid, 'error': e})
consecutive_successes = 0
last_exc = e
# Wait before next validation attempt
time.sleep(validation_interval)
# Timeout reached without achieving stability
error_msg = _('BMC resources failed to stabilize within '
'%(timeout)s seconds for node %(node)s') % {
'timeout': timeout, 'node': node.uuid}
if last_exc:
error_msg += _(', last error: %(error)s') % {'error': last_exc}
LOG.error(error_msg)
raise exception.RedfishError(error=error_msg)
def _continue_updates(self, task, update_service, settings):
"""Continues processing the firmware updates
Continues to process the firmware updates on the node.
First monitors the current task completion, then validates resource
stability before proceeding to next update or completion.
Note that the caller must have an exclusive lock on the node.
@ -312,6 +405,7 @@ class RedfishFirmware(base.FirmwareInterface):
"""
node = task.node
fw_upd = settings[0]
wait_interval = fw_upd.get('wait')
if wait_interval:
time_now = str(timeutils.utcnow().isoformat())
@ -450,6 +544,14 @@ class RedfishFirmware(base.FirmwareInterface):
try:
task_monitor = redfish_utils.get_task_monitor(
node, current_update['task_monitor'])
except exception.RedfishConnectionError as e:
# If the BMC firmware is being updated, the BMC will be
# unavailable for some amount of time.
LOG.warning('Unable to communicate with task monitor service '
'on node %(node)s. Will try again on the next poll. '
'Error: %(error)s',
{'node': node.uuid, 'error': e})
return
except exception.RedfishError:
# The BMC deleted the Task before we could query it
LOG.warning('Firmware update completed for node %(node)s, '
@ -478,12 +580,15 @@ class RedfishFirmware(base.FirmwareInterface):
if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED
and sushy_task.task_status in
[sushy.HEALTH_OK, sushy.HEALTH_WARNING]):
LOG.info('Firmware update succeeded for node %(node)s, '
'firmware %(firmware_image)s: %(messages)s',
LOG.info('Firmware update task completed for node %(node)s, '
'firmware %(firmware_image)s: %(messages)s. '
'Starting BMC response validation.',
{'node': node.uuid,
'firmware_image': current_update['url'],
'messages': ", ".join(messages)})
# Validate BMC resources are consistently available
self._validate_resources_stability(node)
self._continue_updates(task, update_service, settings)
else:
error_msg = (_('Firmware update failed for node %(node)s, '

View file

@ -692,11 +692,13 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
servicing_error_handler_mock.assert_called_once()
interface._continue_updates.assert_not_called()
@mock.patch.object(redfish_fw.RedfishFirmware,
'_validate_resources_stability', autospec=True)
@mock.patch.object(redfish_fw, 'LOG', autospec=True)
@mock.patch.object(redfish_utils, 'get_update_service', autospec=True)
@mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True)
def test__check_node_firmware_update_done(self, tm_mock, get_us_mock,
log_mock):
log_mock, validate_mock):
task_mock = mock.Mock()
task_mock.task_state = sushy.TASK_STATE_COMPLETED
task_mock.task_status = sushy.HEALTH_OK
@ -712,13 +714,15 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
task, interface = self._test__check_node_redfish_firmware_update()
task.upgrade_lock.assert_called_once_with()
info_calls = [
mock.call('Firmware update succeeded for node %(node)s, '
'firmware %(firmware_image)s: %(messages)s',
mock.call('Firmware update task completed for node %(node)s, '
'firmware %(firmware_image)s: %(messages)s. '
'Starting BMC response validation.',
{'node': self.node.uuid,
'firmware_image': 'https://bmc/v1.0.1',
'messages': 'Firmware update done'})]
log_mock.info.assert_has_calls(info_calls)
validate_mock.assert_called_once()
interface._continue_updates.assert_called_once_with(
task, get_us_mock.return_value,
@ -1016,3 +1020,234 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
sleep_mock.assert_called_once_with(
self.node.driver_info.get('firmware_update_unresponsive_bmc_wait')
)
def test__validate_resources_stability_success(self):
"""Test successful BMC resource validation with consecutive success."""
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True) as manager_mock, \
mock.patch.object(redfish_utils, 'get_chassis',
autospec=True) as chassis_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock, \
mock.patch.object(time, 'sleep',
autospec=True) as sleep_mock:
# Mock successful resource responses
system_mock.return_value = mock.Mock()
manager_mock.return_value = mock.Mock()
net_adapters = chassis_mock.return_value.network_adapters
net_adapters.get_members.return_value = []
# Mock time progression to simulate consecutive successes
time_mock.side_effect = [0, 1, 2, 3] # 3 successful attempts
# Should complete successfully after 3 consecutive successes
firmware._validate_resources_stability(task.node)
# Verify all resources were checked 3 times (required success)
self.assertEqual(system_mock.call_count, 3)
self.assertEqual(manager_mock.call_count, 3)
self.assertEqual(chassis_mock.call_count, 3)
# Verify sleep was called between validation attempts
expected_calls = [mock.call(
CONF.redfish.firmware_update_validation_interval)] * 2
sleep_mock.assert_has_calls(expected_calls)
def test__validate_resources_stability_timeout(self):
"""Test BMC resource validation timeout when not achieved."""
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True), \
mock.patch.object(time, 'time', autospec=True) as time_mock, \
mock.patch.object(time, 'sleep', autospec=True):
# Mock system always failing
system_mock.side_effect = exception.RedfishConnectionError(
'timeout')
# Mock time progression to exceed timeout
time_mock.side_effect = [0, 350] # Exceeds 300 second timeout
# Should raise RedfishError due to timeout
self.assertRaises(exception.RedfishError,
firmware._validate_resources_stability,
task.node)
def test__validate_resources_stability_intermittent_failures(self):
"""Test BMC resource validation with intermittent failures."""
cfg.CONF.set_override('firmware_update_required_successes', 3,
'redfish')
cfg.CONF.set_override('firmware_update_validation_interval', 10,
'redfish')
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True) as manager_mock, \
mock.patch.object(redfish_utils, 'get_chassis',
autospec=True) as chassis_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock, \
mock.patch.object(time, 'sleep', autospec=True):
# Mock intermittent failures: success, success, fail,
# success, success, success
# When system_mock raises exception, other calls are not made
call_count = 0
def system_side_effect(*args):
nonlocal call_count
call_count += 1
if call_count == 3: # Third call fails
raise exception.RedfishConnectionError('error')
return mock.Mock()
system_mock.side_effect = system_side_effect
manager_mock.return_value = mock.Mock()
net_adapters = chassis_mock.return_value.network_adapters
net_adapters.get_members.return_value = []
# Mock time progression (6 attempts total)
time_mock.side_effect = [0, 10, 20, 30, 40, 50, 60]
# Should eventually succeed after counter reset
firmware._validate_resources_stability(task.node)
# Verify all 6 attempts were made for system
self.assertEqual(system_mock.call_count, 6)
# Manager and chassis called only 5 times (not on failed)
self.assertEqual(manager_mock.call_count, 5)
self.assertEqual(chassis_mock.call_count, 5)
def test__validate_resources_stability_manager_failure(self):
"""Test BMC resource validation when Manager resource fails."""
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True) as manager_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock:
# Mock system success, manager failure
system_mock.return_value = mock.Mock()
manager_mock.side_effect = exception.RedfishError(
'manager error')
# Mock time progression to exceed timeout
time_mock.side_effect = [0, 350]
# Should raise RedfishError due to timeout
self.assertRaises(exception.RedfishError,
firmware._validate_resources_stability,
task.node)
def test__validate_resources_stability_network_adapters_failure(self):
"""Test validation when NetworkAdapters resource fails."""
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True) as manager_mock, \
mock.patch.object(redfish_utils, 'get_chassis',
autospec=True) as chassis_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock:
# Mock system and manager success, NetworkAdapters failure
system_mock.return_value = mock.Mock()
manager_mock.return_value = mock.Mock()
chassis_mock.side_effect = exception.RedfishError(
'chassis error')
# Mock time progression to exceed timeout
time_mock.side_effect = [0, 350]
# Should raise RedfishError due to timeout
self.assertRaises(exception.RedfishError,
firmware._validate_resources_stability,
task.node)
def test__validate_resources_stability_custom_config(self):
"""Test BMC resource validation with custom configuration values."""
cfg.CONF.set_override('firmware_update_required_successes', 5,
'redfish')
cfg.CONF.set_override('firmware_update_validation_interval', 5,
'redfish')
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(redfish_utils, 'get_manager',
autospec=True) as manager_mock, \
mock.patch.object(redfish_utils, 'get_chassis',
autospec=True) as chassis_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock, \
mock.patch.object(time, 'sleep',
autospec=True) as sleep_mock:
# Mock successful resource responses
system_mock.return_value = mock.Mock()
manager_mock.return_value = mock.Mock()
net_adapters = chassis_mock.return_value.network_adapters
net_adapters.get_members.return_value = []
# Mock time progression (5 successful attempts)
time_mock.side_effect = [0, 5, 10, 15, 20, 25]
# Should complete successfully after 5 consecutive successes
firmware._validate_resources_stability(task.node)
# Verify all resources checked 5 times (custom required)
self.assertEqual(system_mock.call_count, 5)
self.assertEqual(manager_mock.call_count, 5)
self.assertEqual(chassis_mock.call_count, 5)
# Verify sleep was called with custom interval
expected_calls = [mock.call(5)] * 4 # 4 sleeps between 5
sleep_mock.assert_has_calls(expected_calls)
def test__validate_resources_stability_badrequest_error(self):
"""Test BMC resource validation handles BadRequestError correctly."""
firmware = redfish_fw.RedfishFirmware()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
with mock.patch.object(redfish_utils, 'get_system',
autospec=True) as system_mock, \
mock.patch.object(time, 'time', autospec=True) as time_mock:
# Mock BadRequestError from sushy with proper arguments
mock_response = mock.Mock()
mock_response.status_code = 400
system_mock.side_effect = sushy.exceptions.BadRequestError(
'http://test', mock_response, mock_response)
# Mock time progression to exceed timeout
time_mock.side_effect = [0, 350]
# Should raise RedfishError due to timeout
self.assertRaises(exception.RedfishError,
firmware._validate_resources_stability,
task.node)

View file

@ -0,0 +1,24 @@
---
fixes:
- |
Reduces likelihood of intermittent firmware upgrade failures by adding
comprehensive BMC state validation after firmware upgrades for the
Redfish driver. After a firmware update task completes successfully, Ironic
now validates that BMC resources (System, Manager, and NetworkAdapters) are
consistently available before proceeding with subsequent operations.
This change improves firmware update reliability by requiring a
configurable number of consecutive successful responses from the BMC
resources. The validation process helps ensure the BMC has fully stabilized
after firmware updates before marking the operation as complete.
New configuration options in the ``[redfish]`` section:
* ``firmware_update_required_successes`` - Number of consecutive successful
responses required to consider BMC validation successful (default: 3),
0 disables validation
* ``firmware_update_validation_interval`` - Time in seconds to wait between
validation attempts (default: 30)
* ``firmware_update_resource_validation_timeout`` - Maximum time in seconds
to wait for BMC resources to stabilize (default: 300), setting it to 0
disables validation.