From 85ec9d655f7e8ade0a611ceb8d202c841123f05c Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Tue, 9 Sep 2025 14:41:58 +1000 Subject: [PATCH] Fix intermittent Redfish firmware update failures with BMC validation Resolves a bug where firmware updates fail intermittently on some hardware models due to invalid or unstable BMC responses immediately after firmware update completion. The BMC may return inconsistent responses for a period after firmware updates, causing the update process to fail prematurely. This change adds comprehensive BMC state validation that requires multiple consecutive successful responses from System, Manager, and NetworkAdapters resources before considering the firmware update complete. This ensures the BMC has fully stabilized before proceeding. Generated-By: Claude Code Sonnet 4 Change-Id: I5cb72f62d3fc62c3ad750c62924842cef59e79b8 Signed-off-by: Jacob Anders --- ironic/conf/redfish.py | 21 ++ ironic/drivers/modules/redfish/firmware.py | 111 +++++++- .../drivers/modules/redfish/test_firmware.py | 241 +++++++++++++++++- ...fter-firmware-update-3d5f8a9e76c24d1b.yaml | 24 ++ 4 files changed, 391 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 26db778e7c..00bc919924 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -95,6 +95,27 @@ opts = [ default=300, help=_('Number of seconds to wait before proceeding with the ' 'reboot to finish the BMC firmware update step')), + cfg.IntOpt('firmware_update_required_successes', + min=0, + default=3, + help=_('Number of successful responses required to ' + 'consider post-upgrade BMC validation a success. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), + cfg.IntOpt('firmware_update_validation_interval', + min=0, + default=30, + help=_('Timeout (in seconds) to wait between validation ' + 'attempts. Set to 0 for rapid succession retries ' + 'with no delay.')), + cfg.IntOpt('firmware_update_resource_validation_timeout', + min=0, + default=300, + help=_('Timeout (in seconds) to wait for BMC resources ' + '(System, Manager, NetworkAdapters) to become stable ' + 'and consistently available after firmware update. ' + 'Set to 0 to disable post-upgrade validation ' + 'entirely.')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 12c23a509b..9facb99451 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -181,7 +181,6 @@ class RedfishFirmware(base.FirmwareInterface): @base.service_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO, requires_ramdisk=False) - @base.cache_firmware_components def update(self, task, settings): """Update the Firmware on the node using the settings for components. @@ -280,6 +279,7 @@ class RedfishFirmware(base.FirmwareInterface): 'seconds before proceeding to reboot the node %(node_uuid)s ' 'to complete the step', {'node_uuid': node.uuid, 'wait_time': wait_unres_bmc}) + # Store task monitor URI for periodic task polling # TODO(iurygregory): Improve the logic here to identify if the BMC # is back, so we don't have to unconditionally wait. # The wait_unres_bmc will be the maximum time to wait. @@ -299,10 +299,103 @@ class RedfishFirmware(base.FirmwareInterface): fw_clean.append(cleanup) node.set_driver_internal_info('firmware_cleanup', fw_clean) + def _validate_resources_stability(self, node): + """Validate that BMC resources are consistently available. + + Requires consecutive successful responses from System, Manager, + and NetworkAdapters resources before considering them stable. + The number of required successes is configured via + CONF.redfish.firmware_update_required_successes. + Timeout is configured via + CONF.redfish.firmware_update_resource_validation_timeout. + + :param node: the Ironic node object + :raises: RedfishError if resources don't stabilize within timeout + """ + timeout = CONF.redfish.firmware_update_resource_validation_timeout + required_successes = CONF.redfish.firmware_update_required_successes + validation_interval = CONF.redfish.firmware_update_validation_interval + + # Skip validation if validation is disabled via configuration + if required_successes == 0 or timeout == 0: + reasons = [] + if required_successes == 0: + reasons.append('required_successes=0') + if timeout == 0: + reasons.append('validation_timeout=0') + + LOG.info('BMC resource validation disabled (%s) for node %(node)s', + ', '.join(reasons), {'node': node.uuid}) + return + + LOG.debug('Starting resource stability validation for node %(node)s ' + '(timeout: %(timeout)s seconds, ' + 'required_successes: %(required)s, ' + 'validation_interval: %(interval)s seconds)', + {'node': node.uuid, 'timeout': timeout, + 'required': required_successes, + 'interval': validation_interval}) + + start_time = time.time() + end_time = start_time + timeout + consecutive_successes = 0 + last_exc = None + + while time.time() < end_time: + try: + # Test System resource + system = redfish_utils.get_system(node) + + # Test Manager resource + redfish_utils.get_manager(node, system) + + # Test NetworkAdapters resource + chassis = redfish_utils.get_chassis(node, system) + chassis.network_adapters.get_members() + + # All resources successful + consecutive_successes += 1 + LOG.debug('Resource validation success %(count)d/%(required)d ' + 'for node %(node)s', + {'count': consecutive_successes, + 'required': required_successes, + 'node': node.uuid}) + + if consecutive_successes >= required_successes: + LOG.info('All tested Redfish resources stable and ' + ' available for node %(node)s', + {'node': node.uuid}) + return + + except (exception.RedfishError, + exception.RedfishConnectionError, + sushy.exceptions.BadRequestError) as e: + # Resource not available yet, reset counter + if consecutive_successes > 0: + LOG.debug('Resource validation interrupted for node ' + '%(node)s, resetting success counter ' + '(error: %(error)s)', + {'node': node.uuid, 'error': e}) + consecutive_successes = 0 + last_exc = e + + # Wait before next validation attempt + time.sleep(validation_interval) + # Timeout reached without achieving stability + error_msg = _('BMC resources failed to stabilize within ' + '%(timeout)s seconds for node %(node)s') % { + 'timeout': timeout, 'node': node.uuid} + if last_exc: + error_msg += _(', last error: %(error)s') % {'error': last_exc} + LOG.error(error_msg) + raise exception.RedfishError(error=error_msg) + def _continue_updates(self, task, update_service, settings): """Continues processing the firmware updates Continues to process the firmware updates on the node. + First monitors the current task completion, then validates resource + stability before proceeding to next update or completion. Note that the caller must have an exclusive lock on the node. @@ -312,6 +405,7 @@ class RedfishFirmware(base.FirmwareInterface): """ node = task.node fw_upd = settings[0] + wait_interval = fw_upd.get('wait') if wait_interval: time_now = str(timeutils.utcnow().isoformat()) @@ -450,6 +544,14 @@ class RedfishFirmware(base.FirmwareInterface): try: task_monitor = redfish_utils.get_task_monitor( node, current_update['task_monitor']) + except exception.RedfishConnectionError as e: + # If the BMC firmware is being updated, the BMC will be + # unavailable for some amount of time. + LOG.warning('Unable to communicate with task monitor service ' + 'on node %(node)s. Will try again on the next poll. ' + 'Error: %(error)s', + {'node': node.uuid, 'error': e}) + return except exception.RedfishError: # The BMC deleted the Task before we could query it LOG.warning('Firmware update completed for node %(node)s, ' @@ -478,12 +580,15 @@ class RedfishFirmware(base.FirmwareInterface): if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED and sushy_task.task_status in [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): - LOG.info('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', + LOG.info('Firmware update task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s. ' + 'Starting BMC response validation.', {'node': node.uuid, 'firmware_image': current_update['url'], 'messages': ", ".join(messages)}) + # Validate BMC resources are consistently available + self._validate_resources_stability(node) self._continue_updates(task, update_service, settings) else: error_msg = (_('Firmware update failed for node %(node)s, ' diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index 883a4a2ece..4e468381eb 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -692,11 +692,13 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): servicing_error_handler_mock.assert_called_once() interface._continue_updates.assert_not_called() + @mock.patch.object(redfish_fw.RedfishFirmware, + '_validate_resources_stability', autospec=True) @mock.patch.object(redfish_fw, 'LOG', autospec=True) @mock.patch.object(redfish_utils, 'get_update_service', autospec=True) @mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True) def test__check_node_firmware_update_done(self, tm_mock, get_us_mock, - log_mock): + log_mock, validate_mock): task_mock = mock.Mock() task_mock.task_state = sushy.TASK_STATE_COMPLETED task_mock.task_status = sushy.HEALTH_OK @@ -712,13 +714,15 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): task, interface = self._test__check_node_redfish_firmware_update() task.upgrade_lock.assert_called_once_with() info_calls = [ - mock.call('Firmware update succeeded for node %(node)s, ' - 'firmware %(firmware_image)s: %(messages)s', + mock.call('Firmware update task completed for node %(node)s, ' + 'firmware %(firmware_image)s: %(messages)s. ' + 'Starting BMC response validation.', {'node': self.node.uuid, 'firmware_image': 'https://bmc/v1.0.1', 'messages': 'Firmware update done'})] log_mock.info.assert_has_calls(info_calls) + validate_mock.assert_called_once() interface._continue_updates.assert_called_once_with( task, get_us_mock.return_value, @@ -1016,3 +1020,234 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): sleep_mock.assert_called_once_with( self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') ) + + def test__validate_resources_stability_success(self): + """Test successful BMC resource validation with consecutive success.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression to simulate consecutive successes + time_mock.side_effect = [0, 1, 2, 3] # 3 successful attempts + + # Should complete successfully after 3 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources were checked 3 times (required success) + self.assertEqual(system_mock.call_count, 3) + self.assertEqual(manager_mock.call_count, 3) + self.assertEqual(chassis_mock.call_count, 3) + + # Verify sleep was called between validation attempts + expected_calls = [mock.call( + CONF.redfish.firmware_update_validation_interval)] * 2 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_timeout(self): + """Test BMC resource validation timeout when not achieved.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True), \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock system always failing + system_mock.side_effect = exception.RedfishConnectionError( + 'timeout') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] # Exceeds 300 second timeout + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_intermittent_failures(self): + """Test BMC resource validation with intermittent failures.""" + cfg.CONF.set_override('firmware_update_required_successes', 3, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 10, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', autospec=True): + + # Mock intermittent failures: success, success, fail, + # success, success, success + # When system_mock raises exception, other calls are not made + call_count = 0 + + def system_side_effect(*args): + nonlocal call_count + call_count += 1 + if call_count == 3: # Third call fails + raise exception.RedfishConnectionError('error') + return mock.Mock() + + system_mock.side_effect = system_side_effect + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (6 attempts total) + time_mock.side_effect = [0, 10, 20, 30, 40, 50, 60] + + # Should eventually succeed after counter reset + firmware._validate_resources_stability(task.node) + + # Verify all 6 attempts were made for system + self.assertEqual(system_mock.call_count, 6) + # Manager and chassis called only 5 times (not on failed) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + def test__validate_resources_stability_manager_failure(self): + """Test BMC resource validation when Manager resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system success, manager failure + system_mock.return_value = mock.Mock() + manager_mock.side_effect = exception.RedfishError( + 'manager error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_network_adapters_failure(self): + """Test validation when NetworkAdapters resource fails.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock system and manager success, NetworkAdapters failure + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + chassis_mock.side_effect = exception.RedfishError( + 'chassis error') + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) + + def test__validate_resources_stability_custom_config(self): + """Test BMC resource validation with custom configuration values.""" + cfg.CONF.set_override('firmware_update_required_successes', 5, + 'redfish') + cfg.CONF.set_override('firmware_update_validation_interval', 5, + 'redfish') + + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(redfish_utils, 'get_manager', + autospec=True) as manager_mock, \ + mock.patch.object(redfish_utils, 'get_chassis', + autospec=True) as chassis_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock, \ + mock.patch.object(time, 'sleep', + autospec=True) as sleep_mock: + + # Mock successful resource responses + system_mock.return_value = mock.Mock() + manager_mock.return_value = mock.Mock() + net_adapters = chassis_mock.return_value.network_adapters + net_adapters.get_members.return_value = [] + + # Mock time progression (5 successful attempts) + time_mock.side_effect = [0, 5, 10, 15, 20, 25] + + # Should complete successfully after 5 consecutive successes + firmware._validate_resources_stability(task.node) + + # Verify all resources checked 5 times (custom required) + self.assertEqual(system_mock.call_count, 5) + self.assertEqual(manager_mock.call_count, 5) + self.assertEqual(chassis_mock.call_count, 5) + + # Verify sleep was called with custom interval + expected_calls = [mock.call(5)] * 4 # 4 sleeps between 5 + sleep_mock.assert_has_calls(expected_calls) + + def test__validate_resources_stability_badrequest_error(self): + """Test BMC resource validation handles BadRequestError correctly.""" + firmware = redfish_fw.RedfishFirmware() + + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + with mock.patch.object(redfish_utils, 'get_system', + autospec=True) as system_mock, \ + mock.patch.object(time, 'time', autospec=True) as time_mock: + + # Mock BadRequestError from sushy with proper arguments + mock_response = mock.Mock() + mock_response.status_code = 400 + system_mock.side_effect = sushy.exceptions.BadRequestError( + 'http://test', mock_response, mock_response) + + # Mock time progression to exceed timeout + time_mock.side_effect = [0, 350] + + # Should raise RedfishError due to timeout + self.assertRaises(exception.RedfishError, + firmware._validate_resources_stability, + task.node) diff --git a/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml new file mode 100644 index 0000000000..49a66b657b --- /dev/null +++ b/releasenotes/notes/bmc-validation-after-firmware-update-3d5f8a9e76c24d1b.yaml @@ -0,0 +1,24 @@ +--- +fixes: + - | + Reduces likelihood of intermittent firmware upgrade failures by adding + comprehensive BMC state validation after firmware upgrades for the + Redfish driver. After a firmware update task completes successfully, Ironic + now validates that BMC resources (System, Manager, and NetworkAdapters) are + consistently available before proceeding with subsequent operations. + + This change improves firmware update reliability by requiring a + configurable number of consecutive successful responses from the BMC + resources. The validation process helps ensure the BMC has fully stabilized + after firmware updates before marking the operation as complete. + + New configuration options in the ``[redfish]`` section: + + * ``firmware_update_required_successes`` - Number of consecutive successful + responses required to consider BMC validation successful (default: 3), + 0 disables validation + * ``firmware_update_validation_interval`` - Time in seconds to wait between + validation attempts (default: 30) + * ``firmware_update_resource_validation_timeout`` - Maximum time in seconds + to wait for BMC resources to stabilize (default: 300), setting it to 0 + disables validation.