zfsd: don't try to fix an OFFLINE condition

If the system administrator does "zpool offline", he's doing it for a
reason.  zfsd shouldn't consider an offline disk to be an event that
requires automatic healing.  Don't online it in response to a GEOM
event, and don't try to activate a hotspare to take over from it.

MFC after:	2 weeks
Sponsored by:	ConnectWise
This commit is contained in:
Alan Somers 2025-05-02 06:45:32 -06:00
parent 9a1b330335
commit a13ddd6210
6 changed files with 215 additions and 2 deletions

View file

@ -299,6 +299,15 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
PoolGUIDString().c_str(), VdevGUIDString().c_str());
return (/*consumed*/false);
}
if (VdevState() == VDEV_STATE_OFFLINE) {
/*
* OFFLINE is an administrative decision. No need for zfsd to
* do anything.
*/
syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
PoolGUIDString().c_str(), VdevGUIDString().c_str());
return (/*consumed*/false);
}
if (vdev != NULL
&& ( vdev->PoolGUID() == m_poolGUID
@ -401,7 +410,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event)
return (/*consumed*/true);
} else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
RefreshVdevState();
if (VdevState() < VDEV_STATE_HEALTHY)
if (VdevState() < VDEV_STATE_HEALTHY &&
VdevState() != VDEV_STATE_OFFLINE)
consumed = ActivateSpare();
}
@ -694,6 +704,11 @@ CaseFile::CloseIfSolved()
switch (VdevState()) {
case VDEV_STATE_HEALTHY:
/* No need to keep cases for healthy vdevs */
case VDEV_STATE_OFFLINE:
/*
* Offline is a deliberate administrative action. zfsd
* doesn't need to do anything for this state.
*/
Close();
return (true);
case VDEV_STATE_REMOVED:
@ -710,7 +725,6 @@ CaseFile::CloseIfSolved()
*/
case VDEV_STATE_UNKNOWN:
case VDEV_STATE_CLOSED:
case VDEV_STATE_OFFLINE:
/*
* Keep open? This may not be the correct behavior,
* but it's what we've always done

View file

@ -355,6 +355,13 @@ ZfsEvent::Process() const
Vdev vdev(zpl.front(), vdevConfig);
caseFile = &CaseFile::Create(vdev);
if (caseFile->VdevState() == VDEV_STATE_OFFLINE) {
/*
* An administrator did this deliberately. It's not considered
* an error that zfsd must fix.
*/
return (false);
}
if (caseFile->ReEvaluate(*this) == false) {
stringstream msg;
int priority = LOG_INFO;

View file

@ -30,6 +30,8 @@ ${PACKAGE}FILES+= zfsd_hotspare_006_pos.ksh
${PACKAGE}FILES+= zfsd_hotspare_007_pos.ksh
${PACKAGE}FILES+= zfsd_hotspare_008_neg.ksh
${PACKAGE}FILES+= zfsd_import_001_pos.ksh
${PACKAGE}FILES+= zfsd_offline_001_neg.ksh
${PACKAGE}FILES+= zfsd_offline_002_neg.ksh
${PACKAGE}FILES+= zfsd_replace_001_pos.ksh
${PACKAGE}FILES+= zfsd_replace_002_pos.ksh
${PACKAGE}FILES+= zfsd_replace_003_pos.ksh

View file

@ -0,0 +1,64 @@
#!/usr/local/bin/ksh93 -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2025 ConnectWise. All rights reserved.
# Use is subject to license terms.
. $STF_SUITE/tests/hotspare/hotspare.kshlib
verify_runnable "global"
function cleanup
{
$ZPOOL status $TESTPOOL
if poolexists $TESTPOOL ; then
destroy_pool $TESTPOOL
fi
partition_cleanup
}
function verify_assertion
{
log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
# Wait a few seconds before verifying the state
$SLEEP 10
log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
}
log_onexit cleanup
log_assert "ZFSD will not automatically reactivate a disk which has been administratively offlined"
ensure_zfsd_running
typeset FAULT_DISK=$DISK0
typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
set -A MY_KEYWORDS mirror raidz1
for keyword in "${MY_KEYWORDS[@]}" ; do
log_must create_pool $TESTPOOL $keyword $POOLDEVS
verify_assertion
destroy_pool "$TESTPOOL"
done

View file

@ -0,0 +1,66 @@
#!/usr/local/bin/ksh93 -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2025 ConnectWise. All rights reserved.
# Use is subject to license terms.
. $STF_SUITE/tests/hotspare/hotspare.kshlib
verify_runnable "global"
function cleanup
{
$ZPOOL status $TESTPOOL
if poolexists $TESTPOOL ; then
destroy_pool $TESTPOOL
fi
partition_cleanup
}
function verify_assertion
{
log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
# Wait a few seconds before verifying the state
$SLEEP 10
log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL"
}
log_onexit cleanup
log_assert "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
ensure_zfsd_running
typeset FAULT_DISK=$DISK0
typeset SPARE_DISK=$DISK3
typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
set -A MY_KEYWORDS mirror raidz1
for keyword in "${MY_KEYWORDS[@]}" ; do
log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK
verify_assertion
destroy_pool "$TESTPOOL"
done

View file

@ -483,6 +483,64 @@ zfsd_autoreplace_003_pos_cleanup()
ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
}
atf_test_case zfsd_offline_001_neg cleanup
zfsd_offline_001_neg_head()
{
atf_set "descr" "ZFSD will not automatically reactivate a disk which has been administratively offlined"
atf_set "require.progs" "ksh93 zpool zfs"
}
zfsd_offline_001_neg_body()
{
. $(atf_get_srcdir)/../../include/default.cfg
. $(atf_get_srcdir)/../hotspare/hotspare.cfg
. $(atf_get_srcdir)/zfsd.cfg
verify_disk_count "$DISKS" 3
verify_zfsd_running
ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh
if [[ $? != 0 ]]; then
save_artifacts
atf_fail "Testcase failed"
fi
}
zfsd_offline_001_neg_cleanup()
{
. $(atf_get_srcdir)/../../include/default.cfg
. $(atf_get_srcdir)/zfsd.cfg
ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
}
atf_test_case zfsd_offline_002_neg cleanup
zfsd_offline_002_neg_head()
{
atf_set "descr" "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
atf_set "require.progs" "ksh93 zpool zfs"
}
zfsd_offline_002_neg_body()
{
. $(atf_get_srcdir)/../../include/default.cfg
. $(atf_get_srcdir)/../hotspare/hotspare.cfg
. $(atf_get_srcdir)/zfsd.cfg
verify_disk_count "$DISKS" 4
verify_zfsd_running
ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh
if [[ $? != 0 ]]; then
save_artifacts
atf_fail "Testcase failed"
fi
}
zfsd_offline_002_neg_cleanup()
{
. $(atf_get_srcdir)/../../include/default.cfg
. $(atf_get_srcdir)/zfsd.cfg
ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
}
atf_test_case zfsd_replace_001_pos cleanup
zfsd_replace_001_pos_head()
{
@ -676,6 +734,8 @@ atf_init_test_cases()
atf_add_test_case zfsd_autoreplace_001_neg
atf_add_test_case zfsd_autoreplace_002_pos
atf_add_test_case zfsd_autoreplace_003_pos
atf_add_test_case zfsd_offline_001_neg
atf_add_test_case zfsd_offline_002_neg
atf_add_test_case zfsd_replace_001_pos
atf_add_test_case zfsd_replace_002_pos
atf_add_test_case zfsd_replace_003_pos