libsa/zfs: prefer a vdev with newer txg to a stale one

Before this change, the first probed member of a pool would initialize
vdev tree for the pool.  Now, imagine a situation when a machine has a
disk that has been removed from the pool, but the ZFS label was not
erased.  That's a typical scenario - disk goes offline, it is replaced
with a spare, no data changes written to the gone disk.  Then, disk
appears back at boot time and it is the first one to be probed by the
loader.  It has the same pool GUID as all other members and naive loader
would not see a conflict.  Then the disk will be used as source of truth
to read the bootenv.

To fix that, provide vdev_free() that allows to rollback the already
prebuilt vdev tree, so that a new one can be built from scratch.  Upon
encountering a newer configuration for already known top level part of a
pool, call vdev_free() and let vdev_probe() to build a new one.

The change has been tested with loader_lua and userboot.so, but it should
have same effect on the legacy boot1 loader.

Reviewed by:		tsoome, mav, imp
Differential Revision:	https://reviews.freebsd.org/D51219
This commit is contained in:
Gleb Smirnoff 2025-07-16 11:15:12 -07:00
parent cc61a0793a
commit 6dd0803ffd
2 changed files with 44 additions and 10 deletions

View file

@ -1106,7 +1106,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
}
static int
vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
const nvlist_t *nvlist)
{
vdev_t *top_vdev, *vdev;
nvlist_t **kids = NULL;
@ -1120,6 +1121,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
return (rc);
top_vdev->v_spa = spa;
top_vdev->v_top = top_vdev;
top_vdev->v_txg = txg;
vdev_insert(spa->spa_root_vdev, top_vdev);
}
@ -1163,7 +1165,7 @@ done:
static int
vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
{
uint64_t pool_guid, top_guid;
uint64_t pool_guid, top_guid, txg;
nvlist_t *vdevs;
int rc;
@ -1171,13 +1173,15 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
NULL, &pool_guid, NULL) ||
nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
NULL, &top_guid, NULL) ||
nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
NULL, &txg, NULL) != 0 ||
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs, NULL)) {
printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
rc = vdev_from_nvlist(spa, top_guid, vdevs);
rc = vdev_from_nvlist(spa, top_guid, txg, vdevs);
nvlist_destroy(vdevs);
return (rc);
}
@ -1267,6 +1271,21 @@ vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
return (rc);
}
/*
* Shall not be called on root vdev, that is not linked into zfs_vdevs.
* See comment in vdev_create().
*/
static void
vdev_free(struct vdev *vdev)
{
struct vdev *kid, *safe;
STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe)
vdev_free(kid);
STAILQ_REMOVE(&zfs_vdevs, vdev, vdev, v_alllink);
free(vdev);
}
static int
vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
{
@ -1313,9 +1332,10 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
vdev = vdev_find(guid);
/*
* Top level vdev is missing, create it.
* XXXGL: how can this happen?
*/
if (vdev == NULL)
rc = vdev_from_nvlist(spa, guid, kids[i]);
rc = vdev_from_nvlist(spa, guid, 0, kids[i]);
else
rc = vdev_update_from_nvlist(guid, kids[i]);
if (rc != 0)
@ -2006,8 +2026,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
vdev_t *vdev;
nvlist_t *nvl;
uint64_t val;
uint64_t guid;
uint64_t pool_txg, pool_guid;
uint64_t guid, pool_guid, top_guid, txg;
const char *pool_name;
int rc, namelen;
@ -2063,7 +2082,9 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
}
if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
NULL, &pool_txg, NULL) != 0 ||
NULL, &txg, NULL) != 0 ||
nvlist_find(nvl, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
NULL, &top_guid, NULL) != 0 ||
nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
NULL, &pool_guid, NULL) != 0 ||
nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
@ -2098,9 +2119,22 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
nvlist_destroy(nvl);
return (ENOMEM);
}
} else {
struct vdev *kid;
STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children,
v_childlink)
if (kid->v_guid == top_guid && kid->v_txg < txg) {
printf("ZFS: pool %s vdev %s ignoring stale "
"label from txg 0x%jx, using 0x%jx@0x%jx\n",
spa->spa_name, kid->v_name,
kid->v_txg, guid, txg);
STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
kid, vdev, v_childlink);
vdev_free(kid);
break;
}
}
if (pool_txg > spa->spa_txg)
spa->spa_txg = pool_txg;
/*
* Get the vdev tree and create our in-core copy of it.

View file

@ -2019,6 +2019,7 @@ typedef struct vdev {
vdev_list_t v_children; /* children of this vdev */
const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
uint64_t v_txg; /* most recent transaction */
uint64_t v_id; /* index in parent */
uint64_t v_psize; /* physical device capacity */
int v_ashift; /* offset to block shift */
@ -2048,7 +2049,6 @@ typedef struct spa {
STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
char *spa_name; /* pool name */
uint64_t spa_guid; /* pool guid */
uint64_t spa_txg; /* most recent transaction */
struct uberblock *spa_uberblock; /* best uberblock so far */
vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t *spa_mos; /* MOS for this pool */