Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/rebuild/rebuild_iv.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2017-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -129,8 +129,13 @@ rebuild_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key,
/* Gathering the rebuild status here */
rgt = rebuild_global_pool_tracker_lookup(src_iv->riv_pool_uuid,
src_iv->riv_ver, src_iv->riv_rebuild_gen);
if (rgt == NULL)
if (rgt == NULL) {
D_WARN(DF_UUID " rgt not found ver %d gen %u from rank %d term " DF_U64
" on rank %d, possibly stale IV after PS leader switch\n",
DP_UUID(src_iv->riv_pool_uuid), src_iv->riv_ver, src_iv->riv_rebuild_gen,
src_iv->riv_rank, src_iv->riv_leader_term, rank);
D_GOTO(out, rc);
}

if (rgt->rgt_leader_term == src_iv->riv_leader_term) {
/* update the rebuild global status */
Expand Down
47 changes: 44 additions & 3 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,38 @@ rpt_stale(struct rebuild_tgt_pool_tracker *rpt)
return !found;
}

enum {
RPT_ABORT_NONE = 0,
RPT_ABORT_ORPHANED_RECLAIM,
RPT_ABORT_GENERAL_STALE,
};

static int
rpt_should_abort(struct rebuild_tgt_pool_tracker *rpt, struct ds_iv_ns *ns, struct rebuild_iv *iv)
{
/* Abort orphaned rpt whose leader is gone. After PS leader switch,
* reclaim tasks are not regenerated (UPIN not in DOWN/UP/DRAIN),
* so this rpt has no matching rgt on the new leader and IV updates
* are silently dropped.
*/
if (rpt->rt_leader_term < ns->iv_master_term && rpt->rt_scan_done &&
(rpt->rt_rebuild_op == RB_OP_FAIL_RECLAIM || rpt->rt_rebuild_op == RB_OP_RECLAIM)) {
D_ERROR(DF_UUID " ver %d gen %u op %s: stale term " DF_U64 " < " DF_U64
", abort orphaned rpt\n",
DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver, rpt->rt_rebuild_gen,
RB_OP_STR(rpt->rt_rebuild_op), rpt->rt_leader_term, ns->iv_master_term);

return RPT_ABORT_ORPHANED_RECLAIM;
}

if (iv->riv_pull_done && rpt_stale(rpt)) {
D_ERROR(DF_RB " is stale, exit the ULT.\n", DP_RB_RPT(rpt));
return RPT_ABORT_GENERAL_STALE;
}

return RPT_ABORT_NONE;
}

struct rebuild_pool_tls *
rebuild_pool_tls_lookup(uuid_t pool_uuid, unsigned int ver, uint32_t gen)
{
Expand Down Expand Up @@ -1082,6 +1114,17 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
ABT_rwlock_unlock(pool->sp_lock);
map_ranks_fini(&rank_list);

/* Abort orphaned rgt if the node is no longer the leader.
* After PS leader switch, this rgt becomes orphaned and should be aborted.
*/
if (rgt->rgt_leader_term < pool->sp_iv_ns->iv_master_term &&
(rgt->rgt_opc == RB_OP_FAIL_RECLAIM || rgt->rgt_opc == RB_OP_RECLAIM)) {
D_INFO(DF_RB " op %s: stale term " DF_U64 " < " DF_U64", abort orphaned rgt\n",
DP_RB_RGT(rgt), RB_OP_STR(rgt->rgt_opc),
rgt->rgt_leader_term, pool->sp_iv_ns->iv_master_term);
rebuild_abort = true;
}

if (rebuild_abort) {
rgt->rgt_abort = 1;
rgt->rgt_status.rs_errno = -DER_STALE;
Expand Down Expand Up @@ -3030,10 +3073,8 @@ rebuild_tgt_status_check_ult(void *arg)
break;

sched_req_sleep(rpt->rt_ult, RBLD_CHECK_INTV);
if (iv.riv_pull_done && rpt_stale(rpt)) {
D_ERROR(DF_RB " is stale, exit the ULT.\n", DP_RB_RPT(rpt));
if (rpt_should_abort(rpt, rpt->rt_pool->sp_iv_ns, &iv) != RPT_ABORT_NONE)
break;
}
}

sched_req_put(rpt->rt_ult);
Expand Down
Loading