Skip to content

Commit c474fe4

Browse files
committed
DAOS-18368 rebuild: fix a REINT bug
rebuild leader possibly treat reint engine as completion before its pulling DONE. Signed-off-by: Xuezhao Liu <xuezhao.liu@hpe.com>
1 parent 4405393 commit c474fe4

File tree

2 files changed

+20
-11
lines changed

2 files changed

+20
-11
lines changed

src/object/srv_obj_migrate.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3070,8 +3070,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls,
30703070

30713071
/* Each object enumeration RPC will at least one OID */
30723072
if (num < minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) {
3073-
D_DEBUG(DB_REBUILD, DF_RB ": enumeration buffer %u empty" DF_UOID "\n",
3074-
DP_RB_MPT(tls), num, DP_UOID(arg->oid));
3073+
D_INFO(DF_RB ": enumeration buffer %u empty" DF_UOID, DP_RB_MPT(tls), num,
3074+
DP_UOID(arg->oid));
30753075
break;
30763076
}
30773077

src/rebuild/srv.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* (C) Copyright 2016-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -982,26 +982,35 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
982982
if (rgt->rgt_opc == RB_OP_REBUILD) {
983983
if (dom->do_comp.co_status == PO_COMP_ST_UP) {
984984
if (dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) {
985-
D_INFO(DF_RB ": cancel rebuild co_in_ver=%u\n",
986-
DP_RB_RGT(rgt), dom->do_comp.co_in_ver);
985+
D_INFO(DF_RB ": cancel rebuild due to new REINT, "
986+
"co_rank %d, co_in_ver %u\n",
987+
DP_RB_RGT(rgt), dom->do_comp.co_rank,
988+
dom->do_comp.co_in_ver);
987989
rebuild_abort = true;
988990
break;
989991
} else {
990992
continue;
991993
}
992994
} else if (dom->do_comp.co_status == PO_COMP_ST_DOWN) {
993995
if (dom->do_comp.co_fseq > rgt->rgt_rebuild_ver) {
994-
D_INFO(DF_RB ": cancel rebuild co_fseq=%u\n",
995-
DP_RB_RGT(rgt), dom->do_comp.co_fseq);
996+
D_INFO(DF_RB ": cancel rebuild due to new DOWN, "
997+
"co_rank %d, co_fseq %u\n",
998+
DP_RB_RGT(rgt), dom->do_comp.co_rank,
999+
dom->do_comp.co_fseq);
9961000
rebuild_abort = true;
9971001
break;
9981002
}
9991003
}
10001004
}
1001-
D_INFO(DF_RB " exclude rank %d/%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank,
1002-
dom->do_comp.co_status);
1003-
rebuild_leader_set_status(rgt, dom->do_comp.co_rank,
1004-
-1, SCAN_DONE | PULL_DONE);
1005+
D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt),
1006+
dom->do_comp.co_rank, dom->do_comp.co_status);
1007+
1008+
/* for PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks
1009+
* set the completion as no progress/completion will be reported from them.
1010+
*/
1011+
if (dom->do_comp.co_rank != PO_COMP_ST_UP)
1012+
rebuild_leader_set_status(rgt, dom->do_comp.co_rank, -1,
1013+
SCAN_DONE | PULL_DONE);
10051014
}
10061015
ABT_rwlock_unlock(pool->sp_lock);
10071016
map_ranks_fini(&excluded);

0 commit comments

Comments
 (0)