diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 147e4bb3fc1..3fbaae93810 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -88,13 +88,14 @@ struct ds_pool { */ uuid_t sp_srv_cont_hdl; uuid_t sp_srv_pool_hdl; - uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_need_discard : 1, - sp_disable_rebuild : 1, sp_disable_dtx_resync : 1, sp_incr_reint : 1; + uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_disable_rebuild : 1, + sp_disable_dtx_resync : 1, sp_incr_reint : 1; /* pool_uuid + map version + leader term + rebuild generation define a * rebuild job. */ uint32_t sp_rebuild_gen; ATOMIC int sp_rebuilding; + ATOMIC int sp_discarding; /** * someone has already messaged this pool to for rebuild scan, * NB: all xstreams can do lockless-write on it but it's OK @@ -191,8 +192,7 @@ struct ds_pool_child { int spc_ref; ABT_eventual spc_ref_eventual; - uint64_t spc_discard_done:1, - spc_no_storage:1; /* The pool shard has no storage. */ + uint64_t spc_no_storage : 1; /* The pool shard has no storage. */ uint32_t spc_reint_mode; uint32_t *spc_state; /* Pointer to ds_pool->sp_states[i] */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 2ecd52f1a0f..c41c8f0731e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2450,12 +2450,14 @@ static int obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, uint32_t rpc_map_ver, uint32_t flags) { + struct ds_pool *pool = child->sc_pool->spc_pool; + if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { /* EC aggregation is still inflight, rebuild should wait until it's paused */ if (ds_cont_child_ec_aggregating(child)) { D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n", - DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), - atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)); + DP_CONT(pool->sp_uuid, child->sc_uuid), + atomic_load(&pool->sp_rebuilding)); return -DER_UPDATE_AGAIN; } } @@ -2463,7 +2465,7 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY)) return 0; - if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) { + if (atomic_load(&pool->sp_rebuilding)) { uint32_t version; ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD, @@ -2480,10 +2482,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, * vos discard to finish, which otherwise might discard these new in-flight * I/O update. */ - if ((flags & ORF_REINTEGRATING_IO) && - (child->sc_pool->spc_pool->sp_need_discard && - child->sc_pool->spc_discard_done == 0)) { - D_ERROR("reintegrating "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); + if ((flags & ORF_REINTEGRATING_IO) && atomic_load(&pool->sp_discarding) > 0) { + D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 2ccda79e4c2..4ff217a823a 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3230,6 +3230,7 @@ migrate_obj_ult(void *data) { struct iter_obj_arg *arg = data; struct migrate_pool_tls *tls = NULL; + struct ds_pool *pool; daos_epoch_range_t epr; daos_epoch_t stable_epoch = 0; daos_handle_t coh = DAOS_HDL_INVAL; @@ -3249,20 +3250,22 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (tls->mpt_pool->spc_pool->sp_need_discard) { - while(!tls->mpt_pool->spc_discard_done) { - D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", - DP_RB_MPT(tls)); - dss_sleep(2 * 1000); - if (tls->mpt_fini) - D_GOTO(free_notls, rc); - } - if (tls->mpt_pool->spc_pool->sp_discard_status) { - rc = tls->mpt_pool->spc_pool->sp_discard_status; + pool = tls->mpt_pool->spc_pool; + while (atomic_load(&pool->sp_discarding) != 0) { + D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); + dss_sleep(2 * 1000); + if (tls->mpt_fini) + D_GOTO(free_notls, rc); + + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status) { + rc = pool->sp_discard_status; + ABT_mutex_unlock(pool->sp_mutex); D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls), DP_RC(rc)); D_GOTO(out, rc); } + ABT_mutex_unlock(pool->sp_mutex); } if (tls->mpt_reintegrating) { diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 56f0643555e..a8c6d52153c 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -1,7 +1,7 @@ /** * * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -734,6 +734,13 @@ get_object_layout(struct pl_jump_map *jmap, uint32_t layout_ver, struct pl_obj_l setbit(dom_cur_grp_real, domain - root); if (pool_target_down(target)) layout->ol_shards[k].po_rebuilding = 1; + + if (pool_target_is_down2up(target)) { + if (gen_mode == PRE_REBUILD) + layout->ol_shards[k].po_rebuilding = 1; + else + layout->ol_shards[k].po_reintegrating = 1; + } } if (is_extending != NULL && pool_target_is_up_or_drain(target)) diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index 5afe0691a37..52e9ee93371 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -255,13 +256,7 @@ is_comp_avaible(struct pool_component *comp, uint32_t allow_version, status = PO_COMP_ST_UPIN; } else if (status == PO_COMP_ST_UP) { if (comp->co_flags & PO_COMPF_DOWN2UP) { - /* PO_COMP_ST_UP status with PO_COMPF_DOWN2UP flag - * is the case of delay_rebuild exclude+reint. - * Cannot mark it as UPIN to avoid it be used for - * rebuild enumerate/fetch, as the data will be - * discarded in reintegrate. - */ - /* status = PO_COMP_ST_UPIN; */ + status = PO_COMP_ST_UPIN; } else { if (comp->co_fseq <= 1) status = PO_COMP_ST_NEW; @@ -394,9 +389,14 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, if (spare_avail) { /* The selected spare target is up and ready */ l_shard->po_target = spare_tgt->ta_comp.co_id; - l_shard->po_fseq = f_shard->fs_fseq; - l_shard->po_rank = spare_tgt->ta_comp.co_rank; - l_shard->po_index = spare_tgt->ta_comp.co_index; + l_shard->po_fseq = f_shard->fs_fseq; + l_shard->po_rank = spare_tgt->ta_comp.co_rank; + l_shard->po_index = spare_tgt->ta_comp.co_index; + + if (pool_target_is_down2up(spare_tgt)) + f_shard->fs_down2up = 1; + else + f_shard->fs_down2up = 0; /* * Mark the shard as 'rebuilding' so that read will skip this shard. @@ -406,6 +406,10 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, f_shard->fs_status == PO_COMP_ST_DRAIN || f_shard->fs_down2up || pool_target_down(spare_tgt)) l_shard->po_rebuilding = 1; + + if (f_shard->fs_down2up && gen_mode != PRE_REBUILD) + l_shard->po_reintegrating = 1; + } else { l_shard->po_shard = -1; l_shard->po_target = -1; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index b7fe7dfe2c8..7140d18b687 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2025 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2438,8 +2438,9 @@ struct tgt_discard_arg { }; struct child_discard_arg { - struct tgt_discard_arg *tgt_discard; - uuid_t cont_uuid; + uint64_t ca_epoch; + uuid_t ca_po_uuid; + uuid_t ca_co_uuid; }; static struct tgt_discard_arg* @@ -2488,7 +2489,7 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - epr.epr_hi = arg->tgt_discard->epoch; + epr.epr_hi = arg->ca_epoch; epr.epr_lo = 0; do { /* Inform the iterator and delete the object */ @@ -2505,9 +2506,8 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, d_backoff_seq_fini(&backoff_seq); if (rc != 0) - D_ERROR("discard object pool/object "DF_UUID"/"DF_UOID" rc: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UOID(ent->ie_oid), - DP_RC(rc)); + D_ERROR("discard object pool/object " DF_UUID "/" DF_UOID " rc: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UOID(ent->ie_oid), DP_RC(rc)); return rc; } @@ -2526,14 +2526,12 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, int rc; D_ASSERT(type == VOS_ITER_COUUID); - if (uuid_compare(arg->cont_uuid, entry->ie_couuid) == 0) { - D_DEBUG(DB_REBUILD, DF_UUID" already discard\n", - DP_UUID(arg->cont_uuid)); + if (uuid_compare(arg->ca_co_uuid, entry->ie_couuid) == 0) { + D_DEBUG(DB_REBUILD, DF_UUID " already discard\n", DP_UUID(arg->ca_co_uuid)); return 0; } - rc = ds_cont_child_lookup(arg->tgt_discard->pool_uuid, entry->ie_couuid, - &cont); + rc = ds_cont_child_lookup(arg->ca_po_uuid, entry->ie_couuid, &cont); if (rc != DER_SUCCESS) { D_ERROR("Lookup container '"DF_UUIDF"' failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); @@ -2553,8 +2551,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_hdl = coh; param.ip_epr.epr_lo = 0; - param.ip_epr.epr_hi = arg->tgt_discard->epoch; - uuid_copy(arg->cont_uuid, entry->ie_couuid); + param.ip_epr.epr_hi = arg->ca_epoch; + uuid_copy(arg->ca_co_uuid, entry->ie_couuid); do { /* Inform the iterator and delete the object */ *acts |= VOS_ITER_CB_DELETE; @@ -2570,9 +2568,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, d_backoff_seq_fini(&backoff_seq); vos_cont_close(coh); - D_DEBUG(DB_TRACE, DF_UUID"/"DF_UUID" discard cont done: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid), - DP_RC(rc)); + D_DEBUG(DB_TRACE, DF_UUID "/" DF_UUID " discard cont done: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UUID(entry->ie_couuid), DP_RC(rc)); put: ds_cont_child_put(cont); @@ -2589,28 +2586,27 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, static int pool_child_discard(void *data) { - struct tgt_discard_arg *arg = data; + struct tgt_discard_arg *arg = data; struct child_discard_arg cont_arg; - struct ds_pool_child *child; + struct ds_pool_child *child; vos_iter_param_t param = { 0 }; - struct vos_iter_anchors anchor = { 0 }; - struct pool_target_addr addr; - uint32_t myrank; + struct vos_iter_anchors anchor = {0}; + struct pool_target_addr addr; + uint32_t myrank; struct d_backoff_seq backoff_seq; int rc; - myrank = dss_self_rank(); - addr.pta_rank = myrank; + myrank = dss_self_rank(); + addr.pta_rank = myrank; addr.pta_target = dss_get_module_info()->dmi_tgt_id; if (!pool_target_addr_found(&arg->tgt_list, &addr)) { - D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, - addr.pta_target); + D_INFO(DF_UUID "discard skipped rank/target=%u/%u.\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); return 0; } - D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), - myrank, addr.pta_target); - + D_INFO(DF_UUID " discard started rank/target=%u/%d\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); /** * When a faulty device is replaced with a new one using the * “dmg storage replace nvme” command, the reintegration of @@ -2626,33 +2622,33 @@ pool_child_discard(void *data) */ child = ds_pool_child_lookup(arg->pool_uuid); if (child == NULL) - return -DER_AGAIN; + D_GOTO(out, rc = -DER_AGAIN); - param.ip_hdl = child->spc_hdl; + cont_arg.ca_epoch = arg->epoch; + uuid_copy(cont_arg.ca_po_uuid, arg->pool_uuid); rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, 1 << 10 /* max (ms) */); - D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); + D_ASSERTF(rc == 0, "d_backoff_seq_init: " DF_RC "\n", DP_RC(rc)); - cont_arg.tgt_discard = arg; - child->spc_discard_done = 0; + param.ip_hdl = child->spc_hdl; do { - rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, - cont_discard_cb, NULL, &cont_arg, NULL); + rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, cont_discard_cb, NULL, + &cont_arg, NULL); if (rc != -DER_BUSY && rc != -DER_INPROGRESS) break; - D_DEBUG(DB_REBUILD, "retry by "DF_RC"/"DF_UUID"\n", - DP_RC(rc), DP_UUID(arg->pool_uuid)); + D_DEBUG(DB_REBUILD, "retry by " DF_RC "/" DF_UUID "\n", DP_RC(rc), + DP_UUID(arg->pool_uuid)); dss_sleep(d_backoff_seq_next(&backoff_seq)); } while (1); - child->spc_discard_done = 1; - d_backoff_seq_fini(&backoff_seq); - - ds_pool_child_put(child); - +out: + D_INFO(DF_UUID " discard completed rank/target=%u/%d, rc=%d\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target, rc); + if (child) + ds_pool_child_put(child); return rc; } @@ -2743,13 +2739,14 @@ ds_pool_task_collective(uuid_t pool_uuid, uint32_t ex_status, int (*coll_func)(v } /* Discard the objects by epoch in this pool */ -static int +static void ds_pool_tgt_discard_ult(void *data) { - struct ds_pool *pool; - struct tgt_discard_arg *arg = data; - uint32_t ex_status; - int rc; + struct ds_pool *pool; + struct tgt_discard_arg *arg = data; + uint32_t ex_status; + int discarding; + int rc; /* If discard failed, let's still go ahead, since reintegration might * still succeed, though it might leave some garbage on the reintegration @@ -2757,20 +2754,23 @@ ds_pool_tgt_discard_ult(void *data) */ rc = ds_pool_lookup(arg->pool_uuid, &pool); if (pool == NULL) { - D_INFO(DF_UUID" can not be found: %d\n", DP_UUID(arg->pool_uuid), rc); + D_INFO(DF_UUID " can not be found: %d\n", DP_UUID(arg->pool_uuid), rc); D_GOTO(free, rc = 0); } ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - ds_pool_thread_collective(arg->pool_uuid, ex_status, pool_child_discard, arg, - DSS_ULT_DEEP_STACK); + rc = ds_pool_thread_collective(arg->pool_uuid, ex_status, pool_child_discard, arg, + DSS_ULT_DEEP_STACK); - pool->sp_need_discard = 0; + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = rc; + discarding = atomic_fetch_sub(&pool->sp_discarding, 1); + D_ASSERT(discarding == 1); + ABT_mutex_unlock(pool->sp_mutex); + ds_pool_put(pool); free: tgt_discard_arg_free(arg); - return rc; } void @@ -2781,6 +2781,7 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) struct pool_target_addr_list pta_list; struct tgt_discard_arg *arg = NULL; struct ds_pool *pool; + uint32_t discarding = 0; int rc; pta_list.pta_number = in->ptdi_addrs.ca_count; @@ -2801,19 +2802,41 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - pool->sp_need_discard = 1; + if (ds_pool_is_rebuilding(pool)) { + D_INFO(DF_UUID " is already being reintegrated!\n", DP_UUID(arg->pool_uuid)); + D_GOTO(out_put, rc = -DER_BUSY); + } + + ABT_mutex_lock(pool->sp_mutex); + if (!atomic_compare_exchange(&pool->sp_discarding, discarding, 1)) { + D_INFO(DF_UUID " XXX: discard(%d) is already in progress\n", + DP_UUID(arg->pool_uuid), discarding); + ABT_mutex_unlock(pool->sp_mutex); + D_GOTO(out_put, rc = -DER_BUSY); + } pool->sp_discard_status = 0; - rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); - if (rc == 0) - rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + ABT_mutex_unlock(pool->sp_mutex); + D_INFO(DF_UUID " discard is scheduled\n", DP_UUID(arg->pool_uuid)); + + rc = dss_ult_create(ds_pool_tgt_discard_ult, arg, DSS_XS_SYS, 0, 0, NULL); + if (rc == 0) { + arg = NULL; /* taken over by ds_pool_tgt_discard_ult */ + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + } else { + ABT_mutex_lock(pool->sp_mutex); + pool->sp_discard_status = rc; + atomic_fetch_sub(&pool->sp_discarding, 1); + ABT_mutex_unlock(pool->sp_mutex); + } +out_put: ds_pool_put(pool); out: out->ptdo_rc = rc; D_DEBUG(DB_MD, DF_UUID": replying rpc "DF_RC"\n", DP_UUID(in->ptdi_uuid), DP_RC(rc)); crt_reply_send(rpc); - if (rc != 0 && arg != NULL) + if (arg != NULL) tgt_discard_arg_free(arg); } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 91185bb5cc3..e95de55dbfd 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2047,16 +2047,18 @@ rebuild_task_ult(void *arg) uint32_t map_dist_ver = 0; struct rebuild_global_pool_tracker *rgt = NULL; d_rank_t myrank; - uint64_t cur_ts = 0; + // uint64_t cur_ts = 0; uint32_t obj_reclaim_ver = 0; int rc; +#if 0 cur_ts = daos_gettime_coarse(); D_ASSERT(task->dst_schedule_time != (uint64_t)-1); if (cur_ts < task->dst_schedule_time) { D_INFO("rebuild task sleep " DF_U64 " second\n", task->dst_schedule_time - cur_ts); dss_sleep((task->dst_schedule_time - cur_ts) * 1000); } +#endif rc = ds_pool_lookup(task->dst_pool_uuid, &pool); if (pool == NULL) { @@ -2258,6 +2260,8 @@ rebuild_ults(void *arg) while (!d_list_empty(&rebuild_gst.rg_queue_list) || !d_list_empty(&rebuild_gst.rg_running_list)) { + uint64_t now; + if (rebuild_gst.rg_abort) { D_INFO("abort rebuilds\n"); break; @@ -2271,6 +2275,7 @@ rebuild_ults(void *arg) continue; } + now = daos_gettime_coarse(); task = d_list_entry(rebuild_gst.rg_queue_list.next, struct rebuild_task, dst_list); while (&rebuild_gst.rg_queue_list != &task->dst_list) { /* If a pool is already handling a rebuild operation, @@ -2278,6 +2283,7 @@ rebuild_ults(void *arg) * one completes */ if (pool_is_rebuilding(task->dst_pool_uuid) || + task->dst_schedule_time > now || task->dst_schedule_time == (uint64_t)-1) { struct rebuild_task *head_task = task; @@ -2310,7 +2316,7 @@ rebuild_ults(void *arg) } } - dss_sleep(0); + dss_sleep(100); } /* If there are still rebuild task in queue and running list, then