struct obd_device *obd = lut->lut_obd;
int rc = 0;
struct target_recovery_data *trd = &obd->obd_recovery_data;
+ int index;
memset(trd, 0, sizeof(*trd));
init_completion(&trd->trd_starting);
init_completion(&trd->trd_finishing);
trd->trd_recovery_handler = handler;
+ rc = server_name2index(obd->obd_name, &index, NULL);
+ if (rc < 0)
+ return rc;
+
if (!IS_ERR(kthread_run(target_recovery_thread,
- lut, "tgt_recov"))) {
+ lut, "tgt_recover_%d", index))) {
wait_for_completion(&trd->trd_starting);
LASSERT(obd->obd_recovering != 0);
} else {
GOTO(out, rc);
}
+ /* If OSP want cancel cross-MDT lock, let's not block it in
+ * in recovery, otherwise the lock will not released, if
+ * the remote target is also in recovery, and it also need
+ * this lock, it might cause deadlock. */
+ if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS &&
+ exp->exp_obd->obd_lu_dev != NULL &&
+ exp->exp_obd->obd_lu_dev->ld_site != NULL) {
+ struct lu_device *top_dev;
+
+ top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev;
+ if (top_dev != NULL &&
+ top_dev->ld_obd->obd_recovering)
+ req->rq_allow_replay = 1;
+ }
+
req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
ptlrpc_at_set_req_timeout(req);
struct ldlm_res_id *res_id;
struct dt_device *dt_dev = lu2dt_dev(dt->do_lu.lo_dev);
struct osp_device *osp = dt2osp_dev(dt_dev);
+ struct lu_device *top_device;
struct ptlrpc_request *req;
int rc = 0;
__u64 flags = 0;
if (IS_ERR(req))
RETURN(PTR_ERR(req));
+ /* During recovery, it needs to let OSP send enqueue
+ * without checking recoverying status, in case the
+ * other target is being recovered at the same time,
+ * and if we wait here for the import to be recovered,
+ * it might cause deadlock */
+ top_device = dt_dev->dd_lu_dev.ld_site->ls_top_dev;
+ if (top_device->ld_obd->obd_recovering)
+ req->rq_allow_replay = 1;
+
rc = ldlm_cli_enqueue(osp->opd_exp, &req, einfo, res_id,
(const union ldlm_policy_data *)policy,
&flags, NULL, 0, LVB_T_NONE, lh, 0);
struct lu_device *dev = it->ooi_obj->do_lu.lo_dev;
struct osp_device *osp = lu2osp_dev(dev);
struct page **pages;
+ struct lu_device *top_device;
struct ptlrpc_request *req = NULL;
struct ptlrpc_bulk_desc *desc;
struct idx_info *ii;
RETURN(rc);
}
+ /* Let's allow this request during recovery, otherwise
+ * if the remote target is also in recovery status,
+ * it might cause deadlock */
+ top_device = dev->ld_site->ls_top_dev;
+ if (top_device->ld_obd->obd_recovering)
+ req->rq_allow_replay = 1;
+
req->rq_request_portal = OUT_PORTAL;
ii = req_capsule_client_get(&req->rq_pill, &RMF_IDX_INFO);
memset(ii, 0, sizeof(*ii));
}
run_test 116b "large update log slave MDT recovery"
+test_117() {
+ [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local index
+ local mds_indexs
+
+ mkdir -p $DIR/$tdir
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+ $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+ sleep 2
+
+ # Let's set rdonly on all MDTs, so client will send
+ # replay requests on all MDTs and replay these requests
+ # at the same time. This test will verify the recovery
+ # will not be deadlock in this case, LU-7531.
+ for ((index = 0; index < $((MDSCOUNT)); index++)); do
+ replay_barrier mds$((index + 1))
+ if [ -z $mds_indexs ]; then
+ mds_indexs="${mds_indexs}mds$((index+1))"
+ else
+ mds_indexs="${mds_indexs},mds$((index+1))"
+ fi
+ done
+
+ rm -rf $DIR/$tdir/remote_dir
+ rm -rf $DIR/$tdir/remote_dir_1
+
+ fail $mds_indexs
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
complete $SECONDS
check_and_cleanup_lustre