From 9e338ee25609d9bbcc359c37012e6868dd6be5d3 Mon Sep 17 00:00:00 2001 From: Mikhal Pershin Date: Tue, 7 Nov 2017 19:29:32 +0300 Subject: [PATCH] LU-10181 mdt: high-priority request handling for DOM Implement high-priority request handling and lock prolongation for Data-on-MDT BRW requests to avoid incorrect timeouts and client eviction under heavy MDS load. Test-Parameters: mdssizegb=20 testlist=sanity-dom,dom-performance Signed-off-by: Mikhal Pershin Change-Id: I589efd2774d739f3a0b471d7a6e4d6be7c6a7c2c Reviewed-on: https://review.whamcloud.com/29968 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/mdc/mdc_dev.c | 71 +++++++----- lustre/mdt/mdt_handler.c | 11 +- lustre/mdt/mdt_internal.h | 3 + lustre/mdt/mdt_io.c | 286 +++++++++++++++++++++++++++++++++++++++++++++- lustre/mdt/mdt_mds.c | 1 + 5 files changed, 340 insertions(+), 32 deletions(-) diff --git a/lustre/mdc/mdc_dev.c b/lustre/mdc/mdc_dev.c index 8f304f7..444f919 100644 --- a/lustre/mdc/mdc_dev.c +++ b/lustre/mdc/mdc_dev.c @@ -959,6 +959,33 @@ static int mdc_async_upcall(void *a, int rc) return 0; } +static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc, + pgoff_t index, struct lustre_handle *lh) +{ + struct ldlm_lock *lock; + + /* find DOM lock protecting object */ + lock = mdc_dlmlock_at_pgoff(env, osc, index, + OSC_DAP_FL_TEST_LOCK | + OSC_DAP_FL_CANCELING); + if (lock == NULL) { + struct ldlm_resource *res; + struct ldlm_res_id *resname; + + resname = &osc_env_info(env)->oti_resname; + fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname); + res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace, + NULL, resname, LDLM_IBITS, 0); + ldlm_resource_dump(D_ERROR, res); + libcfs_debug_dumpstack(NULL); + return -ENOENT; + } else { + *lh = lock->l_remote_handle; + LDLM_LOCK_PUT(lock); + } + return 0; +} + static int mdc_io_setattr_start(const struct lu_env *env, const struct cl_io_slice *slice) { @@ -1029,6 +1056,11 @@ static int mdc_io_setattr_start(const struct lu_env *env, if (oio->oi_lockless) { oa->o_flags = OBD_FL_SRVLOCK; oa->o_valid |= OBD_MD_FLFLAGS; + } else { + rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF, + &oa->o_handle); + if (!rc) + oa->o_valid |= OBD_MD_FLHANDLE; } init_completion(&cbargs->opc_sync); @@ -1182,35 +1214,22 @@ static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj, attr->cra_oa->o_valid |= OBD_MD_FLID; if (flags & OBD_MD_FLHANDLE) { - struct ldlm_lock *lock; /* _some_ lock protecting @apage */ struct osc_page *opg; opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); - lock = mdc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg), - OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING); - if (lock == NULL && !opg->ops_srvlock) { - struct ldlm_resource *res; - struct ldlm_res_id *resname; - - CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, - "uncovered page!\n"); - - resname = &osc_env_info(env)->oti_resname; - mdc_build_res_name(cl2osc(obj), resname); - res = ldlm_resource_get( - osc_export(cl2osc(obj))->exp_obd->obd_namespace, - NULL, resname, LDLM_IBITS, 0); - ldlm_resource_dump(D_ERROR, res); - - libcfs_debug_dumpstack(NULL); - LBUG(); - } - - /* check for lockless io. */ - if (lock != NULL) { - attr->cra_oa->o_handle = lock->l_remote_handle; - attr->cra_oa->o_valid |= OBD_MD_FLHANDLE; - LDLM_LOCK_PUT(lock); + if (!opg->ops_srvlock) { + int rc; + + rc = mdc_get_lock_handle(env, cl2osc(obj), + osc_index(opg), + &attr->cra_oa->o_handle); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, + "uncovered page!\n"); + LBUG(); + } else { + attr->cra_oa->o_valid |= OBD_MD_FLHANDLE; + } } } } diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 826dd04..8ff002c 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -4900,10 +4900,13 @@ TGT_MDT_HDL(HABEO_CLAVIS | HABEO_CORPUS | HABEO_REFERO | MUTABOR, }; static struct tgt_handler mdt_io_ops[] = { -TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read), -TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write), -TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, - OST_PUNCH, mdt_punch_hdl), +TGT_OST_HDL_HP(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read, + mdt_hp_brw), +TGT_OST_HDL_HP(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write, + mdt_hp_brw), +TGT_OST_HDL_HP(HABEO_CORPUS | HABEO_REFERO | MUTABOR, + OST_PUNCH, mdt_punch_hdl, + mdt_hp_punch), TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_SYNC, mdt_data_sync), }; diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 414cc81..f12fb79 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -1251,6 +1251,9 @@ int mdt_dom_object_size(const struct lu_env *env, struct mdt_device *mdt, bool dom_lock); bool mdt_dom_client_has_lock(struct mdt_thread_info *info, const struct lu_fid *fid); +void mdt_hp_brw(struct tgt_session_info *tsi); +void mdt_hp_punch(struct tgt_session_info *tsi); + /* grants */ long mdt_grant_connect(const struct lu_env *env, struct obd_export *exp, u64 want, bool conservative); diff --git a/lustre/mdt/mdt_io.c b/lustre/mdt/mdt_io.c index bdbb388..637f5ee 100644 --- a/lustre/mdt/mdt_io.c +++ b/lustre/mdt/mdt_io.c @@ -61,6 +61,288 @@ static inline void mdt_dom_write_unlock(struct mdt_object *mo) up_write(&mo->mot_dom_sem); } +/** + * Lock prolongation for Data-on-MDT. + * This is similar to OFD code but for DOM ibits lock. + */ +static inline time64_t prolong_timeout(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + time64_t req_timeout; + + if (AT_OFF) + return obd_timeout / 2; + + req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec; + return max_t(time64_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)), + req_timeout); +} + +static void mdt_prolong_dom_lock(struct tgt_session_info *tsi, + struct ldlm_prolong_args *data) +{ + struct obdo *oa = &tsi->tsi_ost_body->oa; + struct ldlm_lock *lock; + + ENTRY; + + data->lpa_timeout = prolong_timeout(tgt_ses_req(tsi)); + data->lpa_export = tsi->tsi_exp; + data->lpa_resid = tsi->tsi_resid; + + CDEBUG(D_RPCTRACE, "Prolong DOM lock for req %p with x%llu\n", + tgt_ses_req(tsi), tgt_ses_req(tsi)->rq_xid); + + if (oa->o_valid & OBD_MD_FLHANDLE) { + /* mostly a request should be covered by only one lock, try + * fast path. */ + lock = ldlm_handle2lock(&oa->o_handle); + if (lock != NULL) { + LASSERT(lock->l_export == data->lpa_export); + ldlm_lock_prolong_one(lock, data); + lock->l_last_used = ktime_get(); + LDLM_LOCK_PUT(lock); + RETURN_EXIT; + } + } + EXIT; +} + +static int mdt_rw_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct obd_ioobj *ioo; + enum ldlm_mode mode; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + ENTRY; + + if (!(lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM)) + RETURN(0); + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); + + LASSERT(lock->l_resource != NULL); + if (!fid_res_name_eq(&ioo->ioo_oid.oi_fid, &lock->l_resource->lr_name)) + RETURN(0); + + /* a bulk write can only hold a reference on a PW extent lock. */ + mode = LCK_PW; + if (opc == OST_READ) + /* whereas a bulk read can be protected by either a PR or PW + * extent lock */ + mode |= LCK_PR; + + if (!(lock->l_granted_mode & mode)) + RETURN(0); + + RETURN(1); +} + +static int mdt_rw_hpreq_check(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi; + struct obd_ioobj *ioo; + struct niobuf_remote *rnb; + int opc; + struct ldlm_prolong_args pa = { 0 }; + + ENTRY; + + /* Don't use tgt_ses_info() to get session info, because lock_match() + * can be called while request has no processing thread yet. */ + tsi = lu_context_key_get(&req->rq_session, &tgt_session_key); + + /* + * Use LASSERT below because malformed RPCs should have + * been filtered out in tgt_hpreq_handler(). + */ + opc = lustre_msg_get_opc(req->rq_reqmsg); + LASSERT(opc == OST_READ || opc == OST_WRITE); + + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); + + rnb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE); + LASSERT(rnb != NULL); + LASSERT(!(rnb->rnb_flags & OBD_BRW_SRVLOCK)); + + pa.lpa_mode = LCK_PW; + if (opc == OST_READ) + pa.lpa_mode |= LCK_PR; + + DEBUG_REQ(D_RPCTRACE, req, "%s %s: refresh rw locks: "DFID"\n", + tgt_name(tsi->tsi_tgt), current->comm, PFID(&tsi->tsi_fid)); + + mdt_prolong_dom_lock(tsi, &pa); + + if (pa.lpa_blocks_cnt > 0) { + CDEBUG(D_DLMTRACE, + "%s: refreshed %u locks timeout for req %p.\n", + tgt_name(tsi->tsi_tgt), pa.lpa_blocks_cnt, req); + RETURN(1); + } + + RETURN(pa.lpa_locks_cnt > 0 ? 0 : -ESTALE); +} + +static void mdt_rw_hpreq_fini(struct ptlrpc_request *req) +{ + mdt_rw_hpreq_check(req); +} + +static struct ptlrpc_hpreq_ops mdt_hpreq_rw = { + .hpreq_lock_match = mdt_rw_hpreq_lock_match, + .hpreq_check = mdt_rw_hpreq_check, + .hpreq_fini = mdt_rw_hpreq_fini +}; + +/** + * Assign high priority operations to an IO request. + * + * Check if the incoming request is a candidate for + * high-priority processing. If it is, assign it a high + * priority operations table. + * + * \param[in] tsi target session environment for this request + */ +void mdt_hp_brw(struct tgt_session_info *tsi) +{ + struct niobuf_remote *rnb; + struct obd_ioobj *ioo; + + ENTRY; + + ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ); + LASSERT(ioo != NULL); /* must exist after request preprocessing */ + if (ioo->ioo_bufcnt > 0) { + rnb = req_capsule_client_get(tsi->tsi_pill, &RMF_NIOBUF_REMOTE); + LASSERT(rnb != NULL); /* must exist after preprocessing */ + + /* no high priority if server lock is needed */ + if (rnb->rnb_flags & OBD_BRW_SRVLOCK || + (lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & + MSG_REPLAY)) + return; + } + tgt_ses_req(tsi)->rq_ops = &mdt_hpreq_rw; +} + +static int mdt_punch_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct tgt_session_info *tsi; + struct obdo *oa; + + ENTRY; + + /* Don't use tgt_ses_info() to get session info, because lock_match() + * can be called while request has no processing thread yet. */ + tsi = lu_context_key_get(&req->rq_session, &tgt_session_key); + + /* + * Use LASSERT below because malformed RPCs should have + * been filtered out in tgt_hpreq_handler(). + */ + LASSERT(tsi->tsi_ost_body != NULL); + if (tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLHANDLE && + tsi->tsi_ost_body->oa.o_handle.cookie == lock->l_handle.h_cookie) + RETURN(1); + + oa = &tsi->tsi_ost_body->oa; + + LASSERT(lock->l_resource != NULL); + if (!fid_res_name_eq(&oa->o_oi.oi_fid, &lock->l_resource->lr_name)) + RETURN(0); + + if (!(lock->l_granted_mode & LCK_PW)) + RETURN(0); + + RETURN(1); +} + +/** + * Implementation of ptlrpc_hpreq_ops::hpreq_lock_check for OST_PUNCH request. + * + * High-priority queue request check for whether the given punch request + * (\a req) is blocking an LDLM lock cancel. Also checks whether the request is + * covered by an LDLM lock. + * + + * + * \param[in] req the incoming request + * + * \retval 1 if \a req is blocking an LDLM lock cancel + * \retval 0 if it is not + * \retval -ESTALE if lock is not found + */ +static int mdt_punch_hpreq_check(struct ptlrpc_request *req) +{ + struct tgt_session_info *tsi; + struct obdo *oa; + struct ldlm_prolong_args pa = { 0 }; + + ENTRY; + + /* Don't use tgt_ses_info() to get session info, because lock_match() + * can be called while request has no processing thread yet. */ + tsi = lu_context_key_get(&req->rq_session, &tgt_session_key); + LASSERT(tsi != NULL); + oa = &tsi->tsi_ost_body->oa; + + LASSERT(!(oa->o_valid & OBD_MD_FLFLAGS && + oa->o_flags & OBD_FL_SRVLOCK)); + + pa.lpa_mode = LCK_PW; + + CDEBUG(D_DLMTRACE, "%s: refresh DOM lock for "DFID"\n", + tgt_name(tsi->tsi_tgt), PFID(&tsi->tsi_fid)); + + mdt_prolong_dom_lock(tsi, &pa); + + + if (pa.lpa_blocks_cnt > 0) { + CDEBUG(D_DLMTRACE, + "%s: refreshed %u locks timeout for req %p.\n", + tgt_name(tsi->tsi_tgt), pa.lpa_blocks_cnt, req); + RETURN(1); + } + + RETURN(pa.lpa_locks_cnt > 0 ? 0 : -ESTALE); +} + +/** + * Implementation of ptlrpc_hpreq_ops::hpreq_lock_fini for OST_PUNCH request. + * + * Called after the request has been handled. It refreshes lock timeout again + * so that client has more time to send lock cancel RPC. + * + * \param[in] req request which is being processed. + */ +static void mdt_punch_hpreq_fini(struct ptlrpc_request *req) +{ + mdt_punch_hpreq_check(req); +} + +static struct ptlrpc_hpreq_ops mdt_hpreq_punch = { + .hpreq_lock_match = mdt_punch_hpreq_lock_match, + .hpreq_check = mdt_punch_hpreq_check, + .hpreq_fini = mdt_punch_hpreq_fini +}; + +void mdt_hp_punch(struct tgt_session_info *tsi) +{ + LASSERT(tsi->tsi_ost_body != NULL); /* must exists if we are here */ + /* no high-priority if server lock is needed */ + if ((tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLFLAGS && + tsi->tsi_ost_body->oa.o_flags & OBD_FL_SRVLOCK) || + tgt_conn_flags(tsi) & OBD_CONNECT_MDS || + lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY) + return; + tgt_ses_req(tsi)->rq_ops = &mdt_hpreq_punch; +} + static int mdt_preprw_read(const struct lu_env *env, struct obd_export *exp, struct mdt_device *mdt, struct mdt_object *mo, struct lu_attr *la, int niocount, @@ -582,7 +864,7 @@ out_put: lu_object_put(tsi->tsi_env, &mo->mot_obj); out_unlock: if (srvlock) - mdt_save_lock(info, &lh, LCK_PW, rc); + tgt_extent_unlock(&lh, LCK_PW); out: mdt_thread_info_fini(info); return rc; @@ -610,7 +892,7 @@ int mdt_do_glimpse(const struct lu_env *env, struct ldlm_namespace *ns, /* There can be only one write lock covering data, try to match it. */ policy.l_inodebits.bits = MDS_INODELOCK_DOM; - mode = ldlm_lock_match(ns, LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK, + mode = ldlm_lock_match(ns, LDLM_FL_TEST_LOCK, &res->lr_name, LDLM_IBITS, &policy, LCK_PW, &lockh, 0); diff --git a/lustre/mdt/mdt_mds.c b/lustre/mdt/mdt_mds.c index 79afe4f..e69ea49 100644 --- a/lustre/mdt/mdt_mds.c +++ b/lustre/mdt/mdt_mds.c @@ -474,6 +474,7 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .so_thr_done = tgt_io_thread_done, .so_req_handler = tgt_request_handle, .so_req_printer = target_print_req, + .so_hpreq_handler = tgt_hpreq_handler, }, }; m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset, -- 1.8.3.1