X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosc%2Fosc_request.c;h=2aa6b30c0909bfe440c98bfe540068e877c3f01b;hp=0c4084da0704f7046a7611f463108a719e49c642;hb=27815a0611a2e315a9a7696a20c2f257d48aeb7e;hpb=e2af7fb3c91dfb13d34d8e1b2f2df8c09621f768 diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 0c4084d..2aa6b30 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -27,7 +27,7 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Whamcloud, Inc. + * Copyright (c) 2011, 2013, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -58,108 +58,106 @@ #include #include #include +#include #include "osc_internal.h" +#include "osc_cl_internal.h" static void osc_release_ppga(struct brw_page **ppga, obd_count count); static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *data, int rc); -static void osc_check_rpcs0(const struct lu_env *env, struct client_obd *cli, - int ptlrpc); int osc_cleanup(struct obd_device *obd); /* Pack OSC object metadata for disk storage (LE byte order). */ static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm) { - int lmm_size; - ENTRY; + int lmm_size; + ENTRY; - lmm_size = sizeof(**lmmp); - if (!lmmp) - RETURN(lmm_size); + lmm_size = sizeof(**lmmp); + if (lmmp == NULL) + RETURN(lmm_size); - if (*lmmp && !lsm) { - OBD_FREE(*lmmp, lmm_size); - *lmmp = NULL; - RETURN(0); - } + if (*lmmp != NULL && lsm == NULL) { + OBD_FREE(*lmmp, lmm_size); + *lmmp = NULL; + RETURN(0); + } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) { + RETURN(-EBADF); + } - if (!*lmmp) { - OBD_ALLOC(*lmmp, lmm_size); - if (!*lmmp) - RETURN(-ENOMEM); - } + if (*lmmp == NULL) { + OBD_ALLOC(*lmmp, lmm_size); + if (*lmmp == NULL) + RETURN(-ENOMEM); + } - if (lsm) { - LASSERT(lsm->lsm_object_id); - LASSERT_SEQ_IS_MDT(lsm->lsm_object_seq); - (*lmmp)->lmm_object_id = cpu_to_le64(lsm->lsm_object_id); - (*lmmp)->lmm_object_seq = cpu_to_le64(lsm->lsm_object_seq); - } + if (lsm) + ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi); - RETURN(lmm_size); + RETURN(lmm_size); } /* Unpack OSC object metadata from disk storage (LE byte order). */ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, - struct lov_mds_md *lmm, int lmm_bytes) -{ - int lsm_size; - struct obd_import *imp = class_exp2cliimp(exp); - ENTRY; - - if (lmm != NULL) { - if (lmm_bytes < sizeof (*lmm)) { - CERROR("lov_mds_md too small: %d, need %d\n", - lmm_bytes, (int)sizeof(*lmm)); - RETURN(-EINVAL); - } - /* XXX LOV_MAGIC etc check? */ - - if (lmm->lmm_object_id == 0) { - CERROR("lov_mds_md: zero lmm_object_id\n"); - RETURN(-EINVAL); - } - } - - lsm_size = lov_stripe_md_size(1); - if (lsmp == NULL) - RETURN(lsm_size); - - if (*lsmp != NULL && lmm == NULL) { - OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); - OBD_FREE(*lsmp, lsm_size); - *lsmp = NULL; - RETURN(0); - } - - if (*lsmp == NULL) { - OBD_ALLOC(*lsmp, lsm_size); - if (*lsmp == NULL) - RETURN(-ENOMEM); - OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); - if ((*lsmp)->lsm_oinfo[0] == NULL) { - OBD_FREE(*lsmp, lsm_size); - RETURN(-ENOMEM); - } - loi_init((*lsmp)->lsm_oinfo[0]); - } - - if (lmm != NULL) { - /* XXX zero *lsmp? */ - (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id); - (*lsmp)->lsm_object_seq = le64_to_cpu (lmm->lmm_object_seq); - LASSERT((*lsmp)->lsm_object_id); - LASSERT_SEQ_IS_MDT((*lsmp)->lsm_object_seq); - } - - if (imp != NULL && - (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) - (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; - else - (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; - - RETURN(lsm_size); + struct lov_mds_md *lmm, int lmm_bytes) +{ + int lsm_size; + struct obd_import *imp = class_exp2cliimp(exp); + ENTRY; + + if (lmm != NULL) { + if (lmm_bytes < sizeof(*lmm)) { + CERROR("%s: lov_mds_md too small: %d, need %d\n", + exp->exp_obd->obd_name, lmm_bytes, + (int)sizeof(*lmm)); + RETURN(-EINVAL); + } + /* XXX LOV_MAGIC etc check? */ + + if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) { + CERROR("%s: zero lmm_object_id: rc = %d\n", + exp->exp_obd->obd_name, -EINVAL); + RETURN(-EINVAL); + } + } + + lsm_size = lov_stripe_md_size(1); + if (lsmp == NULL) + RETURN(lsm_size); + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + RETURN(0); + } + + if (*lsmp == NULL) { + OBD_ALLOC(*lsmp, lsm_size); + if (unlikely(*lsmp == NULL)) + RETURN(-ENOMEM); + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) { + OBD_FREE(*lsmp, lsm_size); + RETURN(-ENOMEM); + } + loi_init((*lsmp)->lsm_oinfo[0]); + } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) { + RETURN(-EBADF); + } + + if (lmm != NULL) + /* XXX zero *lsmp? */ + ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi); + + if (imp != NULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) + (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; + else + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + + RETURN(lsm_size); } static inline void osc_pack_capa(struct ptlrpc_request *req, @@ -181,13 +179,14 @@ static inline void osc_pack_capa(struct ptlrpc_request *req, static inline void osc_pack_req_body(struct ptlrpc_request *req, struct obd_info *oinfo) { - struct ost_body *body; + struct ost_body *body; - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - osc_pack_capa(req, body, oinfo->oi_capa); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); } static inline void osc_set_capa_size(struct ptlrpc_request *req, @@ -213,12 +212,13 @@ static int osc_getattr_interpret(const struct lu_env *env, body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); if (body) { - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa); + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oi->oi_oa, &body->oa); - /* This should really be sent by the OST */ - aa->aa_oi->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + /* This should really be sent by the OST */ + aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE; + aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; } else { CDEBUG(D_INFO, "can't unpack ost_body\n"); rc = -EPROTO; @@ -292,12 +292,12 @@ static int osc_getattr(const struct lu_env *env, struct obd_export *exp, if (body == NULL) GOTO(out, rc = -EPROTO); - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); - /* This should really be sent by the OST */ - oinfo->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd); + oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; EXIT; out: @@ -338,7 +338,8 @@ static int osc_setattr(const struct lu_env *env, struct obd_export *exp, if (body == NULL) GOTO(out, rc = -EPROTO); - lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); EXIT; out: @@ -360,7 +361,8 @@ static int osc_setattr_interpret(const struct lu_env *env, if (body == NULL) GOTO(out, rc = -EPROTO); - lustre_get_wire_obdo(sa->sa_oa, &body->oa); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, + &body->oa); out: rc = sa->sa_upcall(sa->sa_cookie, rc); RETURN(rc); @@ -456,7 +458,8 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); LASSERT(body); - lustre_set_wire_obdo(&body->oa, oa); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); ptlrpc_request_set_replen(req); @@ -476,19 +479,18 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, if (body == NULL) GOTO(out_req, rc = -EPROTO); - lustre_get_wire_obdo(oa, &body->oa); + CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); - /* This should really be sent by the OST */ - oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - oa->o_valid |= OBD_MD_FLBLKSZ; + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; - /* XXX LOV STACKING: the lsm that is passed to us from LOV does not - * have valid lsm_oinfo data structs, so don't go touching that. - * This needs to be fixed in a big way. - */ - lsm->lsm_object_id = oa->o_id; - lsm->lsm_object_seq = oa->o_seq; - *ea = lsm; + /* XXX LOV STACKING: the lsm that is passed to us from LOV does not + * have valid lsm_oinfo data structs, so don't go touching that. + * This needs to be fixed in a big way. + */ + lsm->lsm_oi = oa->o_oi; + *ea = lsm; if (oti != NULL) { oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); @@ -533,10 +535,11 @@ int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - osc_pack_capa(req, body, oinfo->oi_capa); + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); ptlrpc_request_set_replen(req); @@ -569,7 +572,7 @@ static int osc_sync_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *arg, int rc) { - struct osc_async_args *aa = arg; + struct osc_fsync_args *fa = arg; struct ost_body *body; ENTRY; @@ -582,27 +585,22 @@ static int osc_sync_interpret(const struct lu_env *env, GOTO(out, rc = -EPROTO); } - *aa->aa_oi->oi_oa = body->oa; + *fa->fa_oi->oi_oa = body->oa; out: - rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); - RETURN(rc); + rc = fa->fa_upcall(fa->fa_cookie, rc); + RETURN(rc); } -static int osc_sync(const struct lu_env *env, struct obd_export *exp, - struct obd_info *oinfo, obd_size start, obd_size end, - struct ptlrpc_request_set *set) +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) { - struct ptlrpc_request *req; - struct ost_body *body; - struct osc_async_args *aa; + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; int rc; ENTRY; - if (!oinfo->oi_oa) { - CDEBUG(D_INFO, "oa NULL\n"); - RETURN(-EINVAL); - } - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); if (req == NULL) RETURN(-ENOMEM); @@ -614,32 +612,54 @@ static int osc_sync(const struct lu_env *env, struct obd_export *exp, RETURN(rc); } - /* overload the size and blocks fields in the oa with start/end */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - body->oa.o_size = start; - body->oa.o_blocks = end; - body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); - osc_pack_capa(req, body, oinfo->oi_capa); + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); ptlrpc_request_set_replen(req); req->rq_interpret_reply = osc_sync_interpret; - CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oi = oinfo; + CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args)); + fa = ptlrpc_req_async_args(req); + fa->fa_oi = oinfo; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; - ptlrpc_set_add_req(set, req); - RETURN (0); + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + RETURN (0); +} + +static int osc_sync(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set) +{ + ENTRY; + + if (!oinfo->oi_oa) { + CDEBUG(D_INFO, "oa NULL\n"); + RETURN(-EINVAL); + } + + oinfo->oi_oa->o_size = start; + oinfo->oi_oa->o_blocks = end; + oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + + RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set)); } /* Find and cancel locally locks matched by @mode in the resource found by * @objid. Found locks are added into @cancel list. Returns the amount of * locks added to @cancels list. */ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, - cfs_list_t *cancels, - ldlm_mode_t mode, int lock_flags) + cfs_list_t *cancels, + ldlm_mode_t mode, __u64 lock_flags) { struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; struct ldlm_res_id res_id; @@ -647,10 +667,19 @@ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, int count; ENTRY; - osc_build_res_name(oa->o_id, oa->o_seq, &res_id); - res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (res == NULL) - RETURN(0); + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (res == NULL) + RETURN(0); LDLM_RESOURCE_ADDREF(res); count = ldlm_cancel_resource_local(res, cancels, NULL, mode, @@ -661,32 +690,57 @@ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, } static int osc_destroy_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, - int rc) + struct ptlrpc_request *req, void *data, + int rc) { - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - cfs_atomic_dec(&cli->cl_destroy_in_flight); - cfs_waitq_signal(&cli->cl_destroy_waitq); - return 0; + cfs_atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + return 0; } static int osc_can_send_destroy(struct client_obd *cli) { - if (cfs_atomic_inc_return(&cli->cl_destroy_in_flight) <= - cli->cl_max_rpcs_in_flight) { - /* The destroy request can be sent */ - return 1; - } - if (cfs_atomic_dec_return(&cli->cl_destroy_in_flight) < - cli->cl_max_rpcs_in_flight) { - /* - * The counter has been modified between the two atomic - * operations. - */ - cfs_waitq_signal(&cli->cl_destroy_waitq); - } - return 0; + if (cfs_atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (cfs_atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc = 0; + ENTRY; + + LASSERT(oa); + LASSERT(ea); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_RECREATE_OBJS) { + RETURN(osc_real_create(exp, oa, ea, oti)); + } + + if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi))) + RETURN(osc_real_create(exp, oa, ea, oti)); + + /* we should not get here anymore */ + LBUG(); + + RETURN(rc); } /* Destroy requests can be async always on the client, and we don't even really @@ -736,17 +790,20 @@ static int osc_destroy(const struct lu_env *env, struct obd_export *exp, req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); - if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) - oa->o_lcookie = *oti->oti_logcookies; - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oa); + if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) + oa->o_lcookie = *oti->oti_logcookies; + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); osc_pack_capa(req, body, (struct obd_capa *)capa); ptlrpc_request_set_replen(req); - /* don't throttle destroy RPCs for the MDT */ - if (!(cli->cl_import->imp_connect_flags_orig & OBD_CONNECT_MDS)) { + /* If osc_destory is for destroying the unlink orphan, + * sent from MDT to OST, which should not be blocked here, + * because the process might be triggered by ptlrpcd, and + * it is not good to block ptlrpcd thread (b=16006)*/ + if (!(oa->o_flags & OBD_FL_DELORPHAN)) { req->rq_interpret_reply = osc_destroy_interpret; if (!osc_can_send_destroy(cli)) { struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, @@ -776,31 +833,36 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, oa->o_valid |= bits; client_obd_list_lock(&cli->cl_loi_list_lock); oa->o_dirty = cli->cl_dirty; - if (cli->cl_dirty - cli->cl_dirty_transit > cli->cl_dirty_max) { - CERROR("dirty %lu - %lu > dirty_max %lu\n", - cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); - oa->o_undirty = 0; - } else if (cfs_atomic_read(&obd_dirty_pages) - - cfs_atomic_read(&obd_dirty_transit_pages) > - obd_max_dirty_pages + 1){ - /* The cfs_atomic_read() allowing the cfs_atomic_inc() are - * not covered by a lock thus they may safely race and trip - * this CERROR() unless we add in a small fudge factor (+1). */ - CERROR("dirty %d - %d > system dirty_max %d\n", - cfs_atomic_read(&obd_dirty_pages), - cfs_atomic_read(&obd_dirty_transit_pages), - obd_max_dirty_pages); - oa->o_undirty = 0; - } else if (cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff) { - CERROR("dirty %lu - dirty_max %lu too big???\n", - cli->cl_dirty, cli->cl_dirty_max); - oa->o_undirty = 0; - } else { - long max_in_flight = (cli->cl_max_pages_per_rpc << CFS_PAGE_SHIFT)* - (cli->cl_max_rpcs_in_flight + 1); + if (unlikely(cli->cl_dirty - cli->cl_dirty_transit > + cli->cl_dirty_max)) { + CERROR("dirty %lu - %lu > dirty_max %lu\n", + cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); + oa->o_undirty = 0; + } else if (unlikely(cfs_atomic_read(&obd_unstable_pages) + + cfs_atomic_read(&obd_dirty_pages) - + cfs_atomic_read(&obd_dirty_transit_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The cfs_atomic_read() allowing the cfs_atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n", + cli->cl_import->imp_obd->obd_name, + cfs_atomic_read(&obd_unstable_pages), + cfs_atomic_read(&obd_dirty_pages), + cfs_atomic_read(&obd_dirty_transit_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty, cli->cl_dirty_max); + oa->o_undirty = 0; + } else { + long max_in_flight = (cli->cl_max_pages_per_rpc << + PAGE_CACHE_SHIFT) * + (cli->cl_max_rpcs_in_flight + 1); oa->o_undirty = max(cli->cl_dirty_max, max_in_flight); } - oa->o_grant = cli->cl_avail_grant; + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; oa->o_dropped = cli->cl_lost_grant; cli->cl_lost_grant = 0; client_obd_list_unlock(&cli->cl_loi_list_lock); @@ -809,7 +871,7 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, } -static void osc_update_next_shrink(struct client_obd *cli) +void osc_update_next_shrink(struct client_obd *cli) { cli->cl_next_shrink_grant = cfs_time_shift(cli->cl_grant_shrink_interval); @@ -817,127 +879,6 @@ static void osc_update_next_shrink(struct client_obd *cli) cli->cl_next_shrink_grant); } -/* caller must hold loi_list_lock */ -static void osc_consume_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock.lock); - LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - cfs_atomic_inc(&obd_dirty_pages); - cli->cl_dirty += CFS_PAGE_SIZE; - cli->cl_avail_grant -= CFS_PAGE_SIZE; - pga->flag |= OBD_BRW_FROM_GRANT; - CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - CFS_PAGE_SIZE, pga, pga->pg); - LASSERT(cli->cl_avail_grant >= 0); - osc_update_next_shrink(cli); -} - -/* the companion to osc_consume_write_grant, called when a brw has completed. - * must be called with the loi lock held. */ -static void osc_release_write_grant(struct client_obd *cli, - struct brw_page *pga, int sent) -{ - int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; - ENTRY; - - LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock.lock); - if (!(pga->flag & OBD_BRW_FROM_GRANT)) { - EXIT; - return; - } - - pga->flag &= ~OBD_BRW_FROM_GRANT; - cfs_atomic_dec(&obd_dirty_pages); - cli->cl_dirty -= CFS_PAGE_SIZE; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - cfs_atomic_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit -= CFS_PAGE_SIZE; - } - if (!sent) { - /* Reclaim grant from truncated pages. This is used to solve - * write-truncate and grant all gone(to lost_grant) problem. - * For a vfs write this problem can be easily solved by a sync - * write, however, this is not an option for page_mkwrite() - * because grant has to be allocated before a page becomes - * dirty. */ - if (cli->cl_avail_grant < PTLRPC_MAX_BRW_SIZE) - cli->cl_avail_grant += CFS_PAGE_SIZE; - else - cli->cl_lost_grant += CFS_PAGE_SIZE; - CDEBUG(D_CACHE, "lost grant: %lu avail grant: %lu dirty: %lu\n", - cli->cl_lost_grant, cli->cl_avail_grant, cli->cl_dirty); - } else if (CFS_PAGE_SIZE != blocksize && pga->count != CFS_PAGE_SIZE) { - /* For short writes we shouldn't count parts of pages that - * span a whole block on the OST side, or our accounting goes - * wrong. Should match the code in filter_grant_check. */ - int offset = pga->off & ~CFS_PAGE_MASK; - int count = pga->count + (offset & (blocksize - 1)); - int end = (offset + pga->count) & (blocksize - 1); - if (end) - count += blocksize - end; - - cli->cl_lost_grant += CFS_PAGE_SIZE - count; - CDEBUG(D_CACHE, "lost %lu grant: %lu avail: %lu dirty: %lu\n", - CFS_PAGE_SIZE - count, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty); - } - - EXIT; -} - -static unsigned long rpcs_in_flight(struct client_obd *cli) -{ - return cli->cl_r_in_flight + cli->cl_w_in_flight; -} - -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - cfs_list_t *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - cfs_list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - /* if we can't dirty more, we must wait until some is written */ - if ((cli->cl_dirty + CFS_PAGE_SIZE > cli->cl_dirty_max) || - (cfs_atomic_read(&obd_dirty_pages) + 1 > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld " - "osc max %ld, sys max %d\n", cli->cl_dirty, - cli->cl_dirty_max, obd_max_dirty_pages); - return; - } - - /* if still dirty cache but no grant wait for pending RPCs that - * may yet return us some grant before doing sync writes */ - if (cli->cl_w_in_flight && cli->cl_avail_grant < CFS_PAGE_SIZE) { - CDEBUG(D_CACHE, "%u BRW writes in flight, no grant\n", - cli->cl_w_in_flight); - return; - } - - ocw = cfs_list_entry(l, struct osc_cache_waiter, ocw_entry); - cfs_list_del_init(&ocw->ocw_entry); - if (cli->cl_avail_grant < CFS_PAGE_SIZE) { - /* no more RPCs in flight to return grant, do sync IO */ - ocw->ocw_rc = -EDQUOT; - CDEBUG(D_INODE, "wake oap %p for sync\n", ocw->ocw_oap); - } else { - osc_consume_write_grant(cli, - &ocw->ocw_oap->oap_brw_page); - } - - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant); - - cfs_waitq_signal(&ocw->ocw_waitq); - } - - EXIT; -} - static void __osc_update_grant(struct client_obd *cli, obd_size grant) { client_obd_list_lock(&cli->cl_loi_list_lock); @@ -998,45 +939,45 @@ static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) * needed, and avoids shrinking the grant piecemeal. */ static int osc_shrink_grant(struct client_obd *cli) { - long target = (cli->cl_max_rpcs_in_flight + 1) * - cli->cl_max_pages_per_rpc; + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT); - client_obd_list_lock(&cli->cl_loi_list_lock); - if (cli->cl_avail_grant <= target) - target = cli->cl_max_pages_per_rpc; - client_obd_list_unlock(&cli->cl_loi_list_lock); + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + client_obd_list_unlock(&cli->cl_loi_list_lock); - return osc_shrink_grant_to_target(cli, target); + return osc_shrink_grant_to_target(cli, target_bytes); } -int osc_shrink_grant_to_target(struct client_obd *cli, long target) +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) { - int rc = 0; - struct ost_body *body; - ENTRY; + int rc = 0; + struct ost_body *body; + ENTRY; - client_obd_list_lock(&cli->cl_loi_list_lock); - /* Don't shrink if we are already above or below the desired limit - * We don't want to shrink below a single RPC, as that will negatively - * impact block allocation and long-term performance. */ - if (target < cli->cl_max_pages_per_rpc) - target = cli->cl_max_pages_per_rpc; + client_obd_list_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; - if (target >= cli->cl_avail_grant) { - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(0); - } - client_obd_list_unlock(&cli->cl_loi_list_lock); + if (target_bytes >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + RETURN(0); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); - OBD_ALLOC_PTR(body); - if (!body) - RETURN(-ENOMEM); + OBD_ALLOC_PTR(body); + if (!body) + RETURN(-ENOMEM); - osc_announce_cached(cli, &body->oa, 0); + osc_announce_cached(cli, &body->oa, 0); - client_obd_list_lock(&cli->cl_loi_list_lock); - body->oa.o_grant = cli->cl_avail_grant - target; - cli->cl_avail_grant = target; + client_obd_list_lock(&cli->cl_loi_list_lock); + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; client_obd_list_unlock(&cli->cl_loi_list_lock); if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { body->oa.o_valid |= OBD_MD_FLFLAGS; @@ -1054,7 +995,6 @@ int osc_shrink_grant_to_target(struct client_obd *cli, long target) RETURN(rc); } -#define GRANT_SHRINK_LIMIT PTLRPC_MAX_BRW_SIZE static int osc_should_shrink_grant(struct client_obd *client) { cfs_time_t time = cfs_time_current(); @@ -1064,13 +1004,18 @@ static int osc_should_shrink_grant(struct client_obd *client) OBD_CONNECT_GRANT_SHRINK) == 0) return 0; - if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { - if (client->cl_import->imp_state == LUSTRE_IMP_FULL && - client->cl_avail_grant > GRANT_SHRINK_LIMIT) - return 1; - else - osc_update_next_shrink(client); - } + if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } return 0; } @@ -1128,23 +1073,25 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty; if (cli->cl_avail_grant < 0) { - CWARN("%s: available grant < 0, the OSS is probably not running" - " with patch from bug20278 (%ld) \n", - cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant); - /* workaround for 1.6 servers which do not have - * the patch from bug20278 */ - cli->cl_avail_grant = ocd->ocd_grant; + CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", + cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant, + ocd->ocd_grant, cli->cl_dirty); + /* workaround for servers which do not have the patch from + * LU-2679 */ + cli->cl_avail_grant = ocd->ocd_grant; } - client_obd_list_unlock(&cli->cl_loi_list_lock); + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize); + client_obd_list_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld \n", - cli->cl_import->imp_obd->obd_name, - cli->cl_avail_grant, cli->cl_lost_grant); + CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld." + "chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name, + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits); - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && - cfs_list_empty(&cli->cl_grant_shrink_list)) - osc_add_shrink_grant(cli); + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && + cfs_list_empty(&cli->cl_grant_shrink_list)) + osc_add_shrink_grant(cli); } /* We assume that the reason this OSC got a short read is because it read @@ -1161,29 +1108,29 @@ static void handle_short_read(int nob_read, obd_count page_count, while (nob_read > 0) { LASSERT (page_count > 0); - if (pga[i]->count > nob_read) { - /* EOF inside this page */ - ptr = cfs_kmap(pga[i]->pg) + - (pga[i]->off & ~CFS_PAGE_MASK); - memset(ptr + nob_read, 0, pga[i]->count - nob_read); - cfs_kunmap(pga[i]->pg); - page_count--; - i++; - break; - } + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } nob_read -= pga[i]->count; page_count--; i++; } - /* zero remaining pages */ - while (page_count-- > 0) { - ptr = cfs_kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); - memset(ptr, 0, pga[i]->count); - cfs_kunmap(pga[i]->pg); - i++; - } + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } } static int check_write_rcs(struct ptlrpc_request *req, @@ -1225,8 +1172,9 @@ static int check_write_rcs(struct ptlrpc_request *req, static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) { if (p1->flag != p2->flag) { - unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE| - OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA); + unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | + OBD_BRW_SYNC | OBD_BRW_ASYNC | + OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC); /* warn if we try to combine flags that we don't know to be * safe to combine */ @@ -1242,39 +1190,60 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) } static obd_count osc_checksum_bulk(int nob, obd_count pg_count, - struct brw_page **pga, int opc, - cksum_type_t cksum_type) -{ - __u32 cksum; - int i = 0; - - LASSERT (pg_count > 0); - cksum = init_checksum(cksum_type); - while (nob > 0 && pg_count > 0) { - unsigned char *ptr = cfs_kmap(pga[i]->pg); - int off = pga[i]->off & ~CFS_PAGE_MASK; - int count = pga[i]->count > nob ? nob : pga[i]->count; - - /* corrupt the data before we compute the checksum, to - * simulate an OST->client data error */ - if (i == 0 && opc == OST_READ && - OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) - memcpy(ptr + off, "bad1", min(4, nob)); - cksum = compute_checksum(cksum, ptr + off, count, cksum_type); - cfs_kunmap(pga[i]->pg); - LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d checksum %x\n", - off, cksum); - - nob -= pga[i]->count; - pg_count--; - i++; - } - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo */ - if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) - cksum++; - - return fini_checksum(cksum, cksum_type); + struct brw_page **pga, int opc, + cksum_type_t cksum_type) +{ + __u32 cksum; + int i = 0; + struct cfs_crypto_hash_desc *hdesc; + unsigned int bufsize; + int err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(hdesc); + } + + while (nob > 0 && pg_count > 0) { + int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~CFS_PAGE_MASK; + memcpy(ptr + off, "bad1", min(4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(hdesc, pga[i]->pg, + pga[i]->off & ~CFS_PAGE_MASK, + count); + LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n", + (int)(pga[i]->off & ~CFS_PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = 4; + err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); + + if (err) + cfs_crypto_hash_final(hdesc, NULL, NULL); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + return cksum; } static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, @@ -1331,13 +1300,14 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; - if (opc == OST_WRITE) - desc = ptlrpc_prep_bulk_imp(req, page_count, - BULK_GET_SOURCE, OST_BULK_PORTAL); - else - desc = ptlrpc_prep_bulk_imp(req, page_count, - BULK_PUT_SINK, OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK, + OST_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); @@ -1348,26 +1318,32 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); - lustre_set_wire_obdo(&body->oa, oa); - - obdo_to_ioobj(oa, ioobj); - ioobj->ioo_bufcnt = niocount; - osc_pack_capa(req, body, ocapa); - LASSERT (page_count > 0); - pg_prev = pga[0]; + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + osc_pack_capa(req, body, ocapa); + LASSERT(page_count > 0); + pg_prev = pga[0]; for (requested_nob = i = 0; i < page_count; i++, niobuf++) { struct brw_page *pg = pga[i]; int poff = pg->off & ~CFS_PAGE_MASK; LASSERT(pg->count > 0); /* make sure there is no gap in the middle of page array */ - LASSERTF(page_count == 1 || - (ergo(i == 0, poff + pg->count == CFS_PAGE_SIZE) && - ergo(i > 0 && i < page_count - 1, - poff == 0 && pg->count == CFS_PAGE_SIZE) && - ergo(i == page_count - 1, poff == 0)), - "i: %d/%d pg: %p off: "LPU64", count: %u\n", - i, page_count, pg, pg->off, pg->count); + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_CACHE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: "LPU64", count: %u\n", + i, page_count, pg, pg->off, pg->count); #ifdef __linux__ LASSERTF(i == 0 || pg->off > pg_prev->off, "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64 @@ -1383,7 +1359,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == (pg->flag & OBD_BRW_SRVLOCK)); - ptlrpc_prep_bulk_page(desc, pg->pg, poff, pg->count); + ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count); requested_nob += pg->count; if (i > 0 && can_merge_pages(pg_prev, pg)) { @@ -1509,20 +1485,18 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer, msg = "changed in transit AND doesn't match the original - " "likely false positive due to mmap IO (bug 11742)"; - LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID - " object "LPU64"/"LPU64" extent ["LPU64"-"LPU64"]\n", - msg, libcfs_nid2str(peer->nid), - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, - oa->o_id, - oa->o_valid & OBD_MD_FLGROUP ? oa->o_seq : (__u64)0, - pga[0]->off, - pga[page_count-1]->off + pga[page_count-1]->count - 1); - CERROR("original client csum %x (type %x), server csum %x (type %x), " - "client csum now %x\n", client_cksum, client_cksum_type, - server_cksum, cksum_type, new_cksum); - return 1; + LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID + " object "DOSTID" extent ["LPU64"-"LPU64"]\n", + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1); + CERROR("original client csum %x (type %x), server csum %x (type %x), " + "client csum now %x\n", client_cksum, client_cksum_type, + server_cksum, cksum_type, new_cksum); + return 1; } /* Note rc enters this function as number of bytes transferred */ @@ -1631,42 +1605,39 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) router = libcfs_nid2str(req->rq_bulk->bd_sender); } - if (server_cksum == ~0 && rc > 0) { - CERROR("Protocol error: server %s set the 'checksum' " - "bit, but didn't send a checksum. Not fatal, " - "but please notify on http://bugs.whamcloud.com/\n", - libcfs_nid2str(peer->nid)); - } else if (server_cksum != client_cksum) { - LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " - "%s%s%s inode "DFID" object " - LPU64"/"LPU64" extent " - "["LPU64"-"LPU64"]\n", - req->rq_import->imp_obd->obd_name, - libcfs_nid2str(peer->nid), - via, router, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_seq : (__u64)0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_oid : 0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_ver : 0, - body->oa.o_id, - body->oa.o_valid & OBD_MD_FLGROUP ? - body->oa.o_seq : (__u64)0, - aa->aa_ppga[0]->off, - aa->aa_ppga[aa->aa_page_count-1]->off + - aa->aa_ppga[aa->aa_page_count-1]->count - - 1); - CERROR("client %x, server %x, cksum_type %x\n", - client_cksum, server_cksum, cksum_type); - cksum_counter = 0; - aa->aa_oa->o_cksum = client_cksum; - rc = -EAGAIN; - } else { - cksum_counter++; - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - rc = 0; - } + if (server_cksum == ~0 && rc > 0) { + CERROR("Protocol error: server %s set the 'checksum' " + "bit, but didn't send a checksum. Not fatal, " + "but please notify on http://bugs.whamcloud.com/\n", + libcfs_nid2str(peer->nid)); + } else if (server_cksum != client_cksum) { + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " + "%s%s%s inode "DFID" object "DOSTID + " extent ["LPU64"-"LPU64"]\n", + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(peer->nid), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count-1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - + 1); + CERROR("client %x, server %x, cksum_type %x\n", + client_cksum, server_cksum, cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } } else if (unlikely(client_cksum)) { static int cksum_missed; @@ -1678,8 +1649,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) rc = 0; } out: - if (rc >= 0) - lustre_get_wire_obdo(aa->aa_oa, &body->oa); + if (rc >= 0) + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oa, &body->oa); RETURN(rc); } @@ -1689,16 +1661,16 @@ static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa, obd_count page_count, struct brw_page **pga, struct obd_capa *ocapa) { - struct ptlrpc_request *req; - int rc; - cfs_waitq_t waitq; - int generation, resends = 0; - struct l_wait_info lwi; + struct ptlrpc_request *req; + int rc; + wait_queue_head_t waitq; + int generation, resends = 0; + struct l_wait_info lwi; - ENTRY; + ENTRY; - cfs_waitq_init(&waitq); - generation = exp->exp_obd->u.cli.cl_import->imp_generation; + init_waitqueue_head(&waitq); + generation = exp->exp_obd->u.cli.cl_import->imp_generation; restart_bulk: rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm, @@ -1730,17 +1702,17 @@ restart_bulk: if (rc != -EINPROGRESS && !client_should_resend(resends, &exp->exp_obd->u.cli)) { CERROR("%s: too many resend retries for object: " - ""LPU64":"LPU64", rc = %d.\n", - exp->exp_obd->obd_name, oa->o_id, oa->o_seq, rc); - goto out; - } - if (generation != - exp->exp_obd->u.cli.cl_import->imp_generation) { - CDEBUG(D_HA, "%s: resend cross eviction for object: " - ""LPU64":"LPU64", rc = %d.\n", - exp->exp_obd->obd_name, oa->o_id, oa->o_seq, rc); + ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name, + POSTID(&oa->o_oi), rc); goto out; } + if (generation != + exp->exp_obd->u.cli.cl_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name, + POSTID(&oa->o_oi), rc); + goto out; + } lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL); @@ -1754,17 +1726,16 @@ out: RETURN (rc); } -int osc_brw_redo_request(struct ptlrpc_request *request, - struct osc_brw_async_args *aa) +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) { struct ptlrpc_request *new_req; - struct ptlrpc_request_set *set = request->rq_set; struct osc_brw_async_args *new_aa; struct osc_async_page *oap; - int rc = 0; ENTRY; - DEBUG_REQ(D_ERROR, request, "redo for recoverable error"); + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, @@ -1775,15 +1746,12 @@ int osc_brw_redo_request(struct ptlrpc_request *request, if (rc) RETURN(rc); - client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock); - cfs_list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { if (oap->oap_request != NULL) { LASSERTF(request == oap->oap_request, "request %p != oap_request %p\n", request, oap->oap_request); if (oap->oap_interrupted) { - client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); ptlrpc_req_finished(new_req); RETURN(-EINTR); } @@ -1794,15 +1762,23 @@ int osc_brw_redo_request(struct ptlrpc_request *request, aa->aa_resends++; new_req->rq_interpret_reply = request->rq_interpret_reply; new_req->rq_async_args = request->rq_async_args; - new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends; + new_req->rq_commit_cb = request->rq_commit_cb; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout; + else + new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends; new_req->rq_generation_set = 1; new_req->rq_import_generation = request->rq_import_generation; new_aa = ptlrpc_req_async_args(new_req); CFS_INIT_LIST_HEAD(&new_aa->aa_oaps); - cfs_list_splice(&aa->aa_oaps, &new_aa->aa_oaps); - CFS_INIT_LIST_HEAD(&aa->aa_oaps); + cfs_list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + CFS_INIT_LIST_HEAD(&new_aa->aa_exts); + cfs_list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; cfs_list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { if (oap->oap_request) { @@ -1814,16 +1790,14 @@ int osc_brw_redo_request(struct ptlrpc_request *request, new_aa->aa_ocapa = aa->aa_ocapa; aa->aa_ocapa = NULL; - /* use ptlrpc_set_add_req is safe because interpret functions work - * in check_set context. only one way exist with access to request - * from different thread got -EINTR - this way protected with - * cl_loi_list_lock */ - ptlrpc_set_add_req(set, new_req); + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1); - client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); - - DEBUG_REQ(D_INFO, new_req, "new request"); - RETURN(0); + DEBUG_REQ(D_INFO, new_req, "new request"); + RETURN(0); } /* @@ -1871,7 +1845,7 @@ static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages) if (pages == 0) /* that's all */ return count; - if (offset + pg[i]->count < CFS_PAGE_SIZE) + if (offset + pg[i]->count < PAGE_CACHE_SIZE) return count; /* doesn't end on page boundary */ i++; @@ -1977,1278 +1951,438 @@ out: RETURN(rc); } -/* The companion to osc_enter_cache(), called when @oap is no longer part of - * the dirty accounting. Writeback completes or truncate happens before - * writing starts. Must be called with the loi lock held. */ -static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap, - int sent) -{ - osc_release_write_grant(cli, &oap->oap_brw_page, sent); -} - - -/* This maintains the lists of pending pages to read/write for a given object - * (lop). This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint() - * to quickly find objects that are ready to send an RPC. */ -static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, - int cmd) +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) { + struct osc_brw_async_args *aa = data; + struct osc_extent *ext; + struct osc_extent *tmp; + struct cl_object *obj = NULL; + struct client_obd *cli = aa->aa_cli; ENTRY; - if (lop->lop_num_pending == 0) - RETURN(0); - - /* if we have an invalid import we want to drain the queued pages - * by forcing them through rpcs that immediately fail and complete - * the pages. recovery relies on this to empty the queued pages - * before canceling the locks and evicting down the llite pages */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(1); + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. */ + if (osc_recoverable_error(rc)) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: " + ""LPU64":"LPU64", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + RETURN(0); + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } - /* stream rpcs in queue order as long as as there is an urgent page - * queued. this is our cheap solution for good batching in the case - * where writepage marks some random page in the middle of the file - * as urgent because of, say, memory pressure */ - if (!cfs_list_empty(&lop->lop_urgent)) { - CDEBUG(D_CACHE, "urgent request forcing RPC\n"); - RETURN(1); + if (aa->aa_ocapa) { + capa_put(aa->aa_ocapa); + aa->aa_ocapa = NULL; } - if (cmd & OBD_BRW_WRITE) { - /* trigger a write rpc stream as long as there are dirtiers - * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!cfs_list_empty(&cli->cl_cache_waiters)) { - CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); - RETURN(1); - } - } - if (lop->lop_num_pending >= cli->cl_max_pages_per_rpc) - RETURN(1); + cfs_list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + if (obj == NULL && rc == 0) { + obj = osc2cl(ext->oe_obj); + cl_object_get(obj); + } + + cfs_list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, rc); + } + LASSERT(cfs_list_empty(&aa->aa_exts)); + LASSERT(cfs_list_empty(&aa->aa_oaps)); + + if (obj != NULL) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + + LASSERT(rc == 0); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + if (valid != 0) { + cl_object_attr_lock(obj); + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + cl_object_put(env, obj); + } + OBDO_FREE(aa->aa_oa); + + cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : + req->rq_bulk->bd_nob_transferred); + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(rc); +} + +static void brw_commit(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + /* If osc_inc_unstable_pages (via osc_extent_finish) races with + * this called via the rq_commit_cb, I need to ensure + * osc_dec_unstable_pages is still called. Otherwise unstable + * pages may be leaked. */ + if (req->rq_unstable) { + spin_unlock(&req->rq_lock); + osc_dec_unstable_pages(req); + spin_lock(&req->rq_lock); + } else { + req->rq_committed = 1; + } + spin_unlock(&req->rq_lock); +} - RETURN(0); +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + cfs_list_t *ext_list, int cmd, pdl_policy_t pol) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct cl_req *clerq = NULL; + enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : + CRT_READ; + struct ldlm_lock *lock = NULL; + struct cl_req_attr *crattr = NULL; + obd_off starting_offset = OBD_OBJECT_EOF; + obd_off ending_offset = 0; + int mpflag = 0; + int mem_tight = 0; + int page_count = 0; + int i; + int rc; + CFS_LIST_HEAD(rpc_list); + + ENTRY; + LASSERT(!cfs_list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + cfs_list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + cfs_list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + cfs_list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_CACHE_SIZE); + } + } + + if (mem_tight) + mpflag = cfs_memory_pressure_get_and_set(); + + OBD_ALLOC(crattr, sizeof(*crattr)); + if (crattr == NULL) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (pga == NULL) + GOTO(out, rc = -ENOMEM); + + OBDO_ALLOC(oa); + if (oa == NULL) + GOTO(out, rc = -ENOMEM); + + i = 0; + cfs_list_for_each_entry(oap, &rpc_list, oap_rpc_item) { + struct cl_page *page = oap2cl_page(oap); + if (clerq == NULL) { + clerq = cl_req_alloc(env, page, crt, + 1 /* only 1-object rpcs for now */); + if (IS_ERR(clerq)) + GOTO(out, rc = PTR_ERR(clerq)); + lock = oap->oap_ldlm_lock; + } + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", + pga[i]->pg, page_index(oap->oap_page), oap, + pga[i]->flag); + i++; + cl_req_page_add(env, clerq, page); + } + + /* always get the data for the obdo for the rpc */ + LASSERT(clerq != NULL); + crattr->cra_oa = oa; + cl_req_attr_set(env, clerq, crattr, ~0ULL); + if (lock) { + oa->o_handle = lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + + rc = cl_req_prep(env, clerq); + if (rc != 0) { + CERROR("cl_req_prep failed: %d\n", rc); + GOTO(out, rc); + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, + pga, &req, crattr->cra_capa, 1, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + GOTO(out, rc); + } + + req->rq_commit_cb = brw_commit; + req->rq_interpret_reply = brw_interpret; + + if (mem_tight != 0) + req->rq_memalloc = 1; + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + cl_req_attr_set(env, clerq, crattr, + OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); + + lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + CFS_INIT_LIST_HEAD(&aa->aa_oaps); + cfs_list_splice_init(&rpc_list, &aa->aa_oaps); + CFS_INIT_LIST_HEAD(&aa->aa_exts); + cfs_list_splice_init(ext_list, &aa->aa_exts); + aa->aa_clerq = clerq; + + /* queued sync pages can be torn down while the pages + * were between the pending list and the rpc */ + tmp = NULL; + cfs_list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + /* only one oap gets a request reference */ + if (tmp == NULL) + tmp = oap; + if (oap->oap_interrupted && !req->rq_intr) { + CDEBUG(D_INODE, "oap %p in req %p interrupted\n", + oap, req); + ptlrpc_mark_interrupted(req); + } + } + if (tmp != NULL) + tmp->oap_request = ptlrpc_request_addref(req); + + client_obd_list_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_CACHE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + + /* XXX: Maybe the caller can check the RPC bulk descriptor to + * see which CPU/NUMA node the majority of pages were allocated + * on, and try to assign the async RPC to the CPU core + * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic. + * + * But on the other hand, we expect that multiple ptlrpcd + * threads and the initial write sponsor can run in parallel, + * especially when data checksum is enabled, which is CPU-bound + * operation and single ptlrpcd thread cannot process in time. + * So more ptlrpcd threads sharing BRW load + * (with PDL_POLICY_ROUND) seems better. + */ + ptlrpcd_add_req(req, pol, -1); + rc = 0; + EXIT; + +out: + if (mem_tight != 0) + cfs_memory_pressure_restore(mpflag); + + if (crattr != NULL) { + capa_put(crattr->cra_capa); + OBD_FREE(crattr, sizeof(*crattr)); + } + + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBDO_FREE(oa); + if (pga) + OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + while (!cfs_list_empty(ext_list)) { + ext = cfs_list_entry(ext_list->next, struct osc_extent, + oe_link); + cfs_list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + if (clerq && !IS_ERR(clerq)) + cl_req_completion(env, clerq, rc); + } + RETURN(rc); } -static int lop_makes_hprpc(struct loi_oap_pages *lop) +static int osc_set_lock_data_with_check(struct ldlm_lock *lock, + struct ldlm_enqueue_info *einfo) { - struct osc_async_page *oap; - ENTRY; + void *data = einfo->ei_cbdata; + int set = 0; - if (cfs_list_empty(&lop->lop_urgent)) - RETURN(0); + LASSERT(lock != NULL); + LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); + LASSERT(lock->l_resource->lr_type == einfo->ei_type); + LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); + LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); - oap = cfs_list_entry(lop->lop_urgent.next, - struct osc_async_page, oap_urgent_item); + lock_res_and_lock(lock); + spin_lock(&osc_ast_guard); - if (oap->oap_async_flags & ASYNC_HP) { - CDEBUG(D_CACHE, "hp request forcing RPC\n"); - RETURN(1); - } + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; - RETURN(0); + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(lock); + + return set; } -static void on_list(cfs_list_t *item, cfs_list_t *list, - int should_be_on) +static int osc_set_data_with_check(struct lustre_handle *lockh, + struct ldlm_enqueue_info *einfo) { - if (cfs_list_empty(item) && should_be_on) - cfs_list_add_tail(item, list); - else if (!cfs_list_empty(item) && !should_be_on) - cfs_list_del_init(item); + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int set = 0; + + if (lock != NULL) { + set = osc_set_lock_data_with_check(lock, einfo); + LDLM_LOCK_PUT(lock); + } else + CERROR("lockh %p, data %p - client evicted?\n", + lockh, einfo->ei_cbdata); + return set; } -/* maintain the loi's cli list membership invariants so that osc_send_oap_rpc - * can find pages to build into rpcs quickly */ -void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) +static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, + ldlm_iterator_t replace, void *data) { - if (lop_makes_hprpc(&loi->loi_write_lop) || - lop_makes_hprpc(&loi->loi_read_lop)) { - /* HP rpc */ - on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, 0); - on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); - } else { - on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); - on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)|| - lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)); - } - - on_list(&loi->loi_write_item, &cli->cl_loi_write_list, - loi->loi_write_lop.lop_num_pending); + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); - on_list(&loi->loi_read_item, &cli->cl_loi_read_list, - loi->loi_read_lop.lop_num_pending); + ostid_build_res_name(&lsm->lsm_oi, &res_id); + ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); + return 0; } -static void lop_update_pending(struct client_obd *cli, - struct loi_oap_pages *lop, int cmd, int delta) +/* find any ldlm lock of the inode in osc + * return 0 not find + * 1 find one + * < 0 error */ +static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, + ldlm_iterator_t replace, void *data) { - lop->lop_num_pending += delta; - if (cmd & OBD_BRW_WRITE) - cli->cl_pending_w_pages += delta; - else - cli->cl_pending_r_pages += delta; + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + int rc = 0; + + ostid_build_res_name(&lsm->lsm_oi, &res_id); + rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); + if (rc == LDLM_ITER_STOP) + return(1); + if (rc == LDLM_ITER_CONTINUE) + return(0); + return(rc); } -/** - * this is called when a sync waiter receives an interruption. Its job is to - * get the caller woken as soon as possible. If its page hasn't been put in an - * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as - * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out. - */ -int osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap) +static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, + obd_enqueue_update_f upcall, void *cookie, + __u64 *flags, int agl, int rc) { - struct loi_oap_pages *lop; - struct lov_oinfo *loi; - int rc = -EBUSY; + int intent = *flags & LDLM_FL_HAS_INTENT; ENTRY; - LASSERT(!oap->oap_interrupted); - oap->oap_interrupted = 1; - - /* ok, it's been put in an rpc. only one oap gets a request reference */ - if (oap->oap_request != NULL) { - ptlrpc_mark_interrupted(oap->oap_request); - ptlrpcd_wake(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } + if (intent) { + /* The request was created before ldlm_cli_enqueue call. */ + if (rc == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + rep = req_capsule_server_get(&req->rq_pill, + &RMF_DLM_REP); - /* - * page completion may be called only if ->cpo_prep() method was - * executed by osc_io_submit(), that also adds page the to pending list - */ - if (!cfs_list_empty(&oap->oap_pending_item)) { - cfs_list_del_init(&oap->oap_pending_item); - cfs_list_del_init(&oap->oap_urgent_item); - - loi = oap->oap_loi; - lop = (oap->oap_cmd & OBD_BRW_WRITE) ? - &loi->loi_write_lop : &loi->loi_read_lop; - lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1); - loi_list_maint(oap->oap_cli, oap->oap_loi); - rc = oap->oap_caller_ops->ap_completion(env, - oap->oap_caller_data, - oap->oap_cmd, NULL, -EINTR); - } - - RETURN(rc); -} - -/* this is trying to propogate async writeback errors back up to the - * application. As an async write fails we record the error code for later if - * the app does an fsync. As long as errors persist we force future rpcs to be - * sync so that the app can get a sync error and break the cycle of queueing - * pages for which writeback will fail. */ -static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, - int rc) -{ - if (rc) { - if (!ar->ar_rc) - ar->ar_rc = rc; - - ar->ar_force_sync = 1; - ar->ar_min_xid = ptlrpc_sample_next_xid(); - return; - - } - - if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) - ar->ar_force_sync = 0; -} - -void osc_oap_to_pending(struct osc_async_page *oap) -{ - struct loi_oap_pages *lop; - - if (oap->oap_cmd & OBD_BRW_WRITE) - lop = &oap->oap_loi->loi_write_lop; - else - lop = &oap->oap_loi->loi_read_lop; - - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_add(&oap->oap_urgent_item, &lop->lop_urgent); - else if (oap->oap_async_flags & ASYNC_URGENT) - cfs_list_add_tail(&oap->oap_urgent_item, &lop->lop_urgent); - cfs_list_add_tail(&oap->oap_pending_item, &lop->lop_pending); - lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, 1); -} - -/* this must be called holding the loi list lock to give coverage to exit_cache, - * async_flag maintenance, and oap_request */ -static void osc_ap_completion(const struct lu_env *env, - struct client_obd *cli, struct obdo *oa, - struct osc_async_page *oap, int sent, int rc) -{ - __u64 xid = 0; - - ENTRY; - if (oap->oap_request != NULL) { - xid = ptlrpc_req_xid(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags = 0; - cfs_spin_unlock(&oap->oap_lock); - oap->oap_interrupted = 0; - - if (oap->oap_cmd & OBD_BRW_WRITE) { - osc_process_ar(&cli->cl_ar, xid, rc); - osc_process_ar(&oap->oap_loi->loi_ar, xid, rc); - } - - if (rc == 0 && oa != NULL) { - if (oa->o_valid & OBD_MD_FLBLOCKS) - oap->oap_loi->loi_lvb.lvb_blocks = oa->o_blocks; - if (oa->o_valid & OBD_MD_FLMTIME) - oap->oap_loi->loi_lvb.lvb_mtime = oa->o_mtime; - if (oa->o_valid & OBD_MD_FLATIME) - oap->oap_loi->loi_lvb.lvb_atime = oa->o_atime; - if (oa->o_valid & OBD_MD_FLCTIME) - oap->oap_loi->loi_lvb.lvb_ctime = oa->o_ctime; - } - - rc = oap->oap_caller_ops->ap_completion(env, oap->oap_caller_data, - oap->oap_cmd, oa, rc); - - /* cl_page_completion() drops PG_locked. so, a new I/O on the page could - * start, but OSC calls it under lock and thus we can add oap back to - * pending safely */ - if (rc) - /* upper layer wants to leave the page on pending queue */ - osc_oap_to_pending(oap); - else - osc_exit_cache(cli, oap, sent); - EXIT; -} - -static int brw_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - - CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); - - client_obd_list_lock(&cli->cl_loi_list_lock); - osc_check_rpcs0(env, cli, 1); - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(0); -} - -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc) -{ - struct osc_brw_async_args *aa = data; - struct client_obd *cli; - int async; - ENTRY; - - rc = osc_brw_fini_request(req, rc); - CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); - /* When server return -EINPROGRESS, client should always retry - * regardless of the number of times the bulk was resent already. */ - if (osc_recoverable_error(rc)) { - if (req->rq_import_generation != - req->rq_import->imp_generation) { - CDEBUG(D_HA, "%s: resend cross eviction for object: " - ""LPU64":"LPU64", rc = %d.\n", - req->rq_import->imp_obd->obd_name, - aa->aa_oa->o_id, aa->aa_oa->o_seq, rc); - } else if (rc == -EINPROGRESS || - client_should_resend(aa->aa_resends, aa->aa_cli)) { - rc = osc_brw_redo_request(req, aa); - } else { - CERROR("%s: too many resent retries for object: " - ""LPU64":"LPU64", rc = %d.\n", - req->rq_import->imp_obd->obd_name, - aa->aa_oa->o_id, aa->aa_oa->o_seq, rc); - } - - if (rc == 0) - RETURN(0); - else if (rc == -EAGAIN || rc == -EINPROGRESS) - rc = -EIO; - } - - if (aa->aa_ocapa) { - capa_put(aa->aa_ocapa); - aa->aa_ocapa = NULL; - } - - cli = aa->aa_cli; - client_obd_list_lock(&cli->cl_loi_list_lock); - - /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters - * is called so we know whether to go to sync BRWs or wait for more - * RPCs to complete */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) - cli->cl_w_in_flight--; - else - cli->cl_r_in_flight--; - - async = cfs_list_empty(&aa->aa_oaps); - if (!async) { /* from osc_send_oap_rpc() */ - struct osc_async_page *oap, *tmp; - /* the caller may re-use the oap after the completion call so - * we need to clean it up a little */ - cfs_list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, - oap_rpc_item) { - cfs_list_del_init(&oap->oap_rpc_item); - osc_ap_completion(env, cli, aa->aa_oa, oap, 1, rc); - } - OBDO_FREE(aa->aa_oa); - } else { /* from async_internal() */ - obd_count i; - for (i = 0; i < aa->aa_page_count; i++) - osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); - } - osc_wake_cache_waiters(cli); - osc_check_rpcs0(env, cli, 1); - client_obd_list_unlock(&cli->cl_loi_list_lock); - - if (!async) - cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : - req->rq_bulk->bd_nob_transferred); - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); - ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); - - RETURN(rc); -} - -static struct ptlrpc_request *osc_build_req(const struct lu_env *env, - struct client_obd *cli, - cfs_list_t *rpc_list, - int page_count, int cmd) -{ - struct ptlrpc_request *req; - struct brw_page **pga = NULL; - struct osc_brw_async_args *aa; - struct obdo *oa = NULL; - const struct obd_async_page_ops *ops = NULL; - struct osc_async_page *oap; - struct osc_async_page *tmp; - struct cl_req *clerq = NULL; - enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; - struct ldlm_lock *lock = NULL; - struct cl_req_attr crattr; - int i, rc, mpflag = 0; - - ENTRY; - LASSERT(!cfs_list_empty(rpc_list)); - - if (cmd & OBD_BRW_MEMALLOC) - mpflag = cfs_memory_pressure_get_and_set(); - - memset(&crattr, 0, sizeof crattr); - OBD_ALLOC(pga, sizeof(*pga) * page_count); - if (pga == NULL) - GOTO(out, req = ERR_PTR(-ENOMEM)); - - OBDO_ALLOC(oa); - if (oa == NULL) - GOTO(out, req = ERR_PTR(-ENOMEM)); - - i = 0; - cfs_list_for_each_entry(oap, rpc_list, oap_rpc_item) { - struct cl_page *page = osc_oap2cl_page(oap); - if (ops == NULL) { - ops = oap->oap_caller_ops; - - clerq = cl_req_alloc(env, page, crt, - 1 /* only 1-object rpcs for - * now */); - if (IS_ERR(clerq)) - GOTO(out, req = (void *)clerq); - lock = oap->oap_ldlm_lock; - } - pga[i] = &oap->oap_brw_page; - pga[i]->off = oap->oap_obj_off + oap->oap_page_off; - CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", - pga[i]->pg, cfs_page_index(oap->oap_page), oap, pga[i]->flag); - i++; - cl_req_page_add(env, clerq, page); - } - - /* always get the data for the obdo for the rpc */ - LASSERT(ops != NULL); - crattr.cra_oa = oa; - crattr.cra_capa = NULL; - memset(crattr.cra_jobid, 0, JOBSTATS_JOBID_SIZE); - cl_req_attr_set(env, clerq, &crattr, ~0ULL); - if (lock) { - oa->o_handle = lock->l_remote_handle; - oa->o_valid |= OBD_MD_FLHANDLE; - } - - rc = cl_req_prep(env, clerq); - if (rc != 0) { - CERROR("cl_req_prep failed: %d\n", rc); - GOTO(out, req = ERR_PTR(rc)); - } - - sort_brw_pages(pga, page_count); - rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, - pga, &req, crattr.cra_capa, 1, 0); - if (rc != 0) { - CERROR("prep_req failed: %d\n", rc); - GOTO(out, req = ERR_PTR(rc)); - } - - if (cmd & OBD_BRW_MEMALLOC) - req->rq_memalloc = 1; - - /* Need to update the timestamps after the request is built in case - * we race with setattr (locally or in queue at OST). If OST gets - * later setattr before earlier BRW (as determined by the request xid), - * the OST will not use BRW timestamps. Sadly, there is no obvious - * way to do this in a single call. bug 10150 */ - cl_req_attr_set(env, clerq, &crattr, - OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); - - lustre_msg_set_jobid(req->rq_reqmsg, crattr.cra_jobid); - - CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - CFS_INIT_LIST_HEAD(&aa->aa_oaps); - cfs_list_splice(rpc_list, &aa->aa_oaps); - CFS_INIT_LIST_HEAD(rpc_list); - aa->aa_clerq = clerq; -out: - if (cmd & OBD_BRW_MEMALLOC) - cfs_memory_pressure_restore(mpflag); - - capa_put(crattr.cra_capa); - if (IS_ERR(req)) { - if (oa) - OBDO_FREE(oa); - if (pga) - OBD_FREE(pga, sizeof(*pga) * page_count); - /* this should happen rarely and is pretty bad, it makes the - * pending list not follow the dirty order */ - client_obd_list_lock(&cli->cl_loi_list_lock); - cfs_list_for_each_entry_safe(oap, tmp, rpc_list, oap_rpc_item) { - cfs_list_del_init(&oap->oap_rpc_item); - - /* queued sync pages can be torn down while the pages - * were between the pending list and the rpc */ - if (oap->oap_interrupted) { - CDEBUG(D_INODE, "oap %p interrupted\n", oap); - osc_ap_completion(env, cli, NULL, oap, 0, - oap->oap_count); - continue; - } - osc_ap_completion(env, cli, NULL, oap, 0, PTR_ERR(req)); - } - if (clerq && !IS_ERR(clerq)) - cl_req_completion(env, clerq, PTR_ERR(req)); - } - RETURN(req); -} - -/** - * prepare pages for ASYNC io and put pages in send queue. - * - * \param cmd OBD_BRW_* macroses - * \param lop pending pages - * - * \return zero if no page added to send queue. - * \return 1 if pages successfully added to send queue. - * \return negative on errors. - */ -static int -osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, - struct lov_oinfo *loi, int cmd, - struct loi_oap_pages *lop, pdl_policy_t pol) -{ - struct ptlrpc_request *req; - obd_count page_count = 0; - struct osc_async_page *oap = NULL, *tmp; - struct osc_brw_async_args *aa; - const struct obd_async_page_ops *ops; - CFS_LIST_HEAD(rpc_list); - int srvlock = 0, mem_tight = 0; - struct cl_object *clob = NULL; - obd_off starting_offset = OBD_OBJECT_EOF; - unsigned int ending_offset; - int starting_page_off = 0; - ENTRY; - - /* ASYNC_HP pages first. At present, when the lock the pages is - * to be canceled, the pages covered by the lock will be sent out - * with ASYNC_HP. We have to send out them as soon as possible. */ - cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) { - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_move(&oap->oap_pending_item, &rpc_list); - else if (!(oap->oap_brw_flags & OBD_BRW_SYNC)) - /* only do this for writeback pages. */ - cfs_list_move_tail(&oap->oap_pending_item, &rpc_list); - if (++page_count >= cli->cl_max_pages_per_rpc) - break; - } - cfs_list_splice_init(&rpc_list, &lop->lop_pending); - page_count = 0; - - /* first we find the pages we're allowed to work with */ - cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_pending, - oap_pending_item) { - ops = oap->oap_caller_ops; - - LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, " - "magic 0x%x\n", oap, oap->oap_magic); - - if (clob == NULL) { - /* pin object in memory, so that completion call-backs - * can be safely called under client_obd_list lock. */ - clob = osc_oap2cl_page(oap)->cp_obj; - cl_object_get(clob); - } - - if (page_count != 0 && - srvlock != !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK)) { - CDEBUG(D_PAGE, "SRVLOCK flag mismatch," - " oap %p, page %p, srvlock %u\n", - oap, oap->oap_brw_page.pg, (unsigned)!srvlock); - break; - } - - /* If there is a gap at the start of this page, it can't merge - * with any previous page, so we'll hand the network a - * "fragmented" page array that it can't transfer in 1 RDMA */ - if (oap->oap_obj_off < starting_offset) { - if (starting_page_off != 0) - break; - - starting_page_off = oap->oap_page_off; - starting_offset = oap->oap_obj_off + starting_page_off; - } else if (oap->oap_page_off != 0) - break; - - /* in llite being 'ready' equates to the page being locked - * until completion unlocks it. commit_write submits a page - * as not ready because its unlock will happen unconditionally - * as the call returns. if we race with commit_write giving - * us that page we don't want to create a hole in the page - * stream, so we stop and leave the rpc to be fired by - * another dirtier or kupdated interval (the not ready page - * will still be on the dirty list). we could call in - * at the end of ll_file_write to process the queue again. */ - if (!(oap->oap_async_flags & ASYNC_READY)) { - int rc = ops->ap_make_ready(env, oap->oap_caller_data, - cmd); - if (rc < 0) - CDEBUG(D_INODE, "oap %p page %p returned %d " - "instead of ready\n", oap, - oap->oap_page, rc); - switch (rc) { - case -EAGAIN: - /* llite is telling us that the page is still - * in commit_write and that we should try - * and put it in an rpc again later. we - * break out of the loop so we don't create - * a hole in the sequence of pages in the rpc - * stream.*/ - oap = NULL; - break; - case -EINTR: - /* the io isn't needed.. tell the checks - * below to complete the rpc with EINTR */ - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - cfs_spin_unlock(&oap->oap_lock); - oap->oap_count = -EINTR; - break; - case 0: - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY; - cfs_spin_unlock(&oap->oap_lock); - break; - default: - LASSERTF(0, "oap %p page %p returned %d " - "from make_ready\n", oap, - oap->oap_page, rc); - break; - } - } - if (oap == NULL) - break; - - /* take the page out of our book-keeping */ - cfs_list_del_init(&oap->oap_pending_item); - lop_update_pending(cli, lop, cmd, -1); - cfs_list_del_init(&oap->oap_urgent_item); - - /* ask the caller for the size of the io as the rpc leaves. */ - if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = - ops->ap_refresh_count(env, oap->oap_caller_data, - cmd); - LASSERT(oap->oap_page_off + oap->oap_count <= CFS_PAGE_SIZE); - } - if (oap->oap_count <= 0) { - CDEBUG(D_CACHE, "oap %p count %d, completing\n", oap, - oap->oap_count); - osc_ap_completion(env, cli, NULL, - oap, 0, oap->oap_count); - continue; - } - - /* now put the page back in our accounting */ - cfs_list_add_tail(&oap->oap_rpc_item, &rpc_list); - if (page_count++ == 0) - srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK); - - if (oap->oap_brw_flags & OBD_BRW_MEMALLOC) - mem_tight = 1; - - /* End on a PTLRPC_MAX_BRW_SIZE boundary. We want full-sized - * RPCs aligned on PTLRPC_MAX_BRW_SIZE boundaries to help reads - * have the same alignment as the initial writes that allocated - * extents on the server. */ - ending_offset = oap->oap_obj_off + oap->oap_page_off + - oap->oap_count; - if (!(ending_offset & (PTLRPC_MAX_BRW_SIZE - 1))) - break; - - if (page_count >= cli->cl_max_pages_per_rpc) - break; - - /* If there is a gap at the end of this page, it can't merge - * with any subsequent pages, so we'll hand the network a - * "fragmented" page array that it can't transfer in 1 RDMA */ - if (oap->oap_page_off + oap->oap_count < CFS_PAGE_SIZE) - break; - } - - loi_list_maint(cli, loi); - - client_obd_list_unlock(&cli->cl_loi_list_lock); - - if (clob != NULL) - cl_object_put(env, clob); - - if (page_count == 0) { - client_obd_list_lock(&cli->cl_loi_list_lock); - RETURN(0); - } - - req = osc_build_req(env, cli, &rpc_list, page_count, - mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd); - if (IS_ERR(req)) { - LASSERT(cfs_list_empty(&rpc_list)); - loi_list_maint(cli, loi); - RETURN(PTR_ERR(req)); - } - - aa = ptlrpc_req_async_args(req); - - starting_offset &= PTLRPC_MAX_BRW_SIZE - 1; - if (cmd == OBD_BRW_READ) { - lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); - lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, - (starting_offset >> CFS_PAGE_SHIFT) + 1); - } else { - lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_write_rpc_hist, - cli->cl_w_in_flight); - lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, - (starting_offset >> CFS_PAGE_SHIFT) + 1); - } - - client_obd_list_lock(&cli->cl_loi_list_lock); - - if (cmd == OBD_BRW_READ) - cli->cl_r_in_flight++; - else - cli->cl_w_in_flight++; - - /* queued sync pages can be torn down while the pages - * were between the pending list and the rpc */ - tmp = NULL; - cfs_list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { - /* only one oap gets a request reference */ - if (tmp == NULL) - tmp = oap; - if (oap->oap_interrupted && !req->rq_intr) { - CDEBUG(D_INODE, "oap %p in req %p interrupted\n", - oap, req); - ptlrpc_mark_interrupted(req); - } - } - if (tmp != NULL) - tmp->oap_request = ptlrpc_request_addref(req); - - DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", - page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); - - req->rq_interpret_reply = brw_interpret; - - /* XXX: Maybe the caller can check the RPC bulk descriptor to see which - * CPU/NUMA node the majority of pages were allocated on, and try - * to assign the async RPC to the CPU core (PDL_POLICY_PREFERRED) - * to reduce cross-CPU memory traffic. - * - * But on the other hand, we expect that multiple ptlrpcd threads - * and the initial write sponsor can run in parallel, especially - * when data checksum is enabled, which is CPU-bound operation and - * single ptlrpcd thread cannot process in time. So more ptlrpcd - * threads sharing BRW load (with PDL_POLICY_ROUND) seems better. - */ - ptlrpcd_add_req(req, pol, -1); - RETURN(1); -} - -#define LOI_DEBUG(LOI, STR, args...) \ - CDEBUG(D_INODE, "loi ready %d wr %d:%d rd %d:%d " STR, \ - !cfs_list_empty(&(LOI)->loi_ready_item) || \ - !cfs_list_empty(&(LOI)->loi_hp_ready_item), \ - (LOI)->loi_write_lop.lop_num_pending, \ - !cfs_list_empty(&(LOI)->loi_write_lop.lop_urgent), \ - (LOI)->loi_read_lop.lop_num_pending, \ - !cfs_list_empty(&(LOI)->loi_read_lop.lop_urgent), \ - args) \ - -/* This is called by osc_check_rpcs() to find which objects have pages that - * we could be sending. These lists are maintained by lop_makes_rpc(). */ -struct lov_oinfo *osc_next_loi(struct client_obd *cli) -{ - ENTRY; - - /* First return objects that have blocked locks so that they - * will be flushed quickly and other clients can get the lock, - * then objects which have pages ready to be stuffed into RPCs */ - if (!cfs_list_empty(&cli->cl_loi_hp_ready_list)) - RETURN(cfs_list_entry(cli->cl_loi_hp_ready_list.next, - struct lov_oinfo, loi_hp_ready_item)); - if (!cfs_list_empty(&cli->cl_loi_ready_list)) - RETURN(cfs_list_entry(cli->cl_loi_ready_list.next, - struct lov_oinfo, loi_ready_item)); - - /* then if we have cache waiters, return all objects with queued - * writes. This is especially important when many small files - * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!cfs_list_empty(&cli->cl_cache_waiters) && - !cfs_list_empty(&cli->cl_loi_write_list)) - RETURN(cfs_list_entry(cli->cl_loi_write_list.next, - struct lov_oinfo, loi_write_item)); - - /* then return all queued objects when we have an invalid import - * so that they get flushed */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { - if (!cfs_list_empty(&cli->cl_loi_write_list)) - RETURN(cfs_list_entry(cli->cl_loi_write_list.next, - struct lov_oinfo, - loi_write_item)); - if (!cfs_list_empty(&cli->cl_loi_read_list)) - RETURN(cfs_list_entry(cli->cl_loi_read_list.next, - struct lov_oinfo, loi_read_item)); - } - RETURN(NULL); -} - -static int osc_max_rpc_in_flight(struct client_obd *cli, struct lov_oinfo *loi) -{ - struct osc_async_page *oap; - int hprpc = 0; - - if (!cfs_list_empty(&loi->loi_write_lop.lop_urgent)) { - oap = cfs_list_entry(loi->loi_write_lop.lop_urgent.next, - struct osc_async_page, oap_urgent_item); - hprpc = !!(oap->oap_async_flags & ASYNC_HP); - } - - if (!hprpc && !cfs_list_empty(&loi->loi_read_lop.lop_urgent)) { - oap = cfs_list_entry(loi->loi_read_lop.lop_urgent.next, - struct osc_async_page, oap_urgent_item); - hprpc = !!(oap->oap_async_flags & ASYNC_HP); - } - - return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; -} - -/* called with the loi list lock held */ -static void osc_check_rpcs0(const struct lu_env *env, struct client_obd *cli, int ptlrpc) -{ - struct lov_oinfo *loi; - int rc = 0, race_counter = 0; - pdl_policy_t pol; - ENTRY; - - pol = ptlrpc ? PDL_POLICY_SAME : PDL_POLICY_ROUND; - - while ((loi = osc_next_loi(cli)) != NULL) { - LOI_DEBUG(loi, "%lu in flight\n", rpcs_in_flight(cli)); - - if (osc_max_rpc_in_flight(cli, loi)) - break; - - /* attempt some read/write balancing by alternating between - * reads and writes in an object. The makes_rpc checks here - * would be redundant if we were getting read/write work items - * instead of objects. we don't want send_oap_rpc to drain a - * partial read pending queue when we're given this object to - * do io on writes while there are cache waiters */ - if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) { - rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE, - &loi->loi_write_lop, pol); - if (rc < 0) { - CERROR("Write request failed with %d\n", rc); - - /* osc_send_oap_rpc failed, mostly because of - * memory pressure. - * - * It can't break here, because if: - * - a page was submitted by osc_io_submit, so - * page locked; - * - no request in flight - * - no subsequent request - * The system will be in live-lock state, - * because there is no chance to call - * osc_io_unplug() and osc_check_rpcs() any - * more. pdflush can't help in this case, - * because it might be blocked at grabbing - * the page lock as we mentioned. - * - * Anyway, continue to drain pages. */ - /* break; */ - } - - if (rc > 0) - race_counter = 0; - else if (rc == 0) - race_counter++; - } - if (lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)) { - rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ, - &loi->loi_read_lop, pol); - if (rc < 0) - CERROR("Read request failed with %d\n", rc); - - if (rc > 0) - race_counter = 0; - else if (rc == 0) - race_counter++; - } - - /* attempt some inter-object balancing by issuing rpcs - * for each object in turn */ - if (!cfs_list_empty(&loi->loi_hp_ready_item)) - cfs_list_del_init(&loi->loi_hp_ready_item); - if (!cfs_list_empty(&loi->loi_ready_item)) - cfs_list_del_init(&loi->loi_ready_item); - if (!cfs_list_empty(&loi->loi_write_item)) - cfs_list_del_init(&loi->loi_write_item); - if (!cfs_list_empty(&loi->loi_read_item)) - cfs_list_del_init(&loi->loi_read_item); - - loi_list_maint(cli, loi); - - /* send_oap_rpc fails with 0 when make_ready tells it to - * back off. llite's make_ready does this when it tries - * to lock a page queued for write that is already locked. - * we want to try sending rpcs from many objects, but we - * don't want to spin failing with 0. */ - if (race_counter == 10) - break; - } -} - -void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) -{ - osc_check_rpcs0(env, cli, 0); -} - -/** - * Non-blocking version of osc_enter_cache() that consumes grant only when it - * is available. - */ -int osc_enter_cache_try(const struct lu_env *env, - struct client_obd *cli, struct lov_oinfo *loi, - struct osc_async_page *oap, int transient) -{ - int has_grant; - - has_grant = cli->cl_avail_grant >= CFS_PAGE_SIZE; - if (has_grant) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit += CFS_PAGE_SIZE; - cfs_atomic_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - } - return has_grant; -} - -/* Caller must hold loi_list_lock - we drop/regain it if we need to wait for - * grant or cache space. */ -static int osc_enter_cache(const struct lu_env *env, - struct client_obd *cli, struct lov_oinfo *loi, - struct osc_async_page *oap) -{ - struct osc_cache_waiter ocw; - struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); - int rc = -EDQUOT; - ENTRY; - - CDEBUG(D_CACHE, "dirty: %ld/%d dirty_max: %ld/%d dropped: %lu " - "grant: %lu\n", cli->cl_dirty, cfs_atomic_read(&obd_dirty_pages), - cli->cl_dirty_max, obd_max_dirty_pages, - cli->cl_lost_grant, cli->cl_avail_grant); - - /* force the caller to try sync io. this can jump the list - * of queued writes and create a discontiguous rpc stream */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - cli->cl_dirty_max < CFS_PAGE_SIZE || - cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) - RETURN(-EDQUOT); - - /* Hopefully normal case - cache space and write credits available */ - if (cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max && - cfs_atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages && - osc_enter_cache_try(env, cli, loi, oap, 0)) - RETURN(0); - - /* We can get here for two reasons: too many dirty pages in cache, or - * run out of grants. In both cases we should write dirty pages out. - * Adding a cache waiter will trigger urgent write-out no matter what - * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - cfs_waitq_init(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - while (cli->cl_dirty > 0) { - cfs_list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - - loi_list_maint(cli, loi); - osc_check_rpcs(env, cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli->cl_import->imp_obd->obd_name, &ocw, oap); - - rc = l_wait_event(ocw.ocw_waitq, cfs_list_empty(&ocw.ocw_entry), &lwi); - - client_obd_list_lock(&cli->cl_loi_list_lock); - cfs_list_del_init(&ocw.ocw_entry); - if (rc < 0) - break; - - rc = ocw.ocw_rc; - if (rc != -EDQUOT) - break; - } - - RETURN(rc); -} - - -int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, cfs_page_t *page, - obd_off offset, const struct obd_async_page_ops *ops, - void *data, void **res, int nocache, - struct lustre_handle *lockh) -{ - struct osc_async_page *oap; - - ENTRY; - - if (!page) - return cfs_size_round(sizeof(*oap)); - - oap = *res; - oap->oap_magic = OAP_MAGIC; - oap->oap_cli = &exp->exp_obd->u.cli; - oap->oap_loi = loi; - - oap->oap_caller_ops = ops; - oap->oap_caller_data = data; - - oap->oap_page = page; - oap->oap_obj_off = offset; - if (!client_is_remote(exp) && - cfs_capable(CFS_CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - - LASSERT(!(offset & ~CFS_PAGE_MASK)); - - CFS_INIT_LIST_HEAD(&oap->oap_pending_item); - CFS_INIT_LIST_HEAD(&oap->oap_urgent_item); - CFS_INIT_LIST_HEAD(&oap->oap_rpc_item); - CFS_INIT_LIST_HEAD(&oap->oap_page_list); - - cfs_spin_lock_init(&oap->oap_lock); - CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset); - RETURN(0); -} - -int osc_queue_async_io(const struct lu_env *env, struct obd_export *exp, - struct lov_stripe_md *lsm, struct lov_oinfo *loi, - struct osc_async_page *oap, int cmd, int off, - int count, obd_flag brw_flags, enum async_flags async_flags) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - int rc = 0; - ENTRY; - - if (oap->oap_magic != OAP_MAGIC) - RETURN(-EINVAL); - - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(-EIO); - - if (!cfs_list_empty(&oap->oap_pending_item) || - !cfs_list_empty(&oap->oap_urgent_item) || - !cfs_list_empty(&oap->oap_rpc_item)) - RETURN(-EBUSY); - - /* check if the file's owner/group is over quota */ - if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)) { - struct cl_object *obj; - struct cl_attr attr; /* XXX put attr into thread info */ - unsigned int qid[MAXQUOTAS]; - - obj = cl_object_top(osc_oap2cl_page(oap)->cp_obj); - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, &attr); - cl_object_attr_unlock(obj); - - qid[USRQUOTA] = attr.cat_uid; - qid[GRPQUOTA] = attr.cat_gid; - if (rc == 0 && - osc_quota_chkdq(cli, qid) == NO_QUOTA) - rc = -EDQUOT; - if (rc) - RETURN(rc); - } - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - client_obd_list_lock(&cli->cl_loi_list_lock); - - LASSERT(off + count <= CFS_PAGE_SIZE); - oap->oap_cmd = cmd; - oap->oap_page_off = off; - oap->oap_count = count; - oap->oap_brw_flags = brw_flags; - /* Give a hint to OST that requests are coming from kswapd - bug19529 */ - if (cfs_memory_pressure_get()) - oap->oap_brw_flags |= OBD_BRW_MEMALLOC; - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags = async_flags; - cfs_spin_unlock(&oap->oap_lock); - - if (cmd & OBD_BRW_WRITE) { - rc = osc_enter_cache(env, cli, loi, oap); - if (rc) { - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(rc); - } - } - - LOI_DEBUG(loi, "oap %p page %p added for cmd %d\n", oap, oap->oap_page, - cmd); - - osc_oap_to_pending(oap); - loi_list_maint(cli, loi); - if (!osc_max_rpc_in_flight(cli, loi) && - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) { - LASSERT(cli->cl_writeback_work != NULL); - rc = ptlrpcd_queue_work(cli->cl_writeback_work); - - CDEBUG(D_CACHE, "Queued writeback work for client obd %p/%d.\n", - cli, rc); - } - client_obd_list_unlock(&cli->cl_loi_list_lock); - - RETURN(0); -} - -/* aka (~was & now & flag), but this is more clear :) */ -#define SETTING(was, now, flag) (!(was & flag) && (now & flag)) - -int osc_set_async_flags_base(struct client_obd *cli, - struct lov_oinfo *loi, struct osc_async_page *oap, - obd_flag async_flags) -{ - struct loi_oap_pages *lop; - int flags = 0; - ENTRY; - - LASSERT(!cfs_list_empty(&oap->oap_pending_item)); - - if (oap->oap_cmd & OBD_BRW_WRITE) { - lop = &loi->loi_write_lop; - } else { - lop = &loi->loi_read_lop; - } - - if ((oap->oap_async_flags & async_flags) == async_flags) - RETURN(0); - - if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY)) - flags |= ASYNC_READY; - - if (SETTING(oap->oap_async_flags, async_flags, ASYNC_URGENT) && - cfs_list_empty(&oap->oap_rpc_item)) { - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_add(&oap->oap_urgent_item, &lop->lop_urgent); - else - cfs_list_add_tail(&oap->oap_urgent_item, - &lop->lop_urgent); - flags |= ASYNC_URGENT; - loi_list_maint(cli, loi); - } - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= flags; - cfs_spin_unlock(&oap->oap_lock); - - LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page, - oap->oap_async_flags); - RETURN(0); -} - -int osc_teardown_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, struct osc_async_page *oap) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct loi_oap_pages *lop; - int rc = 0; - ENTRY; - - if (oap->oap_magic != OAP_MAGIC) - RETURN(-EINVAL); - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - if (oap->oap_cmd & OBD_BRW_WRITE) { - lop = &loi->loi_write_lop; - } else { - lop = &loi->loi_read_lop; - } - - client_obd_list_lock(&cli->cl_loi_list_lock); - - if (!cfs_list_empty(&oap->oap_rpc_item)) - GOTO(out, rc = -EBUSY); - - osc_exit_cache(cli, oap, 0); - osc_wake_cache_waiters(cli); - - if (!cfs_list_empty(&oap->oap_urgent_item)) { - cfs_list_del_init(&oap->oap_urgent_item); - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags &= ~(ASYNC_URGENT | ASYNC_HP); - cfs_spin_unlock(&oap->oap_lock); - } - if (!cfs_list_empty(&oap->oap_pending_item)) { - cfs_list_del_init(&oap->oap_pending_item); - lop_update_pending(cli, lop, oap->oap_cmd, -1); - } - loi_list_maint(cli, loi); - LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page); -out: - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(rc); -} - -static int osc_set_lock_data_with_check(struct ldlm_lock *lock, - struct ldlm_enqueue_info *einfo) -{ - void *data = einfo->ei_cbdata; - int set = 0; - - LASSERT(lock != NULL); - LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); - LASSERT(lock->l_resource->lr_type == einfo->ei_type); - LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); - LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); - - lock_res_and_lock(lock); - cfs_spin_lock(&osc_ast_guard); - - if (lock->l_ast_data == NULL) - lock->l_ast_data = data; - if (lock->l_ast_data == data) - set = 1; - - cfs_spin_unlock(&osc_ast_guard); - unlock_res_and_lock(lock); - - return set; -} - -static int osc_set_data_with_check(struct lustre_handle *lockh, - struct ldlm_enqueue_info *einfo) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - int set = 0; - - if (lock != NULL) { - set = osc_set_lock_data_with_check(lock, einfo); - LDLM_LOCK_PUT(lock); - } else - CERROR("lockh %p, data %p - client evicted?\n", - lockh, einfo->ei_cbdata); - return set; -} - -static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_iterator_t replace, void *data) -{ - struct ldlm_res_id res_id; - struct obd_device *obd = class_exp2obd(exp); - - osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_seq, &res_id); - ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); - return 0; -} - -/* find any ldlm lock of the inode in osc - * return 0 not find - * 1 find one - * < 0 error */ -static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_iterator_t replace, void *data) -{ - struct ldlm_res_id res_id; - struct obd_device *obd = class_exp2obd(exp); - int rc = 0; - - osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_seq, &res_id); - rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); - if (rc == LDLM_ITER_STOP) - return(1); - if (rc == LDLM_ITER_CONTINUE) - return(0); - return(rc); -} - -static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, - obd_enqueue_update_f upcall, void *cookie, - int *flags, int agl, int rc) -{ - int intent = *flags & LDLM_FL_HAS_INTENT; - ENTRY; - - if (intent) { - /* The request was created before ldlm_cli_enqueue call. */ - if (rc == ELDLM_LOCK_ABORTED) { - struct ldlm_reply *rep; - rep = req_capsule_server_get(&req->rq_pill, - &RMF_DLM_REP); - - LASSERT(rep != NULL); - if (rep->lock_policy_res1) - rc = rep->lock_policy_res1; - } + LASSERT(rep != NULL); + rep->lock_policy_res1 = + ptlrpc_status_ntoh(rep->lock_policy_res1); + if (rep->lock_policy_res1) + rc = rep->lock_policy_res1; + } } if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) || @@ -3272,7 +2406,7 @@ static int osc_enqueue_interpret(const struct lu_env *env, __u32 mode; struct ost_lvb *lvb; __u32 lvb_len; - int *flags = aa->oa_flags; + __u64 *flags = aa->oa_flags; /* Make a local copy of a lock handle and a mode, because aa->oa_* * might be freed anytime after lock upcall has been called. */ @@ -3326,8 +2460,8 @@ static int osc_enqueue_interpret(const struct lu_env *env, } void osc_update_enqueue(struct lustre_handle *lov_lockhp, - struct lov_oinfo *loi, int flags, - struct ost_lvb *lvb, __u32 mode, int rc) + struct lov_oinfo *loi, __u64 flags, + struct ost_lvb *lvb, __u32 mode, int rc) { struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); @@ -3380,20 +2514,20 @@ struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; * is excluded from the cluster -- such scenarious make the life difficult, so * release locks just after they are obtained. */ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - int *flags, ldlm_policy_data_t *policy, - struct ost_lvb *lvb, int kms_valid, - obd_enqueue_update_f upcall, void *cookie, - struct ldlm_enqueue_info *einfo, - struct lustre_handle *lockh, - struct ptlrpc_request_set *rqset, int async, int agl) -{ - struct obd_device *obd = exp->exp_obd; - struct ptlrpc_request *req = NULL; - int intent = *flags & LDLM_FL_HAS_INTENT; - int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); - ldlm_mode_t mode; - int rc; - ENTRY; + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl) +{ + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + __u64 match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); + ldlm_mode_t mode; + int rc; + ENTRY; /* Filesystem lock extents are extended to page boundaries so that * dealing with the page cache is a little smoother. */ @@ -3429,7 +2563,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, if (mode) { struct ldlm_lock *matched = ldlm_handle2lock(lockh); - if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) { + if ((agl != 0) && !ldlm_is_lvb_ready(matched)) { /* For AGL, if enqueue RPC is sent but the lock is not * granted, then skip to process this strpe. * Return -ECANCELED to tell the caller. */ @@ -3448,7 +2582,10 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, * are explained in lov_enqueue() */ } - /* We already have a lock, and it's referenced */ + /* We already have a lock, and it's referenced. + * + * At this point, the cl_lock::cll_state is CLS_QUEUING, + * AGL upcall may change it to CLS_HELD directly. */ (*upcall)(cookie, ELDLM_OK); if (einfo->ei_mode != mode) @@ -3487,7 +2624,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, *flags &= ~LDLM_FL_BLOCK_GRANTED; rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, - sizeof(*lvb), lockh, async); + sizeof(*lvb), LVB_T_OST, lockh, async); if (rqset) { if (!rc) { struct osc_enqueue_args *aa; @@ -3529,9 +2666,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, int rc; ENTRY; - osc_build_res_name(oinfo->oi_md->lsm_object_id, - oinfo->oi_md->lsm_object_seq, &res_id); - + ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id); rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy, &oinfo->oi_md->lsm_oinfo[0]->loi_lvb, oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid, @@ -3541,14 +2676,14 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, } int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh, - int unref) + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *data, struct lustre_handle *lockh, + int unref) { - struct obd_device *obd = exp->exp_obd; - int lflags = *flags; - ldlm_mode_t rc; - ENTRY; + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + ldlm_mode_t rc; + ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) RETURN(-EIO); @@ -3611,10 +2746,10 @@ static int osc_cancel_unused(struct obd_export *exp, struct obd_device *obd = class_exp2obd(exp); struct ldlm_res_id res_id, *resp = NULL; - if (lsm != NULL) { - resp = osc_build_res_name(lsm->lsm_object_id, - lsm->lsm_object_seq, &res_id); - } + if (lsm != NULL) { + ostid_build_res_name(&lsm->lsm_oi, &res_id); + resp = &res_id; + } return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque); } @@ -3623,9 +2758,7 @@ static int osc_statfs_interpret(const struct lu_env *env, struct ptlrpc_request *req, struct osc_async_args *aa, int rc) { - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; struct obd_statfs *msfs; - __u64 used; ENTRY; if (rc == -EBADR) @@ -3648,51 +2781,6 @@ static int osc_statfs_interpret(const struct lu_env *env, GOTO(out, rc = -EPROTO); } - /* Reinitialize the RDONLY and DEGRADED flags at the client - * on each statfs, so they don't stay set permanently. */ - cfs_spin_lock(&cli->cl_oscc.oscc_lock); - - if (unlikely(msfs->os_state & OS_STATE_DEGRADED)) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_DEGRADED; - else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_DEGRADED)) - cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_DEGRADED; - - if (unlikely(msfs->os_state & OS_STATE_READONLY)) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_RDONLY; - else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_RDONLY)) - cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_RDONLY; - - /* Add a bit of hysteresis so this flag isn't continually flapping, - * and ensure that new files don't get extremely fragmented due to - * only a small amount of available space in the filesystem. - * We want to set the NOSPC flag when there is less than ~0.1% free - * and clear it when there is at least ~0.2% free space, so: - * avail < ~0.1% max max = avail + used - * 1025 * avail < avail + used used = blocks - free - * 1024 * avail < used - * 1024 * avail < blocks - free - * avail < ((blocks - free) >> 10) - * - * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to - * lose that amount of space so in those cases we report no space left - * if their is less than 1 GB left. */ - used = min_t(__u64,(msfs->os_blocks - msfs->os_bfree) >> 10, 1 << 30); - if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) == 0) && - ((msfs->os_ffree < 32) || (msfs->os_bavail < used)))) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_NOSPC; - else if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) != 0) && - (msfs->os_ffree > 64) && - (msfs->os_bavail > (used << 1)))) { - cli->cl_oscc.oscc_flags &= ~(OSCC_FLAG_NOSPC | - OSCC_FLAG_NOSPC_BLK); - } - - if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) != 0) && - (msfs->os_bavail < used))) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_NOSPC_BLK; - - cfs_spin_unlock(&cli->cl_oscc.oscc_lock); - *aa->aa_oi->oi_osfs = *msfs; out: rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); @@ -3755,10 +2843,10 @@ static int osc_statfs(const struct lu_env *env, struct obd_export *exp, /*Since the request might also come from lprocfs, so we need *sync this with client_disconnect_export Bug15684*/ - cfs_down_read(&obd->u.cli.cl_sem); + down_read(&obd->u.cli.cl_sem); if (obd->u.cli.cl_import) imp = class_import_get(obd->u.cli.cl_import); - cfs_up_read(&obd->u.cli.cl_sem); + up_read(&obd->u.cli.cl_sem); if (!imp) RETURN(-ENODEV); @@ -3827,7 +2915,7 @@ static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) /* we only need the header part from user space to get lmm_magic and * lmm_stripe_count, (the header part is common to v1 and v3) */ lum_size = sizeof(struct lov_user_md_v1); - if (cfs_copy_from_user(&lum, lump, lum_size)) + if (copy_from_user(&lum, lump, lum_size)) RETURN(-EFAULT); if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && @@ -3847,27 +2935,27 @@ static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) if (!lumk) RETURN(-ENOMEM); - if (lum.lmm_magic == LOV_USER_MAGIC_V1) - lmm_objects = &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); - else - lmm_objects = &(lumk->lmm_objects[0]); - lmm_objects->l_object_id = lsm->lsm_object_id; - } else { - lum_size = lov_mds_md_size(0, lum.lmm_magic); - lumk = &lum; - } + if (lum.lmm_magic == LOV_USER_MAGIC_V1) + lmm_objects = + &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); + else + lmm_objects = &(lumk->lmm_objects[0]); + lmm_objects->l_ost_oi = lsm->lsm_oi; + } else { + lum_size = lov_mds_md_size(0, lum.lmm_magic); + lumk = &lum; + } - lumk->lmm_object_id = lsm->lsm_object_id; - lumk->lmm_object_seq = lsm->lsm_object_seq; - lumk->lmm_stripe_count = 1; + lumk->lmm_oi = lsm->lsm_oi; + lumk->lmm_stripe_count = 1; - if (cfs_copy_to_user(lump, lumk, lum_size)) - rc = -EFAULT; + if (copy_to_user(lump, lumk, lum_size)) + rc = -EFAULT; - if (lumk != &lum) - OBD_FREE(lumk, lum_size); + if (lumk != &lum) + OBD_FREE(lumk, lum_size); - RETURN(rc); + RETURN(rc); } @@ -3879,10 +2967,10 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, int err = 0; ENTRY; - if (!cfs_try_module_get(THIS_MODULE)) { - CERROR("Can't get module. Is it alive?"); - return -EINVAL; - } + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } switch (cmd) { case OBD_IOC_LOV_GET_CONFIG: { char *buf; @@ -3917,7 +3005,7 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid)); - err = cfs_copy_to_user((void *)uarg, buf, len); + err = copy_to_user((void *)uarg, buf, len); if (err) err = -EFAULT; obd_ioctl_freedata(buf, len); @@ -3947,14 +3035,14 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case OBD_IOC_PING_TARGET: err = ptlrpc_obd_ping(obd); GOTO(out, err); - default: - CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", - cmd, cfs_curproc_comm()); - GOTO(out, err = -ENOTTY); - } + default: + CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", + cmd, current_comm()); + GOTO(out, err = -ENOTTY); + } out: - cfs_module_put(THIS_MODULE); - return err; + module_put(THIS_MODULE); + return err; } static int osc_get_info(const struct lu_env *env, struct obd_export *exp, @@ -4007,15 +3095,52 @@ static int osc_get_info(const struct lu_env *env, struct obd_export *exp, ptlrpc_req_finished(req); RETURN(rc); } else if (KEY_IS(KEY_FIEMAP)) { - struct ptlrpc_request *req; - struct ll_user_fiemap *reply; - char *tmp; - int rc; - + struct ll_fiemap_info_key *fm_key = + (struct ll_fiemap_info_key *)key; + struct ldlm_res_id res_id; + ldlm_policy_data_t policy; + struct lustre_handle lockh; + ldlm_mode_t mode = 0; + struct ptlrpc_request *req; + struct ll_user_fiemap *reply; + char *tmp; + int rc; + + if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC)) + goto skip_locking; + + policy.l_extent.start = fm_key->fiemap.fm_start & + CFS_PAGE_MASK; + + if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <= + fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = (fm_key->fiemap.fm_start + + fm_key->fiemap.fm_length + + PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK; + + ostid_build_res_name(&fm_key->oa.o_oi, &res_id); + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED | + LDLM_FL_LVB_READY, + &res_id, LDLM_EXTENT, &policy, + LCK_PR | LCK_PW, &lockh, 0); + if (mode) { /* lock is cached on client */ + if (mode != LCK_PR) { + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_PW); + } + } else { /* no cached lock, needs acquire lock on server side */ + fm_key->oa.o_valid |= OBD_MD_FLFLAGS; + fm_key->oa.o_flags |= OBD_FL_SRVLOCK; + } + +skip_locking: req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GET_INFO_FIEMAP); if (req == NULL) - RETURN(-ENOMEM); + GOTO(drop_lock, rc = -ENOMEM); req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, keylen); @@ -4027,7 +3152,7 @@ static int osc_get_info(const struct lu_env *env, struct obd_export *exp, rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); if (rc) { ptlrpc_request_free(req); - RETURN(rc); + GOTO(drop_lock, rc); } tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); @@ -4038,56 +3163,24 @@ static int osc_get_info(const struct lu_env *env, struct obd_export *exp, ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); if (rc) - GOTO(out1, rc); + GOTO(fini_req, rc); reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); if (reply == NULL) - GOTO(out1, rc = -EPROTO); + GOTO(fini_req, rc = -EPROTO); memcpy(val, reply, *vallen); - out1: +fini_req: ptlrpc_req_finished(req); - +drop_lock: + if (mode) + ldlm_lock_decref(&lockh, LCK_PR); RETURN(rc); } RETURN(-EINVAL); } -static int osc_setinfo_mds_connect_import(struct obd_import *imp) -{ - struct llog_ctxt *ctxt; - int rc = 0; - ENTRY; - - ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) { - rc = llog_initiator_connect(ctxt); - llog_ctxt_put(ctxt); - } else { - /* XXX return an error? skip setting below flags? */ - } - - cfs_spin_lock(&imp->imp_lock); - imp->imp_server_timeout = 1; - imp->imp_pingable = 1; - cfs_spin_unlock(&imp->imp_lock); - CDEBUG(D_RPCTRACE, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd)); - - RETURN(rc); -} - -static int osc_setinfo_mds_conn_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *aa, int rc) -{ - ENTRY; - if (rc != 0) - RETURN(rc); - - RETURN(osc_setinfo_mds_connect_import(req->rq_import)); -} - static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, obd_count keylen, void *key, obd_count vallen, void *val, struct ptlrpc_request_set *set) @@ -4101,32 +3194,6 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); - if (KEY_IS(KEY_NEXT_ID)) { - obd_id new_val; - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - if (vallen != sizeof(obd_id)) - RETURN(-ERANGE); - if (val == NULL) - RETURN(-EINVAL); - - if (vallen != sizeof(obd_id)) - RETURN(-EINVAL); - - /* avoid race between allocate new object and set next id - * from ll_sync thread */ - cfs_spin_lock(&oscc->oscc_lock); - new_val = *((obd_id*)val) + 1; - if (new_val > oscc->oscc_next_id) - oscc->oscc_next_id = new_val; - cfs_spin_unlock(&oscc->oscc_lock); - CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n", - exp->exp_obd->obd_name, - obd->u.cli.cl_oscc.oscc_next_id); - - RETURN(0); - } - if (KEY_IS(KEY_CHECKSUM)) { if (vallen != sizeof(int)) RETURN(-EINVAL); @@ -4144,6 +3211,33 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, RETURN(0); } + if (KEY_IS(KEY_CACHE_SET)) { + struct client_obd *cli = &obd->u.cli; + + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)val; + cfs_atomic_inc(&cli->cl_cache->ccc_users); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(cfs_list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + cfs_list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + int nr = cfs_atomic_read(&cli->cl_lru_in_list) >> 1; + int target = *(int *)val; + + nr = osc_lru_shrink(cli, min(nr, target)); + *(int *)val -= nr; + RETURN(0); + } + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) RETURN(-EINVAL); @@ -4154,38 +3248,31 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, Even if something bad goes through, we'd get a -EINVAL from OST anyway. */ - if (KEY_IS(KEY_GRANT_SHRINK)) - req = ptlrpc_request_alloc(imp, &RQF_OST_SET_GRANT_INFO); - else - req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); - - if (req == NULL) - RETURN(-ENOMEM); - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT, vallen); - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); - if (rc) { - ptlrpc_request_free(req); - RETURN(rc); - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); memcpy(tmp, val, vallen); - if (KEY_IS(KEY_MDS_CONN)) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - oscc->oscc_oa.o_seq = (*(__u32 *)val); - oscc->oscc_oa.o_valid |= OBD_MD_FLGROUP; - LASSERT_SEQ_IS_MDT(oscc->oscc_oa.o_seq); - req->rq_no_delay = req->rq_no_resend = 1; - req->rq_interpret_reply = osc_setinfo_mds_conn_interpret; - } else if (KEY_IS(KEY_GRANT_SHRINK)) { + if (KEY_IS(KEY_GRANT_SHRINK)) { struct osc_grant_args *aa; struct obdo *oa; @@ -4213,101 +3300,31 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, } -static struct llog_operations osc_size_repl_logops = { - lop_cancel: llog_obd_repl_cancel -}; - -static struct llog_operations osc_mds_ost_orig_logops; - -static int __osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, - struct obd_device *tgt, struct llog_catid *catid) -{ - int rc; - ENTRY; - - rc = llog_setup(obd, &obd->obd_olg, LLOG_MDS_OST_ORIG_CTXT, tgt, 1, - &catid->lci_logid, &osc_mds_ost_orig_logops); - if (rc) { - CERROR("failed LLOG_MDS_OST_ORIG_CTXT\n"); - GOTO(out, rc); - } - - rc = llog_setup(obd, &obd->obd_olg, LLOG_SIZE_REPL_CTXT, tgt, 1, - NULL, &osc_size_repl_logops); - if (rc) { - struct llog_ctxt *ctxt = - llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) - llog_cleanup(ctxt); - CERROR("failed LLOG_SIZE_REPL_CTXT\n"); - } - GOTO(out, rc); -out: - if (rc) { - CERROR("osc '%s' tgt '%s' catid %p rc=%d\n", - obd->obd_name, tgt->obd_name, catid, rc); - CERROR("logid "LPX64":0x%x\n", - catid->lci_logid.lgl_oid, catid->lci_logid.lgl_ogen); - } - return rc; -} - static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, struct obd_device *disk_obd, int *index) { - struct llog_catid catid; - static char name[32] = CATLIST; - int rc; - ENTRY; - - LASSERT(olg == &obd->obd_olg); - - cfs_mutex_lock(&olg->olg_cat_processing); - rc = llog_get_cat_list(disk_obd, name, *index, 1, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - CDEBUG(D_INFO, "%s: Init llog for %d - catid "LPX64"/"LPX64":%x\n", - obd->obd_name, *index, catid.lci_logid.lgl_oid, - catid.lci_logid.lgl_oseq, catid.lci_logid.lgl_ogen); - - rc = __osc_llog_init(obd, olg, disk_obd, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - rc = llog_put_cat_list(disk_obd, name, *index, 1, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - out: - cfs_mutex_unlock(&olg->olg_cat_processing); - - return rc; + /* this code is not supposed to be used with LOD/OSP + * to be removed soon */ + LBUG(); + return 0; } static int osc_llog_finish(struct obd_device *obd, int count) { - struct llog_ctxt *ctxt; - int rc = 0, rc2 = 0; - ENTRY; + struct llog_ctxt *ctxt; - ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) - rc = llog_cleanup(ctxt); + ENTRY; - ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); - if (ctxt) - rc2 = llog_cleanup(ctxt); - if (!rc) - rc = rc2; + ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); + if (ctxt) { + llog_cat_close(NULL, ctxt->loc_handle); + llog_cleanup(NULL, ctxt); + } - RETURN(rc); + ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + RETURN(0); } static int osc_reconnect(const struct lu_env *env, @@ -4323,20 +3340,17 @@ static int osc_reconnect(const struct lu_env *env, client_obd_list_lock(&cli->cl_loi_list_lock); data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?: - 2 * cli->cl_max_pages_per_rpc << CFS_PAGE_SHIFT; + 2 * cli_brw_size(obd); lost_grant = cli->cl_lost_grant; cli->cl_lost_grant = 0; client_obd_list_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "request ocd_grant: %d cl_avail_grant: %ld " - "cl_dirty: %ld cl_lost_grant: %ld\n", data->ocd_grant, - cli->cl_avail_grant, cli->cl_dirty, lost_grant); CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d" - " ocd_grant: %d\n", data->ocd_connect_flags, - data->ocd_version, data->ocd_grant); - } + " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } - RETURN(0); + RETURN(0); } static int osc_disconnect(struct obd_export *exp) @@ -4350,7 +3364,7 @@ static int osc_disconnect(struct obd_export *exp) if (obd->u.cli.cl_conn_count == 1) { /* Flush any remaining cancel messages out to the * target */ - llog_sync(ctxt, exp); + llog_sync(ctxt, exp, 0); } llog_ctxt_put(ctxt); } else { @@ -4393,14 +3407,6 @@ static int osc_import_event(struct obd_device *obd, switch (event) { case IMP_EVENT_DISCON: { - /* Only do this on the MDS OSC's */ - if (imp->imp_server_timeout) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - cfs_spin_lock(&oscc->oscc_lock); - oscc->oscc_flags |= OSCC_FLAG_RECOVERING; - cfs_spin_unlock(&oscc->oscc_lock); - } cli = &obd->u.cli; client_obd_list_lock(&cli->cl_loi_list_lock); cli->cl_avail_grant = 0; @@ -4421,11 +3427,9 @@ static int osc_import_event(struct obd_device *obd, if (!IS_ERR(env)) { /* Reset grants */ cli = &obd->u.cli; - client_obd_list_lock(&cli->cl_loi_list_lock); /* all pages go to failing rpcs due to the invalid * import */ - osc_check_rpcs(env, cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); + osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND); ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); cl_env_put(env, &refcheck); @@ -4434,15 +3438,6 @@ static int osc_import_event(struct obd_device *obd, break; } case IMP_EVENT_ACTIVE: { - /* Only do this on the MDS OSC's */ - if (imp->imp_server_timeout) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - cfs_spin_lock(&oscc->oscc_lock); - oscc->oscc_flags &= ~(OSCC_FLAG_NOSPC | - OSCC_FLAG_NOSPC_BLK); - cfs_spin_unlock(&oscc->oscc_lock); - } rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); break; } @@ -4500,58 +3495,70 @@ static int osc_cancel_for_recovery(struct ldlm_lock *lock) RETURN(0); } -int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +static int brw_queue_work(const struct lu_env *env, void *data) { - struct client_obd *cli = &obd->u.cli; - int rc; - ENTRY; - - ENTRY; - rc = ptlrpcd_addref(); - if (rc) - RETURN(rc); - - rc = client_obd_setup(obd, lcfg); - if (rc == 0) { - void *handler; - handler = ptlrpcd_alloc_work(cli->cl_import, - brw_queue_work, cli); - if (!IS_ERR(handler)) - cli->cl_writeback_work = handler; - else - rc = PTR_ERR(handler); - } + struct client_obd *cli = data; - if (rc == 0) { - struct lprocfs_static_vars lvars = { 0 }; + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); - cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; - lprocfs_osc_init_vars(&lvars); - if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) { - lproc_osc_attach_seqstat(obd); - sptlrpc_lprocfs_cliobd_attach(obd); - ptlrpc_lprocfs_register_obd(obd); - } - - oscc_init(obd); - /* We need to allocate a few requests more, because - brw_interpret tries to create new requests before freeing - previous ones. Ideally we want to have 2x max_rpcs_in_flight - reserved, but I afraid that might be too much wasted RAM - in fact, so 2 is just my guess and still should work. */ - cli->cl_import->imp_rq_pool = - ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, - OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); - - CFS_INIT_LIST_HEAD(&cli->cl_grant_shrink_list); - - ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); - } + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(0); +} - if (rc) - ptlrpcd_decref(); - RETURN(rc); +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { 0 }; + struct client_obd *cli = &obd->u.cli; + void *handler; + int rc; + ENTRY; + + rc = ptlrpcd_addref(); + if (rc) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(out_ptlrpcd, rc); + + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_client_setup, rc = PTR_ERR(handler)); + cli->cl_writeback_work = handler; + + rc = osc_quota_setup(obd); + if (rc) + GOTO(out_ptlrpcd_work, rc); + + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + lprocfs_osc_init_vars(&lvars); + if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) { + lproc_osc_attach_seqstat(obd); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + } + + /* We need to allocate a few requests more, because + * brw_interpret tries to create new requests before freeing + * previous ones, Ideally we want to have 2x max_rpcs_in_flight + * reserved, but I'm afraid that might be too much wasted RAM + * in fact, so 2 is just my guess and still should work. */ + cli->cl_import->imp_rq_pool = + ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, + OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + CFS_INIT_LIST_HEAD(&cli->cl_grant_shrink_list); + ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); + RETURN(rc); + +out_ptlrpcd_work: + ptlrpcd_destroy_work(handler); +out_client_setup: + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + RETURN(rc); } static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) @@ -4566,9 +3573,9 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name); /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */ ptlrpc_deactivate_import(imp); - cfs_spin_lock(&imp->imp_lock); - imp->imp_pingable = 0; - cfs_spin_unlock(&imp->imp_lock); + spin_lock(&imp->imp_lock); + imp->imp_pingable = 0; + spin_unlock(&imp->imp_lock); break; } case OBD_CLEANUP_EXPORTS: { @@ -4601,9 +3608,21 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) int osc_cleanup(struct obd_device *obd) { - int rc; + struct client_obd *cli = &obd->u.cli; + int rc; - ENTRY; + ENTRY; + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(cfs_atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + cfs_list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + cfs_atomic_dec(&cli->cl_cache->ccc_users); + cli->cl_cache = NULL; + } /* free memory of osc quota cache */ osc_quota_cleanup(obd); @@ -4652,9 +3671,7 @@ struct obd_ops osc_obd_ops = { .o_statfs_async = osc_statfs_async, .o_packmd = osc_packmd, .o_unpackmd = osc_unpackmd, - .o_precreate = osc_precreate, .o_create = osc_create, - .o_create_async = osc_create_async, .o_destroy = osc_destroy, .o_getattr = osc_getattr, .o_getattr_async = osc_getattr_async, @@ -4677,12 +3694,11 @@ struct obd_ops osc_obd_ops = { .o_process_config = osc_process_config, .o_quotactl = osc_quotactl, .o_quotacheck = osc_quotacheck, - .o_quota_adjust_qunit = osc_quota_adjust_qunit, }; extern struct lu_kmem_descr osc_caches[]; -extern cfs_spinlock_t osc_ast_guard; -extern cfs_lock_class_key_t osc_ast_guard_class; +extern spinlock_t osc_ast_guard; +extern struct lock_class_key osc_ast_guard_class; int __init osc_init(void) { @@ -4696,10 +3712,11 @@ int __init osc_init(void) CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); rc = lu_kmem_init(osc_caches); + if (rc) + RETURN(rc); lprocfs_osc_init_vars(&lvars); - osc_quota_init(); rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars, LUSTRE_OSC_NAME, &osc_device_type); if (rc) { @@ -4707,26 +3724,17 @@ int __init osc_init(void) RETURN(rc); } - cfs_spin_lock_init(&osc_ast_guard); - cfs_lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); - - osc_mds_ost_orig_logops = llog_lvfs_ops; - osc_mds_ost_orig_logops.lop_setup = llog_obd_origin_setup; - osc_mds_ost_orig_logops.lop_cleanup = llog_obd_origin_cleanup; - osc_mds_ost_orig_logops.lop_add = llog_obd_origin_add; - osc_mds_ost_orig_logops.lop_connect = llog_origin_connect; + spin_lock_init(&osc_ast_guard); + lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); - RETURN(rc); + RETURN(rc); } #ifdef __KERNEL__ static void /*__exit*/ osc_exit(void) { - lu_device_type_fini(&osc_device_type); - - osc_quota_exit(); - class_unregister_type(LUSTRE_OSC_NAME); - lu_kmem_fini(osc_caches); + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); } MODULE_AUTHOR("Sun Microsystems, Inc. ");