X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosc%2Fosc_request.c;h=93567bc54bf439f04f25a77c772f2352a8144331;hp=f79314f624b2407369b875793706cef86d636c22;hb=d6f104f056a5ec6e82e19f12f6faefa0d3ca10a9;hpb=0204171fd3e1b393c53bd374aff228e80080a55a diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index f79314f..93567bc 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -28,18 +26,14 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. - */ -/* - * Copyright (c) 2011 Whamcloud, Inc. + * + * Copyright (c) 2011, 2013, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_OSC #include @@ -52,8 +46,6 @@ #include #include #include -#include -#include #ifdef __CYGWIN__ # include @@ -61,111 +53,121 @@ #include #include +#include #include #include #include +#include #include "osc_internal.h" +#include "osc_cl_internal.h" + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + obd_count aa_page_count; + int aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; + struct obd_capa *aa_ocapa; + struct cl_req *aa_clerq; +}; -static void osc_release_ppga(struct brw_page **ppga, obd_count count); -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc); -static void osc_check_rpcs0(const struct lu_env *env, struct client_obd *cli, - int ptlrpc); -int osc_cleanup(struct obd_device *obd); - -/* Pack OSC object metadata for disk storage (LE byte order). */ -static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, - struct lov_stripe_md *lsm) -{ - int lmm_size; - ENTRY; +#define osc_grant_args osc_brw_async_args - lmm_size = sizeof(**lmmp); - if (!lmmp) - RETURN(lmm_size); +struct osc_async_args { + struct obd_info *aa_oi; +}; - if (*lmmp && !lsm) { - OBD_FREE(*lmmp, lmm_size); - *lmmp = NULL; - RETURN(0); - } +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; - if (!*lmmp) { - OBD_ALLOC(*lmmp, lmm_size); - if (!*lmmp) - RETURN(-ENOMEM); - } +struct osc_fsync_args { + struct obd_info *fa_oi; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; - if (lsm) { - LASSERT(lsm->lsm_object_id); - LASSERT_SEQ_IS_MDT(lsm->lsm_object_seq); - (*lmmp)->lmm_object_id = cpu_to_le64(lsm->lsm_object_id); - (*lmmp)->lmm_object_seq = cpu_to_le64(lsm->lsm_object_seq); - } +struct osc_enqueue_args { + struct obd_export *oa_exp; + __u64 *oa_flags; + obd_enqueue_update_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle *oa_lockh; + struct ldlm_enqueue_info *oa_ei; + unsigned int oa_agl:1; +}; - RETURN(lmm_size); -} +static void osc_release_ppga(struct brw_page **ppga, obd_count count); +static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req, + void *data, int rc); /* Unpack OSC object metadata from disk storage (LE byte order). */ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, - struct lov_mds_md *lmm, int lmm_bytes) -{ - int lsm_size; - struct obd_import *imp = class_exp2cliimp(exp); - ENTRY; - - if (lmm != NULL) { - if (lmm_bytes < sizeof (*lmm)) { - CERROR("lov_mds_md too small: %d, need %d\n", - lmm_bytes, (int)sizeof(*lmm)); - RETURN(-EINVAL); - } - /* XXX LOV_MAGIC etc check? */ - - if (lmm->lmm_object_id == 0) { - CERROR("lov_mds_md: zero lmm_object_id\n"); - RETURN(-EINVAL); - } - } - - lsm_size = lov_stripe_md_size(1); - if (lsmp == NULL) - RETURN(lsm_size); - - if (*lsmp != NULL && lmm == NULL) { - OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); - OBD_FREE(*lsmp, lsm_size); - *lsmp = NULL; - RETURN(0); - } - - if (*lsmp == NULL) { - OBD_ALLOC(*lsmp, lsm_size); - if (*lsmp == NULL) - RETURN(-ENOMEM); - OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); - if ((*lsmp)->lsm_oinfo[0] == NULL) { - OBD_FREE(*lsmp, lsm_size); - RETURN(-ENOMEM); - } - loi_init((*lsmp)->lsm_oinfo[0]); - } - - if (lmm != NULL) { - /* XXX zero *lsmp? */ - (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id); - (*lsmp)->lsm_object_seq = le64_to_cpu (lmm->lmm_object_seq); - LASSERT((*lsmp)->lsm_object_id); - LASSERT_SEQ_IS_MDT((*lsmp)->lsm_object_seq); - } - - if (imp != NULL && - (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) - (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; - else - (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; - - RETURN(lsm_size); + struct lov_mds_md *lmm, int lmm_bytes) +{ + int lsm_size; + struct obd_import *imp = class_exp2cliimp(exp); + ENTRY; + + if (lmm != NULL) { + if (lmm_bytes < sizeof(*lmm)) { + CERROR("%s: lov_mds_md too small: %d, need %d\n", + exp->exp_obd->obd_name, lmm_bytes, + (int)sizeof(*lmm)); + RETURN(-EINVAL); + } + /* XXX LOV_MAGIC etc check? */ + + if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) { + CERROR("%s: zero lmm_object_id: rc = %d\n", + exp->exp_obd->obd_name, -EINVAL); + RETURN(-EINVAL); + } + } + + lsm_size = lov_stripe_md_size(1); + if (lsmp == NULL) + RETURN(lsm_size); + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + RETURN(0); + } + + if (*lsmp == NULL) { + OBD_ALLOC(*lsmp, lsm_size); + if (unlikely(*lsmp == NULL)) + RETURN(-ENOMEM); + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) { + OBD_FREE(*lsmp, lsm_size); + RETURN(-ENOMEM); + } + loi_init((*lsmp)->lsm_oinfo[0]); + } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) { + RETURN(-EBADF); + } + + if (lmm != NULL) + /* XXX zero *lsmp? */ + ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi); + + if (imp != NULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) + (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; + else + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + + RETURN(lsm_size); } static inline void osc_pack_capa(struct ptlrpc_request *req, @@ -187,13 +189,14 @@ static inline void osc_pack_capa(struct ptlrpc_request *req, static inline void osc_pack_req_body(struct ptlrpc_request *req, struct obd_info *oinfo) { - struct ost_body *body; + struct ost_body *body; - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - osc_pack_capa(req, body, oinfo->oi_capa); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); } static inline void osc_set_capa_size(struct ptlrpc_request *req, @@ -219,12 +222,13 @@ static int osc_getattr_interpret(const struct lu_env *env, body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); if (body) { - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa); + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oi->oi_oa, &body->oa); - /* This should really be sent by the OST */ - aa->aa_oi->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + /* This should really be sent by the OST */ + aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE; + aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; } else { CDEBUG(D_INFO, "can't unpack ost_body\n"); rc = -EPROTO; @@ -267,7 +271,8 @@ static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo, RETURN(0); } -static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo) +static int osc_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) { struct ptlrpc_request *req; struct ost_body *body; @@ -297,12 +302,12 @@ static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo) if (body == NULL) GOTO(out, rc = -EPROTO); - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); - /* This should really be sent by the OST */ - oinfo->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd); + oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; EXIT; out: @@ -310,8 +315,8 @@ static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo) return rc; } -static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo, - struct obd_trans_info *oti) +static int osc_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) { struct ptlrpc_request *req; struct ost_body *body; @@ -343,7 +348,8 @@ static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo, if (body == NULL) GOTO(out, rc = -EPROTO); - lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa, + &body->oa); EXIT; out: @@ -365,7 +371,8 @@ static int osc_setattr_interpret(const struct lu_env *env, if (body == NULL) GOTO(out, rc = -EPROTO); - lustre_get_wire_obdo(sa->sa_oa, &body->oa); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, + &body->oa); out: rc = sa->sa_upcall(sa->sa_cookie, rc); RETURN(rc); @@ -461,7 +468,8 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); LASSERT(body); - lustre_set_wire_obdo(&body->oa, oa); + + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); ptlrpc_request_set_replen(req); @@ -481,26 +489,24 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, if (body == NULL) GOTO(out_req, rc = -EPROTO); - lustre_get_wire_obdo(oa, &body->oa); + CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); - /* This should really be sent by the OST */ - oa->o_blksize = PTLRPC_MAX_BRW_SIZE; - oa->o_valid |= OBD_MD_FLBLKSZ; + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; - /* XXX LOV STACKING: the lsm that is passed to us from LOV does not - * have valid lsm_oinfo data structs, so don't go touching that. - * This needs to be fixed in a big way. - */ - lsm->lsm_object_id = oa->o_id; - lsm->lsm_object_seq = oa->o_seq; - *ea = lsm; + /* XXX LOV STACKING: the lsm that is passed to us from LOV does not + * have valid lsm_oinfo data structs, so don't go touching that. + * This needs to be fixed in a big way. + */ + lsm->lsm_oi = oa->o_oi; + *ea = lsm; if (oti != NULL) { - oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); - if (oa->o_valid & OBD_MD_FLCOOKIE) { - if (!oti->oti_logcookies) - oti_alloc_cookies(oti, 1); + if (oti->oti_logcookies == NULL) + oti->oti_logcookies = &oti->oti_onecookie; + *oti->oti_logcookies = oa->o_lcookie; } } @@ -538,14 +544,14 @@ int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - osc_pack_capa(req, body, oinfo->oi_capa); + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); ptlrpc_request_set_replen(req); - req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args)); sa = ptlrpc_req_async_args(req); @@ -560,22 +566,11 @@ int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, RETURN(0); } -static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, - struct obd_trans_info *oti, - struct ptlrpc_request_set *rqset) -{ - oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start; - oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end; - oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - return osc_punch_base(exp, oinfo, - oinfo->oi_cb_up, oinfo, rqset); -} - static int osc_sync_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *arg, int rc) { - struct osc_async_args *aa = arg; + struct osc_fsync_args *fa = arg; struct ost_body *body; ENTRY; @@ -588,27 +583,22 @@ static int osc_sync_interpret(const struct lu_env *env, GOTO(out, rc = -EPROTO); } - *aa->aa_oi->oi_oa = body->oa; + *fa->fa_oi->oi_oa = body->oa; out: - rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); - RETURN(rc); + rc = fa->fa_upcall(fa->fa_cookie, rc); + RETURN(rc); } -static int osc_sync(struct obd_export *exp, struct obd_info *oinfo, - obd_size start, obd_size end, - struct ptlrpc_request_set *set) +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) { - struct ptlrpc_request *req; - struct ost_body *body; - struct osc_async_args *aa; + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; int rc; ENTRY; - if (!oinfo->oi_oa) { - CDEBUG(D_INFO, "oa NULL\n"); - RETURN(-EINVAL); - } - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); if (req == NULL) RETURN(-ENOMEM); @@ -620,32 +610,36 @@ static int osc_sync(struct obd_export *exp, struct obd_info *oinfo, RETURN(rc); } - /* overload the size and blocks fields in the oa with start/end */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); - body->oa.o_size = start; - body->oa.o_blocks = end; - body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); - osc_pack_capa(req, body, oinfo->oi_capa); + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, + oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); ptlrpc_request_set_replen(req); req->rq_interpret_reply = osc_sync_interpret; - CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oi = oinfo; + CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args)); + fa = ptlrpc_req_async_args(req); + fa->fa_oi = oinfo; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; - ptlrpc_set_add_req(set, req); - RETURN (0); + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + RETURN (0); } /* Find and cancel locally locks matched by @mode in the resource found by * @objid. Found locks are added into @cancel list. Returns the amount of * locks added to @cancels list. */ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, - cfs_list_t *cancels, - ldlm_mode_t mode, int lock_flags) + struct list_head *cancels, + ldlm_mode_t mode, __u64 lock_flags) { struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; struct ldlm_res_id res_id; @@ -653,10 +647,19 @@ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, int count; ENTRY; - osc_build_res_name(oa->o_id, oa->o_seq, &res_id); - res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (res == NULL) - RETURN(0); + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (IS_ERR(res)) + RETURN(0); LDLM_RESOURCE_ADDREF(res); count = ldlm_cancel_resource_local(res, cancels, NULL, mode, @@ -667,32 +670,57 @@ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, } static int osc_destroy_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, - int rc) + struct ptlrpc_request *req, void *data, + int rc) { - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - cfs_atomic_dec(&cli->cl_destroy_in_flight); - cfs_waitq_signal(&cli->cl_destroy_waitq); - return 0; + atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + return 0; } static int osc_can_send_destroy(struct client_obd *cli) { - if (cfs_atomic_inc_return(&cli->cl_destroy_in_flight) <= - cli->cl_max_rpcs_in_flight) { - /* The destroy request can be sent */ - return 1; - } - if (cfs_atomic_dec_return(&cli->cl_destroy_in_flight) < - cli->cl_max_rpcs_in_flight) { - /* - * The counter has been modified between the two atomic - * operations. - */ - cfs_waitq_signal(&cli->cl_destroy_waitq); - } - return 0; + if (atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc = 0; + ENTRY; + + LASSERT(oa); + LASSERT(ea); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_RECREATE_OBJS) { + RETURN(osc_real_create(exp, oa, ea, oti)); + } + + if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi))) + RETURN(osc_real_create(exp, oa, ea, oti)); + + /* we should not get here anymore */ + LBUG(); + + RETURN(rc); } /* Destroy requests can be async always on the client, and we don't even really @@ -705,14 +733,15 @@ static int osc_can_send_destroy(struct client_obd *cli) * the records are not cancelled, and when the OST reconnects to the MDS next, * it will retrieve the llog unlink logs and then sends the log cancellation * cookies to the MDS after committing destroy transactions. */ -static int osc_destroy(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_export, void *capa) +static int osc_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_export, + void *capa) { struct client_obd *cli = &exp->exp_obd->u.cli; struct ptlrpc_request *req; struct ost_body *body; - CFS_LIST_HEAD(cancels); + struct list_head cancels = LIST_HEAD_INIT(cancels); int rc, count; ENTRY; @@ -741,17 +770,20 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); - if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) - oa->o_lcookie = *oti->oti_logcookies; - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&body->oa, oa); + if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) + oa->o_lcookie = *oti->oti_logcookies; + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); osc_pack_capa(req, body, (struct obd_capa *)capa); ptlrpc_request_set_replen(req); - /* don't throttle destroy RPCs for the MDT */ - if (!(cli->cl_import->imp_connect_flags_orig & OBD_CONNECT_MDS)) { + /* If osc_destory is for destroying the unlink orphan, + * sent from MDT to OST, which should not be blocked here, + * because the process might be triggered by ptlrpcd, and + * it is not good to block ptlrpcd thread (b=16006)*/ + if (!(oa->o_flags & OBD_FL_DELORPHAN)) { req->rq_interpret_reply = osc_destroy_interpret; if (!osc_can_send_destroy(cli)) { struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, @@ -778,34 +810,40 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, LASSERT(!(oa->o_valid & bits)); - oa->o_valid |= bits; - client_obd_list_lock(&cli->cl_loi_list_lock); - oa->o_dirty = cli->cl_dirty; - if (cli->cl_dirty - cli->cl_dirty_transit > cli->cl_dirty_max) { - CERROR("dirty %lu - %lu > dirty_max %lu\n", - cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); - oa->o_undirty = 0; - } else if (cfs_atomic_read(&obd_dirty_pages) - - cfs_atomic_read(&obd_dirty_transit_pages) > - obd_max_dirty_pages + 1){ - /* The cfs_atomic_read() allowing the cfs_atomic_inc() are - * not covered by a lock thus they may safely race and trip - * this CERROR() unless we add in a small fudge factor (+1). */ - CERROR("dirty %d - %d > system dirty_max %d\n", - cfs_atomic_read(&obd_dirty_pages), - cfs_atomic_read(&obd_dirty_transit_pages), - obd_max_dirty_pages); - oa->o_undirty = 0; - } else if (cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff) { - CERROR("dirty %lu - dirty_max %lu too big???\n", - cli->cl_dirty, cli->cl_dirty_max); - oa->o_undirty = 0; - } else { - long max_in_flight = (cli->cl_max_pages_per_rpc << CFS_PAGE_SHIFT)* - (cli->cl_max_rpcs_in_flight + 1); - oa->o_undirty = max(cli->cl_dirty_max, max_in_flight); - } - oa->o_grant = cli->cl_avail_grant; + oa->o_valid |= bits; + client_obd_list_lock(&cli->cl_loi_list_lock); + oa->o_dirty = cli->cl_dirty_pages << PAGE_CACHE_SHIFT; + if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit > + cli->cl_dirty_max_pages)) { + CERROR("dirty %lu - %lu > dirty_max %lu\n", + cli->cl_dirty_pages, cli->cl_dirty_transit, + cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else if (unlikely(atomic_read(&obd_dirty_pages) - + atomic_read(&obd_dirty_transit_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The atomic_read() allowing the atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("%s: dirty %d - %d > system dirty_max %d\n", + cli->cl_import->imp_obd->obd_name, + atomic_read(&obd_dirty_pages), + atomic_read(&obd_dirty_transit_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > + 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty_pages, cli->cl_dirty_max_pages); + oa->o_undirty = 0; + } else { + long max_in_flight = (cli->cl_max_pages_per_rpc << + PAGE_CACHE_SHIFT) * + (cli->cl_max_rpcs_in_flight + 1); + oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_CACHE_SHIFT, + max_in_flight); + } + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; oa->o_dropped = cli->cl_lost_grant; cli->cl_lost_grant = 0; client_obd_list_unlock(&cli->cl_loi_list_lock); @@ -814,7 +852,7 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, } -static void osc_update_next_shrink(struct client_obd *cli) +void osc_update_next_shrink(struct client_obd *cli) { cli->cl_next_shrink_grant = cfs_time_shift(cli->cl_grant_shrink_interval); @@ -822,127 +860,6 @@ static void osc_update_next_shrink(struct client_obd *cli) cli->cl_next_shrink_grant); } -/* caller must hold loi_list_lock */ -static void osc_consume_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock.lock); - LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - cfs_atomic_inc(&obd_dirty_pages); - cli->cl_dirty += CFS_PAGE_SIZE; - cli->cl_avail_grant -= CFS_PAGE_SIZE; - pga->flag |= OBD_BRW_FROM_GRANT; - CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - CFS_PAGE_SIZE, pga, pga->pg); - LASSERT(cli->cl_avail_grant >= 0); - osc_update_next_shrink(cli); -} - -/* the companion to osc_consume_write_grant, called when a brw has completed. - * must be called with the loi lock held. */ -static void osc_release_write_grant(struct client_obd *cli, - struct brw_page *pga, int sent) -{ - int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; - ENTRY; - - LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock.lock); - if (!(pga->flag & OBD_BRW_FROM_GRANT)) { - EXIT; - return; - } - - pga->flag &= ~OBD_BRW_FROM_GRANT; - cfs_atomic_dec(&obd_dirty_pages); - cli->cl_dirty -= CFS_PAGE_SIZE; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - cfs_atomic_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit -= CFS_PAGE_SIZE; - } - if (!sent) { - /* Reclaim grant from truncated pages. This is used to solve - * write-truncate and grant all gone(to lost_grant) problem. - * For a vfs write this problem can be easily solved by a sync - * write, however, this is not an option for page_mkwrite() - * because grant has to be allocated before a page becomes - * dirty. */ - if (cli->cl_avail_grant < PTLRPC_MAX_BRW_SIZE) - cli->cl_avail_grant += CFS_PAGE_SIZE; - else - cli->cl_lost_grant += CFS_PAGE_SIZE; - CDEBUG(D_CACHE, "lost grant: %lu avail grant: %lu dirty: %lu\n", - cli->cl_lost_grant, cli->cl_avail_grant, cli->cl_dirty); - } else if (CFS_PAGE_SIZE != blocksize && pga->count != CFS_PAGE_SIZE) { - /* For short writes we shouldn't count parts of pages that - * span a whole block on the OST side, or our accounting goes - * wrong. Should match the code in filter_grant_check. */ - int offset = pga->off & ~CFS_PAGE_MASK; - int count = pga->count + (offset & (blocksize - 1)); - int end = (offset + pga->count) & (blocksize - 1); - if (end) - count += blocksize - end; - - cli->cl_lost_grant += CFS_PAGE_SIZE - count; - CDEBUG(D_CACHE, "lost %lu grant: %lu avail: %lu dirty: %lu\n", - CFS_PAGE_SIZE - count, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty); - } - - EXIT; -} - -static unsigned long rpcs_in_flight(struct client_obd *cli) -{ - return cli->cl_r_in_flight + cli->cl_w_in_flight; -} - -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - cfs_list_t *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - cfs_list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - /* if we can't dirty more, we must wait until some is written */ - if ((cli->cl_dirty + CFS_PAGE_SIZE > cli->cl_dirty_max) || - (cfs_atomic_read(&obd_dirty_pages) + 1 > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld " - "osc max %ld, sys max %d\n", cli->cl_dirty, - cli->cl_dirty_max, obd_max_dirty_pages); - return; - } - - /* if still dirty cache but no grant wait for pending RPCs that - * may yet return us some grant before doing sync writes */ - if (cli->cl_w_in_flight && cli->cl_avail_grant < CFS_PAGE_SIZE) { - CDEBUG(D_CACHE, "%u BRW writes in flight, no grant\n", - cli->cl_w_in_flight); - return; - } - - ocw = cfs_list_entry(l, struct osc_cache_waiter, ocw_entry); - cfs_list_del_init(&ocw->ocw_entry); - if (cli->cl_avail_grant < CFS_PAGE_SIZE) { - /* no more RPCs in flight to return grant, do sync IO */ - ocw->ocw_rc = -EDQUOT; - CDEBUG(D_INODE, "wake oap %p for sync\n", ocw->ocw_oap); - } else { - osc_consume_write_grant(cli, - &ocw->ocw_oap->oap_brw_page); - } - - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant); - - cfs_waitq_signal(&ocw->ocw_waitq); - } - - EXIT; -} - static void __osc_update_grant(struct client_obd *cli, obd_size grant) { client_obd_list_lock(&cli->cl_loi_list_lock); @@ -958,9 +875,9 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body) } } -static int osc_set_info_async(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val, - struct ptlrpc_request_set *set); +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set); static int osc_shrink_grant_interpret(const struct lu_env *env, struct ptlrpc_request *req, @@ -1003,45 +920,45 @@ static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) * needed, and avoids shrinking the grant piecemeal. */ static int osc_shrink_grant(struct client_obd *cli) { - long target = (cli->cl_max_rpcs_in_flight + 1) * - cli->cl_max_pages_per_rpc; + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT); - client_obd_list_lock(&cli->cl_loi_list_lock); - if (cli->cl_avail_grant <= target) - target = cli->cl_max_pages_per_rpc; - client_obd_list_unlock(&cli->cl_loi_list_lock); + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + client_obd_list_unlock(&cli->cl_loi_list_lock); - return osc_shrink_grant_to_target(cli, target); + return osc_shrink_grant_to_target(cli, target_bytes); } -int osc_shrink_grant_to_target(struct client_obd *cli, long target) +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) { - int rc = 0; - struct ost_body *body; - ENTRY; + int rc = 0; + struct ost_body *body; + ENTRY; - client_obd_list_lock(&cli->cl_loi_list_lock); - /* Don't shrink if we are already above or below the desired limit - * We don't want to shrink below a single RPC, as that will negatively - * impact block allocation and long-term performance. */ - if (target < cli->cl_max_pages_per_rpc) - target = cli->cl_max_pages_per_rpc; + client_obd_list_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; - if (target >= cli->cl_avail_grant) { - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(0); - } - client_obd_list_unlock(&cli->cl_loi_list_lock); + if (target_bytes >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + RETURN(0); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); - OBD_ALLOC_PTR(body); - if (!body) - RETURN(-ENOMEM); + OBD_ALLOC_PTR(body); + if (!body) + RETURN(-ENOMEM); - osc_announce_cached(cli, &body->oa, 0); + osc_announce_cached(cli, &body->oa, 0); - client_obd_list_lock(&cli->cl_loi_list_lock); - body->oa.o_grant = cli->cl_avail_grant - target; - cli->cl_avail_grant = target; + client_obd_list_lock(&cli->cl_loi_list_lock); + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; client_obd_list_unlock(&cli->cl_loi_list_lock); if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { body->oa.o_valid |= OBD_MD_FLFLAGS; @@ -1050,7 +967,7 @@ int osc_shrink_grant_to_target(struct client_obd *cli, long target) body->oa.o_flags |= OBD_FL_SHRINK_GRANT; osc_update_next_shrink(cli); - rc = osc_set_info_async(cli->cl_import->imp_obd->obd_self_export, + rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, sizeof(*body), body, NULL); if (rc != 0) @@ -1059,7 +976,6 @@ int osc_shrink_grant_to_target(struct client_obd *cli, long target) RETURN(rc); } -#define GRANT_SHRINK_LIMIT PTLRPC_MAX_BRW_SIZE static int osc_should_shrink_grant(struct client_obd *client) { cfs_time_t time = cfs_time_current(); @@ -1069,26 +985,30 @@ static int osc_should_shrink_grant(struct client_obd *client) OBD_CONNECT_GRANT_SHRINK) == 0) return 0; - if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { - if (client->cl_import->imp_state == LUSTRE_IMP_FULL && - client->cl_avail_grant > GRANT_SHRINK_LIMIT) - return 1; - else - osc_update_next_shrink(client); - } + if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } return 0; } static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data) { - struct client_obd *client; + struct client_obd *client; - cfs_list_for_each_entry(client, &item->ti_obd_list, - cl_grant_shrink_list) { - if (osc_should_shrink_grant(client)) - osc_shrink_grant(client); - } - return 0; + list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) { + if (osc_should_shrink_grant(client)) + osc_shrink_grant(client); + } + return 0; } static int osc_add_shrink_grant(struct client_obd *client) @@ -1118,38 +1038,42 @@ static int osc_del_shrink_grant(struct client_obd *client) static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) { - /* - * ocd_grant is the total grant amount we're expect to hold: if we've - * been evicted, it's the new avail_grant amount, cl_dirty will drop - * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty. - * - * race is tolerable here: if we're evicted, but imp_state already - * left EVICTED state, then cl_dirty must be 0 already. - */ - client_obd_list_lock(&cli->cl_loi_list_lock); - if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) - cli->cl_avail_grant = ocd->ocd_grant; - else - cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty; + /* + * ocd_grant is the total grant amount we're expect to hold: if we've + * been evicted, it's the new avail_grant amount, cl_dirty_pages will + * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + + * dirty. + * + * race is tolerable here: if we're evicted, but imp_state already + * left EVICTED state, then cl_dirty_pages must be 0 already. + */ + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) + cli->cl_avail_grant = ocd->ocd_grant; + else + cli->cl_avail_grant = ocd->ocd_grant - + (cli->cl_dirty_pages << PAGE_CACHE_SHIFT); if (cli->cl_avail_grant < 0) { - CWARN("%s: available grant < 0, the OSS is probably not running" - " with patch from bug20278 (%ld) \n", - cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant); - /* workaround for 1.6 servers which do not have - * the patch from bug20278 */ - cli->cl_avail_grant = ocd->ocd_grant; + CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", + cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant, + ocd->ocd_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT); + /* workaround for servers which do not have the patch from + * LU-2679 */ + cli->cl_avail_grant = ocd->ocd_grant; } - client_obd_list_unlock(&cli->cl_loi_list_lock); + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize); + client_obd_list_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld \n", - cli->cl_import->imp_obd->obd_name, - cli->cl_avail_grant, cli->cl_lost_grant); + CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld." + "chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name, + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits); - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && - cfs_list_empty(&cli->cl_grant_shrink_list)) - osc_add_shrink_grant(cli); + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && + list_empty(&cli->cl_grant_shrink_list)) + osc_add_shrink_grant(cli); } /* We assume that the reason this OSC got a short read is because it read @@ -1166,29 +1090,29 @@ static void handle_short_read(int nob_read, obd_count page_count, while (nob_read > 0) { LASSERT (page_count > 0); - if (pga[i]->count > nob_read) { - /* EOF inside this page */ - ptr = cfs_kmap(pga[i]->pg) + - (pga[i]->off & ~CFS_PAGE_MASK); - memset(ptr + nob_read, 0, pga[i]->count - nob_read); - cfs_kunmap(pga[i]->pg); - page_count--; - i++; - break; - } + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } nob_read -= pga[i]->count; page_count--; i++; } - /* zero remaining pages */ - while (page_count-- > 0) { - ptr = cfs_kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); - memset(ptr, 0, pga[i]->count); - cfs_kunmap(pga[i]->pg); - i++; - } + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } } static int check_write_rcs(struct ptlrpc_request *req, @@ -1230,14 +1154,15 @@ static int check_write_rcs(struct ptlrpc_request *req, static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) { if (p1->flag != p2->flag) { - unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE| - OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA); + unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | + OBD_BRW_SYNC | OBD_BRW_ASYNC | + OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC); /* warn if we try to combine flags that we don't know to be * safe to combine */ if (unlikely((p1->flag & mask) != (p2->flag & mask))) { CWARN("Saw flags 0x%x and 0x%x in the same brw, please " - "report this at http://bugs.whamcloud.com/\n", + "report this at https://jira.hpdd.intel.com/\n", p1->flag, p2->flag); } return 0; @@ -1247,39 +1172,58 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) } static obd_count osc_checksum_bulk(int nob, obd_count pg_count, - struct brw_page **pga, int opc, - cksum_type_t cksum_type) -{ - __u32 cksum; - int i = 0; - - LASSERT (pg_count > 0); - cksum = init_checksum(cksum_type); - while (nob > 0 && pg_count > 0) { - unsigned char *ptr = cfs_kmap(pga[i]->pg); - int off = pga[i]->off & ~CFS_PAGE_MASK; - int count = pga[i]->count > nob ? nob : pga[i]->count; - - /* corrupt the data before we compute the checksum, to - * simulate an OST->client data error */ - if (i == 0 && opc == OST_READ && - OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) - memcpy(ptr + off, "bad1", min(4, nob)); - cksum = compute_checksum(cksum, ptr + off, count, cksum_type); - cfs_kunmap(pga[i]->pg); - LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d checksum %x\n", - off, cksum); - - nob -= pga[i]->count; - pg_count--; - i++; - } - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo */ - if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) - cksum++; - - return fini_checksum(cksum, cksum_type); + struct brw_page **pga, int opc, + cksum_type_t cksum_type) +{ + __u32 cksum; + int i = 0; + struct cfs_crypto_hash_desc *hdesc; + unsigned int bufsize; + int err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(hdesc); + } + + while (nob > 0 && pg_count > 0) { + int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~CFS_PAGE_MASK; + + memcpy(ptr + off, "bad1", min(4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(hdesc, pga[i]->pg, + pga[i]->off & ~CFS_PAGE_MASK, + count); + LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n", + (int)(pga[i]->off & ~CFS_PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = sizeof(cksum); + err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + return cksum; } static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, @@ -1336,13 +1280,14 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; - if (opc == OST_WRITE) - desc = ptlrpc_prep_bulk_imp(req, page_count, - BULK_GET_SOURCE, OST_BULK_PORTAL); - else - desc = ptlrpc_prep_bulk_imp(req, page_count, - BULK_PUT_SINK, OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK, + OST_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); @@ -1353,26 +1298,32 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); - lustre_set_wire_obdo(&body->oa, oa); - - obdo_to_ioobj(oa, ioobj); - ioobj->ioo_bufcnt = niocount; - osc_pack_capa(req, body, ocapa); - LASSERT (page_count > 0); - pg_prev = pga[0]; + lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + osc_pack_capa(req, body, ocapa); + LASSERT(page_count > 0); + pg_prev = pga[0]; for (requested_nob = i = 0; i < page_count; i++, niobuf++) { struct brw_page *pg = pga[i]; int poff = pg->off & ~CFS_PAGE_MASK; LASSERT(pg->count > 0); /* make sure there is no gap in the middle of page array */ - LASSERTF(page_count == 1 || - (ergo(i == 0, poff + pg->count == CFS_PAGE_SIZE) && - ergo(i > 0 && i < page_count - 1, - poff == 0 && pg->count == CFS_PAGE_SIZE) && - ergo(i == page_count - 1, poff == 0)), - "i: %d/%d pg: %p off: "LPU64", count: %u\n", - i, page_count, pg, pg->off, pg->count); + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_CACHE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: "LPU64", count: %u\n", + i, page_count, pg, pg->off, pg->count); #ifdef __linux__ LASSERTF(i == 0 || pg->off > pg_prev->off, "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64 @@ -1388,16 +1339,16 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == (pg->flag & OBD_BRW_SRVLOCK)); - ptlrpc_prep_bulk_page(desc, pg->pg, poff, pg->count); + ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count); requested_nob += pg->count; if (i > 0 && can_merge_pages(pg_prev, pg)) { niobuf--; - niobuf->len += pg->count; - } else { - niobuf->offset = pg->off; - niobuf->len = pg->count; - niobuf->flags = pg->flag; + niobuf->rnb_len += pg->count; + } else { + niobuf->rnb_offset = pg->off; + niobuf->rnb_len = pg->count; + niobuf->rnb_flags = pg->flag; } pg_prev = pg; } @@ -1421,7 +1372,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, /* size[REQ_REC_OFF] still sizeof (*body) */ if (opc == OST_WRITE) { - if (unlikely(cli->cl_checksum) && + if (cli->cl_checksum && !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { /* store cl_cksum_type in a local variable since * it can be changed via lprocfs */ @@ -1452,7 +1403,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, sizeof(__u32) * niocount); } else { - if (unlikely(cli->cl_checksum) && + if (cli->cl_checksum && !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) body->oa.o_flags = 0; @@ -1471,7 +1422,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, aa->aa_resends = 0; aa->aa_ppga = pga; aa->aa_cli = cli; - CFS_INIT_LIST_HEAD(&aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_oaps); if (ocapa && reserve) aa->aa_ocapa = capa_get(ocapa); @@ -1514,20 +1465,18 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer, msg = "changed in transit AND doesn't match the original - " "likely false positive due to mmap IO (bug 11742)"; - LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID - " object "LPU64"/"LPU64" extent ["LPU64"-"LPU64"]\n", - msg, libcfs_nid2str(peer->nid), - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, - oa->o_id, - oa->o_valid & OBD_MD_FLGROUP ? oa->o_seq : (__u64)0, - pga[0]->off, - pga[page_count-1]->off + pga[page_count-1]->count - 1); - CERROR("original client csum %x (type %x), server csum %x (type %x), " - "client csum now %x\n", client_cksum, client_cksum_type, - server_cksum, cksum_type, new_cksum); - return 1; + LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID + " object "DOSTID" extent ["LPU64"-"LPU64"]\n", + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1); + CERROR("original client csum %x (type %x), server csum %x (type %x), " + "client csum now %x\n", client_cksum, client_cksum_type, + server_cksum, cksum_type, new_cksum); + return 1; } /* Note rc enters this function as number of bytes transferred */ @@ -1636,42 +1585,34 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) router = libcfs_nid2str(req->rq_bulk->bd_sender); } - if (server_cksum == ~0 && rc > 0) { - CERROR("Protocol error: server %s set the 'checksum' " - "bit, but didn't send a checksum. Not fatal, " - "but please notify on http://bugs.whamcloud.com/\n", - libcfs_nid2str(peer->nid)); - } else if (server_cksum != client_cksum) { - LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " - "%s%s%s inode "DFID" object " - LPU64"/"LPU64" extent " - "["LPU64"-"LPU64"]\n", - req->rq_import->imp_obd->obd_name, - libcfs_nid2str(peer->nid), - via, router, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_seq : (__u64)0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_oid : 0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_ver : 0, - body->oa.o_id, - body->oa.o_valid & OBD_MD_FLGROUP ? - body->oa.o_seq : (__u64)0, - aa->aa_ppga[0]->off, - aa->aa_ppga[aa->aa_page_count-1]->off + - aa->aa_ppga[aa->aa_page_count-1]->count - - 1); - CERROR("client %x, server %x, cksum_type %x\n", - client_cksum, server_cksum, cksum_type); - cksum_counter = 0; - aa->aa_oa->o_cksum = client_cksum; - rc = -EAGAIN; - } else { - cksum_counter++; - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - rc = 0; - } + if (server_cksum != client_cksum) { + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " + "%s%s%s inode "DFID" object "DOSTID + " extent ["LPU64"-"LPU64"]\n", + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(peer->nid), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count-1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - + 1); + CERROR("client %x, server %x, cksum_type %x\n", + client_cksum, server_cksum, cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } } else if (unlikely(client_cksum)) { static int cksum_missed; @@ -1683,76 +1624,23 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) rc = 0; } out: - if (rc >= 0) - lustre_get_wire_obdo(aa->aa_oa, &body->oa); + if (rc >= 0) + lustre_get_wire_obdo(&req->rq_import->imp_connect_data, + aa->aa_oa, &body->oa); RETURN(rc); } -static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, - obd_count page_count, struct brw_page **pga, - struct obd_capa *ocapa) -{ - struct ptlrpc_request *req; - int rc; - cfs_waitq_t waitq; - int resends = 0; - struct l_wait_info lwi; - - ENTRY; - - cfs_waitq_init(&waitq); - -restart_bulk: - rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm, - page_count, pga, &req, ocapa, 0, resends); - if (rc != 0) - return (rc); - - rc = ptlrpc_queue_wait(req); - - if (rc == -ETIMEDOUT && req->rq_resend) { - DEBUG_REQ(D_HA, req, "BULK TIMEOUT"); - ptlrpc_req_finished(req); - goto restart_bulk; - } - - rc = osc_brw_fini_request(req, rc); - - ptlrpc_req_finished(req); - if (osc_recoverable_error(rc)) { - resends++; - if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { - CERROR("too many resend retries, returning error\n"); - RETURN(-EIO); - } - - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL); - l_wait_event(waitq, 0, &lwi); - - goto restart_bulk; - } - - RETURN (rc); -} - -int osc_brw_redo_request(struct ptlrpc_request *request, - struct osc_brw_async_args *aa) +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) { struct ptlrpc_request *new_req; - struct ptlrpc_request_set *set = request->rq_set; struct osc_brw_async_args *new_aa; struct osc_async_page *oap; - int rc = 0; ENTRY; - if (!client_should_resend(aa->aa_resends, aa->aa_cli)) { - CERROR("too many resent retries, returning error\n"); - RETURN(-EIO); - } - - DEBUG_REQ(D_ERROR, request, "redo for recoverable error"); + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, @@ -1763,15 +1651,12 @@ int osc_brw_redo_request(struct ptlrpc_request *request, if (rc) RETURN(rc); - client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock); - - cfs_list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { if (oap->oap_request != NULL) { LASSERTF(request == oap->oap_request, "request %p != oap_request %p\n", request, oap->oap_request); if (oap->oap_interrupted) { - client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); ptlrpc_req_finished(new_req); RETURN(-EINTR); } @@ -1782,15 +1667,25 @@ int osc_brw_redo_request(struct ptlrpc_request *request, aa->aa_resends++; new_req->rq_interpret_reply = request->rq_interpret_reply; new_req->rq_async_args = request->rq_async_args; - new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends; + new_req->rq_commit_cb = request->rq_commit_cb; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout; + else + new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends; + new_req->rq_generation_set = 1; + new_req->rq_import_generation = request->rq_import_generation; new_aa = ptlrpc_req_async_args(new_req); - CFS_INIT_LIST_HEAD(&new_aa->aa_oaps); - cfs_list_splice(&aa->aa_oaps, &new_aa->aa_oaps); - CFS_INIT_LIST_HEAD(&aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_oaps); + list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_exts); + list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; - cfs_list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { + list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { if (oap->oap_request) { ptlrpc_req_finished(oap->oap_request); oap->oap_request = ptlrpc_request_addref(new_req); @@ -1800,16 +1695,14 @@ int osc_brw_redo_request(struct ptlrpc_request *request, new_aa->aa_ocapa = aa->aa_ocapa; aa->aa_ocapa = NULL; - /* use ptlrpc_set_add_req is safe because interpret functions work - * in check_set context. only one way exist with access to request - * from different thread got -EINTR - this way protected with - * cl_loi_list_lock */ - ptlrpc_set_add_req(set, new_req); - - client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1); - DEBUG_REQ(D_INFO, new_req, "new request"); - RETURN(0); + DEBUG_REQ(D_INFO, new_req, "new request"); + RETURN(0); } /* @@ -1843,1317 +1736,400 @@ static void sort_brw_pages(struct brw_page **array, int num) } while (stride > 1); } -static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages) -{ - int count = 1; - int offset; - int i = 0; - - LASSERT (pages > 0); - offset = pg[i]->off & ~CFS_PAGE_MASK; - - for (;;) { - pages--; - if (pages == 0) /* that's all */ - return count; - - if (offset + pg[i]->count < CFS_PAGE_SIZE) - return count; /* doesn't end on page boundary */ - - i++; - offset = pg[i]->off & ~CFS_PAGE_MASK; - if (offset != 0) /* doesn't start on page boundary */ - return count; - - count++; - } -} - -static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count) -{ - struct brw_page **ppga; - int i; - - OBD_ALLOC(ppga, sizeof(*ppga) * count); - if (ppga == NULL) - return NULL; - - for (i = 0; i < count; i++) - ppga[i] = pga + i; - return ppga; -} - static void osc_release_ppga(struct brw_page **ppga, obd_count count) { LASSERT(ppga != NULL); OBD_FREE(ppga, sizeof(*ppga) * count); } -static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, - obd_count page_count, struct brw_page *pga, - struct obd_trans_info *oti) +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) { - struct obdo *saved_oa = NULL; - struct brw_page **ppga, **orig; - struct obd_import *imp = class_exp2cliimp(exp); - struct client_obd *cli; - int rc, page_count_orig; + struct osc_brw_async_args *aa = data; + struct osc_extent *ext; + struct osc_extent *tmp; + struct client_obd *cli = aa->aa_cli; ENTRY; - LASSERT((imp != NULL) && (imp->imp_obd != NULL)); - cli = &imp->imp_obd->u.cli; - - if (cmd & OBD_BRW_CHECK) { - /* The caller just wants to know if there's a chance that this - * I/O can succeed */ + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. */ + if (osc_recoverable_error(rc)) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: " + ""LPU64":"LPU64", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + RETURN(0); + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } - if (imp->imp_invalid) - RETURN(-EIO); - RETURN(0); + if (aa->aa_ocapa) { + capa_put(aa->aa_ocapa); + aa->aa_ocapa = NULL; } - /* test_brw with a failed create can trip this, maybe others. */ - LASSERT(cli->cl_max_pages_per_rpc); - - rc = 0; - - orig = ppga = osc_build_ppga(pga, page_count); - if (ppga == NULL) - RETURN(-ENOMEM); - page_count_orig = page_count; + if (rc == 0) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + struct cl_object *obj; + struct osc_async_page *last; + + last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]); + obj = osc2cl(last->oap_obj); + + cl_object_attr_lock(obj); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + loff_t last_off = last->oap_count + last->oap_obj_off; + + /* Change file size if this is an out of quota or + * direct IO write and it extends the file size */ + if (loi->loi_lvb.lvb_size < last_off) { + attr->cat_size = last_off; + valid |= CAT_SIZE; + } + /* Extend KMS if it's not a lockless write */ + if (loi->loi_kms < last_off && + oap2osc_page(last)->ops_srvlock == 0) { + attr->cat_kms = last_off; + valid |= CAT_KMS; + } + } + + if (valid != 0) + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + OBDO_FREE(aa->aa_oa); + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) + osc_inc_unstable_pages(req); + + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, rc); + } + LASSERT(list_empty(&aa->aa_exts)); + LASSERT(list_empty(&aa->aa_oaps)); + + cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : + req->rq_bulk->bd_nob_transferred); + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(rc); +} + +static void brw_commit(struct ptlrpc_request *req) +{ + /* If osc_inc_unstable_pages (via osc_extent_finish) races with + * this called via the rq_commit_cb, I need to ensure + * osc_dec_unstable_pages is still called. Otherwise unstable + * pages may be leaked. */ + spin_lock(&req->rq_lock); + if (likely(req->rq_unstable)) { + req->rq_unstable = 0; + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + } +} - sort_brw_pages(ppga, page_count); - while (page_count) { - obd_count pages_per_brw; +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd, pdl_policy_t pol) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct cl_req *clerq = NULL; + enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : + CRT_READ; + struct cl_req_attr *crattr = NULL; + obd_off starting_offset = OBD_OBJECT_EOF; + obd_off ending_offset = 0; + int mpflag = 0; + int mem_tight = 0; + int page_count = 0; + bool soft_sync = false; + int i; + int rc; + struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + + ENTRY; + LASSERT(!list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_CACHE_SIZE); + } + } + + soft_sync = osc_over_unstable_soft_limit(cli); + if (mem_tight) + mpflag = cfs_memory_pressure_get_and_set(); + + OBD_ALLOC(crattr, sizeof(*crattr)); + if (crattr == NULL) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (pga == NULL) + GOTO(out, rc = -ENOMEM); + + OBDO_ALLOC(oa); + if (oa == NULL) + GOTO(out, rc = -ENOMEM); + + i = 0; + list_for_each_entry(oap, &rpc_list, oap_rpc_item) { + struct cl_page *page = oap2cl_page(oap); + if (clerq == NULL) { + clerq = cl_req_alloc(env, page, crt, + 1 /* only 1-object rpcs for now */); + if (IS_ERR(clerq)) + GOTO(out, rc = PTR_ERR(clerq)); + } + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + if (soft_sync) + oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", + pga[i]->pg, page_index(oap->oap_page), oap, + pga[i]->flag); + i++; + cl_req_page_add(env, clerq, page); + } + + /* always get the data for the obdo for the rpc */ + LASSERT(clerq != NULL); + crattr->cra_oa = oa; + cl_req_attr_set(env, clerq, crattr, ~0ULL); + + rc = cl_req_prep(env, clerq); + if (rc != 0) { + CERROR("cl_req_prep failed: %d\n", rc); + GOTO(out, rc); + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, + pga, &req, crattr->cra_capa, 1, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + GOTO(out, rc); + } + + req->rq_commit_cb = brw_commit; + req->rq_interpret_reply = brw_interpret; + + if (mem_tight != 0) + req->rq_memalloc = 1; + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + cl_req_attr_set(env, clerq, crattr, + OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); + + lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + INIT_LIST_HEAD(&aa->aa_oaps); + list_splice_init(&rpc_list, &aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_exts); + list_splice_init(ext_list, &aa->aa_exts); + aa->aa_clerq = clerq; + + /* queued sync pages can be torn down while the pages + * were between the pending list and the rpc */ + tmp = NULL; + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + /* only one oap gets a request reference */ + if (tmp == NULL) + tmp = oap; + if (oap->oap_interrupted && !req->rq_intr) { + CDEBUG(D_INODE, "oap %p in req %p interrupted\n", + oap, req); + ptlrpc_mark_interrupted(req); + } + } + if (tmp != NULL) + tmp->oap_request = ptlrpc_request_addref(req); + + client_obd_list_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_CACHE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + + /* XXX: Maybe the caller can check the RPC bulk descriptor to + * see which CPU/NUMA node the majority of pages were allocated + * on, and try to assign the async RPC to the CPU core + * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic. + * + * But on the other hand, we expect that multiple ptlrpcd + * threads and the initial write sponsor can run in parallel, + * especially when data checksum is enabled, which is CPU-bound + * operation and single ptlrpcd thread cannot process in time. + * So more ptlrpcd threads sharing BRW load + * (with PDL_POLICY_ROUND) seems better. + */ + ptlrpcd_add_req(req, pol, -1); + rc = 0; + EXIT; - if (page_count > cli->cl_max_pages_per_rpc) - pages_per_brw = cli->cl_max_pages_per_rpc; - else - pages_per_brw = page_count; - - pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw); - - if (saved_oa != NULL) { - /* restore previously saved oa */ - *oinfo->oi_oa = *saved_oa; - } else if (page_count > pages_per_brw) { - /* save a copy of oa (brw will clobber it) */ - OBDO_ALLOC(saved_oa); - if (saved_oa == NULL) - GOTO(out, rc = -ENOMEM); - *saved_oa = *oinfo->oi_oa; - } +out: + if (mem_tight != 0) + cfs_memory_pressure_restore(mpflag); + + if (crattr != NULL) { + capa_put(crattr->cra_capa); + OBD_FREE(crattr, sizeof(*crattr)); + } + + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBDO_FREE(oa); + if (pga) + OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + while (!list_empty(ext_list)) { + ext = list_entry(ext_list->next, struct osc_extent, + oe_link); + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + if (clerq && !IS_ERR(clerq)) + cl_req_completion(env, clerq, rc); + } + RETURN(rc); +} - rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md, - pages_per_brw, ppga, oinfo->oi_capa); +static int osc_set_lock_data_with_check(struct ldlm_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + void *data = einfo->ei_cbdata; + int set = 0; - if (rc != 0) - break; + LASSERT(lock != NULL); + LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); + LASSERT(lock->l_resource->lr_type == einfo->ei_type); + LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); + LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); - page_count -= pages_per_brw; - ppga += pages_per_brw; - } + lock_res_and_lock(lock); + spin_lock(&osc_ast_guard); -out: - osc_release_ppga(orig, page_count_orig); + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; - if (saved_oa != NULL) - OBDO_FREE(saved_oa); + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(lock); - RETURN(rc); + return set; } -/* The companion to osc_enter_cache(), called when @oap is no longer part of - * the dirty accounting. Writeback completes or truncate happens before - * writing starts. Must be called with the loi lock held. */ -static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap, - int sent) +static int osc_set_data_with_check(struct lustre_handle *lockh, + struct ldlm_enqueue_info *einfo) { - osc_release_write_grant(cli, &oap->oap_brw_page, sent); -} - - -/* This maintains the lists of pending pages to read/write for a given object - * (lop). This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint() - * to quickly find objects that are ready to send an RPC. */ -static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, - int cmd) -{ - ENTRY; - - if (lop->lop_num_pending == 0) - RETURN(0); - - /* if we have an invalid import we want to drain the queued pages - * by forcing them through rpcs that immediately fail and complete - * the pages. recovery relies on this to empty the queued pages - * before canceling the locks and evicting down the llite pages */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(1); - - /* stream rpcs in queue order as long as as there is an urgent page - * queued. this is our cheap solution for good batching in the case - * where writepage marks some random page in the middle of the file - * as urgent because of, say, memory pressure */ - if (!cfs_list_empty(&lop->lop_urgent)) { - CDEBUG(D_CACHE, "urgent request forcing RPC\n"); - RETURN(1); - } - - if (cmd & OBD_BRW_WRITE) { - /* trigger a write rpc stream as long as there are dirtiers - * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!cfs_list_empty(&cli->cl_cache_waiters)) { - CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); - RETURN(1); - } - } - if (lop->lop_num_pending >= cli->cl_max_pages_per_rpc) - RETURN(1); - - RETURN(0); -} - -static int lop_makes_hprpc(struct loi_oap_pages *lop) -{ - struct osc_async_page *oap; - ENTRY; - - if (cfs_list_empty(&lop->lop_urgent)) - RETURN(0); - - oap = cfs_list_entry(lop->lop_urgent.next, - struct osc_async_page, oap_urgent_item); - - if (oap->oap_async_flags & ASYNC_HP) { - CDEBUG(D_CACHE, "hp request forcing RPC\n"); - RETURN(1); - } - - RETURN(0); -} - -static void on_list(cfs_list_t *item, cfs_list_t *list, - int should_be_on) -{ - if (cfs_list_empty(item) && should_be_on) - cfs_list_add_tail(item, list); - else if (!cfs_list_empty(item) && !should_be_on) - cfs_list_del_init(item); -} - -/* maintain the loi's cli list membership invariants so that osc_send_oap_rpc - * can find pages to build into rpcs quickly */ -void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) -{ - if (lop_makes_hprpc(&loi->loi_write_lop) || - lop_makes_hprpc(&loi->loi_read_lop)) { - /* HP rpc */ - on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, 0); - on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); - } else { - on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); - on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)|| - lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)); - } - - on_list(&loi->loi_write_item, &cli->cl_loi_write_list, - loi->loi_write_lop.lop_num_pending); - - on_list(&loi->loi_read_item, &cli->cl_loi_read_list, - loi->loi_read_lop.lop_num_pending); -} - -static void lop_update_pending(struct client_obd *cli, - struct loi_oap_pages *lop, int cmd, int delta) -{ - lop->lop_num_pending += delta; - if (cmd & OBD_BRW_WRITE) - cli->cl_pending_w_pages += delta; - else - cli->cl_pending_r_pages += delta; -} - -/** - * this is called when a sync waiter receives an interruption. Its job is to - * get the caller woken as soon as possible. If its page hasn't been put in an - * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as - * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out. - */ -int osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap) -{ - struct loi_oap_pages *lop; - struct lov_oinfo *loi; - int rc = -EBUSY; - ENTRY; - - LASSERT(!oap->oap_interrupted); - oap->oap_interrupted = 1; - - /* ok, it's been put in an rpc. only one oap gets a request reference */ - if (oap->oap_request != NULL) { - ptlrpc_mark_interrupted(oap->oap_request); - ptlrpcd_wake(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - - /* - * page completion may be called only if ->cpo_prep() method was - * executed by osc_io_submit(), that also adds page the to pending list - */ - if (!cfs_list_empty(&oap->oap_pending_item)) { - cfs_list_del_init(&oap->oap_pending_item); - cfs_list_del_init(&oap->oap_urgent_item); - - loi = oap->oap_loi; - lop = (oap->oap_cmd & OBD_BRW_WRITE) ? - &loi->loi_write_lop : &loi->loi_read_lop; - lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1); - loi_list_maint(oap->oap_cli, oap->oap_loi); - rc = oap->oap_caller_ops->ap_completion(env, - oap->oap_caller_data, - oap->oap_cmd, NULL, -EINTR); - } - - RETURN(rc); -} - -/* this is trying to propogate async writeback errors back up to the - * application. As an async write fails we record the error code for later if - * the app does an fsync. As long as errors persist we force future rpcs to be - * sync so that the app can get a sync error and break the cycle of queueing - * pages for which writeback will fail. */ -static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, - int rc) -{ - if (rc) { - if (!ar->ar_rc) - ar->ar_rc = rc; - - ar->ar_force_sync = 1; - ar->ar_min_xid = ptlrpc_sample_next_xid(); - return; - - } - - if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) - ar->ar_force_sync = 0; -} - -void osc_oap_to_pending(struct osc_async_page *oap) -{ - struct loi_oap_pages *lop; - - if (oap->oap_cmd & OBD_BRW_WRITE) - lop = &oap->oap_loi->loi_write_lop; - else - lop = &oap->oap_loi->loi_read_lop; - - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_add(&oap->oap_urgent_item, &lop->lop_urgent); - else if (oap->oap_async_flags & ASYNC_URGENT) - cfs_list_add_tail(&oap->oap_urgent_item, &lop->lop_urgent); - cfs_list_add_tail(&oap->oap_pending_item, &lop->lop_pending); - lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, 1); -} - -/* this must be called holding the loi list lock to give coverage to exit_cache, - * async_flag maintenance, and oap_request */ -static void osc_ap_completion(const struct lu_env *env, - struct client_obd *cli, struct obdo *oa, - struct osc_async_page *oap, int sent, int rc) -{ - __u64 xid = 0; - - ENTRY; - if (oap->oap_request != NULL) { - xid = ptlrpc_req_xid(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags = 0; - cfs_spin_unlock(&oap->oap_lock); - oap->oap_interrupted = 0; - - if (oap->oap_cmd & OBD_BRW_WRITE) { - osc_process_ar(&cli->cl_ar, xid, rc); - osc_process_ar(&oap->oap_loi->loi_ar, xid, rc); - } - - if (rc == 0 && oa != NULL) { - if (oa->o_valid & OBD_MD_FLBLOCKS) - oap->oap_loi->loi_lvb.lvb_blocks = oa->o_blocks; - if (oa->o_valid & OBD_MD_FLMTIME) - oap->oap_loi->loi_lvb.lvb_mtime = oa->o_mtime; - if (oa->o_valid & OBD_MD_FLATIME) - oap->oap_loi->loi_lvb.lvb_atime = oa->o_atime; - if (oa->o_valid & OBD_MD_FLCTIME) - oap->oap_loi->loi_lvb.lvb_ctime = oa->o_ctime; - } - - rc = oap->oap_caller_ops->ap_completion(env, oap->oap_caller_data, - oap->oap_cmd, oa, rc); - - /* cl_page_completion() drops PG_locked. so, a new I/O on the page could - * start, but OSC calls it under lock and thus we can add oap back to - * pending safely */ - if (rc) - /* upper layer wants to leave the page on pending queue */ - osc_oap_to_pending(oap); - else - osc_exit_cache(cli, oap, sent); - EXIT; -} - -static int brw_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - - CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); - - client_obd_list_lock(&cli->cl_loi_list_lock); - osc_check_rpcs0(env, cli, 1); - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(0); -} - -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc) -{ - struct osc_brw_async_args *aa = data; - struct client_obd *cli; - int async; - ENTRY; - - rc = osc_brw_fini_request(req, rc); - CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); - if (osc_recoverable_error(rc)) { - rc = osc_brw_redo_request(req, aa); - if (rc == 0) - RETURN(0); - } - - if (aa->aa_ocapa) { - capa_put(aa->aa_ocapa); - aa->aa_ocapa = NULL; - } - - cli = aa->aa_cli; - client_obd_list_lock(&cli->cl_loi_list_lock); - - /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters - * is called so we know whether to go to sync BRWs or wait for more - * RPCs to complete */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) - cli->cl_w_in_flight--; - else - cli->cl_r_in_flight--; - - async = cfs_list_empty(&aa->aa_oaps); - if (!async) { /* from osc_send_oap_rpc() */ - struct osc_async_page *oap, *tmp; - /* the caller may re-use the oap after the completion call so - * we need to clean it up a little */ - cfs_list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, - oap_rpc_item) { - cfs_list_del_init(&oap->oap_rpc_item); - osc_ap_completion(env, cli, aa->aa_oa, oap, 1, rc); - } - OBDO_FREE(aa->aa_oa); - } else { /* from async_internal() */ - obd_count i; - for (i = 0; i < aa->aa_page_count; i++) - osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); - } - osc_wake_cache_waiters(cli); - osc_check_rpcs0(env, cli, 1); - client_obd_list_unlock(&cli->cl_loi_list_lock); - - if (!async) - cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : - req->rq_bulk->bd_nob_transferred); - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); - ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); - - RETURN(rc); -} - -static struct ptlrpc_request *osc_build_req(const struct lu_env *env, - struct client_obd *cli, - cfs_list_t *rpc_list, - int page_count, int cmd) -{ - struct ptlrpc_request *req; - struct brw_page **pga = NULL; - struct osc_brw_async_args *aa; - struct obdo *oa = NULL; - const struct obd_async_page_ops *ops = NULL; - struct osc_async_page *oap; - struct osc_async_page *tmp; - struct cl_req *clerq = NULL; - enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; - struct ldlm_lock *lock = NULL; - struct cl_req_attr crattr; - int i, rc, mpflag = 0; - - ENTRY; - LASSERT(!cfs_list_empty(rpc_list)); - - if (cmd & OBD_BRW_MEMALLOC) - mpflag = cfs_memory_pressure_get_and_set(); - - memset(&crattr, 0, sizeof crattr); - OBD_ALLOC(pga, sizeof(*pga) * page_count); - if (pga == NULL) - GOTO(out, req = ERR_PTR(-ENOMEM)); - - OBDO_ALLOC(oa); - if (oa == NULL) - GOTO(out, req = ERR_PTR(-ENOMEM)); - - i = 0; - cfs_list_for_each_entry(oap, rpc_list, oap_rpc_item) { - struct cl_page *page = osc_oap2cl_page(oap); - if (ops == NULL) { - ops = oap->oap_caller_ops; - - clerq = cl_req_alloc(env, page, crt, - 1 /* only 1-object rpcs for - * now */); - if (IS_ERR(clerq)) - GOTO(out, req = (void *)clerq); - lock = oap->oap_ldlm_lock; - } - pga[i] = &oap->oap_brw_page; - pga[i]->off = oap->oap_obj_off + oap->oap_page_off; - CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", - pga[i]->pg, cfs_page_index(oap->oap_page), oap, pga[i]->flag); - i++; - cl_req_page_add(env, clerq, page); - } - - /* always get the data for the obdo for the rpc */ - LASSERT(ops != NULL); - crattr.cra_oa = oa; - crattr.cra_capa = NULL; - cl_req_attr_set(env, clerq, &crattr, ~0ULL); - if (lock) { - oa->o_handle = lock->l_remote_handle; - oa->o_valid |= OBD_MD_FLHANDLE; - } - - rc = cl_req_prep(env, clerq); - if (rc != 0) { - CERROR("cl_req_prep failed: %d\n", rc); - GOTO(out, req = ERR_PTR(rc)); - } - - sort_brw_pages(pga, page_count); - rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, - pga, &req, crattr.cra_capa, 1, 0); - if (rc != 0) { - CERROR("prep_req failed: %d\n", rc); - GOTO(out, req = ERR_PTR(rc)); - } - - if (cmd & OBD_BRW_MEMALLOC) - req->rq_memalloc = 1; - - /* Need to update the timestamps after the request is built in case - * we race with setattr (locally or in queue at OST). If OST gets - * later setattr before earlier BRW (as determined by the request xid), - * the OST will not use BRW timestamps. Sadly, there is no obvious - * way to do this in a single call. bug 10150 */ - cl_req_attr_set(env, clerq, &crattr, - OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); - - CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - CFS_INIT_LIST_HEAD(&aa->aa_oaps); - cfs_list_splice(rpc_list, &aa->aa_oaps); - CFS_INIT_LIST_HEAD(rpc_list); - aa->aa_clerq = clerq; -out: - if (cmd & OBD_BRW_MEMALLOC) - cfs_memory_pressure_restore(mpflag); - - capa_put(crattr.cra_capa); - if (IS_ERR(req)) { - if (oa) - OBDO_FREE(oa); - if (pga) - OBD_FREE(pga, sizeof(*pga) * page_count); - /* this should happen rarely and is pretty bad, it makes the - * pending list not follow the dirty order */ - client_obd_list_lock(&cli->cl_loi_list_lock); - cfs_list_for_each_entry_safe(oap, tmp, rpc_list, oap_rpc_item) { - cfs_list_del_init(&oap->oap_rpc_item); - - /* queued sync pages can be torn down while the pages - * were between the pending list and the rpc */ - if (oap->oap_interrupted) { - CDEBUG(D_INODE, "oap %p interrupted\n", oap); - osc_ap_completion(env, cli, NULL, oap, 0, - oap->oap_count); - continue; - } - osc_ap_completion(env, cli, NULL, oap, 0, PTR_ERR(req)); - } - if (clerq && !IS_ERR(clerq)) - cl_req_completion(env, clerq, PTR_ERR(req)); - } - RETURN(req); -} - -/** - * prepare pages for ASYNC io and put pages in send queue. - * - * \param cmd OBD_BRW_* macroses - * \param lop pending pages - * - * \return zero if no page added to send queue. - * \return 1 if pages successfully added to send queue. - * \return negative on errors. - */ -static int -osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, - struct lov_oinfo *loi, int cmd, - struct loi_oap_pages *lop, pdl_policy_t pol) -{ - struct ptlrpc_request *req; - obd_count page_count = 0; - struct osc_async_page *oap = NULL, *tmp; - struct osc_brw_async_args *aa; - const struct obd_async_page_ops *ops; - CFS_LIST_HEAD(rpc_list); - int srvlock = 0, mem_tight = 0; - struct cl_object *clob = NULL; - obd_off starting_offset = OBD_OBJECT_EOF; - unsigned int ending_offset; - int starting_page_off = 0; - ENTRY; - - /* ASYNC_HP pages first. At present, when the lock the pages is - * to be canceled, the pages covered by the lock will be sent out - * with ASYNC_HP. We have to send out them as soon as possible. */ - cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) { - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_move(&oap->oap_pending_item, &rpc_list); - else if (!(oap->oap_brw_flags & OBD_BRW_SYNC)) - /* only do this for writeback pages. */ - cfs_list_move_tail(&oap->oap_pending_item, &rpc_list); - if (++page_count >= cli->cl_max_pages_per_rpc) - break; - } - cfs_list_splice_init(&rpc_list, &lop->lop_pending); - page_count = 0; - - /* first we find the pages we're allowed to work with */ - cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_pending, - oap_pending_item) { - ops = oap->oap_caller_ops; - - LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, " - "magic 0x%x\n", oap, oap->oap_magic); - - if (clob == NULL) { - /* pin object in memory, so that completion call-backs - * can be safely called under client_obd_list lock. */ - clob = osc_oap2cl_page(oap)->cp_obj; - cl_object_get(clob); - } - - if (page_count != 0 && - srvlock != !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK)) { - CDEBUG(D_PAGE, "SRVLOCK flag mismatch," - " oap %p, page %p, srvlock %u\n", - oap, oap->oap_brw_page.pg, (unsigned)!srvlock); - break; - } - - /* If there is a gap at the start of this page, it can't merge - * with any previous page, so we'll hand the network a - * "fragmented" page array that it can't transfer in 1 RDMA */ - if (oap->oap_obj_off < starting_offset) { - if (starting_page_off != 0) - break; - - starting_page_off = oap->oap_page_off; - starting_offset = oap->oap_obj_off + starting_page_off; - } else if (oap->oap_page_off != 0) - break; - - /* in llite being 'ready' equates to the page being locked - * until completion unlocks it. commit_write submits a page - * as not ready because its unlock will happen unconditionally - * as the call returns. if we race with commit_write giving - * us that page we don't want to create a hole in the page - * stream, so we stop and leave the rpc to be fired by - * another dirtier or kupdated interval (the not ready page - * will still be on the dirty list). we could call in - * at the end of ll_file_write to process the queue again. */ - if (!(oap->oap_async_flags & ASYNC_READY)) { - int rc = ops->ap_make_ready(env, oap->oap_caller_data, - cmd); - if (rc < 0) - CDEBUG(D_INODE, "oap %p page %p returned %d " - "instead of ready\n", oap, - oap->oap_page, rc); - switch (rc) { - case -EAGAIN: - /* llite is telling us that the page is still - * in commit_write and that we should try - * and put it in an rpc again later. we - * break out of the loop so we don't create - * a hole in the sequence of pages in the rpc - * stream.*/ - oap = NULL; - break; - case -EINTR: - /* the io isn't needed.. tell the checks - * below to complete the rpc with EINTR */ - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - cfs_spin_unlock(&oap->oap_lock); - oap->oap_count = -EINTR; - break; - case 0: - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY; - cfs_spin_unlock(&oap->oap_lock); - break; - default: - LASSERTF(0, "oap %p page %p returned %d " - "from make_ready\n", oap, - oap->oap_page, rc); - break; - } - } - if (oap == NULL) - break; - - /* take the page out of our book-keeping */ - cfs_list_del_init(&oap->oap_pending_item); - lop_update_pending(cli, lop, cmd, -1); - cfs_list_del_init(&oap->oap_urgent_item); - - /* ask the caller for the size of the io as the rpc leaves. */ - if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = - ops->ap_refresh_count(env, oap->oap_caller_data, - cmd); - LASSERT(oap->oap_page_off + oap->oap_count <= CFS_PAGE_SIZE); - } - if (oap->oap_count <= 0) { - CDEBUG(D_CACHE, "oap %p count %d, completing\n", oap, - oap->oap_count); - osc_ap_completion(env, cli, NULL, - oap, 0, oap->oap_count); - continue; - } - - /* now put the page back in our accounting */ - cfs_list_add_tail(&oap->oap_rpc_item, &rpc_list); - if (page_count++ == 0) - srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK); - - if (oap->oap_brw_flags & OBD_BRW_MEMALLOC) - mem_tight = 1; - - /* End on a PTLRPC_MAX_BRW_SIZE boundary. We want full-sized - * RPCs aligned on PTLRPC_MAX_BRW_SIZE boundaries to help reads - * have the same alignment as the initial writes that allocated - * extents on the server. */ - ending_offset = oap->oap_obj_off + oap->oap_page_off + - oap->oap_count; - if (!(ending_offset & (PTLRPC_MAX_BRW_SIZE - 1))) - break; - - if (page_count >= cli->cl_max_pages_per_rpc) - break; - - /* If there is a gap at the end of this page, it can't merge - * with any subsequent pages, so we'll hand the network a - * "fragmented" page array that it can't transfer in 1 RDMA */ - if (oap->oap_page_off + oap->oap_count < CFS_PAGE_SIZE) - break; - } - - loi_list_maint(cli, loi); - - client_obd_list_unlock(&cli->cl_loi_list_lock); - - if (clob != NULL) - cl_object_put(env, clob); - - if (page_count == 0) { - client_obd_list_lock(&cli->cl_loi_list_lock); - RETURN(0); - } - - req = osc_build_req(env, cli, &rpc_list, page_count, - mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd); - if (IS_ERR(req)) { - LASSERT(cfs_list_empty(&rpc_list)); - loi_list_maint(cli, loi); - RETURN(PTR_ERR(req)); - } - - aa = ptlrpc_req_async_args(req); - - starting_offset &= PTLRPC_MAX_BRW_SIZE - 1; - if (cmd == OBD_BRW_READ) { - lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); - lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, - (starting_offset >> CFS_PAGE_SHIFT) + 1); - } else { - lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_write_rpc_hist, - cli->cl_w_in_flight); - lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, - (starting_offset >> CFS_PAGE_SHIFT) + 1); - } - - client_obd_list_lock(&cli->cl_loi_list_lock); - - if (cmd == OBD_BRW_READ) - cli->cl_r_in_flight++; - else - cli->cl_w_in_flight++; - - /* queued sync pages can be torn down while the pages - * were between the pending list and the rpc */ - tmp = NULL; - cfs_list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { - /* only one oap gets a request reference */ - if (tmp == NULL) - tmp = oap; - if (oap->oap_interrupted && !req->rq_intr) { - CDEBUG(D_INODE, "oap %p in req %p interrupted\n", - oap, req); - ptlrpc_mark_interrupted(req); - } - } - if (tmp != NULL) - tmp->oap_request = ptlrpc_request_addref(req); - - DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", - page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); - - req->rq_interpret_reply = brw_interpret; - - /* XXX: Maybe the caller can check the RPC bulk descriptor to see which - * CPU/NUMA node the majority of pages were allocated on, and try - * to assign the async RPC to the CPU core (PDL_POLICY_PREFERRED) - * to reduce cross-CPU memory traffic. - * - * But on the other hand, we expect that multiple ptlrpcd threads - * and the initial write sponsor can run in parallel, especially - * when data checksum is enabled, which is CPU-bound operation and - * single ptlrpcd thread cannot process in time. So more ptlrpcd - * threads sharing BRW load (with PDL_POLICY_ROUND) seems better. - */ - ptlrpcd_add_req(req, pol, -1); - RETURN(1); -} - -#define LOI_DEBUG(LOI, STR, args...) \ - CDEBUG(D_INODE, "loi ready %d wr %d:%d rd %d:%d " STR, \ - !cfs_list_empty(&(LOI)->loi_ready_item) || \ - !cfs_list_empty(&(LOI)->loi_hp_ready_item), \ - (LOI)->loi_write_lop.lop_num_pending, \ - !cfs_list_empty(&(LOI)->loi_write_lop.lop_urgent), \ - (LOI)->loi_read_lop.lop_num_pending, \ - !cfs_list_empty(&(LOI)->loi_read_lop.lop_urgent), \ - args) \ - -/* This is called by osc_check_rpcs() to find which objects have pages that - * we could be sending. These lists are maintained by lop_makes_rpc(). */ -struct lov_oinfo *osc_next_loi(struct client_obd *cli) -{ - ENTRY; - - /* First return objects that have blocked locks so that they - * will be flushed quickly and other clients can get the lock, - * then objects which have pages ready to be stuffed into RPCs */ - if (!cfs_list_empty(&cli->cl_loi_hp_ready_list)) - RETURN(cfs_list_entry(cli->cl_loi_hp_ready_list.next, - struct lov_oinfo, loi_hp_ready_item)); - if (!cfs_list_empty(&cli->cl_loi_ready_list)) - RETURN(cfs_list_entry(cli->cl_loi_ready_list.next, - struct lov_oinfo, loi_ready_item)); - - /* then if we have cache waiters, return all objects with queued - * writes. This is especially important when many small files - * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!cfs_list_empty(&cli->cl_cache_waiters) && - !cfs_list_empty(&cli->cl_loi_write_list)) - RETURN(cfs_list_entry(cli->cl_loi_write_list.next, - struct lov_oinfo, loi_write_item)); - - /* then return all queued objects when we have an invalid import - * so that they get flushed */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { - if (!cfs_list_empty(&cli->cl_loi_write_list)) - RETURN(cfs_list_entry(cli->cl_loi_write_list.next, - struct lov_oinfo, - loi_write_item)); - if (!cfs_list_empty(&cli->cl_loi_read_list)) - RETURN(cfs_list_entry(cli->cl_loi_read_list.next, - struct lov_oinfo, loi_read_item)); - } - RETURN(NULL); -} - -static int osc_max_rpc_in_flight(struct client_obd *cli, struct lov_oinfo *loi) -{ - struct osc_async_page *oap; - int hprpc = 0; - - if (!cfs_list_empty(&loi->loi_write_lop.lop_urgent)) { - oap = cfs_list_entry(loi->loi_write_lop.lop_urgent.next, - struct osc_async_page, oap_urgent_item); - hprpc = !!(oap->oap_async_flags & ASYNC_HP); - } - - if (!hprpc && !cfs_list_empty(&loi->loi_read_lop.lop_urgent)) { - oap = cfs_list_entry(loi->loi_read_lop.lop_urgent.next, - struct osc_async_page, oap_urgent_item); - hprpc = !!(oap->oap_async_flags & ASYNC_HP); - } - - return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; -} - -/* called with the loi list lock held */ -static void osc_check_rpcs0(const struct lu_env *env, struct client_obd *cli, int ptlrpc) -{ - struct lov_oinfo *loi; - int rc = 0, race_counter = 0; - pdl_policy_t pol; - ENTRY; - - pol = ptlrpc ? PDL_POLICY_SAME : PDL_POLICY_ROUND; - - while ((loi = osc_next_loi(cli)) != NULL) { - LOI_DEBUG(loi, "%lu in flight\n", rpcs_in_flight(cli)); - - if (osc_max_rpc_in_flight(cli, loi)) - break; - - /* attempt some read/write balancing by alternating between - * reads and writes in an object. The makes_rpc checks here - * would be redundant if we were getting read/write work items - * instead of objects. we don't want send_oap_rpc to drain a - * partial read pending queue when we're given this object to - * do io on writes while there are cache waiters */ - if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) { - rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE, - &loi->loi_write_lop, pol); - if (rc < 0) { - CERROR("Write request failed with %d\n", rc); - - /* osc_send_oap_rpc failed, mostly because of - * memory pressure. - * - * It can't break here, because if: - * - a page was submitted by osc_io_submit, so - * page locked; - * - no request in flight - * - no subsequent request - * The system will be in live-lock state, - * because there is no chance to call - * osc_io_unplug() and osc_check_rpcs() any - * more. pdflush can't help in this case, - * because it might be blocked at grabbing - * the page lock as we mentioned. - * - * Anyway, continue to drain pages. */ - /* break; */ - } - - if (rc > 0) - race_counter = 0; - else if (rc == 0) - race_counter++; - } - if (lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)) { - rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ, - &loi->loi_read_lop, pol); - if (rc < 0) - CERROR("Read request failed with %d\n", rc); - - if (rc > 0) - race_counter = 0; - else if (rc == 0) - race_counter++; - } - - /* attempt some inter-object balancing by issuing rpcs - * for each object in turn */ - if (!cfs_list_empty(&loi->loi_hp_ready_item)) - cfs_list_del_init(&loi->loi_hp_ready_item); - if (!cfs_list_empty(&loi->loi_ready_item)) - cfs_list_del_init(&loi->loi_ready_item); - if (!cfs_list_empty(&loi->loi_write_item)) - cfs_list_del_init(&loi->loi_write_item); - if (!cfs_list_empty(&loi->loi_read_item)) - cfs_list_del_init(&loi->loi_read_item); - - loi_list_maint(cli, loi); - - /* send_oap_rpc fails with 0 when make_ready tells it to - * back off. llite's make_ready does this when it tries - * to lock a page queued for write that is already locked. - * we want to try sending rpcs from many objects, but we - * don't want to spin failing with 0. */ - if (race_counter == 10) - break; - } -} - -void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) -{ - osc_check_rpcs0(env, cli, 0); -} - -/** - * Non-blocking version of osc_enter_cache() that consumes grant only when it - * is available. - */ -int osc_enter_cache_try(const struct lu_env *env, - struct client_obd *cli, struct lov_oinfo *loi, - struct osc_async_page *oap, int transient) -{ - int has_grant; - - has_grant = cli->cl_avail_grant >= CFS_PAGE_SIZE; - if (has_grant) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit += CFS_PAGE_SIZE; - cfs_atomic_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - } - return has_grant; -} - -/* Caller must hold loi_list_lock - we drop/regain it if we need to wait for - * grant or cache space. */ -static int osc_enter_cache(const struct lu_env *env, - struct client_obd *cli, struct lov_oinfo *loi, - struct osc_async_page *oap) -{ - struct osc_cache_waiter ocw; - struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); - int rc = -EDQUOT; - ENTRY; - - CDEBUG(D_CACHE, "dirty: %ld/%d dirty_max: %ld/%d dropped: %lu " - "grant: %lu\n", cli->cl_dirty, cfs_atomic_read(&obd_dirty_pages), - cli->cl_dirty_max, obd_max_dirty_pages, - cli->cl_lost_grant, cli->cl_avail_grant); - - /* force the caller to try sync io. this can jump the list - * of queued writes and create a discontiguous rpc stream */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - cli->cl_dirty_max < CFS_PAGE_SIZE || - cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) - RETURN(-EDQUOT); - - /* Hopefully normal case - cache space and write credits available */ - if (cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max && - cfs_atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages && - osc_enter_cache_try(env, cli, loi, oap, 0)) - RETURN(0); - - /* We can get here for two reasons: too many dirty pages in cache, or - * run out of grants. In both cases we should write dirty pages out. - * Adding a cache waiter will trigger urgent write-out no matter what - * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - cfs_waitq_init(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - while (cli->cl_dirty > 0) { - cfs_list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - - loi_list_maint(cli, loi); - osc_check_rpcs(env, cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli->cl_import->imp_obd->obd_name, &ocw, oap); - - rc = l_wait_event(ocw.ocw_waitq, cfs_list_empty(&ocw.ocw_entry), &lwi); - - client_obd_list_lock(&cli->cl_loi_list_lock); - cfs_list_del_init(&ocw.ocw_entry); - if (rc < 0) - break; - - rc = ocw.ocw_rc; - if (rc != -EDQUOT) - break; - } - - RETURN(rc); -} - - -int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, cfs_page_t *page, - obd_off offset, const struct obd_async_page_ops *ops, - void *data, void **res, int nocache, - struct lustre_handle *lockh) -{ - struct osc_async_page *oap; - - ENTRY; - - if (!page) - return cfs_size_round(sizeof(*oap)); - - oap = *res; - oap->oap_magic = OAP_MAGIC; - oap->oap_cli = &exp->exp_obd->u.cli; - oap->oap_loi = loi; - - oap->oap_caller_ops = ops; - oap->oap_caller_data = data; - - oap->oap_page = page; - oap->oap_obj_off = offset; - if (!client_is_remote(exp) && - cfs_capable(CFS_CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - - LASSERT(!(offset & ~CFS_PAGE_MASK)); - - CFS_INIT_LIST_HEAD(&oap->oap_pending_item); - CFS_INIT_LIST_HEAD(&oap->oap_urgent_item); - CFS_INIT_LIST_HEAD(&oap->oap_rpc_item); - CFS_INIT_LIST_HEAD(&oap->oap_page_list); - - cfs_spin_lock_init(&oap->oap_lock); - CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset); - RETURN(0); -} - -int osc_queue_async_io(const struct lu_env *env, struct obd_export *exp, - struct lov_stripe_md *lsm, struct lov_oinfo *loi, - struct osc_async_page *oap, int cmd, int off, - int count, obd_flag brw_flags, enum async_flags async_flags) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - int rc = 0; - ENTRY; - - if (oap->oap_magic != OAP_MAGIC) - RETURN(-EINVAL); - - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(-EIO); - - if (!cfs_list_empty(&oap->oap_pending_item) || - !cfs_list_empty(&oap->oap_urgent_item) || - !cfs_list_empty(&oap->oap_rpc_item)) - RETURN(-EBUSY); - - /* check if the file's owner/group is over quota */ - if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)) { - struct cl_object *obj; - struct cl_attr attr; /* XXX put attr into thread info */ - unsigned int qid[MAXQUOTAS]; - - obj = cl_object_top(osc_oap2cl_page(oap)->cp_obj); - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, &attr); - cl_object_attr_unlock(obj); - - qid[USRQUOTA] = attr.cat_uid; - qid[GRPQUOTA] = attr.cat_gid; - if (rc == 0 && - osc_quota_chkdq(cli, qid) == NO_QUOTA) - rc = -EDQUOT; - if (rc) - RETURN(rc); - } - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - client_obd_list_lock(&cli->cl_loi_list_lock); - - LASSERT(off + count <= CFS_PAGE_SIZE); - oap->oap_cmd = cmd; - oap->oap_page_off = off; - oap->oap_count = count; - oap->oap_brw_flags = brw_flags; - /* Give a hint to OST that requests are coming from kswapd - bug19529 */ - if (cfs_memory_pressure_get()) - oap->oap_brw_flags |= OBD_BRW_MEMALLOC; - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags = async_flags; - cfs_spin_unlock(&oap->oap_lock); - - if (cmd & OBD_BRW_WRITE) { - rc = osc_enter_cache(env, cli, loi, oap); - if (rc) { - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(rc); - } - } - - LOI_DEBUG(loi, "oap %p page %p added for cmd %d\n", oap, oap->oap_page, - cmd); - - osc_oap_to_pending(oap); - loi_list_maint(cli, loi); - if (!osc_max_rpc_in_flight(cli, loi) && - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) { - LASSERT(cli->cl_writeback_work != NULL); - rc = ptlrpcd_queue_work(cli->cl_writeback_work); - - CDEBUG(D_CACHE, "Queued writeback work for client obd %p/%d.\n", - cli, rc); - } - client_obd_list_unlock(&cli->cl_loi_list_lock); - - RETURN(0); -} - -/* aka (~was & now & flag), but this is more clear :) */ -#define SETTING(was, now, flag) (!(was & flag) && (now & flag)) - -int osc_set_async_flags_base(struct client_obd *cli, - struct lov_oinfo *loi, struct osc_async_page *oap, - obd_flag async_flags) -{ - struct loi_oap_pages *lop; - int flags = 0; - ENTRY; - - LASSERT(!cfs_list_empty(&oap->oap_pending_item)); - - if (oap->oap_cmd & OBD_BRW_WRITE) { - lop = &loi->loi_write_lop; - } else { - lop = &loi->loi_read_lop; - } - - if ((oap->oap_async_flags & async_flags) == async_flags) - RETURN(0); - - if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY)) - flags |= ASYNC_READY; - - if (SETTING(oap->oap_async_flags, async_flags, ASYNC_URGENT) && - cfs_list_empty(&oap->oap_rpc_item)) { - if (oap->oap_async_flags & ASYNC_HP) - cfs_list_add(&oap->oap_urgent_item, &lop->lop_urgent); - else - cfs_list_add_tail(&oap->oap_urgent_item, - &lop->lop_urgent); - flags |= ASYNC_URGENT; - loi_list_maint(cli, loi); - } - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags |= flags; - cfs_spin_unlock(&oap->oap_lock); - - LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page, - oap->oap_async_flags); - RETURN(0); -} - -int osc_teardown_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, struct osc_async_page *oap) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct loi_oap_pages *lop; - int rc = 0; - ENTRY; - - if (oap->oap_magic != OAP_MAGIC) - RETURN(-EINVAL); - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - if (oap->oap_cmd & OBD_BRW_WRITE) { - lop = &loi->loi_write_lop; - } else { - lop = &loi->loi_read_lop; - } - - client_obd_list_lock(&cli->cl_loi_list_lock); - - if (!cfs_list_empty(&oap->oap_rpc_item)) - GOTO(out, rc = -EBUSY); - - osc_exit_cache(cli, oap, 0); - osc_wake_cache_waiters(cli); - - if (!cfs_list_empty(&oap->oap_urgent_item)) { - cfs_list_del_init(&oap->oap_urgent_item); - cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags &= ~(ASYNC_URGENT | ASYNC_HP); - cfs_spin_unlock(&oap->oap_lock); - } - if (!cfs_list_empty(&oap->oap_pending_item)) { - cfs_list_del_init(&oap->oap_pending_item); - lop_update_pending(cli, lop, oap->oap_cmd, -1); - } - loi_list_maint(cli, loi); - LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page); -out: - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(rc); -} - -static int osc_set_lock_data_with_check(struct ldlm_lock *lock, - struct ldlm_enqueue_info *einfo) -{ - void *data = einfo->ei_cbdata; - int set = 0; - - LASSERT(lock != NULL); - LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); - LASSERT(lock->l_resource->lr_type == einfo->ei_type); - LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); - LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); - - lock_res_and_lock(lock); - cfs_spin_lock(&osc_ast_guard); - - if (lock->l_ast_data == NULL) - lock->l_ast_data = data; - if (lock->l_ast_data == data) - set = 1; - - cfs_spin_unlock(&osc_ast_guard); - unlock_res_and_lock(lock); - - return set; -} - -static int osc_set_data_with_check(struct lustre_handle *lockh, - struct ldlm_enqueue_info *einfo) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - int set = 0; + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int set = 0; if (lock != NULL) { set = osc_set_lock_data_with_check(lock, einfo); @@ -3170,7 +2146,7 @@ static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, struct ldlm_res_id res_id; struct obd_device *obd = class_exp2obd(exp); - osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_seq, &res_id); + ostid_build_res_name(&lsm->lsm_oi, &res_id); ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); return 0; } @@ -3186,7 +2162,7 @@ static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, struct obd_device *obd = class_exp2obd(exp); int rc = 0; - osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_seq, &res_id); + ostid_build_res_name(&lsm->lsm_oi, &res_id); rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); if (rc == LDLM_ITER_STOP) return(1); @@ -3197,7 +2173,7 @@ static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, obd_enqueue_update_f upcall, void *cookie, - int *flags, int agl, int rc) + __u64 *flags, int agl, int rc) { int intent = *flags & LDLM_FL_HAS_INTENT; ENTRY; @@ -3210,6 +2186,8 @@ static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, &RMF_DLM_REP); LASSERT(rep != NULL); + rep->lock_policy_res1 = + ptlrpc_status_ntoh(rep->lock_policy_res1); if (rep->lock_policy_res1) rc = rep->lock_policy_res1; } @@ -3236,7 +2214,7 @@ static int osc_enqueue_interpret(const struct lu_env *env, __u32 mode; struct ost_lvb *lvb; __u32 lvb_len; - int *flags = aa->oa_flags; + __u64 *flags = aa->oa_flags; /* Make a local copy of a lock handle and a mode, because aa->oa_* * might be freed anytime after lock upcall has been called. */ @@ -3253,6 +2231,9 @@ static int osc_enqueue_interpret(const struct lu_env *env, * osc_enqueue_fini(). */ ldlm_lock_addref(&handle, mode); + /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); + /* Let CP AST to grant the lock first. */ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); @@ -3289,51 +2270,6 @@ static int osc_enqueue_interpret(const struct lu_env *env, return rc; } -void osc_update_enqueue(struct lustre_handle *lov_lockhp, - struct lov_oinfo *loi, int flags, - struct ost_lvb *lvb, __u32 mode, int rc) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); - - if (rc == ELDLM_OK) { - __u64 tmp; - - LASSERT(lock != NULL); - loi->loi_lvb = *lvb; - tmp = loi->loi_lvb.lvb_size; - /* Extend KMS up to the end of this lock and no further - * A lock on [x,y] means a KMS of up to y + 1 bytes! */ - if (tmp > lock->l_policy_data.l_extent.end) - tmp = lock->l_policy_data.l_extent.end + 1; - if (tmp >= loi->loi_kms) { - LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64 - ", kms="LPU64, loi->loi_lvb.lvb_size, tmp); - loi_kms_set(loi, tmp); - } else { - LDLM_DEBUG(lock, "lock acquired, setting rss=" - LPU64"; leaving kms="LPU64", end="LPU64, - loi->loi_lvb.lvb_size, loi->loi_kms, - lock->l_policy_data.l_extent.end); - } - ldlm_lock_allow_match(lock); - } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) { - LASSERT(lock != NULL); - loi->loi_lvb = *lvb; - ldlm_lock_allow_match(lock); - CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" - " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms); - rc = ELDLM_OK; - } - - if (lock != NULL) { - if (rc != ELDLM_OK) - ldlm_lock_fail_match(lock); - - LDLM_LOCK_PUT(lock); - } -} -EXPORT_SYMBOL(osc_update_enqueue); - struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock @@ -3344,20 +2280,20 @@ struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; * is excluded from the cluster -- such scenarious make the life difficult, so * release locks just after they are obtained. */ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - int *flags, ldlm_policy_data_t *policy, - struct ost_lvb *lvb, int kms_valid, - obd_enqueue_update_f upcall, void *cookie, - struct ldlm_enqueue_info *einfo, - struct lustre_handle *lockh, - struct ptlrpc_request_set *rqset, int async, int agl) -{ - struct obd_device *obd = exp->exp_obd; - struct ptlrpc_request *req = NULL; - int intent = *flags & LDLM_FL_HAS_INTENT; - int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); - ldlm_mode_t mode; - int rc; - ENTRY; + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl) +{ + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + __u64 match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); + ldlm_mode_t mode; + int rc; + ENTRY; /* Filesystem lock extents are extended to page boundaries so that * dealing with the page cache is a little smoother. */ @@ -3393,7 +2329,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, if (mode) { struct ldlm_lock *matched = ldlm_handle2lock(lockh); - if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) { + if ((agl != 0) && !ldlm_is_lvb_ready(matched)) { /* For AGL, if enqueue RPC is sent but the lock is not * granted, then skip to process this strpe. * Return -ECANCELED to tell the caller. */ @@ -3412,7 +2348,10 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, * are explained in lov_enqueue() */ } - /* We already have a lock, and it's referenced */ + /* We already have a lock, and it's referenced. + * + * At this point, the cl_lock::cll_state is CLS_QUEUING, + * AGL upcall may change it to CLS_HELD directly. */ (*upcall)(cookie, ELDLM_OK); if (einfo->ei_mode != mode) @@ -3430,14 +2369,13 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, no_match: if (intent) { - CFS_LIST_HEAD(cancels); - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_ENQUEUE_LVB); - if (req == NULL) - RETURN(-ENOMEM); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE_LVB); + if (req == NULL) + RETURN(-ENOMEM); - rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0); - if (rc) { + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE); + if (rc < 0) { ptlrpc_request_free(req); RETURN(rc); } @@ -3451,7 +2389,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, *flags &= ~LDLM_FL_BLOCK_GRANTED; rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, - sizeof(*lvb), lockh, async); + sizeof(*lvb), LVB_T_OST, lockh, async); if (rqset) { if (!rc) { struct osc_enqueue_args *aa; @@ -3485,34 +2423,15 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, RETURN(rc); } -static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, - struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset) -{ - struct ldlm_res_id res_id; - int rc; - ENTRY; - - osc_build_res_name(oinfo->oi_md->lsm_object_id, - oinfo->oi_md->lsm_object_seq, &res_id); - - rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy, - &oinfo->oi_md->lsm_oinfo[0]->loi_lvb, - oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid, - oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh, - rqset, rqset != NULL, 0); - RETURN(rc); -} - int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh, - int unref) + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *data, struct lustre_handle *lockh, + int unref) { - struct obd_device *obd = exp->exp_obd; - int lflags = *flags; - ldlm_mode_t rc; - ENTRY; + struct obd_device *obd = exp->exp_obd; + __u64 lflags = *flags; + ldlm_mode_t rc; + ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) RETURN(-EIO); @@ -3560,36 +2479,11 @@ int osc_cancel_base(struct lustre_handle *lockh, __u32 mode) RETURN(0); } -static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md, - __u32 mode, struct lustre_handle *lockh) -{ - ENTRY; - RETURN(osc_cancel_base(lockh, mode)); -} - -static int osc_cancel_unused(struct obd_export *exp, - struct lov_stripe_md *lsm, - ldlm_cancel_flags_t flags, - void *opaque) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ldlm_res_id res_id, *resp = NULL; - - if (lsm != NULL) { - resp = osc_build_res_name(lsm->lsm_object_id, - lsm->lsm_object_seq, &res_id); - } - - return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque); -} - static int osc_statfs_interpret(const struct lu_env *env, struct ptlrpc_request *req, struct osc_async_args *aa, int rc) { - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; struct obd_statfs *msfs; - __u64 used; ENTRY; if (rc == -EBADR) @@ -3612,60 +2506,17 @@ static int osc_statfs_interpret(const struct lu_env *env, GOTO(out, rc = -EPROTO); } - /* Reinitialize the RDONLY and DEGRADED flags at the client - * on each statfs, so they don't stay set permanently. */ - cfs_spin_lock(&cli->cl_oscc.oscc_lock); - - if (unlikely(msfs->os_state & OS_STATE_DEGRADED)) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_DEGRADED; - else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_DEGRADED)) - cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_DEGRADED; - - if (unlikely(msfs->os_state & OS_STATE_READONLY)) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_RDONLY; - else if (unlikely(cli->cl_oscc.oscc_flags & OSCC_FLAG_RDONLY)) - cli->cl_oscc.oscc_flags &= ~OSCC_FLAG_RDONLY; - - /* Add a bit of hysteresis so this flag isn't continually flapping, - * and ensure that new files don't get extremely fragmented due to - * only a small amount of available space in the filesystem. - * We want to set the NOSPC flag when there is less than ~0.1% free - * and clear it when there is at least ~0.2% free space, so: - * avail < ~0.1% max max = avail + used - * 1025 * avail < avail + used used = blocks - free - * 1024 * avail < used - * 1024 * avail < blocks - free - * avail < ((blocks - free) >> 10) - * - * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to - * lose that amount of space so in those cases we report no space left - * if their is less than 1 GB left. */ - used = min_t(__u64,(msfs->os_blocks - msfs->os_bfree) >> 10, 1 << 30); - if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) == 0) && - ((msfs->os_ffree < 32) || (msfs->os_bavail < used)))) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_NOSPC; - else if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) != 0) && - (msfs->os_ffree > 64) && - (msfs->os_bavail > (used << 1)))) { - cli->cl_oscc.oscc_flags &= ~(OSCC_FLAG_NOSPC | - OSCC_FLAG_NOSPC_BLK); - } - - if (unlikely(((cli->cl_oscc.oscc_flags & OSCC_FLAG_NOSPC) != 0) && - (msfs->os_bavail < used))) - cli->cl_oscc.oscc_flags |= OSCC_FLAG_NOSPC_BLK; - - cfs_spin_unlock(&cli->cl_oscc.oscc_lock); - *aa->aa_oi->oi_osfs = *msfs; out: rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); RETURN(rc); } -static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo, - __u64 max_age, struct ptlrpc_request_set *rqset) +static int osc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, __u64 max_age, + struct ptlrpc_request_set *rqset) { + struct obd_device *obd = class_exp2obd(exp); struct ptlrpc_request *req; struct osc_async_args *aa; int rc; @@ -3705,9 +2556,10 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo, RETURN(0); } -static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, - __u64 max_age, __u32 flags) +static int osc_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) { + struct obd_device *obd = class_exp2obd(exp); struct obd_statfs *msfs; struct ptlrpc_request *req; struct obd_import *imp = NULL; @@ -3716,10 +2568,10 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, /*Since the request might also come from lprocfs, so we need *sync this with client_disconnect_export Bug15684*/ - cfs_down_read(&obd->u.cli.cl_sem); + down_read(&obd->u.cli.cl_sem); if (obd->u.cli.cl_import) imp = class_import_get(obd->u.cli.cl_import); - cfs_up_read(&obd->u.cli.cl_sem); + up_read(&obd->u.cli.cl_sem); if (!imp) RETURN(-ENODEV); @@ -3768,70 +2620,6 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, return rc; } -/* Retrieve object striping information. - * - * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating - * the maximum number of OST indices which will fit in the user buffer. - * lmm_magic must be LOV_MAGIC (we only use 1 slot here). - */ -static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) -{ - /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ - struct lov_user_md_v3 lum, *lumk; - struct lov_user_ost_data_v1 *lmm_objects; - int rc = 0, lum_size; - ENTRY; - - if (!lsm) - RETURN(-ENODATA); - - /* we only need the header part from user space to get lmm_magic and - * lmm_stripe_count, (the header part is common to v1 and v3) */ - lum_size = sizeof(struct lov_user_md_v1); - if (cfs_copy_from_user(&lum, lump, lum_size)) - RETURN(-EFAULT); - - if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && - (lum.lmm_magic != LOV_USER_MAGIC_V3)) - RETURN(-EINVAL); - - /* lov_user_md_vX and lov_mds_md_vX must have the same size */ - LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1)); - LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3)); - LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0])); - - /* we can use lov_mds_md_size() to compute lum_size - * because lov_user_md_vX and lov_mds_md_vX have the same size */ - if (lum.lmm_stripe_count > 0) { - lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic); - OBD_ALLOC(lumk, lum_size); - if (!lumk) - RETURN(-ENOMEM); - - if (lum.lmm_magic == LOV_USER_MAGIC_V1) - lmm_objects = &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); - else - lmm_objects = &(lumk->lmm_objects[0]); - lmm_objects->l_object_id = lsm->lsm_object_id; - } else { - lum_size = lov_mds_md_size(0, lum.lmm_magic); - lumk = &lum; - } - - lumk->lmm_object_id = lsm->lsm_object_id; - lumk->lmm_object_seq = lsm->lsm_object_seq; - lumk->lmm_stripe_count = 1; - - if (cfs_copy_to_user(lump, lumk, lum_size)) - rc = -EFAULT; - - if (lumk != &lum) - OBD_FREE(lumk, lum_size); - - RETURN(rc); -} - - static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) { @@ -3840,58 +2628,12 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, int err = 0; ENTRY; - if (!cfs_try_module_get(THIS_MODULE)) { - CERROR("Can't get module. Is it alive?"); - return -EINVAL; - } + if (!try_module_get(THIS_MODULE)) { + CERROR("%s: cannot get module '%s'\n", obd->obd_name, + module_name(THIS_MODULE)); + return -EINVAL; + } switch (cmd) { - case OBD_IOC_LOV_GET_CONFIG: { - char *buf; - struct lov_desc *desc; - struct obd_uuid uuid; - - buf = NULL; - len = 0; - if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) - GOTO(out, err = -EINVAL); - - data = (struct obd_ioctl_data *)buf; - - if (sizeof(*desc) > data->ioc_inllen1) { - obd_ioctl_freedata(buf, len); - GOTO(out, err = -EINVAL); - } - - if (data->ioc_inllen2 < sizeof(uuid)) { - obd_ioctl_freedata(buf, len); - GOTO(out, err = -EINVAL); - } - - desc = (struct lov_desc *)data->ioc_inlbuf1; - desc->ld_tgt_count = 1; - desc->ld_active_tgt_count = 1; - desc->ld_default_stripe_count = 1; - desc->ld_default_stripe_size = 0; - desc->ld_default_stripe_offset = 0; - desc->ld_pattern = 0; - memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid)); - - memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid)); - - err = cfs_copy_to_user((void *)uarg, buf, len); - if (err) - err = -EFAULT; - obd_ioctl_freedata(buf, len); - GOTO(out, err); - } - case LL_IOC_LOV_SETSTRIPE: - err = obd_alloc_memmd(exp, karg); - if (err > 0) - err = 0; - GOTO(out, err); - case LL_IOC_LOV_GETSTRIPE: - err = osc_getstripe(karg, uarg); - GOTO(out, err); case OBD_IOC_CLIENT_RECOVER: err = ptlrpc_recover_import(obd->u.cli.cl_import, data->ioc_inlbuf1, 0); @@ -3908,18 +2650,18 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case OBD_IOC_PING_TARGET: err = ptlrpc_obd_ping(obd); GOTO(out, err); - default: - CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", - cmd, cfs_curproc_comm()); - GOTO(out, err = -ENOTTY); - } + default: + CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", + cmd, current_comm()); + GOTO(out, err = -ENOTTY); + } out: - cfs_module_put(THIS_MODULE); - return err; + module_put(THIS_MODULE); + return err; } -static int osc_get_info(struct obd_export *exp, obd_count keylen, - void *key, __u32 *vallen, void *val, +static int osc_get_info(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, __u32 *vallen, void *val, struct lov_stripe_md *lsm) { ENTRY; @@ -3968,15 +2710,52 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, ptlrpc_req_finished(req); RETURN(rc); } else if (KEY_IS(KEY_FIEMAP)) { - struct ptlrpc_request *req; - struct ll_user_fiemap *reply; - char *tmp; - int rc; - + struct ll_fiemap_info_key *fm_key = + (struct ll_fiemap_info_key *)key; + struct ldlm_res_id res_id; + ldlm_policy_data_t policy; + struct lustre_handle lockh; + ldlm_mode_t mode = 0; + struct ptlrpc_request *req; + struct ll_user_fiemap *reply; + char *tmp; + int rc; + + if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC)) + goto skip_locking; + + policy.l_extent.start = fm_key->fiemap.fm_start & + CFS_PAGE_MASK; + + if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <= + fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = (fm_key->fiemap.fm_start + + fm_key->fiemap.fm_length + + PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK; + + ostid_build_res_name(&fm_key->oa.o_oi, &res_id); + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED | + LDLM_FL_LVB_READY, + &res_id, LDLM_EXTENT, &policy, + LCK_PR | LCK_PW, &lockh, 0); + if (mode) { /* lock is cached on client */ + if (mode != LCK_PR) { + ldlm_lock_addref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_PW); + } + } else { /* no cached lock, needs acquire lock on server side */ + fm_key->oa.o_valid |= OBD_MD_FLFLAGS; + fm_key->oa.o_flags |= OBD_FL_SRVLOCK; + } + +skip_locking: req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GET_INFO_FIEMAP); if (req == NULL) - RETURN(-ENOMEM); + GOTO(drop_lock, rc = -ENOMEM); req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, keylen); @@ -3988,7 +2767,7 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); if (rc) { ptlrpc_request_free(req); - RETURN(rc); + GOTO(drop_lock, rc); } tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); @@ -3999,59 +2778,27 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); if (rc) - GOTO(out1, rc); + GOTO(fini_req, rc); reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); if (reply == NULL) - GOTO(out1, rc = -EPROTO); + GOTO(fini_req, rc = -EPROTO); memcpy(val, reply, *vallen); - out1: +fini_req: ptlrpc_req_finished(req); - +drop_lock: + if (mode) + ldlm_lock_decref(&lockh, LCK_PR); RETURN(rc); } RETURN(-EINVAL); } -static int osc_setinfo_mds_connect_import(struct obd_import *imp) -{ - struct llog_ctxt *ctxt; - int rc = 0; - ENTRY; - - ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) { - rc = llog_initiator_connect(ctxt); - llog_ctxt_put(ctxt); - } else { - /* XXX return an error? skip setting below flags? */ - } - - cfs_spin_lock(&imp->imp_lock); - imp->imp_server_timeout = 1; - imp->imp_pingable = 1; - cfs_spin_unlock(&imp->imp_lock); - CDEBUG(D_RPCTRACE, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd)); - - RETURN(rc); -} - -static int osc_setinfo_mds_conn_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *aa, int rc) -{ - ENTRY; - if (rc != 0) - RETURN(rc); - - RETURN(osc_setinfo_mds_connect_import(req->rq_import)); -} - -static int osc_set_info_async(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val, - struct ptlrpc_request_set *set) +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set) { struct ptlrpc_request *req; struct obd_device *obd = exp->exp_obd; @@ -4062,32 +2809,6 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); - if (KEY_IS(KEY_NEXT_ID)) { - obd_id new_val; - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - if (vallen != sizeof(obd_id)) - RETURN(-ERANGE); - if (val == NULL) - RETURN(-EINVAL); - - if (vallen != sizeof(obd_id)) - RETURN(-EINVAL); - - /* avoid race between allocate new object and set next id - * from ll_sync thread */ - cfs_spin_lock(&oscc->oscc_lock); - new_val = *((obd_id*)val) + 1; - if (new_val > oscc->oscc_next_id) - oscc->oscc_next_id = new_val; - cfs_spin_unlock(&oscc->oscc_lock); - CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n", - exp->exp_obd->obd_name, - obd->u.cli.cl_oscc.oscc_next_id); - - RETURN(0); - } - if (KEY_IS(KEY_CHECKSUM)) { if (vallen != sizeof(int)) RETURN(-EINVAL); @@ -4105,6 +2826,33 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } + if (KEY_IS(KEY_CACHE_SET)) { + struct client_obd *cli = &obd->u.cli; + + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)val; + atomic_inc(&cli->cl_cache->ccc_users); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + int nr = atomic_read(&cli->cl_lru_in_list) >> 1; + int target = *(int *)val; + + nr = osc_lru_shrink(env, cli, min(nr, target), true); + *(int *)val -= nr; + RETURN(0); + } + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) RETURN(-EINVAL); @@ -4115,38 +2863,31 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, Even if something bad goes through, we'd get a -EINVAL from OST anyway. */ - if (KEY_IS(KEY_GRANT_SHRINK)) - req = ptlrpc_request_alloc(imp, &RQF_OST_SET_GRANT_INFO); - else - req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); - - if (req == NULL) - RETURN(-ENOMEM); - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT, vallen); - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); - if (rc) { - ptlrpc_request_free(req); - RETURN(rc); - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); memcpy(tmp, val, vallen); - if (KEY_IS(KEY_MDS_CONN)) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - oscc->oscc_oa.o_seq = (*(__u32 *)val); - oscc->oscc_oa.o_valid |= OBD_MD_FLGROUP; - LASSERT_SEQ_IS_MDT(oscc->oscc_oa.o_seq); - req->rq_no_delay = req->rq_no_resend = 1; - req->rq_interpret_reply = osc_setinfo_mds_conn_interpret; - } else if (KEY_IS(KEY_GRANT_SHRINK)) { + if (KEY_IS(KEY_GRANT_SHRINK)) { struct osc_grant_args *aa; struct obdo *oa; @@ -4173,104 +2914,6 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } - -static struct llog_operations osc_size_repl_logops = { - lop_cancel: llog_obd_repl_cancel -}; - -static struct llog_operations osc_mds_ost_orig_logops; - -static int __osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, - struct obd_device *tgt, struct llog_catid *catid) -{ - int rc; - ENTRY; - - rc = llog_setup(obd, &obd->obd_olg, LLOG_MDS_OST_ORIG_CTXT, tgt, 1, - &catid->lci_logid, &osc_mds_ost_orig_logops); - if (rc) { - CERROR("failed LLOG_MDS_OST_ORIG_CTXT\n"); - GOTO(out, rc); - } - - rc = llog_setup(obd, &obd->obd_olg, LLOG_SIZE_REPL_CTXT, tgt, 1, - NULL, &osc_size_repl_logops); - if (rc) { - struct llog_ctxt *ctxt = - llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) - llog_cleanup(ctxt); - CERROR("failed LLOG_SIZE_REPL_CTXT\n"); - } - GOTO(out, rc); -out: - if (rc) { - CERROR("osc '%s' tgt '%s' catid %p rc=%d\n", - obd->obd_name, tgt->obd_name, catid, rc); - CERROR("logid "LPX64":0x%x\n", - catid->lci_logid.lgl_oid, catid->lci_logid.lgl_ogen); - } - return rc; -} - -static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, - struct obd_device *disk_obd, int *index) -{ - struct llog_catid catid; - static char name[32] = CATLIST; - int rc; - ENTRY; - - LASSERT(olg == &obd->obd_olg); - - cfs_mutex_down(&olg->olg_cat_processing); - rc = llog_get_cat_list(disk_obd, name, *index, 1, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - CDEBUG(D_INFO, "%s: Init llog for %d - catid "LPX64"/"LPX64":%x\n", - obd->obd_name, *index, catid.lci_logid.lgl_oid, - catid.lci_logid.lgl_oseq, catid.lci_logid.lgl_ogen); - - rc = __osc_llog_init(obd, olg, disk_obd, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - rc = llog_put_cat_list(disk_obd, name, *index, 1, &catid); - if (rc) { - CERROR("rc: %d\n", rc); - GOTO(out, rc); - } - - out: - cfs_mutex_up(&olg->olg_cat_processing); - - return rc; -} - -static int osc_llog_finish(struct obd_device *obd, int count) -{ - struct llog_ctxt *ctxt; - int rc = 0, rc2 = 0; - ENTRY; - - ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt) - rc = llog_cleanup(ctxt); - - ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); - if (ctxt) - rc2 = llog_cleanup(ctxt); - if (!rc) - rc = rc2; - - RETURN(rc); -} - static int osc_reconnect(const struct lu_env *env, struct obd_export *exp, struct obd_device *obd, struct obd_uuid *cluuid, @@ -4282,22 +2925,20 @@ static int osc_reconnect(const struct lu_env *env, if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { long lost_grant; - client_obd_list_lock(&cli->cl_loi_list_lock); - data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?: - 2 * cli->cl_max_pages_per_rpc << CFS_PAGE_SHIFT; + client_obd_list_lock(&cli->cl_loi_list_lock); + data->ocd_grant = (cli->cl_avail_grant + + (cli->cl_dirty_pages << PAGE_CACHE_SHIFT)) ?: + 2 * cli_brw_size(obd); lost_grant = cli->cl_lost_grant; cli->cl_lost_grant = 0; client_obd_list_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "request ocd_grant: %d cl_avail_grant: %ld " - "cl_dirty: %ld cl_lost_grant: %ld\n", data->ocd_grant, - cli->cl_avail_grant, cli->cl_dirty, lost_grant); CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d" - " ocd_grant: %d\n", data->ocd_connect_flags, - data->ocd_version, data->ocd_grant); - } + " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } - RETURN(0); + RETURN(0); } static int osc_disconnect(struct obd_export *exp) @@ -4311,7 +2952,7 @@ static int osc_disconnect(struct obd_export *exp) if (obd->u.cli.cl_conn_count == 1) { /* Flush any remaining cancel messages out to the * target */ - llog_sync(ctxt, exp); + llog_sync(ctxt, exp, 0); } llog_ctxt_put(ctxt); } else { @@ -4354,14 +2995,6 @@ static int osc_import_event(struct obd_device *obd, switch (event) { case IMP_EVENT_DISCON: { - /* Only do this on the MDS OSC's */ - if (imp->imp_server_timeout) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - cfs_spin_lock(&oscc->oscc_lock); - oscc->oscc_flags |= OSCC_FLAG_RECOVERING; - cfs_spin_unlock(&oscc->oscc_lock); - } cli = &obd->u.cli; client_obd_list_lock(&cli->cl_loi_list_lock); cli->cl_avail_grant = 0; @@ -4382,11 +3015,9 @@ static int osc_import_event(struct obd_device *obd, if (!IS_ERR(env)) { /* Reset grants */ cli = &obd->u.cli; - client_obd_list_lock(&cli->cl_loi_list_lock); /* all pages go to failing rpcs due to the invalid * import */ - osc_check_rpcs(env, cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); + osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND); ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); cl_env_put(env, &refcheck); @@ -4395,15 +3026,6 @@ static int osc_import_event(struct obd_device *obd, break; } case IMP_EVENT_ACTIVE: { - /* Only do this on the MDS OSC's */ - if (imp->imp_server_timeout) { - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - - cfs_spin_lock(&oscc->oscc_lock); - oscc->oscc_flags &= ~(OSCC_FLAG_NOSPC | - OSCC_FLAG_NOSPC_BLK); - cfs_spin_unlock(&oscc->oscc_lock); - } rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); break; } @@ -4442,78 +3064,119 @@ static int osc_import_event(struct obd_device *obd, * \retval zero the lock can't be canceled * \retval other ok to cancel */ -static int osc_cancel_for_recovery(struct ldlm_lock *lock) +static int osc_cancel_weight(struct ldlm_lock *lock) { - check_res_locked(lock->l_resource); + /* + * Cancel all unused and granted extent lock. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + lock->l_granted_mode == lock->l_req_mode && + osc_ldlm_weigh_ast(lock) == 0) + RETURN(1); - /* - * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR. - * - * XXX as a future improvement, we can also cancel unused write lock - * if it doesn't have dirty data and active mmaps. - */ - if (lock->l_resource->lr_type == LDLM_EXTENT && - (lock->l_granted_mode == LCK_PR || - lock->l_granted_mode == LCK_CR) && - (osc_dlm_lock_pageref(lock) == 0)) - RETURN(1); + RETURN(0); +} - RETURN(0); +static int brw_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(0); } int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) { - struct client_obd *cli = &obd->u.cli; - int rc; - ENTRY; + struct client_obd *cli = &obd->u.cli; + struct obd_type *type; + void *handler; + int rc; + ENTRY; - ENTRY; - rc = ptlrpcd_addref(); - if (rc) - RETURN(rc); + rc = ptlrpcd_addref(); + if (rc) + RETURN(rc); - rc = client_obd_setup(obd, lcfg); - if (rc == 0) { - void *handler; - handler = ptlrpcd_alloc_work(cli->cl_import, - brw_queue_work, cli); - if (!IS_ERR(handler)) - cli->cl_writeback_work = handler; - else - rc = PTR_ERR(handler); - } + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(out_ptlrpcd, rc); - if (rc == 0) { - struct lprocfs_static_vars lvars = { 0 }; + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_client_setup, rc = PTR_ERR(handler)); + cli->cl_writeback_work = handler; - cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; - lprocfs_osc_init_vars(&lvars); - if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) { - lproc_osc_attach_seqstat(obd); - sptlrpc_lprocfs_cliobd_attach(obd); - ptlrpc_lprocfs_register_obd(obd); - } + handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler)); + cli->cl_lru_work = handler; - oscc_init(obd); - /* We need to allocate a few requests more, because - brw_interpret tries to create new requests before freeing - previous ones. Ideally we want to have 2x max_rpcs_in_flight - reserved, but I afraid that might be too much wasted RAM - in fact, so 2 is just my guess and still should work. */ - cli->cl_import->imp_rq_pool = - ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, - OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); - - CFS_INIT_LIST_HEAD(&cli->cl_grant_shrink_list); - cfs_sema_init(&cli->cl_grant_sem, 1); - - ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); - } + rc = osc_quota_setup(obd); + if (rc) + GOTO(out_ptlrpcd_work, rc); - if (rc) - ptlrpcd_decref(); - RETURN(rc); + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + +#ifdef LPROCFS + obd->obd_vars = lprocfs_osc_obd_vars; +#endif + /* If this is true then both client (osc) and server (osp) are on the + * same node. The osp layer if loaded first will register the osc proc + * directory. In that case this obd_device will be attached its proc + * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. */ + type = class_search_type(LUSTRE_OSP_NAME); + if (type && type->typ_procsym) { + obd->obd_proc_entry = lprocfs_seq_register(obd->obd_name, + type->typ_procsym, + obd->obd_vars, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n", rc, + obd->obd_name); + obd->obd_proc_entry = NULL; + } + } else { + rc = lprocfs_seq_obd_setup(obd); + } + + /* If the basic OSC proc tree construction succeeded then + * lets do the rest. */ + if (rc == 0) { + lproc_osc_attach_seqstat(obd); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + } + + /* We need to allocate a few requests more, because + * brw_interpret tries to create new requests before freeing + * previous ones, Ideally we want to have 2x max_rpcs_in_flight + * reserved, but I'm afraid that might be too much wasted RAM + * in fact, so 2 is just my guess and still should work. */ + cli->cl_import->imp_rq_pool = + ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, + OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + INIT_LIST_HEAD(&cli->cl_grant_shrink_list); + ns_register_cancel(obd->obd_namespace, osc_cancel_weight); + RETURN(0); + +out_ptlrpcd_work: + if (cli->cl_writeback_work != NULL) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + if (cli->cl_lru_work != NULL) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } +out_client_setup: + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + RETURN(rc); } static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) @@ -4528,9 +3191,9 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name); /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */ ptlrpc_deactivate_import(imp); - cfs_spin_lock(&imp->imp_lock); - imp->imp_pingable = 0; - cfs_spin_unlock(&imp->imp_lock); + spin_lock(&imp->imp_lock); + imp->imp_pingable = 0; + spin_unlock(&imp->imp_lock); break; } case OBD_CLEANUP_EXPORTS: { @@ -4549,6 +3212,10 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) ptlrpcd_destroy_work(cli->cl_writeback_work); cli->cl_writeback_work = NULL; } + if (cli->cl_lru_work) { + ptlrpcd_destroy_work(cli->cl_lru_work); + cli->cl_lru_work = NULL; + } obd_cleanup_client_import(obd); ptlrpc_lprocfs_unregister_obd(obd); lprocfs_obd_cleanup(obd); @@ -4563,9 +3230,21 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) int osc_cleanup(struct obd_device *obd) { - int rc; + struct client_obd *cli = &obd->u.cli; + int rc; - ENTRY; + ENTRY; + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + atomic_dec(&cli->cl_cache->ccc_users); + cli->cl_cache = NULL; + } /* free memory of osc quota cache */ osc_quota_cleanup(obd); @@ -4578,21 +3257,9 @@ int osc_cleanup(struct obd_device *obd) int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) { - struct lprocfs_static_vars lvars = { 0 }; - int rc = 0; - - lprocfs_osc_init_vars(&lvars); - - switch (lcfg->lcfg_command) { - default: - rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; - break; - } - - return(rc); + int rc = class_process_proc_seq_param(PARAM_OSC, obd->obd_vars, + lcfg, obd); + return rc > 0 ? 0: rc; } static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) @@ -4612,45 +3279,34 @@ struct obd_ops osc_obd_ops = { .o_disconnect = osc_disconnect, .o_statfs = osc_statfs, .o_statfs_async = osc_statfs_async, - .o_packmd = osc_packmd, .o_unpackmd = osc_unpackmd, - .o_precreate = osc_precreate, .o_create = osc_create, - .o_create_async = osc_create_async, .o_destroy = osc_destroy, .o_getattr = osc_getattr, .o_getattr_async = osc_getattr_async, .o_setattr = osc_setattr, .o_setattr_async = osc_setattr_async, - .o_brw = osc_brw, - .o_punch = osc_punch, - .o_sync = osc_sync, - .o_enqueue = osc_enqueue, .o_change_cbdata = osc_change_cbdata, .o_find_cbdata = osc_find_cbdata, - .o_cancel = osc_cancel, - .o_cancel_unused = osc_cancel_unused, .o_iocontrol = osc_iocontrol, .o_get_info = osc_get_info, .o_set_info_async = osc_set_info_async, .o_import_event = osc_import_event, - .o_llog_init = osc_llog_init, - .o_llog_finish = osc_llog_finish, .o_process_config = osc_process_config, .o_quotactl = osc_quotactl, .o_quotacheck = osc_quotacheck, - .o_quota_adjust_qunit = osc_quota_adjust_qunit, }; extern struct lu_kmem_descr osc_caches[]; -extern cfs_spinlock_t osc_ast_guard; -extern cfs_lock_class_key_t osc_ast_guard_class; +extern spinlock_t osc_ast_guard; +extern struct lock_class_key osc_ast_guard_class; int __init osc_init(void) { - struct lprocfs_static_vars lvars = { 0 }; - int rc; - ENTRY; + bool enable_proc = true; + struct obd_type *type; + int rc; + ENTRY; /* print an address of _any_ initialized kernel symbol from this * module, to allow debugging with gdb that doesn't support data @@ -4658,37 +3314,34 @@ int __init osc_init(void) CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); rc = lu_kmem_init(osc_caches); + if (rc) + RETURN(rc); - lprocfs_osc_init_vars(&lvars); + type = class_search_type(LUSTRE_OSP_NAME); + if (type != NULL && type->typ_procsym != NULL) + enable_proc = false; - osc_quota_init(); - rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars, - LUSTRE_OSC_NAME, &osc_device_type); + rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL, +#ifndef HAVE_ONLY_PROCFS_SEQ + NULL, +#endif + LUSTRE_OSC_NAME, &osc_device_type); if (rc) { lu_kmem_fini(osc_caches); RETURN(rc); } - cfs_spin_lock_init(&osc_ast_guard); - cfs_lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); - - osc_mds_ost_orig_logops = llog_lvfs_ops; - osc_mds_ost_orig_logops.lop_setup = llog_obd_origin_setup; - osc_mds_ost_orig_logops.lop_cleanup = llog_obd_origin_cleanup; - osc_mds_ost_orig_logops.lop_add = llog_obd_origin_add; - osc_mds_ost_orig_logops.lop_connect = llog_origin_connect; + spin_lock_init(&osc_ast_guard); + lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); - RETURN(rc); + RETURN(rc); } #ifdef __KERNEL__ static void /*__exit*/ osc_exit(void) { - lu_device_type_fini(&osc_device_type); - - osc_quota_exit(); - class_unregister_type(LUSTRE_OSC_NAME); - lu_kmem_fini(osc_caches); + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); } MODULE_AUTHOR("Sun Microsystems, Inc. ");