From c8e3992acf3039b2824725d41f90d9a3be3be921 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Wed, 14 Jun 2017 17:03:46 +0800 Subject: [PATCH] LU-9514 ptlrpc: free reply buffer earlier for open RPC It is unnecessary to keep the reply buffer for open RPC. Replay related data has already been saved in the request buffer when the RPC replied. If the open replay really happen, the replay logic will alloc the reply buffer when needed. On the other hand, the client always tries to alloc big enough space to hold the open RPC reply since the client does not exactly know how much data the server will reply to the client. So the reply buffer may be quite larger than the real needed. Under such case, keeping the large reply buffer for the open RPC will occupy a lot of RAM as to OOM if the are too many open RPCs to be replayed. This patch frees the reply buffer for the open RPC when only the replay logic holds the last reference of the RPC. Test-Parameters: envdefinitions=ONLY=51f testlist=sanity ostfilesystemtype=ldiskfs mdtfilesystemtype=ldiskfs Test-Parameters: envdefinitions=ONLY=51f testlist=sanity ostfilesystemtype=zfs mdtfilesystemtype=zfs Signed-off-by: Fan Yong Change-Id: I1bea2456b8aa4e53a0b65143a48e617f836181a0 Reviewed-on: https://review.whamcloud.com/27208 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Niu Yawei Reviewed-by: Jinshan Xiong Reviewed-by: Oleg Drokin --- lustre/include/lustre_net.h | 2 ++ lustre/mdc/mdc_request.c | 39 ++++++++++++++++++--------- lustre/ptlrpc/client.c | 58 ++++++++++++++++++++++++++++----------- lustre/ptlrpc/pack_generic.c | 60 ++++++++++++++++++++++++----------------- lustre/ptlrpc/ptlrpc_internal.h | 1 + lustre/tests/sanity.sh | 4 +-- 6 files changed, 110 insertions(+), 54 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 14da6e2..8c918af 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -967,6 +967,7 @@ struct ptlrpc_request { * rq_list */ spinlock_t rq_lock; + spinlock_t rq_early_free_lock; /** client-side flags are serialized by rq_lock @{ */ unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, @@ -998,6 +999,7 @@ struct ptlrpc_request { rq_allow_replay:1, /* bulk request, sent to server, but uncommitted */ rq_unstable:1, + rq_early_free_repbuf:1, /* free reply buffer in advance */ rq_allow_intr:1; /** @} */ diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 349ba27..b02866e 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -569,29 +569,37 @@ void mdc_replay_open(struct ptlrpc_request *req) body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); LASSERT(body != NULL); - och = mod->mod_och; - if (och != NULL) { - struct lustre_handle *file_fh; + spin_lock(&req->rq_lock); + och = mod->mod_och; + if (och && och->och_fh.cookie) + req->rq_early_free_repbuf = 1; + else + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_early_free_repbuf) { + struct lustre_handle *file_fh; - LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); - file_fh = &och->och_fh; + file_fh = &och->och_fh; CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", file_fh->cookie, body->mbo_handle.cookie); old = *file_fh; *file_fh = body->mbo_handle; - } - close_req = mod->mod_close_req; - if (close_req != NULL) { - __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); - struct mdt_ioepoch *epoch; + } + + close_req = mod->mod_close_req; + if (close_req) { + __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); + struct mdt_ioepoch *epoch; LASSERT(opc == MDS_CLOSE); epoch = req_capsule_client_get(&close_req->rq_pill, &RMF_MDT_EPOCH); LASSERT(epoch); - if (och != NULL) + if (req->rq_early_free_repbuf) LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old))); DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); @@ -675,6 +683,7 @@ int mdc_set_open_replay_data(struct obd_export *exp, mod->mod_open_req = open_req; open_req->rq_cb_data = mod; open_req->rq_commit_cb = mdc_commit_open; + open_req->rq_early_free_repbuf = 1; spin_unlock(&open_req->rq_lock); } @@ -728,8 +737,14 @@ int mdc_clear_open_replay_data(struct obd_export *exp, if (mod == NULL) RETURN(0); - LASSERT(mod != LP_POISON); + LASSERT(mod != LP_POISON); LASSERT(mod->mod_open_req != NULL); + + spin_lock(&mod->mod_open_req->rq_lock); + if (mod->mod_och) + mod->mod_och->och_fh.cookie = 0; + mod->mod_open_req->rq_early_free_repbuf = 0; + spin_unlock(&mod->mod_open_req->rq_lock); mdc_free_open(mod); mod->mod_och = NULL; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index e31010c..ad994fb 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2506,29 +2506,54 @@ void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) * Drops one reference count for request \a request. * \a locked set indicates that caller holds import imp_lock. * Frees the request whe reference count reaches zero. + * + * \retval 1 the request is freed + * \retval 0 some others still hold references on the request */ static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) { - ENTRY; - if (request == NULL) - RETURN(1); + int count; + ENTRY; - if (request == LP_POISON || - request->rq_reqmsg == LP_POISON) { - CERROR("dereferencing freed request (bug 575)\n"); - LBUG(); - RETURN(1); - } + if (!request) + RETURN(1); - DEBUG_REQ(D_INFO, request, "refcount now %u", + LASSERT(request != LP_POISON); + LASSERT(request->rq_reqmsg != LP_POISON); + + DEBUG_REQ(D_INFO, request, "refcount now %u", atomic_read(&request->rq_refcount) - 1); - if (atomic_dec_and_test(&request->rq_refcount)) { - __ptlrpc_free_req(request, locked); - RETURN(1); - } + spin_lock(&request->rq_lock); + count = atomic_dec_return(&request->rq_refcount); + LASSERTF(count >= 0, "Invalid ref count %d\n", count); - RETURN(0); + /* For open RPC, the client does not know the EA size (LOV, ACL, and + * so on) before replied, then the client has to reserve very large + * reply buffer. Such buffer will not be released until the RPC freed. + * Since The open RPC is replayable, we need to keep it in the replay + * list until close. If there are a lot of files opened concurrently, + * then the client may be OOM. + * + * If fact, it is unnecessary to keep reply buffer for open replay, + * related EAs have already been saved via mdc_save_lovea() before + * coming here. So it is safe to free the reply buffer some earlier + * before releasing the RPC to avoid client OOM. LU-9514 */ + if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) { + spin_lock(&request->rq_early_free_lock); + sptlrpc_cli_free_repbuf(request); + request->rq_repbuf = NULL; + request->rq_repbuf_len = 0; + request->rq_repdata = NULL; + request->rq_reqdata_len = 0; + spin_unlock(&request->rq_early_free_lock); + } + spin_unlock(&request->rq_lock); + + if (!count) + __ptlrpc_free_req(request, locked); + + RETURN(!count); } /** @@ -3082,6 +3107,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) DEBUG_REQ(D_HA, req, "REPLAY"); atomic_inc(&req->rq_import->imp_replay_inflight); + spin_lock(&req->rq_lock); + req->rq_early_free_repbuf = 0; + spin_unlock(&req->rq_lock); ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ ptlrpcd_add_req(req); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index aa6552e..0b2a137 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -2511,35 +2511,47 @@ static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) { - LASSERT(req->rq_repmsg); + if (unlikely(!req->rq_repmsg)) + return 0; - switch (req->rq_repmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); - default: - /* uninitialized yet */ - return 0; - } + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + /* uninitialized yet */ + return 0; + } } void _debug_req(struct ptlrpc_request *req, - struct libcfs_debug_msg_data *msgdata, - const char *fmt, ... ) + struct libcfs_debug_msg_data *msgdata, const char *fmt, ...) { - int req_ok = req->rq_reqmsg != NULL; - int rep_ok = req->rq_repmsg != NULL; - lnet_nid_t nid = LNET_NID_ANY; - va_list args; + bool req_ok = req->rq_reqmsg != NULL; + bool rep_ok = false; + lnet_nid_t nid = LNET_NID_ANY; + va_list args; + int rep_flags = -1; + int rep_status = -1; - if (ptlrpc_req_need_swab(req)) { - req_ok = req_ok && req_ptlrpc_body_swabbed(req); - rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); - } + spin_lock(&req->rq_early_free_lock); + if (req->rq_repmsg) + rep_ok = true; + + if (ptlrpc_req_need_swab(req)) { + req_ok = req_ok && req_ptlrpc_body_swabbed(req); + rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); + } + + if (rep_ok) { + rep_flags = lustre_msg_get_flags(req->rq_repmsg); + rep_status = lustre_msg_get_status(req->rq_repmsg); + } + spin_unlock(&req->rq_early_free_lock); - if (req->rq_import && req->rq_import->imp_connection) - nid = req->rq_import->imp_connection->c_peer.nid; - else if (req->rq_export && req->rq_export->exp_connection) - nid = req->rq_export->exp_connection->c_peer.nid; + if (req->rq_import && req->rq_import->imp_connection) + nid = req->rq_import->imp_connection->c_peer.nid; + else if (req->rq_export && req->rq_export->exp_connection) + nid = req->rq_export->exp_connection->c_peer.nid; va_start(args, fmt); libcfs_debug_vmsg2(msgdata, fmt, args, @@ -2560,9 +2572,7 @@ void _debug_req(struct ptlrpc_request *req, atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, - rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1, - req->rq_status, - rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1); + rep_flags, req->rq_status, rep_status); va_end(args); } EXPORT_SYMBOL(_debug_req); diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index a37be57..cfd1de5 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -356,6 +356,7 @@ static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req) { spin_lock_init(&req->rq_lock); + spin_lock_init(&req->rq_early_free_lock); atomic_set(&req->rq_refcount, 1); INIT_LIST_HEAD(&req->rq_list); INIT_LIST_HEAD(&req->rq_replay_list); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 19cb29f..ec6f11e 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -81,8 +81,8 @@ init_logging [ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 300o" if [ $(facet_fstype $SINGLEMDS) = "zfs" ]; then - # bug number for skipped test: LU-9514 LU-4536 LU-1957 - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 51f 65ic 180" + # bug number for skipped test: LU-4536 LU-1957 + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 65ic 180" # 13 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="$EXCEPT_SLOW 51b" fi -- 1.8.3.1