From 6732fea64acb3966d2181920b5239082205ba679 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Thu, 27 Jan 2011 14:49:27 +0100 Subject: [PATCH] b=21776 Set PF_MEMALLOC on outgoing path to prevent deadlock on memory allocation under pressure i=johann i=dmitry i=maxim --- lnet/ChangeLog | 9 ++++++++- lnet/include/libcfs/linux/linux-mem.h | 19 +++++++++++++++++++ lnet/include/lnet/lib-types.h | 1 + lnet/klnds/ptllnd/ptllnd_cb.c | 19 +++++++++++++------ lnet/klnds/socklnd/socklnd_cb.c | 7 +++++++ lnet/libcfs/tracefile.c | 4 ++++ lnet/lnet/lib-move.c | 1 + lustre/include/liblustre.h | 2 ++ lustre/include/lustre_net.h | 1 + lustre/osc/osc_request.c | 29 +++++++++++++++++++++-------- lustre/ptlrpc/niobuf.c | 11 +++++++++-- 11 files changed, 86 insertions(+), 17 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index bd52cb2..5583576 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -13,8 +13,15 @@ xxxx-xx-xx Oracle, Inc. ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x Severity : normal +Bugzilla : 21776 +Description: ptlrpcd stuck in lnet allocations under memory pressure +Details : Set PF_MEMALLOC on outgoing path to prevent deadlock on memory + allocation + +Severity : normal Bugzilla : 23575 -Description: fix o2iblnd v2 regression of credit deadlock with v1 peers (bug 14425). +Description: fix o2iblnd v2 regression of credit deadlock with v1 peers + (bug 14425). Severity : normal Bugzilla : 21456 diff --git a/lnet/include/libcfs/linux/linux-mem.h b/lnet/include/libcfs/linux/linux-mem.h index 3a21707..94e73a9 100644 --- a/lnet/include/libcfs/linux/linux-mem.h +++ b/lnet/include/libcfs/linux/linux-mem.h @@ -122,6 +122,25 @@ extern void __cfs_free_pages(cfs_page_t *page, unsigned int order); #define CFS_NUM_CACHEPAGES num_physpages #endif +static inline int libcfs_memory_pressure_get_and_set(void) +{ + int old = libcfs_memory_pressure_get(); + + if (!old) + libcfs_memory_pressure_set(); + return old; +} + +static inline void libcfs_memory_pressure_restore(int old) +{ + if (old) + libcfs_memory_pressure_set(); + else + libcfs_memory_pressure_clr(); + return; +} + + /* * In Linux there is no way to determine whether current execution context is * blockable. diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index d7835a0..9698532 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -194,6 +194,7 @@ typedef struct lnet_msg { lnet_process_id_t msg_target; __u32 msg_type; + unsigned int msg_vmflush:1; /* VM trying to free memory */ unsigned int msg_target_is_router:1; /* sending to a router */ unsigned int msg_routing:1; /* being forwarded */ unsigned int msg_ack:1; /* ack on finalize (PUT) */ diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c index 8a0d67a..645512a 100644 --- a/lnet/klnds/ptllnd/ptllnd_cb.c +++ b/lnet/klnds/ptllnd/ptllnd_cb.c @@ -321,7 +321,8 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; kptl_net_t *net = ni->ni_data; - kptl_peer_t *peer; + kptl_peer_t *peer = NULL; + int mpflag = 0; kptl_tx_t *tx; int nob; int nfrag; @@ -335,10 +336,13 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); LASSERT (!in_interrupt()); + if (lntmsg->msg_vmflush) + mpflag = libcfs_memory_pressure_get_and_set(); + rc = kptllnd_find_target(net, target, &peer); if (rc != 0) - return rc; - + goto out; + /* NB peer->peer_id does NOT always equal target, be careful with * which one to use */ switch (type) { @@ -416,7 +420,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov, NULL, lntmsg->msg_md->md_iov.kiov, 0, lntmsg->msg_md->md_length); - + tx->tx_lnet_msg = lntmsg; tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr; kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET, @@ -470,7 +474,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) payload_offset, payload_nob); #endif } - + nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]); kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, target, nob); @@ -486,7 +490,10 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) kptllnd_tx_launch(peer, tx, nfrag); out: - kptllnd_peer_decref(peer); + if (lntmsg->msg_vmflush) + libcfs_memory_pressure_restore(mpflag); + if (peer) + kptllnd_peer_decref(peer); return rc; } diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 29a71d8..53576de 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -926,6 +926,7 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { + int mpflag = 0; int type = lntmsg->msg_type; lnet_process_id_t target = lntmsg->msg_target; unsigned int payload_niov = lntmsg->msg_niov; @@ -956,10 +957,14 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) desc_size = offsetof(ksock_tx_t, tx_frags.paged.kiov[payload_niov]); + if (lntmsg->msg_vmflush) + mpflag = libcfs_memory_pressure_get_and_set(); tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); if (tx == NULL) { CERROR("Can't allocate tx desc type %d size %d\n", type, desc_size); + if (lntmsg->msg_vmflush) + libcfs_memory_pressure_restore(mpflag); return (-ENOMEM); } @@ -990,6 +995,8 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* The first fragment will be set later in pro_pack */ rc = ksocknal_launch_packet(ni, tx, target); + if (lntmsg->msg_vmflush) + libcfs_memory_pressure_restore(mpflag); if (rc == 0) return (0); diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c index 841d4bd..b5123cb 100644 --- a/lnet/libcfs/tracefile.c +++ b/lnet/libcfs/tracefile.c @@ -71,6 +71,10 @@ static struct trace_page *tage_alloc(int gfp) cfs_page_t *page; struct trace_page *tage; + /* My caller is trying to free memory */ + if (!cfs_in_interrupt() && libcfs_memory_pressure_get()) + return NULL; + /* * Don't spam console with allocation failures: they will be reported * by upper layer anyway. diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 2f9544a..eb4f286 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2380,6 +2380,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, libcfs_id2str(target)); return -ENOMEM; } + msg->msg_vmflush = !!libcfs_memory_pressure_get(); LNET_LOCK(); diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 29d725a..1fb9110 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -751,6 +751,8 @@ typedef struct { volatile int counter; } atomic_t; #define libcfs_memory_pressure_get() (0) #define libcfs_memory_pressure_set() do {} while (0) #define libcfs_memory_pressure_clr() do {} while (0) +#define libcfs_memory_pressure_get_and_set() do {} while (0) +#define libcfs_memory_pressure_restore() do {} while (0) /* FIXME sys/capability will finally included linux/fs.h thus * cause numerous trouble on x86-64. as temporary solution for diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 20196cd..8195fbb 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -319,6 +319,7 @@ struct ptlrpc_request { rq_replay:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1, + rq_memalloc:1, /* req originated from "kswapd" */ /* server-side flags */ rq_packed_final:1, /* packed final reply */ rq_hp:1, /* high priority RPC */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 434090e..7dd8667 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2036,7 +2036,7 @@ static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, if (cmd & OBD_BRW_WRITE) { /* trigger a write rpc stream as long as there are dirtiers * waiting for space. as they're waiting, they're not going to - * create more pages to coallesce with what's waiting.. */ + * create more pages to coalesce with what's waiting.. */ if (!list_empty(&cli->cl_cache_waiters)) { CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); RETURN(1); @@ -2339,11 +2339,14 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, struct osc_async_page *oap; struct ldlm_lock *lock = NULL; obd_valid valid; - int i, rc; + int i, rc, mpflag = 0; ENTRY; LASSERT(!list_empty(rpc_list)); + if (cmd & OBD_BRW_MEMALLOC) + mpflag = libcfs_memory_pressure_get_and_set(); + OBD_ALLOC(pga, sizeof(*pga) * page_count); if (pga == NULL) RETURN(ERR_PTR(-ENOMEM)); @@ -2384,6 +2387,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, oa = &((struct ost_body *)lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(struct ost_body)))->oa; + if (cmd & OBD_BRW_MEMALLOC) + req->rq_memalloc = 1; + /* Need to update the timestamps after the request is built in case * we race with setattr (locally or in queue at OST). If OST gets * later setattr before earlier BRW (as determined by the request xid), @@ -2415,6 +2421,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, CFS_INIT_LIST_HEAD(rpc_list); out: + if (cmd & OBD_BRW_MEMALLOC) + libcfs_memory_pressure_restore(mpflag); + if (IS_ERR(req)) { if (oa) OBDO_FREE(oa); @@ -2434,8 +2443,9 @@ out: * \param cmd - OBD_BRW_* macroses * \param lop - pending pages * - * \return zero if pages successfully add to send queue. - * \return not zere if error occurring. + * \return zero if no page added to send queue. + * \return 1 if pages successfully added to send queue. + * \return negative on errors. */ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) @@ -2448,7 +2458,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, CFS_LIST_HEAD(rpc_list); unsigned int ending_offset; unsigned starting_offset = 0; - int srvlock = 0; + int srvlock = 0, mem_tight = 0; ENTRY; /* If there are HP OAPs we need to handle at least 1 of them, @@ -2478,7 +2488,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, * until completion unlocks it. commit_write submits a page * as not ready because its unlock will happen unconditionally * as the call returns. if we race with commit_write giving - * us that page we dont' want to create a hole in the page + * us that page we don't want to create a hole in the page * stream, so we stop and leave the rpc to be fired by * another dirtier or kupdated interval (the not ready page * will still be on the dirty list). we could call in @@ -2561,6 +2571,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, /* now put the page back in our accounting */ list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (oap->oap_brw_flags & OBD_BRW_MEMALLOC) + mem_tight = 1; if (page_count == 0) srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK); if (++page_count >= cli->cl_max_pages_per_rpc) @@ -2591,7 +2603,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, client_obd_list_unlock(&cli->cl_loi_list_lock); - req = osc_build_req(cli, &rpc_list, page_count, cmd); + req = osc_build_req(cli, &rpc_list, page_count, + mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd); if (IS_ERR(req)) { /* this should happen rarely and is pretty bad, it makes the * pending list not follow the dirty order */ @@ -2766,7 +2779,7 @@ static void osc_check_rpcs(struct client_obd *cli) race_counter++; } - /* attempt some inter-object balancing by issueing rpcs + /* attempt some inter-object balancing by issuing rpcs * for each object in turn */ if (!list_empty(&loi->loi_hp_ready_item)) list_del_init(&loi->loi_hp_ready_item); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 44b5711..25c1df5 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -488,6 +488,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) { int rc; int rc2; + int mpflag = 0; struct ptlrpc_connection *connection; lnet_handle_me_t reply_me_h; lnet_md_t reply_md; @@ -534,6 +535,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (request->rq_resend) lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + if (request->rq_memalloc) + mpflag = libcfs_memory_pressure_get_and_set(); + if (!noreply) { LASSERT (request->rq_replen != 0); if (request->rq_repbuf == NULL) @@ -633,11 +637,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) request->rq_request_portal, request->rq_xid, 0); if (rc == 0) - RETURN(rc); + GOTO(out, rc); ptlrpc_req_finished(request); if (noreply) - RETURN(rc); + GOTO(out, rc); cleanup_me: /* MEUnlink is safe; the PUT didn't even get off the ground, and @@ -657,6 +661,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) /* We do sync unlink here as there was no real transfer here so * the chance to have long unlink to sluggish net is smaller here. */ ptlrpc_unregister_bulk(request, 0); + out: + if (request->rq_memalloc) + libcfs_memory_pressure_restore(mpflag); return rc; } -- 1.8.3.1