From 6732fea64acb3966d2181920b5239082205ba679 Mon Sep 17 00:00:00 2001
From: Isaac Huang <he.h.huang@oracle.com>
Date: Thu, 27 Jan 2011 14:49:27 +0100
Subject: [PATCH] b=21776 Set PF_MEMALLOC on outgoing path to prevent deadlock
 on memory allocation under pressure

i=johann
i=dmitry
i=maxim
---
 lnet/ChangeLog                        |  9 ++++++++-
 lnet/include/libcfs/linux/linux-mem.h | 19 +++++++++++++++++++
 lnet/include/lnet/lib-types.h         |  1 +
 lnet/klnds/ptllnd/ptllnd_cb.c         | 19 +++++++++++++------
 lnet/klnds/socklnd/socklnd_cb.c       |  7 +++++++
 lnet/libcfs/tracefile.c               |  4 ++++
 lnet/lnet/lib-move.c                  |  1 +
 lustre/include/liblustre.h            |  2 ++
 lustre/include/lustre_net.h           |  1 +
 lustre/osc/osc_request.c              | 29 +++++++++++++++++++++--------
 lustre/ptlrpc/niobuf.c                | 11 +++++++++--
 11 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/lnet/ChangeLog b/lnet/ChangeLog
index bd52cb2..5583576 100644
--- a/lnet/ChangeLog
+++ b/lnet/ChangeLog
@@ -13,8 +13,15 @@ xxxx-xx-xx Oracle, Inc.
 	  ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
 
 Severity   : normal
+Bugzilla   : 21776
+Description: ptlrpcd stuck in lnet allocations under memory pressure
+Details    : Set PF_MEMALLOC on outgoing path to prevent deadlock on memory
+             allocation
+
+Severity   : normal
 Bugzilla   : 23575
-Description: fix o2iblnd v2 regression of credit deadlock with v1 peers (bug 14425).
+Description: fix o2iblnd v2 regression of credit deadlock with v1 peers
+             (bug 14425).
 
 Severity   : normal
 Bugzilla   : 21456
diff --git a/lnet/include/libcfs/linux/linux-mem.h b/lnet/include/libcfs/linux/linux-mem.h
index 3a21707..94e73a9 100644
--- a/lnet/include/libcfs/linux/linux-mem.h
+++ b/lnet/include/libcfs/linux/linux-mem.h
@@ -122,6 +122,25 @@ extern void __cfs_free_pages(cfs_page_t *page, unsigned int order);
 #define CFS_NUM_CACHEPAGES num_physpages
 #endif
 
+static inline int libcfs_memory_pressure_get_and_set(void)
+{
+        int old = libcfs_memory_pressure_get();
+
+        if (!old)
+                libcfs_memory_pressure_set();
+        return old;
+}
+
+static inline void libcfs_memory_pressure_restore(int old)
+{
+        if (old)
+                libcfs_memory_pressure_set();
+        else
+                libcfs_memory_pressure_clr();
+        return;
+}
+
+
 /*
  * In Linux there is no way to determine whether current execution context is
  * blockable.
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
index d7835a0..9698532 100644
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -194,6 +194,7 @@ typedef struct lnet_msg {
         lnet_process_id_t   msg_target;
         __u32               msg_type;
 
+        unsigned int        msg_vmflush:1;      /* VM trying to free memory */
         unsigned int        msg_target_is_router:1; /* sending to a router */
         unsigned int        msg_routing:1;      /* being forwarded */
         unsigned int        msg_ack:1;          /* ack on finalize (PUT) */
diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c
index 8a0d67a..645512a 100644
--- a/lnet/klnds/ptllnd/ptllnd_cb.c
+++ b/lnet/klnds/ptllnd/ptllnd_cb.c
@@ -321,7 +321,8 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         unsigned int      payload_offset = lntmsg->msg_offset;
         unsigned int      payload_nob = lntmsg->msg_len;
         kptl_net_t       *net = ni->ni_data;
-        kptl_peer_t      *peer;
+        kptl_peer_t      *peer = NULL;
+        int               mpflag = 0;
         kptl_tx_t        *tx;
         int               nob;
         int               nfrag;
@@ -335,10 +336,13 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
         LASSERT (!in_interrupt());
 
+        if (lntmsg->msg_vmflush)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
         rc = kptllnd_find_target(net, target, &peer);
         if (rc != 0)
-                return rc;
-        
+                goto out;
+
         /* NB peer->peer_id does NOT always equal target, be careful with
          * which one to use */
         switch (type) {
@@ -416,7 +420,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                         kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov,
                                              NULL, lntmsg->msg_md->md_iov.kiov,
                                              0, lntmsg->msg_md->md_length);
-                
+
                 tx->tx_lnet_msg = lntmsg;
                 tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr;
                 kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET,
@@ -470,7 +474,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                                                 payload_offset, payload_nob);
 #endif
         }
-        
+
         nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
         kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, target, nob);
 
@@ -486,7 +490,10 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
         kptllnd_tx_launch(peer, tx, nfrag);
 
  out:
-        kptllnd_peer_decref(peer);
+        if (lntmsg->msg_vmflush)
+                libcfs_memory_pressure_restore(mpflag);
+        if (peer)
+                kptllnd_peer_decref(peer);
         return rc;
 }
 
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c
index 29a71d8..53576de 100644
--- a/lnet/klnds/socklnd/socklnd_cb.c
+++ b/lnet/klnds/socklnd/socklnd_cb.c
@@ -926,6 +926,7 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
 int
 ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
+        int               mpflag = 0;
         int               type = lntmsg->msg_type;
         lnet_process_id_t target = lntmsg->msg_target;
         unsigned int      payload_niov = lntmsg->msg_niov;
@@ -956,10 +957,14 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                 desc_size = offsetof(ksock_tx_t,
                                      tx_frags.paged.kiov[payload_niov]);
 
+        if (lntmsg->msg_vmflush)
+                mpflag = libcfs_memory_pressure_get_and_set();
         tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
         if (tx == NULL) {
                 CERROR("Can't allocate tx desc type %d size %d\n",
                        type, desc_size);
+                if (lntmsg->msg_vmflush)
+                        libcfs_memory_pressure_restore(mpflag);
                 return (-ENOMEM);
         }
 
@@ -990,6 +995,8 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 
         /* The first fragment will be set later in pro_pack */
         rc = ksocknal_launch_packet(ni, tx, target);
+        if (lntmsg->msg_vmflush)
+                libcfs_memory_pressure_restore(mpflag);
         if (rc == 0)
                 return (0);
 
diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c
index 841d4bd..b5123cb 100644
--- a/lnet/libcfs/tracefile.c
+++ b/lnet/libcfs/tracefile.c
@@ -71,6 +71,10 @@ static struct trace_page *tage_alloc(int gfp)
         cfs_page_t        *page;
         struct trace_page *tage;
 
+        /* My caller is trying to free memory */
+        if (!cfs_in_interrupt() && libcfs_memory_pressure_get())
+                return NULL;
+
         /*
          * Don't spam console with allocation failures: they will be reported
          * by upper layer anyway.
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c
index 2f9544a..eb4f286 100644
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -2380,6 +2380,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
                        libcfs_id2str(target));
                 return -ENOMEM;
         }
+        msg->msg_vmflush = !!libcfs_memory_pressure_get();
 
         LNET_LOCK();
 
diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h
index 29d725a..1fb9110 100644
--- a/lustre/include/liblustre.h
+++ b/lustre/include/liblustre.h
@@ -751,6 +751,8 @@ typedef struct { volatile int counter; } atomic_t;
 #define libcfs_memory_pressure_get() (0) 
 #define libcfs_memory_pressure_set() do {} while (0) 
 #define libcfs_memory_pressure_clr() do {} while (0)
+#define libcfs_memory_pressure_get_and_set() do {} while (0)
+#define libcfs_memory_pressure_restore() do {} while (0)
 
 /* FIXME sys/capability will finally included linux/fs.h thus
  * cause numerous trouble on x86-64. as temporary solution for
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 20196cd..8195fbb 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -319,6 +319,7 @@ struct ptlrpc_request {
                 rq_replay:1,
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                 rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
+                rq_memalloc:1,      /* req originated from "kswapd" */
                 /* server-side flags */
                 rq_packed_final:1,  /* packed final reply */
                 rq_hp:1,            /* high priority RPC */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 434090e..7dd8667 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -2036,7 +2036,7 @@ static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
         if (cmd & OBD_BRW_WRITE) {
                 /* trigger a write rpc stream as long as there are dirtiers
                  * waiting for space.  as they're waiting, they're not going to
-                 * create more pages to coallesce with what's waiting.. */
+                 * create more pages to coalesce with what's waiting.. */
                 if (!list_empty(&cli->cl_cache_waiters)) {
                         CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
                         RETURN(1);
@@ -2339,11 +2339,14 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
         struct osc_async_page *oap;
         struct ldlm_lock *lock = NULL;
         obd_valid valid;
-        int i, rc;
+        int i, rc, mpflag = 0;
 
         ENTRY;
         LASSERT(!list_empty(rpc_list));
 
+        if (cmd & OBD_BRW_MEMALLOC)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
         OBD_ALLOC(pga, sizeof(*pga) * page_count);
         if (pga == NULL)
                 RETURN(ERR_PTR(-ENOMEM));
@@ -2384,6 +2387,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
         oa = &((struct ost_body *)lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
                                                  sizeof(struct ost_body)))->oa;
 
+        if (cmd & OBD_BRW_MEMALLOC)
+                req->rq_memalloc = 1;
+
         /* Need to update the timestamps after the request is built in case
          * we race with setattr (locally or in queue at OST).  If OST gets
          * later setattr before earlier BRW (as determined by the request xid),
@@ -2415,6 +2421,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
         CFS_INIT_LIST_HEAD(rpc_list);
 
 out:
+        if (cmd & OBD_BRW_MEMALLOC)
+                libcfs_memory_pressure_restore(mpflag);
+
         if (IS_ERR(req)) {
                 if (oa)
                         OBDO_FREE(oa);
@@ -2434,8 +2443,9 @@ out:
  * \param cmd - OBD_BRW_* macroses
  * \param lop - pending pages
  *
- * \return zero if pages successfully add to send queue.
- * \return not zere if error occurring.
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
  */
 static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                             int cmd, struct loi_oap_pages *lop)
@@ -2448,7 +2458,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
         CFS_LIST_HEAD(rpc_list);
         unsigned int ending_offset;
         unsigned  starting_offset = 0;
-        int srvlock = 0;
+        int srvlock = 0, mem_tight = 0;
         ENTRY;
 
         /* If there are HP OAPs we need to handle at least 1 of them,
@@ -2478,7 +2488,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                  * until completion unlocks it.  commit_write submits a page
                  * as not ready because its unlock will happen unconditionally
                  * as the call returns.  if we race with commit_write giving
-                 * us that page we dont' want to create a hole in the page
+                 * us that page we don't want to create a hole in the page
                  * stream, so we stop and leave the rpc to be fired by
                  * another dirtier or kupdated interval (the not ready page
                  * will still be on the dirty list).  we could call in
@@ -2561,6 +2571,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
 
                 /* now put the page back in our accounting */
                 list_add_tail(&oap->oap_rpc_item, &rpc_list);
+                if (oap->oap_brw_flags & OBD_BRW_MEMALLOC)
+                        mem_tight = 1;
                 if (page_count == 0)
                         srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK);
                 if (++page_count >= cli->cl_max_pages_per_rpc)
@@ -2591,7 +2603,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
 
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 
-        req = osc_build_req(cli, &rpc_list, page_count, cmd);
+        req = osc_build_req(cli, &rpc_list, page_count,
+                            mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd);
         if (IS_ERR(req)) {
                 /* this should happen rarely and is pretty bad, it makes the
                  * pending list not follow the dirty order */
@@ -2766,7 +2779,7 @@ static void osc_check_rpcs(struct client_obd *cli)
                                 race_counter++;
                 }
 
-                /* attempt some inter-object balancing by issueing rpcs
+                /* attempt some inter-object balancing by issuing rpcs
                  * for each object in turn */
                 if (!list_empty(&loi->loi_hp_ready_item))
                         list_del_init(&loi->loi_hp_ready_item);
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c
index 44b5711..25c1df5 100644
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -488,6 +488,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 {
         int rc;
         int rc2;
+        int mpflag = 0;
         struct ptlrpc_connection *connection;
         lnet_handle_me_t  reply_me_h;
         lnet_md_t         reply_md;
@@ -534,6 +535,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         if (request->rq_resend)
                 lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
 
+        if (request->rq_memalloc)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
         if (!noreply) {
                 LASSERT (request->rq_replen != 0);
                 if (request->rq_repbuf == NULL)
@@ -633,11 +637,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                           request->rq_request_portal,
                           request->rq_xid, 0);
         if (rc == 0)
-                RETURN(rc);
+                GOTO(out, rc);
 
         ptlrpc_req_finished(request);
         if (noreply)
-                RETURN(rc);
+                GOTO(out, rc);
 
  cleanup_me:
         /* MEUnlink is safe; the PUT didn't even get off the ground, and
@@ -657,6 +661,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         /* We do sync unlink here as there was no real transfer here so
          * the chance to have long unlink to sluggish net is smaller here. */
         ptlrpc_unregister_bulk(request, 0);
+ out:
+        if (request->rq_memalloc)
+                libcfs_memory_pressure_restore(mpflag);
         return rc;
 }
 
-- 
1.8.3.1