b=21776 Set PF_MEMALLOC on outgoing path to prevent deadlock on memory allocation...

author Isaac Huang <he.h.huang@oracle.com>

Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)

committer Johann Lombardi <johann.lombardi@oracle.com>

Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)
author Isaac Huang <he.h.huang@oracle.com>
Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)
committer Johann Lombardi <johann.lombardi@oracle.com>
Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)
diff --git a/lnet/ChangeLog b/lnet/ChangeLog

index bd52cb2..5583576 100644 (file)
--- a/lnet/ChangeLog
+++ b/lnet/ChangeLog
@@ -13,8 +13,15 @@ xxxx-xx-xx Oracle, Inc.
           ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
  
  Severity   : normal
+Bugzilla   : 21776
+Description: ptlrpcd stuck in lnet allocations under memory pressure
+Details    : Set PF_MEMALLOC on outgoing path to prevent deadlock on memory
+             allocation
+
+Severity   : normal
  Bugzilla   : 23575
-Description: fix o2iblnd v2 regression of credit deadlock with v1 peers (bug 14425).
+Description: fix o2iblnd v2 regression of credit deadlock with v1 peers
+             (bug 14425).
  
  Severity   : normal
  Bugzilla   : 21456
diff --git a/lnet/include/libcfs/linux/linux-mem.h b/lnet/include/libcfs/linux/linux-mem.h

index 3a21707..94e73a9 100644 (file)
--- a/lnet/include/libcfs/linux/linux-mem.h
+++ b/lnet/include/libcfs/linux/linux-mem.h
@@ -122,6 +122,25 @@ extern void __cfs_free_pages(cfs_page_t *page, unsigned int order);
  #define CFS_NUM_CACHEPAGES num_physpages
  #endif
  
+static inline int libcfs_memory_pressure_get_and_set(void)
+{
+        int old = libcfs_memory_pressure_get();
+
+        if (!old)
+                libcfs_memory_pressure_set();
+        return old;
+}
+
+static inline void libcfs_memory_pressure_restore(int old)
+{
+        if (old)
+                libcfs_memory_pressure_set();
+        else
+                libcfs_memory_pressure_clr();
+        return;
+}
+
+
  /*
   * In Linux there is no way to determine whether current execution context is
   * blockable.
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h

index d7835a0..9698532 100644 (file)
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -194,6 +194,7 @@ typedef struct lnet_msg {
          lnet_process_id_t   msg_target;
          __u32               msg_type;
  
+        unsigned int        msg_vmflush:1;      /* VM trying to free memory */
          unsigned int        msg_target_is_router:1; /* sending to a router */
          unsigned int        msg_routing:1;      /* being forwarded */
          unsigned int        msg_ack:1;          /* ack on finalize (PUT) */
diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c

index 8a0d67a..645512a 100644 (file)
--- a/lnet/klnds/ptllnd/ptllnd_cb.c
+++ b/lnet/klnds/ptllnd/ptllnd_cb.c
@@ -321,7 +321,8 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
          unsigned int      payload_offset = lntmsg->msg_offset;
          unsigned int      payload_nob = lntmsg->msg_len;
          kptl_net_t       *net = ni->ni_data;
-        kptl_peer_t      *peer;
+        kptl_peer_t      *peer = NULL;
+        int               mpflag = 0;
          kptl_tx_t        *tx;
          int               nob;
          int               nfrag;
@@ -335,10 +336,13 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
          LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
          LASSERT (!in_interrupt());
  
+        if (lntmsg->msg_vmflush)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
          rc = kptllnd_find_target(net, target, &peer);
          if (rc != 0)
-                return rc;
-        
+                goto out;
+
          /* NB peer->peer_id does NOT always equal target, be careful with
           * which one to use */
          switch (type) {
@@ -416,7 +420,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                          kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov,
                                               NULL, lntmsg->msg_md->md_iov.kiov,
                                               0, lntmsg->msg_md->md_length);
-                
+
                  tx->tx_lnet_msg = lntmsg;
                  tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr;
                  kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET,
@@ -470,7 +474,7 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                                                  payload_offset, payload_nob);
  #endif
          }
-        
+
          nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
          kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, target, nob);
  
@@ -486,7 +490,10 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
          kptllnd_tx_launch(peer, tx, nfrag);
  
   out:
-        kptllnd_peer_decref(peer);
+        if (lntmsg->msg_vmflush)
+                libcfs_memory_pressure_restore(mpflag);
+        if (peer)
+                kptllnd_peer_decref(peer);
          return rc;
  }
  
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c

index 29a71d8..53576de 100644 (file)
--- a/lnet/klnds/socklnd/socklnd_cb.c
+++ b/lnet/klnds/socklnd/socklnd_cb.c
@@ -926,6 +926,7 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
  int
  ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
  {
+        int               mpflag = 0;
          int               type = lntmsg->msg_type;
          lnet_process_id_t target = lntmsg->msg_target;
          unsigned int      payload_niov = lntmsg->msg_niov;
@@ -956,10 +957,14 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
                  desc_size = offsetof(ksock_tx_t,
                                       tx_frags.paged.kiov[payload_niov]);
  
+        if (lntmsg->msg_vmflush)
+                mpflag = libcfs_memory_pressure_get_and_set();
          tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
          if (tx == NULL) {
                  CERROR("Can't allocate tx desc type %d size %d\n",
                         type, desc_size);
+                if (lntmsg->msg_vmflush)
+                        libcfs_memory_pressure_restore(mpflag);
                  return (-ENOMEM);
          }
  
@@ -990,6 +995,8 @@ ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
  
          /* The first fragment will be set later in pro_pack */
          rc = ksocknal_launch_packet(ni, tx, target);
+        if (lntmsg->msg_vmflush)
+                libcfs_memory_pressure_restore(mpflag);
          if (rc == 0)
                  return (0);
  
diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c

index 841d4bd..b5123cb 100644 (file)
--- a/lnet/libcfs/tracefile.c
+++ b/lnet/libcfs/tracefile.c
@@ -71,6 +71,10 @@ static struct trace_page *tage_alloc(int gfp)
          cfs_page_t        *page;
          struct trace_page *tage;
  
+        /* My caller is trying to free memory */
+        if (!cfs_in_interrupt() && libcfs_memory_pressure_get())
+                return NULL;
+
          /*
           * Don't spam console with allocation failures: they will be reported
           * by upper layer anyway.
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c

index 2f9544a..eb4f286 100644 (file)
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -2380,6 +2380,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
                         libcfs_id2str(target));
                  return -ENOMEM;
          }
+        msg->msg_vmflush = !!libcfs_memory_pressure_get();
  
          LNET_LOCK();
  
diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h

index 29d725a..1fb9110 100644 (file)
--- a/lustre/include/liblustre.h
+++ b/lustre/include/liblustre.h
@@ -751,6 +751,8 @@ typedef struct { volatile int counter; } atomic_t;
  #define libcfs_memory_pressure_get() (0) 
  #define libcfs_memory_pressure_set() do {} while (0) 
  #define libcfs_memory_pressure_clr() do {} while (0)
+#define libcfs_memory_pressure_get_and_set() do {} while (0)
+#define libcfs_memory_pressure_restore() do {} while (0)
  
  /* FIXME sys/capability will finally included linux/fs.h thus
   * cause numerous trouble on x86-64. as temporary solution for
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 20196cd..8195fbb 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -319,6 +319,7 @@ struct ptlrpc_request {
                  rq_replay:1,
                  rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                  rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
+                rq_memalloc:1,      /* req originated from "kswapd" */
                  /* server-side flags */
                  rq_packed_final:1,  /* packed final reply */
                  rq_hp:1,            /* high priority RPC */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 434090e..7dd8667 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -2036,7 +2036,7 @@ static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
          if (cmd & OBD_BRW_WRITE) {
                  /* trigger a write rpc stream as long as there are dirtiers
                   * waiting for space.  as they're waiting, they're not going to
-                 * create more pages to coallesce with what's waiting.. */
+                 * create more pages to coalesce with what's waiting.. */
                  if (!list_empty(&cli->cl_cache_waiters)) {
                          CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
                          RETURN(1);
@@ -2339,11 +2339,14 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
          struct osc_async_page *oap;
          struct ldlm_lock *lock = NULL;
          obd_valid valid;
-        int i, rc;
+        int i, rc, mpflag = 0;
  
          ENTRY;
          LASSERT(!list_empty(rpc_list));
  
+        if (cmd & OBD_BRW_MEMALLOC)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
          OBD_ALLOC(pga, sizeof(*pga) * page_count);
          if (pga == NULL)
                  RETURN(ERR_PTR(-ENOMEM));
@@ -2384,6 +2387,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
          oa = &((struct ost_body *)lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
                                                   sizeof(struct ost_body)))->oa;
  
+        if (cmd & OBD_BRW_MEMALLOC)
+                req->rq_memalloc = 1;
+
          /* Need to update the timestamps after the request is built in case
           * we race with setattr (locally or in queue at OST).  If OST gets
           * later setattr before earlier BRW (as determined by the request xid),
@@ -2415,6 +2421,9 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
          CFS_INIT_LIST_HEAD(rpc_list);
  
  out:
+        if (cmd & OBD_BRW_MEMALLOC)
+                libcfs_memory_pressure_restore(mpflag);
+
          if (IS_ERR(req)) {
                  if (oa)
                          OBDO_FREE(oa);
@@ -2434,8 +2443,9 @@ out:
   * \param cmd - OBD_BRW_* macroses
   * \param lop - pending pages
   *
- * \return zero if pages successfully add to send queue.
- * \return not zere if error occurring.
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
   */
  static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                              int cmd, struct loi_oap_pages *lop)
@@ -2448,7 +2458,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
          CFS_LIST_HEAD(rpc_list);
          unsigned int ending_offset;
          unsigned  starting_offset = 0;
-        int srvlock = 0;
+        int srvlock = 0, mem_tight = 0;
          ENTRY;
  
          /* If there are HP OAPs we need to handle at least 1 of them,
@@ -2478,7 +2488,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                   * until completion unlocks it.  commit_write submits a page
                   * as not ready because its unlock will happen unconditionally
                   * as the call returns.  if we race with commit_write giving
-                 * us that page we dont' want to create a hole in the page
+                 * us that page we don't want to create a hole in the page
                   * stream, so we stop and leave the rpc to be fired by
                   * another dirtier or kupdated interval (the not ready page
                   * will still be on the dirty list).  we could call in
@@ -2561,6 +2571,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  
                  /* now put the page back in our accounting */
                  list_add_tail(&oap->oap_rpc_item, &rpc_list);
+                if (oap->oap_brw_flags & OBD_BRW_MEMALLOC)
+                        mem_tight = 1;
                  if (page_count == 0)
                          srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK);
                  if (++page_count >= cli->cl_max_pages_per_rpc)
@@ -2591,7 +2603,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  
          client_obd_list_unlock(&cli->cl_loi_list_lock);
  
-        req = osc_build_req(cli, &rpc_list, page_count, cmd);
+        req = osc_build_req(cli, &rpc_list, page_count,
+                            mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd);
          if (IS_ERR(req)) {
                  /* this should happen rarely and is pretty bad, it makes the
                   * pending list not follow the dirty order */
@@ -2766,7 +2779,7 @@ static void osc_check_rpcs(struct client_obd *cli)
                                  race_counter++;
                  }
  
-                /* attempt some inter-object balancing by issueing rpcs
+                /* attempt some inter-object balancing by issuing rpcs
                   * for each object in turn */
                  if (!list_empty(&loi->loi_hp_ready_item))
                          list_del_init(&loi->loi_hp_ready_item);
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 44b5711..25c1df5 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -488,6 +488,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
  {
          int rc;
          int rc2;
+        int mpflag = 0;
          struct ptlrpc_connection *connection;
          lnet_handle_me_t  reply_me_h;
          lnet_md_t         reply_md;
@@ -534,6 +535,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          if (request->rq_resend)
                  lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
  
+        if (request->rq_memalloc)
+                mpflag = libcfs_memory_pressure_get_and_set();
+
          if (!noreply) {
                  LASSERT (request->rq_replen != 0);
                  if (request->rq_repbuf == NULL)
@@ -633,11 +637,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                            request->rq_request_portal,
                            request->rq_xid, 0);
          if (rc == 0)
-                RETURN(rc);
+                GOTO(out, rc);
  
          ptlrpc_req_finished(request);
          if (noreply)
-                RETURN(rc);
+                GOTO(out, rc);
  
   cleanup_me:
          /* MEUnlink is safe; the PUT didn't even get off the ground, and
@@ -657,6 +661,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          /* We do sync unlink here as there was no real transfer here so
           * the chance to have long unlink to sluggish net is smaller here. */
          ptlrpc_unregister_bulk(request, 0);
+ out:
+        if (request->rq_memalloc)
+                libcfs_memory_pressure_restore(mpflag);
          return rc;
  }
author	Isaac Huang <he.h.huang@oracle.com>
	Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)
committer	Johann Lombardi <johann.lombardi@oracle.com>
	Thu, 27 Jan 2011 13:49:27 +0000 (14:49 +0100)
lnet/ChangeLog		patch \| blob \| history
lnet/include/libcfs/linux/linux-mem.h		patch \| blob \| history
lnet/include/lnet/lib-types.h		patch \| blob \| history
lnet/klnds/ptllnd/ptllnd_cb.c		patch \| blob \| history
lnet/klnds/socklnd/socklnd_cb.c		patch \| blob \| history
lnet/libcfs/tracefile.c		patch \| blob \| history
lnet/lnet/lib-move.c		patch \| blob \| history
lustre/include/liblustre.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history