ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
Severity : normal
+Bugzilla : 21776
+Description: ptlrpcd stuck in lnet allocations under memory pressure
+Details : Set PF_MEMALLOC on outgoing path to prevent deadlock on memory
+ allocation
+
+Severity : normal
Bugzilla : 23575
-Description: fix o2iblnd v2 regression of credit deadlock with v1 peers (bug 14425).
+Description: fix o2iblnd v2 regression of credit deadlock with v1 peers
+ (bug 14425).
Severity : normal
Bugzilla : 21456
#define CFS_NUM_CACHEPAGES num_physpages
#endif
+static inline int libcfs_memory_pressure_get_and_set(void)
+{
+ int old = libcfs_memory_pressure_get();
+
+ if (!old)
+ libcfs_memory_pressure_set();
+ return old;
+}
+
+static inline void libcfs_memory_pressure_restore(int old)
+{
+ if (old)
+ libcfs_memory_pressure_set();
+ else
+ libcfs_memory_pressure_clr();
+ return;
+}
+
+
/*
* In Linux there is no way to determine whether current execution context is
* blockable.
lnet_process_id_t msg_target;
__u32 msg_type;
+ unsigned int msg_vmflush:1; /* VM trying to free memory */
unsigned int msg_target_is_router:1; /* sending to a router */
unsigned int msg_routing:1; /* being forwarded */
unsigned int msg_ack:1; /* ack on finalize (PUT) */
unsigned int payload_offset = lntmsg->msg_offset;
unsigned int payload_nob = lntmsg->msg_len;
kptl_net_t *net = ni->ni_data;
- kptl_peer_t *peer;
+ kptl_peer_t *peer = NULL;
+ int mpflag = 0;
kptl_tx_t *tx;
int nob;
int nfrag;
LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
LASSERT (!in_interrupt());
+ if (lntmsg->msg_vmflush)
+ mpflag = libcfs_memory_pressure_get_and_set();
+
rc = kptllnd_find_target(net, target, &peer);
if (rc != 0)
- return rc;
-
+ goto out;
+
/* NB peer->peer_id does NOT always equal target, be careful with
* which one to use */
switch (type) {
kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov,
NULL, lntmsg->msg_md->md_iov.kiov,
0, lntmsg->msg_md->md_length);
-
+
tx->tx_lnet_msg = lntmsg;
tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr;
kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET,
payload_offset, payload_nob);
#endif
}
-
+
nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, target, nob);
kptllnd_tx_launch(peer, tx, nfrag);
out:
- kptllnd_peer_decref(peer);
+ if (lntmsg->msg_vmflush)
+ libcfs_memory_pressure_restore(mpflag);
+ if (peer)
+ kptllnd_peer_decref(peer);
return rc;
}
int
ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
{
+ int mpflag = 0;
int type = lntmsg->msg_type;
lnet_process_id_t target = lntmsg->msg_target;
unsigned int payload_niov = lntmsg->msg_niov;
desc_size = offsetof(ksock_tx_t,
tx_frags.paged.kiov[payload_niov]);
+ if (lntmsg->msg_vmflush)
+ mpflag = libcfs_memory_pressure_get_and_set();
tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
if (tx == NULL) {
CERROR("Can't allocate tx desc type %d size %d\n",
type, desc_size);
+ if (lntmsg->msg_vmflush)
+ libcfs_memory_pressure_restore(mpflag);
return (-ENOMEM);
}
/* The first fragment will be set later in pro_pack */
rc = ksocknal_launch_packet(ni, tx, target);
+ if (lntmsg->msg_vmflush)
+ libcfs_memory_pressure_restore(mpflag);
if (rc == 0)
return (0);
cfs_page_t *page;
struct trace_page *tage;
+ /* My caller is trying to free memory */
+ if (!cfs_in_interrupt() && libcfs_memory_pressure_get())
+ return NULL;
+
/*
* Don't spam console with allocation failures: they will be reported
* by upper layer anyway.
libcfs_id2str(target));
return -ENOMEM;
}
+ msg->msg_vmflush = !!libcfs_memory_pressure_get();
LNET_LOCK();
#define libcfs_memory_pressure_get() (0)
#define libcfs_memory_pressure_set() do {} while (0)
#define libcfs_memory_pressure_clr() do {} while (0)
+#define libcfs_memory_pressure_get_and_set() do {} while (0)
+#define libcfs_memory_pressure_restore() do {} while (0)
/* FIXME sys/capability will finally included linux/fs.h thus
* cause numerous trouble on x86-64. as temporary solution for
rq_replay:1,
rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
+ rq_memalloc:1, /* req originated from "kswapd" */
/* server-side flags */
rq_packed_final:1, /* packed final reply */
rq_hp:1, /* high priority RPC */
if (cmd & OBD_BRW_WRITE) {
/* trigger a write rpc stream as long as there are dirtiers
* waiting for space. as they're waiting, they're not going to
- * create more pages to coallesce with what's waiting.. */
+ * create more pages to coalesce with what's waiting.. */
if (!list_empty(&cli->cl_cache_waiters)) {
CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
RETURN(1);
struct osc_async_page *oap;
struct ldlm_lock *lock = NULL;
obd_valid valid;
- int i, rc;
+ int i, rc, mpflag = 0;
ENTRY;
LASSERT(!list_empty(rpc_list));
+ if (cmd & OBD_BRW_MEMALLOC)
+ mpflag = libcfs_memory_pressure_get_and_set();
+
OBD_ALLOC(pga, sizeof(*pga) * page_count);
if (pga == NULL)
RETURN(ERR_PTR(-ENOMEM));
oa = &((struct ost_body *)lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
sizeof(struct ost_body)))->oa;
+ if (cmd & OBD_BRW_MEMALLOC)
+ req->rq_memalloc = 1;
+
/* Need to update the timestamps after the request is built in case
* we race with setattr (locally or in queue at OST). If OST gets
* later setattr before earlier BRW (as determined by the request xid),
CFS_INIT_LIST_HEAD(rpc_list);
out:
+ if (cmd & OBD_BRW_MEMALLOC)
+ libcfs_memory_pressure_restore(mpflag);
+
if (IS_ERR(req)) {
if (oa)
OBDO_FREE(oa);
* \param cmd - OBD_BRW_* macroses
* \param lop - pending pages
*
- * \return zero if pages successfully add to send queue.
- * \return not zere if error occurring.
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
*/
static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
int cmd, struct loi_oap_pages *lop)
CFS_LIST_HEAD(rpc_list);
unsigned int ending_offset;
unsigned starting_offset = 0;
- int srvlock = 0;
+ int srvlock = 0, mem_tight = 0;
ENTRY;
/* If there are HP OAPs we need to handle at least 1 of them,
* until completion unlocks it. commit_write submits a page
* as not ready because its unlock will happen unconditionally
* as the call returns. if we race with commit_write giving
- * us that page we dont' want to create a hole in the page
+ * us that page we don't want to create a hole in the page
* stream, so we stop and leave the rpc to be fired by
* another dirtier or kupdated interval (the not ready page
* will still be on the dirty list). we could call in
/* now put the page back in our accounting */
list_add_tail(&oap->oap_rpc_item, &rpc_list);
+ if (oap->oap_brw_flags & OBD_BRW_MEMALLOC)
+ mem_tight = 1;
if (page_count == 0)
srvlock = !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK);
if (++page_count >= cli->cl_max_pages_per_rpc)
client_obd_list_unlock(&cli->cl_loi_list_lock);
- req = osc_build_req(cli, &rpc_list, page_count, cmd);
+ req = osc_build_req(cli, &rpc_list, page_count,
+ mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd);
if (IS_ERR(req)) {
/* this should happen rarely and is pretty bad, it makes the
* pending list not follow the dirty order */
race_counter++;
}
- /* attempt some inter-object balancing by issueing rpcs
+ /* attempt some inter-object balancing by issuing rpcs
* for each object in turn */
if (!list_empty(&loi->loi_hp_ready_item))
list_del_init(&loi->loi_hp_ready_item);
{
int rc;
int rc2;
+ int mpflag = 0;
struct ptlrpc_connection *connection;
lnet_handle_me_t reply_me_h;
lnet_md_t reply_md;
if (request->rq_resend)
lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+ if (request->rq_memalloc)
+ mpflag = libcfs_memory_pressure_get_and_set();
+
if (!noreply) {
LASSERT (request->rq_replen != 0);
if (request->rq_repbuf == NULL)
request->rq_request_portal,
request->rq_xid, 0);
if (rc == 0)
- RETURN(rc);
+ GOTO(out, rc);
ptlrpc_req_finished(request);
if (noreply)
- RETURN(rc);
+ GOTO(out, rc);
cleanup_me:
/* MEUnlink is safe; the PUT didn't even get off the ground, and
/* We do sync unlink here as there was no real transfer here so
* the chance to have long unlink to sluggish net is smaller here. */
ptlrpc_unregister_bulk(request, 0);
+ out:
+ if (request->rq_memalloc)
+ libcfs_memory_pressure_restore(mpflag);
return rc;
}