X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=932591cc7c9272aece60172ac96bf84270bd6ea0;hb=ddd2b0e4c4170071948e27bf4e4dcd81ef24291e;hp=156a09a8289e9efc6616f13791b745f0bcfde6f4;hpb=14a611ad680e89523abbcab0a3310511ab808ba8;p=fs%2Flustre-release.git diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 156a09a..932591c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -26,7 +26,7 @@ * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. */ /* @@ -155,6 +155,7 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, lnet_libmd_t **md_out) { lnet_portal_t *ptl = &the_lnet.ln_portals[index]; + cfs_list_t *head; lnet_me_t *me; lnet_me_t *tmp; lnet_libmd_t *md; @@ -169,7 +170,11 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, return LNET_MATCHMD_DROP; } - cfs_list_for_each_entry_safe_typed (me, tmp, &ptl->ptl_ml, + head = lnet_portal_me_head(index, src, match_bits); + if (head == NULL) /* nobody posted anything on this portal */ + goto out; + + cfs_list_for_each_entry_safe_typed (me, tmp, head, lnet_me_t, me_list) { md = me->me_md; @@ -199,8 +204,9 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, /* not reached */ } + out: if (op_mask == LNET_MD_OP_GET || - (ptl->ptl_options & LNET_PTL_LAZY) == 0) + !lnet_portal_is_lazy(ptl)) return LNET_MATCHMD_DROP; return LNET_MATCHMD_NONE; @@ -209,10 +215,10 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, int lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) { - lnet_test_peer_t *tp; - struct list_head *el; - struct list_head *next; - struct list_head cull; + lnet_test_peer_t *tp; + cfs_list_t *el; + cfs_list_t *next; + cfs_list_t cull; LASSERT (the_lnet.ln_init); @@ -226,7 +232,7 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) tp->tp_threshold = threshold; LNET_LOCK(); - list_add_tail (&tp->tp_list, &the_lnet.ln_test_peers); + cfs_list_add_tail (&tp->tp_list, &the_lnet.ln_test_peers); LNET_UNLOCK(); return 0; } @@ -236,24 +242,24 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) LNET_LOCK(); - list_for_each_safe (el, next, &the_lnet.ln_test_peers) { - tp = list_entry (el, lnet_test_peer_t, tp_list); + cfs_list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = cfs_list_entry (el, lnet_test_peer_t, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ nid == LNET_NID_ANY || /* removing all entries */ tp->tp_nid == nid) /* matched this one */ { - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); + cfs_list_del (&tp->tp_list); + cfs_list_add (&tp->tp_list, &cull); } } LNET_UNLOCK(); - while (!list_empty (&cull)) { - tp = list_entry (cull.next, lnet_test_peer_t, tp_list); + while (!cfs_list_empty (&cull)) { + tp = cfs_list_entry (cull.next, lnet_test_peer_t, tp_list); - list_del (&tp->tp_list); + cfs_list_del (&tp->tp_list); LIBCFS_FREE(tp, sizeof (*tp)); } return 0; @@ -262,18 +268,18 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) static int fail_peer (lnet_nid_t nid, int outgoing) { - lnet_test_peer_t *tp; - struct list_head *el; - struct list_head *next; - struct list_head cull; + lnet_test_peer_t *tp; + cfs_list_t *el; + cfs_list_t *next; + cfs_list_t cull; int fail = 0; CFS_INIT_LIST_HEAD (&cull); LNET_LOCK(); - list_for_each_safe (el, next, &the_lnet.ln_test_peers) { - tp = list_entry (el, lnet_test_peer_t, tp_list); + cfs_list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = cfs_list_entry (el, lnet_test_peer_t, tp_list); if (tp->tp_threshold == 0) { /* zombie entry */ @@ -281,8 +287,8 @@ fail_peer (lnet_nid_t nid, int outgoing) /* only cull zombies on outgoing tests, * since we may be at interrupt priority on * incoming messages. */ - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); + cfs_list_del (&tp->tp_list); + cfs_list_add (&tp->tp_list, &cull); } continue; } @@ -296,8 +302,8 @@ fail_peer (lnet_nid_t nid, int outgoing) if (outgoing && tp->tp_threshold == 0) { /* see above */ - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); + cfs_list_del (&tp->tp_list); + cfs_list_add (&tp->tp_list, &cull); } } break; @@ -306,9 +312,9 @@ fail_peer (lnet_nid_t nid, int outgoing) LNET_UNLOCK (); - while (!list_empty (&cull)) { - tp = list_entry (cull.next, lnet_test_peer_t, tp_list); - list_del (&tp->tp_list); + while (!cfs_list_empty (&cull)) { + tp = cfs_list_entry (cull.next, lnet_test_peer_t, tp_list); + cfs_list_del (&tp->tp_list); LIBCFS_FREE(tp, sizeof (*tp)); } @@ -497,7 +503,7 @@ lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset if (nob == 0) return; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); LASSERT (ndiov > 0); while (doffset >= diov->kiov_len) { @@ -577,7 +583,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset if (nob == 0) return; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); LASSERT (niov > 0); while (iovoffset >= iov->iov_len) { @@ -646,7 +652,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs if (nob == 0) return; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); LASSERT (nkiov > 0); while (kiovoffset >= kiov->kiov_len) { @@ -761,7 +767,7 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, lnet_kiov_t *kiov = NULL; int rc; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); if (msg != NULL) { @@ -864,7 +870,7 @@ lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) void *priv = msg->msg_private; int rc; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || (msg->msg_txcredit && msg->msg_peertxcredit)); @@ -915,8 +921,7 @@ lnet_ni_peer_alive(lnet_peer_t *lp) cfs_time_t last_alive = 0; lnet_ni_t *ni = lp->lp_ni; - LASSERT (ni != NULL); - LASSERT (ni->ni_peertimeout > 0); + LASSERT (lnet_peer_aliveness_enabled(lp)); LASSERT (ni->ni_lnd->lnd_query != NULL); LNET_UNLOCK(); @@ -934,12 +939,10 @@ lnet_ni_peer_alive(lnet_peer_t *lp) static inline int lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) { - lnet_ni_t *ni = lp->lp_ni; - cfs_time_t deadline; - int alive; + int alive; + cfs_time_t deadline; - LASSERT (ni != NULL); - LASSERT (ni->ni_peertimeout > 0); + LASSERT (lnet_peer_aliveness_enabled(lp)); /* Trust lnet_notify() if it has more recent aliveness news, but * ignore the initial assumed death (see lnet_peers_start_down()). @@ -949,12 +952,15 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) return 0; deadline = cfs_time_add(lp->lp_last_alive, - cfs_time_seconds(ni->ni_peertimeout)); + cfs_time_seconds(lp->lp_ni->ni_peertimeout)); alive = cfs_time_after(deadline, now); - /* Update obsolete lp_alive */ - if (alive && !lp->lp_alive && lp->lp_timestamp != 0 && - cfs_time_before(lp->lp_timestamp, lp->lp_last_alive)) + /* Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lp_last_alive at peer creation is assumed. + */ + if (alive && !lp->lp_alive && + !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); return alive; @@ -966,12 +972,9 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) int lnet_peer_alive_locked (lnet_peer_t *lp) { - lnet_ni_t *ni = lp->lp_ni; - cfs_time_t now = cfs_time_current(); + cfs_time_t now = cfs_time_current(); - LASSERT (ni != NULL); - - if (ni->ni_peertimeout <= 0) /* disabled */ + if (!lnet_peer_aliveness_enabled(lp)) return -ENODEV; if (lnet_peer_is_alive(lp, now)) @@ -992,7 +995,8 @@ lnet_peer_alive_locked (lnet_peer_t *lp) "%d < %d (%d/%d)\n", libcfs_nid2str(lp->lp_nid), (int)now, (int)next_query, - lnet_queryinterval, ni->ni_peertimeout); + lnet_queryinterval, + lp->lp_ni->ni_peertimeout); return 0; } } @@ -1024,10 +1028,12 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && lnet_peer_alive_locked(lp) == 0) { + the_lnet.ln_counters.drop_count++; + the_lnet.ln_counters.drop_length += msg->msg_len; LNET_UNLOCK(); - CDEBUG(D_NETERROR, "Dropping message for %s: peer not alive\n", - libcfs_id2str(msg->msg_target)); + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); if (do_send) lnet_finalize(ni, msg, -EHOSTUNREACH); @@ -1036,7 +1042,8 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) } if (!msg->msg_peertxcredit) { - LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq)); + LASSERT ((lp->lp_txcredits < 0) == + !cfs_list_empty(&lp->lp_txq)); msg->msg_peertxcredit = 1; lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); @@ -1047,13 +1054,14 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) if (lp->lp_txcredits < 0) { msg->msg_delayed = 1; - list_add_tail (&msg->msg_list, &lp->lp_txq); + cfs_list_add_tail(&msg->msg_list, &lp->lp_txq); return EAGAIN; } } if (!msg->msg_txcredit) { - LASSERT ((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); + LASSERT ((ni->ni_txcredits < 0) == + !cfs_list_empty(&ni->ni_txq)); msg->msg_txcredit = 1; ni->ni_txcredits--; @@ -1063,7 +1071,7 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) if (ni->ni_txcredits < 0) { msg->msg_delayed = 1; - list_add_tail (&msg->msg_list, &ni->ni_txq); + cfs_list_add_tail(&msg->msg_list, &ni->ni_txq); return EAGAIN; } } @@ -1094,7 +1102,7 @@ lnet_commit_routedmsg (lnet_msg_t *msg) LASSERT (!msg->msg_onactivelist); msg->msg_onactivelist = 1; - list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); + cfs_list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); } lnet_rtrbufpool_t * @@ -1132,7 +1140,8 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) LASSERT (!do_recv || msg->msg_delayed); if (!msg->msg_peerrtrcredit) { - LASSERT ((lp->lp_rtrcredits < 0) == !list_empty(&lp->lp_rtrq)); + LASSERT ((lp->lp_rtrcredits < 0) == + !cfs_list_empty(&lp->lp_rtrq)); msg->msg_peerrtrcredit = 1; lp->lp_rtrcredits--; @@ -1142,7 +1151,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) if (lp->lp_rtrcredits < 0) { /* must have checked eager_recv before here */ LASSERT (msg->msg_delayed); - list_add_tail(&msg->msg_list, &lp->lp_rtrq); + cfs_list_add_tail(&msg->msg_list, &lp->lp_rtrq); return EAGAIN; } } @@ -1150,7 +1159,8 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) rbp = lnet_msg2bufpool(msg); if (!msg->msg_rtrcredit) { - LASSERT ((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); + LASSERT ((rbp->rbp_credits < 0) == + !cfs_list_empty(&rbp->rbp_msgs)); msg->msg_rtrcredit = 1; rbp->rbp_credits--; @@ -1160,14 +1170,14 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) if (rbp->rbp_credits < 0) { /* must have checked eager_recv before here */ LASSERT (msg->msg_delayed); - list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + cfs_list_add_tail(&msg->msg_list, &rbp->rbp_msgs); return EAGAIN; } } - LASSERT (!list_empty(&rbp->rbp_bufs)); - rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); - list_del(&rb->rb_list); + LASSERT (!cfs_list_empty(&rbp->rbp_bufs)); + rb = cfs_list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + cfs_list_del(&rb->rb_list); msg->msg_niov = rbp->rbp_npages; msg->msg_kiov = &rb->rb_kiov[0]; @@ -1195,12 +1205,13 @@ lnet_return_credits_locked (lnet_msg_t *msg) msg->msg_txcredit = 0; ni = txpeer->lp_ni; - LASSERT((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); + LASSERT((ni->ni_txcredits < 0) == !cfs_list_empty(&ni->ni_txq)); ni->ni_txcredits++; if (ni->ni_txcredits <= 0) { - msg2 = list_entry(ni->ni_txq.next, lnet_msg_t, msg_list); - list_del(&msg2->msg_list); + msg2 = cfs_list_entry(ni->ni_txq.next, lnet_msg_t, + msg_list); + cfs_list_del(&msg2->msg_list); LASSERT(msg2->msg_txpeer->lp_ni == ni); LASSERT(msg2->msg_delayed); @@ -1213,16 +1224,17 @@ lnet_return_credits_locked (lnet_msg_t *msg) /* give back peer txcredits */ msg->msg_peertxcredit = 0; - LASSERT((txpeer->lp_txcredits < 0) == !list_empty(&txpeer->lp_txq)); + LASSERT((txpeer->lp_txcredits < 0) == + !cfs_list_empty(&txpeer->lp_txq)); txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); LASSERT (txpeer->lp_txqnob >= 0); txpeer->lp_txcredits++; if (txpeer->lp_txcredits <= 0) { - msg2 = list_entry(txpeer->lp_txq.next, - lnet_msg_t, msg_list); - list_del(&msg2->msg_list); + msg2 = cfs_list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + cfs_list_del(&msg2->msg_list); LASSERT (msg2->msg_txpeer == txpeer); LASSERT (msg2->msg_delayed); @@ -1247,22 +1259,24 @@ lnet_return_credits_locked (lnet_msg_t *msg) * itself */ LASSERT (msg->msg_kiov != NULL); - rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rb = cfs_list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); rbp = rb->rb_pool; LASSERT (rbp == lnet_msg2bufpool(msg)); msg->msg_kiov = NULL; msg->msg_rtrcredit = 0; - LASSERT((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); - LASSERT((rbp->rbp_credits > 0) == !list_empty(&rbp->rbp_bufs)); + LASSERT((rbp->rbp_credits < 0) == + !cfs_list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == + !cfs_list_empty(&rbp->rbp_bufs)); - list_add(&rb->rb_list, &rbp->rbp_bufs); + cfs_list_add(&rb->rb_list, &rbp->rbp_bufs); rbp->rbp_credits++; if (rbp->rbp_credits <= 0) { - msg2 = list_entry(rbp->rbp_msgs.next, - lnet_msg_t, msg_list); - list_del(&msg2->msg_list); + msg2 = cfs_list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + cfs_list_del(&msg2->msg_list); (void) lnet_post_routed_recv_locked(msg2, 1); } @@ -1272,13 +1286,14 @@ lnet_return_credits_locked (lnet_msg_t *msg) /* give back peer router credits */ msg->msg_peerrtrcredit = 0; - LASSERT((rxpeer->lp_rtrcredits < 0) == !list_empty(&rxpeer->lp_rtrq)); + LASSERT((rxpeer->lp_rtrcredits < 0) == + !cfs_list_empty(&rxpeer->lp_rtrq)); rxpeer->lp_rtrcredits++; if (rxpeer->lp_rtrcredits <= 0) { - msg2 = list_entry(rxpeer->lp_rtrq.next, - lnet_msg_t, msg_list); - list_del(&msg2->msg_list); + msg2 = cfs_list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + cfs_list_del(&msg2->msg_list); (void) lnet_post_routed_recv_locked(msg2, 1); } @@ -1302,7 +1317,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lnet_remotenet_t *rnet; lnet_route_t *route; lnet_route_t *best_route; - struct list_head *tmp; + cfs_list_t *tmp; lnet_peer_t *lp; lnet_peer_t *lp2; int rc; @@ -1329,8 +1344,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) src_ni = lnet_nid2ni_locked(src_nid); if (src_ni == NULL) { LNET_UNLOCK(); - CERROR("Can't send to %s: src %s is not a local nid\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); + LCONSOLE_WARN("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); return -EINVAL; } LASSERT (!msg->msg_routing); @@ -1349,8 +1365,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lnet_ni_decref_locked(local_ni); lnet_ni_decref_locked(src_ni); LNET_UNLOCK(); - CERROR("No route to %s via from %s\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); + LCONSOLE_WARN("No route to %s via from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); return -EINVAL; } @@ -1371,8 +1388,8 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lnet_ni_decref_locked(src_ni); /* lp has ref on src_ni; lose mine */ if (rc != 0) { LNET_UNLOCK(); - CERROR("Error %d finding peer %s\n", rc, - libcfs_nid2str(dst_nid)); + LCONSOLE_WARN("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); /* ENOMEM or shutting down */ return rc; } @@ -1397,15 +1414,16 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) if (src_ni != NULL) lnet_ni_decref_locked(src_ni); LNET_UNLOCK(); - CERROR("No route to %s\n", libcfs_id2str(msg->msg_target)); + LCONSOLE_WARN("No route to %s\n", + libcfs_id2str(msg->msg_target)); return -EHOSTUNREACH; } /* Find the best gateway I can use */ lp = NULL; best_route = NULL; - list_for_each(tmp, &rnet->lrn_routes) { - route = list_entry(tmp, lnet_route_t, lr_list); + cfs_list_for_each(tmp, &rnet->lrn_routes) { + route = cfs_list_entry(tmp, lnet_route_t, lr_list); lp2 = route->lr_gateway; if (lp2->lp_alive && @@ -1423,16 +1441,17 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lnet_ni_decref_locked(src_ni); LNET_UNLOCK(); - CERROR("No route to %s via %s (all routers down)\n", - libcfs_id2str(msg->msg_target), - libcfs_nid2str(src_nid)); + LCONSOLE_WARN("No route to %s via %s " + "(all routers down)\n", + libcfs_id2str(msg->msg_target), + libcfs_nid2str(src_nid)); return -EHOSTUNREACH; } /* Place selected route at the end of the route list to ensure * fairness; everything else being equal... */ - list_del(&best_route->lr_list); - list_add_tail(&best_route->lr_list, &rnet->lrn_routes); + cfs_list_del(&best_route->lr_list); + cfs_list_add_tail(&best_route->lr_list, &rnet->lrn_routes); if (src_ni == NULL) { src_ni = lp->lp_ni; @@ -1501,7 +1520,7 @@ lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg) LASSERT (!msg->msg_onactivelist); msg->msg_onactivelist = 1; - list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); + cfs_list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); } static void @@ -1554,6 +1573,34 @@ lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) LNET_UNLOCK(); } +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ int LNetSetLazyPortal(int portal) { @@ -1565,18 +1612,25 @@ LNetSetLazyPortal(int portal) CDEBUG(D_NET, "Setting portal %d lazy\n", portal); LNET_LOCK(); - - ptl->ptl_options |= LNET_PTL_LAZY; - + lnet_portal_setopt(ptl, LNET_PTL_LAZY); LNET_UNLOCK(); return 0; } +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ int LNetClearLazyPortal(int portal) { - struct list_head zombies; + cfs_list_t zombies; lnet_portal_t *ptl = &the_lnet.ln_portals[portal]; lnet_msg_t *msg; @@ -1585,7 +1639,7 @@ LNetClearLazyPortal(int portal) LNET_LOCK(); - if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) { + if (!lnet_portal_is_lazy(ptl)) { LNET_UNLOCK(); return 0; } @@ -1596,17 +1650,17 @@ LNetClearLazyPortal(int portal) CDEBUG (D_NET, "clearing portal %d lazy\n", portal); /* grab all the blocked messages atomically */ - list_add(&zombies, &ptl->ptl_msgq); - list_del_init(&ptl->ptl_msgq); + cfs_list_add(&zombies, &ptl->ptl_msgq); + cfs_list_del_init(&ptl->ptl_msgq); ptl->ptl_msgq_version++; - ptl->ptl_options &= ~LNET_PTL_LAZY; + lnet_portal_unsetopt(ptl, LNET_PTL_LAZY); LNET_UNLOCK(); - while (!list_empty(&zombies)) { - msg = list_entry(zombies.next, lnet_msg_t, msg_list); - list_del(&msg->msg_list); + while (!cfs_list_empty(&zombies)) { + msg = cfs_list_entry(zombies.next, lnet_msg_t, msg_list); + cfs_list_del(&msg->msg_list); lnet_drop_delayed_put(msg, "Clearing lazy portal attr"); } @@ -1652,22 +1706,23 @@ lnet_match_blocked_msg(lnet_libmd_t *md) { CFS_LIST_HEAD (drops); CFS_LIST_HEAD (matches); - struct list_head *tmp; - struct list_head *entry; + cfs_list_t *tmp; + cfs_list_t *entry; lnet_msg_t *msg; + lnet_portal_t *ptl; lnet_me_t *me = md->md_me; - lnet_portal_t *ptl = &the_lnet.ln_portals[me->me_portal]; LASSERT (me->me_portal < (unsigned int)the_lnet.ln_nportals); - if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) { - LASSERT (list_empty(&ptl->ptl_msgq)); + ptl = &the_lnet.ln_portals[me->me_portal]; + if (!lnet_portal_is_lazy(ptl)) { + LASSERT (cfs_list_empty(&ptl->ptl_msgq)); return; } LASSERT (md->md_refcount == 0); /* a brand new MD */ - list_for_each_safe (entry, tmp, &ptl->ptl_msgq) { + cfs_list_for_each_safe (entry, tmp, &ptl->ptl_msgq) { int rc; int index; unsigned int mlength; @@ -1675,7 +1730,7 @@ lnet_match_blocked_msg(lnet_libmd_t *md) lnet_hdr_t *hdr; lnet_process_id_t src; - msg = list_entry(entry, lnet_msg_t, msg_list); + msg = cfs_list_entry(entry, lnet_msg_t, msg_list); LASSERT (msg->msg_delayed); @@ -1695,11 +1750,11 @@ lnet_match_blocked_msg(lnet_libmd_t *md) continue; /* Hurrah! This _is_ a match */ - list_del(&msg->msg_list); + cfs_list_del(&msg->msg_list); ptl->ptl_msgq_version++; if (rc == LNET_MATCHMD_OK) { - list_add_tail(&msg->msg_list, &matches); + cfs_list_add_tail(&msg->msg_list, &matches); CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " "match "LPU64" offset %d length %d.\n", @@ -1711,7 +1766,7 @@ lnet_match_blocked_msg(lnet_libmd_t *md) } else { LASSERT (rc == LNET_MATCHMD_DROP); - list_add_tail(&msg->msg_list, &drops); + cfs_list_add_tail(&msg->msg_list, &drops); } if (lnet_md_exhausted(md)) @@ -1720,18 +1775,18 @@ lnet_match_blocked_msg(lnet_libmd_t *md) LNET_UNLOCK(); - list_for_each_safe (entry, tmp, &drops) { - msg = list_entry(entry, lnet_msg_t, msg_list); + cfs_list_for_each_safe (entry, tmp, &drops) { + msg = cfs_list_entry(entry, lnet_msg_t, msg_list); - list_del(&msg->msg_list); + cfs_list_del(&msg->msg_list); lnet_drop_delayed_put(msg, "Bad match"); } - list_for_each_safe (entry, tmp, &matches) { - msg = list_entry(entry, lnet_msg_t, msg_list); + cfs_list_for_each_safe (entry, tmp, &matches) { + msg = cfs_list_entry(entry, lnet_msg_t, msg_list); - list_del(&msg->msg_list); + cfs_list_del(&msg->msg_list); /* md won't disappear under me, since each msg * holds a ref on it */ @@ -1766,7 +1821,6 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); index = hdr->msg.put.ptl_index; - ptl = &the_lnet.ln_portals[index]; LNET_LOCK(); @@ -1785,6 +1839,7 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) return 0; case LNET_MATCHMD_NONE: + ptl = &the_lnet.ln_portals[index]; version = ptl->ptl_ml_version; rc = 0; @@ -1793,11 +1848,11 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) if (rc == 0 && !the_lnet.ln_shutdown && - ((ptl->ptl_options & LNET_PTL_LAZY) != 0)) { + lnet_portal_is_lazy(ptl)) { if (version != ptl->ptl_ml_version) goto again; - list_add_tail(&msg->msg_list, &ptl->ptl_msgq); + cfs_list_add_tail(&msg->msg_list, &ptl->ptl_msgq); ptl->ptl_msgq_version++; LNET_UNLOCK(); @@ -1811,12 +1866,11 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) /* fall through */ case LNET_MATCHMD_DROP: - CDEBUG(D_NETERROR, - "Dropping PUT from %s portal %d match "LPU64 - " offset %d length %d: %d\n", - libcfs_id2str(src), index, - hdr->msg.put.match_bits, - hdr->msg.put.offset, rlength, rc); + CNETERR("Dropping PUT from %s portal %d match "LPU64 + " offset %d length %d: %d\n", + libcfs_id2str(src), index, + hdr->msg.put.match_bits, + hdr->msg.put.offset, rlength, rc); LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ @@ -1850,14 +1904,13 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) hdr->msg.get.match_bits, msg, &mlength, &offset, &md); if (rc == LNET_MATCHMD_DROP) { - CDEBUG(D_NETERROR, - "Dropping GET from %s portal %d match "LPU64 - " offset %d length %d\n", - libcfs_id2str(src), - hdr->msg.get.ptl_index, - hdr->msg.get.match_bits, - hdr->msg.get.src_offset, - hdr->msg.get.sink_length); + CNETERR("Dropping GET from %s portal %d match "LPU64 + " offset %d length %d\n", + libcfs_id2str(src), + hdr->msg.get.ptl_index, + hdr->msg.get.match_bits, + hdr->msg.get.src_offset, + hdr->msg.get.sink_length); LNET_UNLOCK(); return ENOENT; /* +ve: OK but no match */ } @@ -1920,12 +1973,12 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) /* NB handles only looked up by creator (no flips) */ md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { - CDEBUG(D_NETERROR, "%s: Dropping REPLY from %s for %s " - "MD "LPX64"."LPX64"\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - (md == NULL) ? "invalid" : "inactive", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); + CNETERR("%s: Dropping REPLY from %s for %s " + "MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); if (md != NULL && md->md_me != NULL) CERROR("REPLY MD also attached to portal %d\n", md->md_me->me_portal); @@ -1941,9 +1994,9 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) if (mlength < rlength && (md->md_options & LNET_MD_TRUNCATE) == 0) { - CDEBUG(D_NETERROR, "%s: Dropping REPLY from %s length %d " - "for MD "LPX64" would overflow (%d)\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD "LPX64" would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, mlength); LNET_UNLOCK(); @@ -2131,7 +2184,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, __u32 payload_length; __u32 type; - LASSERT (!in_interrupt ()); + LASSERT (!cfs_in_interrupt ()); type = le32_to_cpu(hdr->type); src_nid = le64_to_cpu(hdr->src_nid); @@ -2235,7 +2288,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, /* Message looks OK; we're not going to return an error, so we MUST * call back lnd_recv() come what may... */ - if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + if (!cfs_list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer (src_nid, 0)) /* shall we now? */ { CERROR("%s, src %s: Dropping %s to simulate failure\n", @@ -2349,6 +2402,50 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, return 0; } +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see lnet_event_t::hdr_data and lnet_event_kind_t. + */ int LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, lnet_process_id_t target, unsigned int portal, @@ -2362,7 +2459,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + if (!cfs_list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer (target.nid, 1)) /* shall we now? */ { CERROR("Dropping PUT to %s: simulated failure\n", @@ -2376,6 +2473,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, libcfs_id2str(target)); return -ENOMEM; } + msg->msg_vmflush = !!cfs_memory_pressure_get(); LNET_LOCK(); @@ -2440,7 +2538,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, rc = lnet_send(self, msg); if (rc != 0) { - CERROR("Error sending PUT to %s: %d\n", + CNETERR( "Error sending PUT to %s: %d\n", libcfs_id2str(target), rc); lnet_finalize (NULL, msg, rc); } @@ -2535,6 +2633,26 @@ lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) reply->msg_ev.mlength = len; } +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ int LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, lnet_process_id_t target, unsigned int portal, @@ -2547,7 +2665,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + if (!cfs_list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer (target.nid, 1)) /* shall we now? */ { CERROR("Dropping GET to %s: simulated failure\n", @@ -2617,7 +2735,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, rc = lnet_send(self, msg); if (rc < 0) { - CERROR("error sending GET to %s: %d\n", + CNETERR( "Error sending GET to %s: %d\n", libcfs_id2str(target), rc); lnet_finalize (NULL, msg, rc); } @@ -2626,10 +2744,24 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, return 0; } +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ int LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + cfs_list_t *e; lnet_ni_t *ni; lnet_remotenet_t *rnet; __u32 dstnet = LNET_NIDNET(dstnid); @@ -2646,8 +2778,8 @@ LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) LNET_LOCK(); - list_for_each (e, &the_lnet.ln_nis) { - ni = list_entry(e, lnet_ni_t, ni_list); + cfs_list_for_each (e, &the_lnet.ln_nis) { + ni = cfs_list_entry(e, lnet_ni_t, ni_list); if (ni->ni_nid == dstnid) { if (srcnidp != NULL) @@ -2675,16 +2807,17 @@ LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) order++; } - list_for_each (e, &the_lnet.ln_remote_nets) { - rnet = list_entry(e, lnet_remotenet_t, lrn_list); + cfs_list_for_each (e, &the_lnet.ln_remote_nets) { + rnet = cfs_list_entry(e, lnet_remotenet_t, lrn_list); if (rnet->lrn_net == dstnet) { lnet_route_t *route; lnet_route_t *shortest = NULL; - LASSERT (!list_empty(&rnet->lrn_routes)); + LASSERT (!cfs_list_empty(&rnet->lrn_routes)); - list_for_each_entry(route, &rnet->lrn_routes, lr_list) { + cfs_list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { if (shortest == NULL || route->lr_hops < shortest->lr_hops) shortest = route; @@ -2706,6 +2839,23 @@ LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) return -EHOSTUNREACH; } +/** + * Set the number of asynchronous messages expected from a target process. + * + * This function is only meaningful for userspace callers. It's a no-op when + * called from kernel. + * + * Asynchronous messages are those that can come from a target when the + * userspace process is not waiting for IO to complete; e.g., AST callbacks + * from Lustre servers. Specifying the expected number of such messages + * allows them to be eagerly received when user process is not running in + * LNet; otherwise network errors may occur. + * + * \param id Process ID of the target process. + * \param nasync Number of asynchronous messages expected from the target. + * + * \return 0 on success, and an error code otherwise. + */ int LNetSetAsync(lnet_process_id_t id, int nasync) { @@ -2714,7 +2864,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) #else lnet_ni_t *ni; lnet_remotenet_t *rnet; - struct list_head *tmp; + cfs_list_t *tmp; lnet_route_t *route; lnet_nid_t *nids; int nnids; @@ -2722,8 +2872,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) int rc = 0; int rc2; - /* Target on a local network? */ - + /* Target on a local network? */ ni = lnet_net2ni(LNET_NIDNET(id.nid)); if (ni != NULL) { if (ni->ni_lnd->lnd_setasync != NULL) @@ -2743,7 +2892,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) LNET_LOCK(); rnet = lnet_find_net_locked(LNET_NIDNET(id.nid)); if (rnet != NULL) { - list_for_each(tmp, &rnet->lrn_routes) { + cfs_list_for_each(tmp, &rnet->lrn_routes) { if (nnids == maxnids) { LNET_UNLOCK(); LIBCFS_FREE(nids, maxnids * sizeof(*nids)); @@ -2751,7 +2900,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) goto again; } - route = list_entry(tmp, lnet_route_t, lr_list); + route = cfs_list_entry(tmp, lnet_route_t, lr_list); nids[nnids++] = route->lr_gateway->lp_nid; } }