1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
5 * Author: PJ Kirner <pjkirner@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * This file is confidential source code owned by Cluster File Systems.
11 * No viewing, modification, compilation, redistribution, or any other
12 * form of use is permitted except through a signed license agreement.
14 * If you have not signed such an agreement, then you have no rights to
15 * this file. Please destroy it immediately and contact CFS.
22 kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp)
24 memset(rxbp, 0, sizeof(*rxbp));
25 spin_lock_init(&rxbp->rxbp_lock);
26 INIT_LIST_HEAD(&rxbp->rxbp_list);
30 kptllnd_rx_buffer_destroy(kptl_rx_buffer_t *rxb)
32 kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
34 LASSERT(rxb->rxb_refcount == 0);
35 LASSERT(PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE));
36 LASSERT(!rxb->rxb_posted);
37 LASSERT(rxb->rxb_idle);
39 list_del(&rxb->rxb_list);
42 LIBCFS_FREE(rxb->rxb_buffer, kptllnd_rx_buffer_size());
43 LIBCFS_FREE(rxb, sizeof(*rxb));
47 kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count)
52 kptl_rx_buffer_t *rxb;
56 bufsize = kptllnd_rx_buffer_size();
57 msgs_per_buffer = bufsize / (*kptllnd_tunables.kptl_max_msg_size);
59 CDEBUG(D_NET, "kptllnd_rx_buffer_pool_reserve(%d)\n", count);
61 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
64 if (rxbp->rxbp_shutdown) {
69 if (rxbp->rxbp_reserved + count <=
70 rxbp->rxbp_count * msgs_per_buffer) {
75 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
77 LIBCFS_ALLOC(rxb, sizeof(*rxb));
78 LIBCFS_ALLOC(buffer, bufsize);
80 if (rxb == NULL || buffer == NULL) {
81 CERROR("Failed to allocate rx buffer\n");
84 LIBCFS_FREE(rxb, sizeof(*rxb));
86 LIBCFS_FREE(buffer, bufsize);
88 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
93 memset(rxb, 0, sizeof(*rxb));
95 rxb->rxb_eventarg.eva_type = PTLLND_EVENTARG_TYPE_BUF;
96 rxb->rxb_refcount = 0;
100 rxb->rxb_buffer = buffer;
101 rxb->rxb_mdh = PTL_INVALID_HANDLE;
103 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
105 if (rxbp->rxbp_shutdown) {
106 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
108 LIBCFS_FREE(rxb, sizeof(*rxb));
109 LIBCFS_FREE(buffer, bufsize);
111 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
116 list_add_tail(&rxb->rxb_list, &rxbp->rxbp_list);
119 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
121 kptllnd_rx_buffer_post(rxb);
123 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
127 rxbp->rxbp_reserved += count;
129 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
135 kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp,
140 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
142 CDEBUG(D_NET, "kptllnd_rx_buffer_pool_unreserve(%d)\n", count);
143 rxbp->rxbp_reserved -= count;
145 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
149 kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp)
151 kptl_rx_buffer_t *rxb;
155 struct list_head *tmp;
156 struct list_head *nxt;
159 /* CAVEAT EMPTOR: I'm racing with everything here!!!
161 * Buffers can still be posted after I set rxbp_shutdown because I
162 * can't hold rxbp_lock while I'm posting them.
164 * Calling PtlMDUnlink() here races with auto-unlinks; i.e. a buffer's
165 * MD handle could become invalid under me. I am vulnerable to portals
166 * re-using handles (i.e. make the same handle valid again, but for a
167 * different MD) from when the MD is actually unlinked, to when the
168 * event callback tells me it has been unlinked. */
170 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
172 rxbp->rxbp_shutdown = 1;
175 list_for_each_safe(tmp, nxt, &rxbp->rxbp_list) {
176 rxb = list_entry (tmp, kptl_rx_buffer_t, rxb_list);
179 spin_unlock_irqrestore(&rxbp->rxbp_lock,
181 kptllnd_rx_buffer_destroy(rxb);
182 spin_lock_irqsave(&rxbp->rxbp_lock,
188 if (PtlHandleIsEqual(mdh, PTL_INVALID_HANDLE))
191 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
193 rc = PtlMDUnlink(mdh);
195 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
197 #ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
198 /* callback clears rxb_mdh and drops net's ref
199 * (which causes repost, but since I set
200 * shutdown, it will just set the buffer
205 rxb->rxb_mdh = PTL_INVALID_HANDLE;
206 kptllnd_rx_buffer_decref_locked(rxb);
211 if (list_empty(&rxbp->rxbp_list))
214 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
216 /* Wait a bit for references to be dropped */
217 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
218 "Waiting for %d Busy RX Buffers\n",
221 cfs_pause(cfs_time_seconds(1));
223 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
226 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
230 kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb)
236 ptl_process_id_t any;
237 kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
240 LASSERT (!in_interrupt());
241 LASSERT (rxb->rxb_refcount == 0);
242 LASSERT (!rxb->rxb_idle);
243 LASSERT (!rxb->rxb_posted);
244 LASSERT (PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE));
246 any.nid = PTL_NID_ANY;
247 any.pid = PTL_PID_ANY;
249 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
251 if (rxbp->rxbp_shutdown) {
253 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
257 rxb->rxb_refcount = 1; /* net's ref */
258 rxb->rxb_posted = 1; /* I'm posting */
260 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
262 rc = PtlMEAttach(kptllnd_data.kptl_nih,
263 *kptllnd_tunables.kptl_portal,
266 0, /* all matchbits are valid - ignore none */
271 CERROR("PtlMeAttach rxb failed %d\n", rc);
278 md.start = rxb->rxb_buffer;
279 md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages;
280 md.threshold = PTL_MD_THRESH_INF;
281 md.options = PTL_MD_OP_PUT |
282 PTL_MD_LUSTRE_COMPLETION_SEMANTICS |
283 PTL_MD_EVENT_START_DISABLE |
286 md.user_ptr = &rxb->rxb_eventarg;
287 md.max_size = *kptllnd_tunables.kptl_max_msg_size;
288 md.eq_handle = kptllnd_data.kptl_eqh;
290 rc = PtlMDAttach(meh, md, PTL_UNLINK, &mdh);
292 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
293 if (rxb->rxb_posted) /* Not auto-unlinked yet!!! */
295 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
299 CERROR("PtlMDAttach rxb failed %d\n", rc);
300 rc = PtlMEUnlink(meh);
301 LASSERT(rc == PTL_OK);
304 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
306 /* XXX this will just try again immediately */
307 kptllnd_rx_buffer_decref_locked(rxb);
308 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
312 kptllnd_rx_alloc(void)
316 if (IS_SIMULATION_ENABLED(FAIL_RX_ALLOC)) {
317 CERROR ("FAIL_RX_ALLOC SIMULATION triggered\n");
321 rx = cfs_mem_cache_alloc(kptllnd_data.kptl_rx_cache, CFS_ALLOC_ATOMIC);
323 CERROR("Failed to allocate rx\n");
327 memset(rx, 0, sizeof(*rx));
332 kptllnd_rx_done(kptl_rx_t *rx)
334 kptl_rx_buffer_t *rxb = rx->rx_rxb;
335 kptl_peer_t *peer = rx->rx_peer;
338 CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer);
341 kptllnd_rx_buffer_decref(rxb);
344 /* Update credits (after I've decref-ed the buffer) */
345 spin_lock_irqsave(&peer->peer_lock, flags);
347 peer->peer_outstanding_credits++;
348 LASSERT (peer->peer_outstanding_credits <=
349 *kptllnd_tunables.kptl_peercredits);
351 CDEBUG(D_NETTRACE, "%s[%d/%d]: rx %p done\n",
352 libcfs_id2str(peer->peer_id),
353 peer->peer_credits, peer->peer_outstanding_credits, rx);
355 spin_unlock_irqrestore(&peer->peer_lock, flags);
357 /* I might have to send back credits */
358 kptllnd_peer_check_sends(peer);
359 kptllnd_peer_decref(peer);
362 cfs_mem_cache_free(kptllnd_data.kptl_rx_cache, rx);
366 kptllnd_rx_buffer_callback (ptl_event_t *ev)
368 kptl_eventarg_t *eva = ev->md.user_ptr;
369 kptl_rx_buffer_t *rxb = kptllnd_eventarg2obj(eva);
370 kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
375 #ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
376 unlinked = ev->unlinked;
378 unlinked = ev->type == PTL_EVENT_UNLINK;
381 CDEBUG(D_NET, "RXB Callback %s(%d) rxb=%p id=%s unlink=%d rc %d\n",
382 kptllnd_evtype2str(ev->type), ev->type, rxb,
383 kptllnd_ptlid2str(ev->initiator),
384 unlinked, ev->ni_fail_type);
386 LASSERT (!rxb->rxb_idle);
387 LASSERT (ev->md.start == rxb->rxb_buffer);
388 LASSERT (ev->offset + ev->mlength <=
389 PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
390 LASSERT (ev->type == PTL_EVENT_PUT_END ||
391 ev->type == PTL_EVENT_UNLINK);
392 LASSERT (ev->type == PTL_EVENT_UNLINK ||
393 ev->match_bits == LNET_MSG_MATCHBITS);
395 if (ev->ni_fail_type != PTL_NI_OK)
396 CERROR("event type %d, status %d from %s\n",
397 ev->type, ev->ni_fail_type,
398 kptllnd_ptlid2str(ev->initiator));
400 if (ev->type == PTL_EVENT_PUT_END &&
401 ev->ni_fail_type == PTL_NI_OK &&
402 !rxbp->rxbp_shutdown) {
404 /* rxbp_shutdown sampled without locking! I only treat it as a
405 * hint since shutdown can start while rx's are queued on
407 #if (PTL_MD_LOCAL_ALIGN8 == 0)
408 /* Portals can't force message alignment - someone sending an
409 * odd-length message will misalign subsequent messages and
410 * force the fixup below... */
411 if ((ev->mlength & 7) != 0)
412 CWARN("Message from %s has odd length %d: "
413 "probable version incompatibility\n",
414 kptllnd_ptlid2str(ev->initiator),
417 rx = kptllnd_rx_alloc();
419 CERROR("Message from %s dropped: ENOMEM",
420 kptllnd_ptlid2str(ev->initiator));
422 if ((ev->offset & 7) == 0) {
423 kptllnd_rx_buffer_addref(rxb);
425 rx->rx_nob = ev->mlength;
426 rx->rx_msg = (kptl_msg_t *)
427 (rxb->rxb_buffer + ev->offset);
429 #if (PTL_MD_LOCAL_ALIGN8 == 0)
430 /* Portals can't force alignment - copy into
431 * rx_space (avoiding overflow) to fix */
432 int maxlen = *kptllnd_tunables.kptl_max_msg_size;
435 rx->rx_nob = MIN(maxlen, ev->mlength);
436 rx->rx_msg = (kptl_msg_t *)rx->rx_space;
437 memcpy(rx->rx_msg, rxb->rxb_buffer + ev->offset,
440 /* Portals should have forced the alignment */
445 rx->rx_initiator = ev->initiator;
447 rx->rx_uid = ev->uid;
449 /* Queue for attention */
450 spin_lock_irqsave(&kptllnd_data.kptl_sched_lock,
453 list_add_tail(&rx->rx_list,
454 &kptllnd_data.kptl_sched_rxq);
455 wake_up(&kptllnd_data.kptl_sched_waitq);
457 spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock,
463 spin_lock_irqsave(&rxbp->rxbp_lock, flags);
466 rxb->rxb_mdh = PTL_INVALID_HANDLE;
467 kptllnd_rx_buffer_decref_locked(rxb);
469 spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
474 kptllnd_nak (kptl_rx_t *rx)
476 /* Fire-and-forget a stub message that will let the peer know my
477 * protocol magic/version and make her drop/refresh any peer state she
478 * might have with me. */
480 .start = kptllnd_data.kptl_nak_msg,
481 .length = kptllnd_data.kptl_nak_msg->ptlm_nob,
485 .eq_handle = PTL_EQ_NONE};
489 rc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &mdh);
491 CWARN("Can't NAK %s: bind failed %d\n",
492 kptllnd_ptlid2str(rx->rx_initiator), rc);
496 rc = PtlPut(mdh, PTL_NOACK_REQ, rx->rx_initiator,
497 *kptllnd_tunables.kptl_portal, 0,
498 LNET_MSG_MATCHBITS, 0, 0);
501 CWARN("Can't NAK %s: put failed %d\n",
502 kptllnd_ptlid2str(rx->rx_initiator), rc);
506 kptllnd_rx_parse(kptl_rx_t *rx)
508 kptl_msg_t *msg = rx->rx_msg;
513 lnet_process_id_t srcid;
515 LASSERT (rx->rx_peer == NULL);
517 if ((rx->rx_nob >= 4 &&
518 (msg->ptlm_magic == LNET_PROTO_MAGIC ||
519 msg->ptlm_magic == __swab32(LNET_PROTO_MAGIC))) ||
521 ((msg->ptlm_magic == PTLLND_MSG_MAGIC &&
522 msg->ptlm_version != PTLLND_MSG_VERSION) ||
523 (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC) &&
524 msg->ptlm_version != __swab16(PTLLND_MSG_VERSION))))) {
525 /* NAK incompatible versions
526 * See other LNDs for how to handle this if/when ptllnd begins
527 * to allow different versions to co-exist */
528 CERROR("Bad version: got %04x expected %04x from %s\n",
529 (__u32)(msg->ptlm_magic == PTLLND_MSG_MAGIC ?
530 msg->ptlm_version : __swab16(msg->ptlm_version)),
531 PTLLND_MSG_VERSION, kptllnd_ptlid2str(rx->rx_initiator));
536 rc = kptllnd_msg_unpack(msg, rx->rx_nob);
538 CERROR ("Error %d unpacking rx from %s\n",
539 rc, kptllnd_ptlid2str(rx->rx_initiator));
543 srcid.nid = msg->ptlm_srcnid;
544 srcid.pid = msg->ptlm_srcpid;
546 CDEBUG(D_NETTRACE, "%s: RX %s c %d %p\n", libcfs_id2str(srcid),
547 kptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_credits, rx);
549 if (srcid.nid != kptllnd_ptl2lnetnid(rx->rx_initiator.nid)) {
550 CERROR("Bad source id %s from %s\n",
551 libcfs_id2str(srcid),
552 kptllnd_ptlid2str(rx->rx_initiator));
556 if (msg->ptlm_type == PTLLND_MSG_TYPE_NAK) {
557 peer = kptllnd_id2peer(srcid);
561 CWARN("NAK from %s (%s)\n",
562 libcfs_id2str(srcid),
563 kptllnd_ptlid2str(rx->rx_initiator));
569 if (msg->ptlm_dstnid != kptllnd_data.kptl_ni->ni_nid ||
570 msg->ptlm_dstpid != the_lnet.ln_pid) {
571 CERROR("Bad dstid %s (expected %s) from %s\n",
572 libcfs_id2str((lnet_process_id_t) {
573 .nid = msg->ptlm_dstnid,
574 .pid = msg->ptlm_dstpid}),
575 libcfs_id2str((lnet_process_id_t) {
576 .nid = kptllnd_data.kptl_ni->ni_nid,
577 .pid = the_lnet.ln_pid}),
578 kptllnd_ptlid2str(rx->rx_initiator));
582 if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) {
583 peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg);
585 CWARN("No peer for %s\n",
586 kptllnd_ptlid2str(rx->rx_initiator));
590 peer = kptllnd_id2peer(srcid);
592 CWARN("NAK %s: no connection; peer must reconnect\n",
593 libcfs_id2str(srcid));
594 /* NAK to make the peer reconnect */
599 /* Ignore anything else while I'm waiting for HELLO */
600 if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
601 kptllnd_peer_decref(peer);
606 LASSERT (msg->ptlm_srcnid == peer->peer_id.nid &&
607 msg->ptlm_srcpid == peer->peer_id.pid);
609 if (msg->ptlm_srcstamp != peer->peer_incarnation) {
610 CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n",
611 libcfs_id2str(peer->peer_id),
613 peer->peer_incarnation);
618 if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation &&
619 (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */
620 msg->ptlm_dststamp != 0)) { /* zero dststamp */
621 CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n",
622 libcfs_id2str(peer->peer_id), msg->ptlm_dststamp,
623 kptllnd_data.kptl_incarnation);
628 if (msg->ptlm_credits != 0) {
629 spin_lock_irqsave(&peer->peer_lock, flags);
631 if (peer->peer_credits + msg->ptlm_credits >
632 *kptllnd_tunables.kptl_peercredits) {
633 credits = peer->peer_credits;
634 spin_unlock_irqrestore(&peer->peer_lock, flags);
636 CERROR("Credit overflow from %s: %d + %d > %d\n",
637 libcfs_id2str(peer->peer_id),
638 credits, msg->ptlm_credits,
639 *kptllnd_tunables.kptl_peercredits);
644 peer->peer_credits += msg->ptlm_credits;
646 spin_unlock_irqrestore(&peer->peer_lock, flags);
648 kptllnd_peer_check_sends(peer);
651 /* ptllnd-level protocol correct - rx takes my ref on peer and increments
652 * peer_outstanding_credits when it completes */
654 kptllnd_peer_alive(peer);
656 switch (msg->ptlm_type) {
658 /* already checked by kptllnd_msg_unpack() */
661 case PTLLND_MSG_TYPE_HELLO:
662 CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO\n");
665 case PTLLND_MSG_TYPE_NOOP:
666 CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP\n");
669 case PTLLND_MSG_TYPE_IMMEDIATE:
670 CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
671 rc = lnet_parse(kptllnd_data.kptl_ni,
672 &msg->ptlm_u.immediate.kptlim_hdr,
675 if (rc >= 0) /* kptllnd_recv owns 'rx' now */
679 case PTLLND_MSG_TYPE_PUT:
680 case PTLLND_MSG_TYPE_GET:
681 CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
682 msg->ptlm_type == PTLLND_MSG_TYPE_PUT ?
685 /* checked in kptllnd_msg_unpack() */
686 LASSERT (msg->ptlm_u.rdma.kptlrm_matchbits >=
687 PTL_RESERVED_MATCHBITS);
689 /* Update last match bits seen */
690 spin_lock_irqsave(&peer->peer_lock, flags);
692 if (msg->ptlm_u.rdma.kptlrm_matchbits >
693 rx->rx_peer->peer_last_matchbits_seen)
694 rx->rx_peer->peer_last_matchbits_seen =
695 msg->ptlm_u.rdma.kptlrm_matchbits;
697 spin_unlock_irqrestore(&rx->rx_peer->peer_lock, flags);
699 rc = lnet_parse(kptllnd_data.kptl_ni,
700 &msg->ptlm_u.rdma.kptlrm_hdr,
703 if (rc >= 0) /* kptllnd_recv owns 'rx' now */
709 kptllnd_peer_close(peer, rc);
710 if (rx->rx_peer == NULL) /* drop ref on peer */
711 kptllnd_peer_decref(peer); /* unless rx_done will */