1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openibnal.h"
27 ptl_handle_ni_t koibnal_ni;
28 koib_data_t koibnal_data;
29 koib_tunables_t koibnal_tunables;
32 #define OPENIBNAL_SYSCTL 202
34 #define OPENIBNAL_SYSCTL_TIMEOUT 1
35 #define OPENIBNAL_SYSCTL_ZERO_COPY 2
37 static ctl_table koibnal_ctl_table[] = {
38 {OPENIBNAL_SYSCTL_TIMEOUT, "timeout",
39 &koibnal_tunables.koib_io_timeout, sizeof (int),
40 0644, NULL, &proc_dointvec},
44 static ctl_table koibnal_top_ctl_table[] = {
45 {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
51 print_service(struct ib_common_attrib_service *service, char *tag, int rc)
58 "status : %d (NULL)\n", tag, rc);
61 strncpy (name, service->service_name, sizeof(name)-1);
62 name[sizeof(name)-1] = 0;
66 "service id: "LPX64"\n"
68 "NID : "LPX64"\n", tag, rc,
69 service->service_id, name, service->service_data64[0]);
73 koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
74 struct ib_common_attrib_service *service, void *arg)
77 up (&koibnal_data.koib_nid_signal);
81 koibnal_advertise (void)
87 LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
89 memset (&koibnal_data.koib_service, 0,
90 sizeof (koibnal_data.koib_service));
92 koibnal_data.koib_service.service_id
93 = koibnal_data.koib_cm_service_id;
95 rc = ib_cached_gid_get(koibnal_data.koib_device,
96 koibnal_data.koib_port,
98 koibnal_data.koib_service.service_gid);
100 CERROR ("Can't get port %d GID: %d\n",
101 koibnal_data.koib_port, rc);
105 rc = ib_cached_pkey_get(koibnal_data.koib_device,
106 koibnal_data.koib_port,
108 &koibnal_data.koib_service.service_pkey);
110 CERROR ("Can't get port %d PKEY: %d\n",
111 koibnal_data.koib_port, rc);
115 koibnal_data.koib_service.service_lease = 0xffffffff;
117 koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
119 CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
120 koibnal_data.koib_service.service_id,
121 koibnal_data.koib_service.service_name,
122 *koibnal_service_nid_field(&koibnal_data.koib_service));
124 rc = ib_service_set (koibnal_data.koib_device,
125 koibnal_data.koib_port,
126 &koibnal_data.koib_service,
127 IB_SA_SERVICE_COMP_MASK_ID |
128 IB_SA_SERVICE_COMP_MASK_GID |
129 IB_SA_SERVICE_COMP_MASK_PKEY |
130 IB_SA_SERVICE_COMP_MASK_LEASE |
131 KOIBNAL_SERVICE_KEY_MASK,
132 koibnal_tunables.koib_io_timeout * HZ,
133 koibnal_service_setunset_done, &rc2, &tid);
136 down (&koibnal_data.koib_nid_signal);
141 CERROR ("Error %d advertising SM service\n", rc);
147 koibnal_unadvertise (int expect_success)
153 LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
155 memset (&koibnal_data.koib_service, 0,
156 sizeof (koibnal_data.koib_service));
158 koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
160 CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
161 koibnal_data.koib_service.service_name,
162 *koibnal_service_nid_field(&koibnal_data.koib_service));
164 rc = ib_service_delete (koibnal_data.koib_device,
165 koibnal_data.koib_port,
166 &koibnal_data.koib_service,
167 KOIBNAL_SERVICE_KEY_MASK,
168 koibnal_tunables.koib_io_timeout * HZ,
169 koibnal_service_setunset_done, &rc2, &tid);
171 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
172 rc, koibnal_data.koib_nid);
176 down (&koibnal_data.koib_nid_signal);
178 if ((rc2 == 0) == !!expect_success)
182 CERROR("Error %d unadvertising NID "LPX64"\n",
183 rc, koibnal_data.koib_nid);
185 CWARN("Removed conflicting NID "LPX64"\n",
186 koibnal_data.koib_nid);
192 koibnal_check_advert (void)
198 static struct ib_common_attrib_service srv;
200 memset (&srv, 0, sizeof (srv));
202 koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
204 rc = ib_service_get (koibnal_data.koib_device,
205 koibnal_data.koib_port,
207 KOIBNAL_SERVICE_KEY_MASK,
208 koibnal_tunables.koib_io_timeout * HZ,
209 koibnal_service_setunset_done, &rc2,
213 CERROR ("Immediate error %d checking SM service\n", rc);
215 down (&koibnal_data.koib_nid_signal);
219 CERROR ("Error %d checking SM service\n", rc);
226 koibnal_set_mynid(ptl_nid_t nid)
229 lib_ni_t *ni = &koibnal_lib.libnal_ni;
232 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
233 nid, ni->ni_pid.nid);
235 do_gettimeofday(&tv);
237 down (&koibnal_data.koib_nid_mutex);
239 if (nid == koibnal_data.koib_nid) {
240 /* no change of NID */
241 up (&koibnal_data.koib_nid_mutex);
245 CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
246 koibnal_data.koib_nid, nid);
248 if (koibnal_data.koib_nid != PTL_NID_ANY) {
250 koibnal_unadvertise (1);
252 rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
254 CERROR ("Error %d stopping listener\n", rc);
257 koibnal_data.koib_nid = ni->ni_pid.nid = nid;
258 koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
260 /* Delete all existing peers and their connections after new
261 * NID/incarnation set to ensure no old connections in our brave
263 koibnal_del_peer (PTL_NID_ANY, 0);
266 if (koibnal_data.koib_nid != PTL_NID_ANY) {
267 /* New NID installed */
269 /* remove any previous advert (crashed node etc) */
270 koibnal_unadvertise(0);
272 /* Assign new service number */
273 koibnal_data.koib_cm_service_id = ib_cm_service_assign();
274 CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
276 rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
277 TS_IB_CM_SERVICE_EXACT_MASK,
278 koibnal_passive_conn_callback, NULL,
279 &koibnal_data.koib_listen_handle);
281 CERROR ("ib_cm_listen error: %d\n", rc);
285 rc = koibnal_advertise();
287 koibnal_check_advert();
292 koibnal_data.koib_nid = PTL_NID_ANY;
293 /* remove any peers that sprung up while I failed to
294 * advertise myself */
295 koibnal_del_peer (PTL_NID_ANY, 0);
298 up (&koibnal_data.koib_nid_mutex);
303 koibnal_create_peer (ptl_nid_t nid)
307 LASSERT (nid != PTL_NID_ANY);
309 PORTAL_ALLOC (peer, sizeof (*peer));
313 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
316 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
318 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
319 INIT_LIST_HEAD (&peer->ibp_conns);
320 INIT_LIST_HEAD (&peer->ibp_tx_queue);
322 peer->ibp_reconnect_time = jiffies;
323 peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
325 atomic_inc (&koibnal_data.koib_npeers);
330 koibnal_destroy_peer (koib_peer_t *peer)
332 CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
334 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
335 LASSERT (peer->ibp_persistence == 0);
336 LASSERT (!koibnal_peer_active(peer));
337 LASSERT (peer->ibp_connecting == 0);
338 LASSERT (list_empty (&peer->ibp_conns));
339 LASSERT (list_empty (&peer->ibp_tx_queue));
341 PORTAL_FREE (peer, sizeof (*peer));
343 /* NB a peer's connections keep a reference on their peer until
344 * they are destroyed, so we can be assured that _all_ state to do
345 * with this peer has been cleaned up when its refcount drops to
347 atomic_dec (&koibnal_data.koib_npeers);
351 koibnal_put_peer (koib_peer_t *peer)
353 CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
355 atomic_read (&peer->ibp_refcount));
357 LASSERT (atomic_read (&peer->ibp_refcount) > 0);
358 if (!atomic_dec_and_test (&peer->ibp_refcount))
361 koibnal_destroy_peer (peer);
365 koibnal_find_peer_locked (ptl_nid_t nid)
367 struct list_head *peer_list = koibnal_nid2peerlist (nid);
368 struct list_head *tmp;
371 list_for_each (tmp, peer_list) {
373 peer = list_entry (tmp, koib_peer_t, ibp_list);
375 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
376 peer->ibp_connecting != 0 || /* creating conns */
377 !list_empty (&peer->ibp_conns)); /* active conn */
379 if (peer->ibp_nid != nid)
382 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
383 peer, nid, atomic_read (&peer->ibp_refcount));
390 koibnal_get_peer (ptl_nid_t nid)
394 read_lock (&koibnal_data.koib_global_lock);
395 peer = koibnal_find_peer_locked (nid);
396 if (peer != NULL) /* +1 ref for caller? */
397 atomic_inc (&peer->ibp_refcount);
398 read_unlock (&koibnal_data.koib_global_lock);
404 koibnal_unlink_peer_locked (koib_peer_t *peer)
406 LASSERT (peer->ibp_persistence == 0);
407 LASSERT (list_empty(&peer->ibp_conns));
409 LASSERT (koibnal_peer_active(peer));
410 list_del_init (&peer->ibp_list);
411 /* lose peerlist's ref */
412 koibnal_put_peer (peer);
416 koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
419 struct list_head *ptmp;
422 read_lock (&koibnal_data.koib_global_lock);
424 for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
426 list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
428 peer = list_entry (ptmp, koib_peer_t, ibp_list);
429 LASSERT (peer->ibp_persistence != 0 ||
430 peer->ibp_connecting != 0 ||
431 !list_empty (&peer->ibp_conns));
436 *nidp = peer->ibp_nid;
437 *persistencep = peer->ibp_persistence;
439 read_unlock (&koibnal_data.koib_global_lock);
444 read_unlock (&koibnal_data.koib_global_lock);
449 koibnal_add_persistent_peer (ptl_nid_t nid)
455 if (nid == PTL_NID_ANY)
458 peer = koibnal_create_peer (nid);
462 write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
464 peer2 = koibnal_find_peer_locked (nid);
466 koibnal_put_peer (peer);
469 /* peer table takes existing ref on peer */
470 list_add_tail (&peer->ibp_list,
471 koibnal_nid2peerlist (nid));
474 peer->ibp_persistence++;
476 write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
481 koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
483 struct list_head *ctmp;
484 struct list_head *cnxt;
488 peer->ibp_persistence = 0;
489 else if (peer->ibp_persistence > 0)
490 peer->ibp_persistence--;
492 if (peer->ibp_persistence != 0)
495 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
496 conn = list_entry(ctmp, koib_conn_t, ibc_list);
498 koibnal_close_conn_locked (conn, 0);
501 /* NB peer unlinks itself when last conn is closed */
505 koibnal_del_peer (ptl_nid_t nid, int single_share)
508 struct list_head *ptmp;
509 struct list_head *pnxt;
516 write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
518 if (nid != PTL_NID_ANY)
519 lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
522 hi = koibnal_data.koib_peer_hash_size - 1;
525 for (i = lo; i <= hi; i++) {
526 list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
527 peer = list_entry (ptmp, koib_peer_t, ibp_list);
528 LASSERT (peer->ibp_persistence != 0 ||
529 peer->ibp_connecting != 0 ||
530 !list_empty (&peer->ibp_conns));
532 if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
535 koibnal_del_peer_locked (peer, single_share);
536 rc = 0; /* matched something */
543 write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
549 koibnal_get_conn_by_idx (int index)
552 struct list_head *ptmp;
554 struct list_head *ctmp;
557 read_lock (&koibnal_data.koib_global_lock);
559 for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
560 list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
562 peer = list_entry (ptmp, koib_peer_t, ibp_list);
563 LASSERT (peer->ibp_persistence > 0 ||
564 peer->ibp_connecting != 0 ||
565 !list_empty (&peer->ibp_conns));
567 list_for_each (ctmp, &peer->ibp_conns) {
571 conn = list_entry (ctmp, koib_conn_t, ibc_list);
572 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
573 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
574 atomic_read (&conn->ibc_refcount));
575 atomic_inc (&conn->ibc_refcount);
576 read_unlock (&koibnal_data.koib_global_lock);
582 read_unlock (&koibnal_data.koib_global_lock);
587 koibnal_create_conn (void)
597 struct ib_qp_create_param qp_create;
598 struct ib_qp_attribute qp_attr;
601 PORTAL_ALLOC (conn, sizeof (*conn));
603 CERROR ("Can't allocate connection\n");
607 /* zero flags, NULL pointers etc... */
608 memset (conn, 0, sizeof (*conn));
610 INIT_LIST_HEAD (&conn->ibc_tx_queue);
611 INIT_LIST_HEAD (&conn->ibc_rdma_queue);
612 spin_lock_init (&conn->ibc_lock);
614 atomic_inc (&koibnal_data.koib_nconns);
615 /* well not really, but I call destroy() on failure, which decrements */
617 PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
618 if (conn->ibc_rxs == NULL)
620 memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
622 rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
623 OPENIBNAL_RX_MSG_PAGES,
624 IB_ACCESS_LOCAL_WRITE);
628 vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
630 for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
631 struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
632 koib_rx_t *rx = &conn->ibc_rxs[i];
635 rx->rx_vaddr = vaddr;
636 rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
638 vaddr += OPENIBNAL_MSG_SIZE;
639 LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
641 page_offset += OPENIBNAL_MSG_SIZE;
642 LASSERT (page_offset <= PAGE_SIZE);
644 if (page_offset == PAGE_SIZE) {
647 LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
651 params.qp_create = (struct ib_qp_create_param) {
653 /* Sends have an optional RDMA */
654 .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
655 .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
656 .max_send_gather_element = 1,
657 .max_receive_scatter_element = 1,
659 .pd = koibnal_data.koib_pd,
660 .send_queue = koibnal_data.koib_tx_cq,
661 .receive_queue = koibnal_data.koib_rx_cq,
662 .send_policy = IB_WQ_SIGNAL_SELECTABLE,
663 .receive_policy = IB_WQ_SIGNAL_SELECTABLE,
665 .transport = IB_TRANSPORT_RC,
666 .device_specific = NULL,
669 rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
671 CERROR ("Failed to create queue pair: %d\n", rc);
675 /* Mark QP created */
676 conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
678 params.qp_attr = (struct ib_qp_attribute) {
679 .state = IB_QP_STATE_INIT,
680 .port = koibnal_data.koib_port,
681 .enable_rdma_read = 1,
682 .enable_rdma_write = 1,
683 .valid_fields = (IB_QP_ATTRIBUTE_STATE |
684 IB_QP_ATTRIBUTE_PORT |
685 IB_QP_ATTRIBUTE_PKEY_INDEX |
686 IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
688 rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr);
690 CERROR ("Failed to modify queue pair: %d\n", rc);
694 /* 1 ref for caller */
695 atomic_set (&conn->ibc_refcount, 1);
699 koibnal_destroy_conn (conn);
704 koibnal_destroy_conn (koib_conn_t *conn)
708 CDEBUG (D_NET, "connection %p\n", conn);
710 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
711 LASSERT (list_empty(&conn->ibc_tx_queue));
712 LASSERT (list_empty(&conn->ibc_rdma_queue));
713 LASSERT (conn->ibc_nsends_posted == 0);
714 LASSERT (conn->ibc_connreq == NULL);
716 switch (conn->ibc_state) {
717 case OPENIBNAL_CONN_ZOMBIE:
718 /* called after connection sequence initiated */
720 case OPENIBNAL_CONN_INIT_QP:
721 rc = ib_qp_destroy(conn->ibc_qp);
723 CERROR("Can't destroy QP: %d\n", rc);
726 case OPENIBNAL_CONN_INIT_NOTHING:
733 if (conn->ibc_rx_pages != NULL)
734 koibnal_free_pages(conn->ibc_rx_pages);
736 if (conn->ibc_rxs != NULL)
737 PORTAL_FREE(conn->ibc_rxs,
738 OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
740 if (conn->ibc_peer != NULL)
741 koibnal_put_peer(conn->ibc_peer);
743 PORTAL_FREE(conn, sizeof (*conn));
745 atomic_dec(&koibnal_data.koib_nconns);
747 if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
748 koibnal_data.koib_shutdown) {
749 /* I just nuked the last connection on shutdown; wake up
750 * everyone so they can exit. */
751 wake_up_all(&koibnal_data.koib_sched_waitq);
752 wake_up_all(&koibnal_data.koib_connd_waitq);
757 koibnal_put_conn (koib_conn_t *conn)
761 CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
762 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
763 atomic_read (&conn->ibc_refcount));
765 LASSERT (atomic_read (&conn->ibc_refcount) > 0);
766 if (!atomic_dec_and_test (&conn->ibc_refcount))
769 /* last ref only goes on zombies */
770 LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
772 spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
774 list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
775 wake_up (&koibnal_data.koib_connd_waitq);
777 spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
781 koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
784 struct list_head *ctmp;
785 struct list_head *cnxt;
788 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
789 conn = list_entry (ctmp, koib_conn_t, ibc_list);
792 koibnal_close_conn_locked (conn, why);
799 koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
802 struct list_head *ctmp;
803 struct list_head *cnxt;
806 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
807 conn = list_entry (ctmp, koib_conn_t, ibc_list);
809 if (conn->ibc_incarnation == incarnation)
812 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
813 peer->ibp_nid, conn->ibc_incarnation, incarnation);
816 koibnal_close_conn_locked (conn, -ESTALE);
823 koibnal_close_matching_conns (ptl_nid_t nid)
827 struct list_head *ptmp;
828 struct list_head *pnxt;
834 write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
836 if (nid != PTL_NID_ANY)
837 lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
840 hi = koibnal_data.koib_peer_hash_size - 1;
843 for (i = lo; i <= hi; i++) {
844 list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
846 peer = list_entry (ptmp, koib_peer_t, ibp_list);
847 LASSERT (peer->ibp_persistence != 0 ||
848 peer->ibp_connecting != 0 ||
849 !list_empty (&peer->ibp_conns));
851 if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
854 count += koibnal_close_peer_conns_locked (peer, 0);
858 write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
860 /* wildcards always succeed */
861 if (nid == PTL_NID_ANY)
864 return (count == 0 ? -ENOENT : 0);
868 koibnal_cmd(struct portals_cfg *pcfg, void * private)
872 LASSERT (pcfg != NULL);
874 switch(pcfg->pcfg_command) {
875 case NAL_CMD_GET_PEER: {
879 rc = koibnal_get_peer_info(pcfg->pcfg_count,
881 pcfg->pcfg_nid = nid;
885 pcfg->pcfg_count = 0;
886 pcfg->pcfg_wait = share_count;
889 case NAL_CMD_ADD_PEER: {
890 rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
893 case NAL_CMD_DEL_PEER: {
894 rc = koibnal_del_peer (pcfg->pcfg_nid,
895 /* flags == single_share */
896 pcfg->pcfg_flags != 0);
899 case NAL_CMD_GET_CONN: {
900 koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
906 pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
909 pcfg->pcfg_flags = 0;
910 koibnal_put_conn (conn);
914 case NAL_CMD_CLOSE_CONNECTION: {
915 rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
918 case NAL_CMD_REGISTER_MYNID: {
919 if (pcfg->pcfg_nid == PTL_NID_ANY)
922 rc = koibnal_set_mynid (pcfg->pcfg_nid);
931 koibnal_free_pages (koib_pages_t *p)
933 int npages = p->oibp_npages;
937 if (p->oibp_mapped) {
938 rc = ib_memory_deregister(p->oibp_handle);
940 CERROR ("Deregister error: %d\n", rc);
943 for (i = 0; i < npages; i++)
944 if (p->oibp_pages[i] != NULL)
945 __free_page(p->oibp_pages[i]);
947 PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
951 koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
954 struct ib_physical_buffer *phys_pages;
958 PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
960 CERROR ("Can't allocate buffer %d\n", npages);
964 memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
965 p->oibp_npages = npages;
967 for (i = 0; i < npages; i++) {
968 p->oibp_pages[i] = alloc_page (GFP_KERNEL);
969 if (p->oibp_pages[i] == NULL) {
970 CERROR ("Can't allocate page %d of %d\n", i, npages);
971 koibnal_free_pages(p);
976 PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
977 if (phys_pages == NULL) {
978 CERROR ("Can't allocate physarray for %d pages\n", npages);
979 koibnal_free_pages(p);
983 for (i = 0; i < npages; i++) {
984 phys_pages[i].size = PAGE_SIZE;
985 phys_pages[i].address =
986 koibnal_page2phys(p->oibp_pages[i]);
990 rc = ib_memory_register_physical(koibnal_data.koib_pd,
993 npages * PAGE_SIZE, 0,
999 PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1002 CERROR ("Error %d mapping %d pages\n", rc, npages);
1003 koibnal_free_pages(p);
1013 koibnal_setup_tx_descs (void)
1016 int page_offset = 0;
1024 /* pre-mapped messages are not bigger than 1 page */
1025 LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
1027 /* No fancy arithmetic when we do the buffer calculations */
1028 LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
1030 rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
1031 OPENIBNAL_TX_MSG_PAGES,
1032 0); /* local read access only */
1036 vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
1038 for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
1039 page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
1040 tx = &koibnal_data.koib_tx_descs[i];
1042 memset (tx, 0, sizeof(*tx)); /* zero flags etc */
1044 tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
1045 tx->tx_vaddr = vaddr;
1046 tx->tx_isnblk = (i >= OPENIBNAL_NTX);
1047 tx->tx_mapped = KOIB_TX_UNMAPPED;
1049 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1050 i, tx, tx->tx_msg, tx->tx_vaddr);
1053 list_add (&tx->tx_list,
1054 &koibnal_data.koib_idle_nblk_txs);
1056 list_add (&tx->tx_list,
1057 &koibnal_data.koib_idle_txs);
1059 vaddr += OPENIBNAL_MSG_SIZE;
1060 LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
1062 page_offset += OPENIBNAL_MSG_SIZE;
1063 LASSERT (page_offset <= PAGE_SIZE);
1065 if (page_offset == PAGE_SIZE) {
1068 LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
1076 koibnal_api_shutdown (nal_t *nal)
1081 if (nal->nal_refct != 0) {
1082 /* This module got the first ref */
1083 PORTAL_MODULE_UNUSE;
1087 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1088 atomic_read (&portal_kmemory));
1090 LASSERT(nal == &koibnal_api);
1092 switch (koibnal_data.koib_init) {
1094 CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
1097 case OPENIBNAL_INIT_ALL:
1098 /* stop calls to nal_cmd */
1099 libcfs_nal_cmd_unregister(OPENIBNAL);
1102 /* resetting my NID to unadvertises me, removes my
1103 * listener and nukes all current peers */
1104 koibnal_set_mynid (PTL_NID_ANY);
1106 /* Wait for all peer state to clean up */
1108 while (atomic_read (&koibnal_data.koib_npeers) != 0) {
1110 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1111 "waiting for %d peers to close down\n",
1112 atomic_read (&koibnal_data.koib_npeers));
1113 set_current_state (TASK_INTERRUPTIBLE);
1114 schedule_timeout (HZ);
1118 case OPENIBNAL_INIT_TX_CQ:
1119 rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
1121 CERROR ("Destroy tx CQ error: %d\n", rc);
1124 case OPENIBNAL_INIT_RX_CQ:
1125 rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
1127 CERROR ("Destroy rx CQ error: %d\n", rc);
1130 case OPENIBNAL_INIT_TXD:
1131 koibnal_free_pages (koibnal_data.koib_tx_pages);
1134 case OPENIBNAL_INIT_FMR:
1135 rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
1137 CERROR ("Destroy FMR pool error: %d\n", rc);
1140 case OPENIBNAL_INIT_PD:
1141 rc = ib_pd_destroy(koibnal_data.koib_pd);
1143 CERROR ("Destroy PD error: %d\n", rc);
1146 case OPENIBNAL_INIT_LIB:
1147 lib_fini(&koibnal_lib);
1150 case OPENIBNAL_INIT_DATA:
1151 /* Module refcount only gets to zero when all peers
1152 * have been closed so all lists must be empty */
1153 LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
1154 LASSERT (koibnal_data.koib_peers != NULL);
1155 for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
1156 LASSERT (list_empty (&koibnal_data.koib_peers[i]));
1158 LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
1159 LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
1160 LASSERT (list_empty (&koibnal_data.koib_sched_txq));
1161 LASSERT (list_empty (&koibnal_data.koib_connd_conns));
1162 LASSERT (list_empty (&koibnal_data.koib_connd_peers));
1164 /* flag threads to terminate; wake and wait for them to die */
1165 koibnal_data.koib_shutdown = 1;
1166 wake_up_all (&koibnal_data.koib_sched_waitq);
1167 wake_up_all (&koibnal_data.koib_connd_waitq);
1170 while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
1172 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1173 "Waiting for %d threads to terminate\n",
1174 atomic_read (&koibnal_data.koib_nthreads));
1175 set_current_state (TASK_INTERRUPTIBLE);
1176 schedule_timeout (HZ);
1180 case OPENIBNAL_INIT_NOTHING:
1184 if (koibnal_data.koib_tx_descs != NULL)
1185 PORTAL_FREE (koibnal_data.koib_tx_descs,
1186 OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
1188 if (koibnal_data.koib_peers != NULL)
1189 PORTAL_FREE (koibnal_data.koib_peers,
1190 sizeof (struct list_head) *
1191 koibnal_data.koib_peer_hash_size);
1193 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1194 atomic_read (&portal_kmemory));
1195 printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
1196 atomic_read(&portal_kmemory));
1198 koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
1202 koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1203 ptl_ni_limits_t *requested_limits,
1204 ptl_ni_limits_t *actual_limits)
1206 ptl_process_id_t process_id;
1207 int pkmem = atomic_read(&portal_kmemory);
1211 LASSERT (nal == &koibnal_api);
1213 if (nal->nal_refct != 0) {
1214 if (actual_limits != NULL)
1215 *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
1216 /* This module got the first ref */
1221 LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
1223 memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
1225 init_MUTEX (&koibnal_data.koib_nid_mutex);
1226 init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
1227 koibnal_data.koib_nid = PTL_NID_ANY;
1229 rwlock_init(&koibnal_data.koib_global_lock);
1231 koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
1232 PORTAL_ALLOC (koibnal_data.koib_peers,
1233 sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
1234 if (koibnal_data.koib_peers == NULL) {
1237 for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
1238 INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
1240 spin_lock_init (&koibnal_data.koib_connd_lock);
1241 INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
1242 INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
1243 init_waitqueue_head (&koibnal_data.koib_connd_waitq);
1245 spin_lock_init (&koibnal_data.koib_sched_lock);
1246 INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
1247 INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
1248 init_waitqueue_head (&koibnal_data.koib_sched_waitq);
1250 spin_lock_init (&koibnal_data.koib_tx_lock);
1251 INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
1252 INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
1253 init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
1255 PORTAL_ALLOC (koibnal_data.koib_tx_descs,
1256 OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
1257 if (koibnal_data.koib_tx_descs == NULL) {
1258 CERROR ("Can't allocate tx descs\n");
1262 /* lists/ptrs/locks initialised */
1263 koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
1264 /*****************************************************/
1266 process_id.pid = requested_pid;
1267 process_id.nid = koibnal_data.koib_nid;
1269 rc = lib_init(&koibnal_lib, nal, process_id,
1270 requested_limits, actual_limits);
1272 CERROR("lib_init failed: error %d\n", rc);
1276 /* lib interface initialised */
1277 koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
1278 /*****************************************************/
1280 for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
1281 rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
1283 CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1289 rc = koibnal_thread_start (koibnal_connd, NULL);
1291 CERROR ("Can't spawn openibnal connd: %d\n", rc);
1295 koibnal_data.koib_device = ib_device_get_by_index(0);
1296 if (koibnal_data.koib_device == NULL) {
1297 CERROR ("Can't open ib device 0\n");
1301 rc = ib_device_properties_get(koibnal_data.koib_device,
1302 &koibnal_data.koib_device_props);
1304 CERROR ("Can't get device props: %d\n", rc);
1308 CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
1309 koibnal_data.koib_device_props.max_initiator_per_qp,
1310 koibnal_data.koib_device_props.max_responder_per_qp);
1312 koibnal_data.koib_port = 0;
1313 for (i = 1; i <= 2; i++) {
1314 rc = ib_port_properties_get(koibnal_data.koib_device, i,
1315 &koibnal_data.koib_port_props);
1317 koibnal_data.koib_port = i;
1321 if (koibnal_data.koib_port == 0) {
1322 CERROR ("Can't find a port\n");
1326 rc = ib_pd_create(koibnal_data.koib_device,
1327 NULL, &koibnal_data.koib_pd);
1329 CERROR ("Can't create PD: %d\n", rc);
1333 /* flag PD initialised */
1334 koibnal_data.koib_init = OPENIBNAL_INIT_PD;
1335 /*****************************************************/
1338 const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
1339 struct ib_fmr_pool_param params = {
1340 .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1341 .access = (IB_ACCESS_LOCAL_WRITE |
1342 IB_ACCESS_REMOTE_WRITE |
1343 IB_ACCESS_REMOTE_READ),
1344 .pool_size = pool_size,
1345 .dirty_watermark = (pool_size * 3)/4,
1346 .flush_function = NULL,
1350 rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms,
1351 &koibnal_data.koib_fmr_pool);
1353 CERROR ("Can't create FMR pool size %d: %d\n",
1359 /* flag FMR pool initialised */
1360 koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
1362 /*****************************************************/
1364 rc = koibnal_setup_tx_descs();
1366 CERROR ("Can't register tx descs: %d\n", rc);
1370 /* flag TX descs initialised */
1371 koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
1372 /*****************************************************/
1375 struct ib_cq_callback callback = {
1376 .context = OPENIBNAL_CALLBACK_CTXT,
1377 .policy = IB_CQ_PROVIDER_REARM,
1379 .entry = koibnal_rx_callback,
1383 int nentries = OPENIBNAL_RX_CQ_ENTRIES;
1385 rc = ib_cq_create (koibnal_data.koib_device,
1386 &nentries, &callback, NULL,
1387 &koibnal_data.koib_rx_cq);
1389 CERROR ("Can't create RX CQ: %d\n", rc);
1393 /* I only want solicited events */
1394 rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
1398 /* flag RX CQ initialised */
1399 koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
1400 /*****************************************************/
1403 struct ib_cq_callback callback = {
1404 .context = OPENIBNAL_CALLBACK_CTXT,
1405 .policy = IB_CQ_PROVIDER_REARM,
1407 .entry = koibnal_tx_callback,
1411 int nentries = OPENIBNAL_TX_CQ_ENTRIES;
1413 rc = ib_cq_create (koibnal_data.koib_device,
1414 &nentries, &callback, NULL,
1415 &koibnal_data.koib_tx_cq);
1417 CERROR ("Can't create RX CQ: %d\n", rc);
1421 /* I only want solicited events */
1422 rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
1426 /* flag TX CQ initialised */
1427 koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
1428 /*****************************************************/
1430 rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
1432 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1436 /* flag everything initialised */
1437 koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
1438 /*****************************************************/
1440 printk(KERN_INFO "Lustre: OpenIB NAL loaded "
1441 "(initial mem %d)\n", pkmem);
1446 koibnal_api_shutdown (&koibnal_api);
1451 koibnal_module_fini (void)
1453 #ifdef CONFIG_SYSCTL
1454 if (koibnal_tunables.koib_sysctl != NULL)
1455 unregister_sysctl_table (koibnal_tunables.koib_sysctl);
1457 PtlNIFini(koibnal_ni);
1459 ptl_unregister_nal(OPENIBNAL);
1463 koibnal_module_init (void)
1467 /* the following must be sizeof(int) for proc_dointvec() */
1468 LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
1470 koibnal_api.nal_ni_init = koibnal_api_startup;
1471 koibnal_api.nal_ni_fini = koibnal_api_shutdown;
1473 /* Initialise dynamic tunables to defaults once only */
1474 koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
1476 rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
1478 CERROR("Can't register OPENIBNAL: %d\n", rc);
1479 return (-ENOMEM); /* or something... */
1482 /* Pure gateways want the NAL started up at module load time... */
1483 rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
1484 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1485 ptl_unregister_nal(OPENIBNAL);
1489 #ifdef CONFIG_SYSCTL
1490 /* Press on regardless even if registering sysctl doesn't work */
1491 koibnal_tunables.koib_sysctl =
1492 register_sysctl_table (koibnal_top_ctl_table, 0);
1497 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1498 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
1499 MODULE_LICENSE("GPL");
1501 module_init(koibnal_module_init);
1502 module_exit(koibnal_module_fini);