1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openibnal.h"
27 ptl_handle_ni_t kibnal_ni;
28 kib_data_t kibnal_data;
29 kib_tunables_t kibnal_tunables;
32 #define IBNAL_SYSCTL 202
34 #define IBNAL_SYSCTL_TIMEOUT 1
36 static ctl_table kibnal_ctl_table[] = {
37 {IBNAL_SYSCTL_TIMEOUT, "timeout",
38 &kibnal_tunables.kib_io_timeout, sizeof (int),
39 0644, NULL, &proc_dointvec},
43 static ctl_table kibnal_top_ctl_table[] = {
44 {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
50 print_service(struct ib_common_attrib_service *service, char *tag, int rc)
57 "status : %d (NULL)\n", tag, rc);
60 strncpy (name, service->service_name, sizeof(name)-1);
61 name[sizeof(name)-1] = 0;
65 "service id: "LPX64"\n"
67 "NID : "LPX64"\n", tag, rc,
68 service->service_id, name,
69 *kibnal_service_nid_field(service));
73 kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
74 struct ib_common_attrib_service *service, void *arg)
77 up (&kibnal_data.kib_nid_signal);
80 #if IBNAL_CHECK_ADVERT
82 kibnal_check_advert (void)
84 struct ib_common_attrib_service *svc;
89 PORTAL_ALLOC(svc, sizeof(*svc));
93 memset (svc, 0, sizeof (*svc));
94 kibnal_set_service_keys(svc, kibnal_data.kib_nid);
96 rc = ib_service_get (kibnal_data.kib_device,
99 KIBNAL_SERVICE_KEY_MASK,
100 kibnal_tunables.kib_io_timeout * HZ,
101 kibnal_service_setunset_done, &rc2,
105 CERROR ("Immediate error %d checking SM service\n", rc);
107 down (&kibnal_data.kib_nid_signal);
111 CERROR ("Error %d checking SM service\n", rc);
114 PORTAL_FREE(svc, sizeof(*svc));
119 kibnal_advertise (void)
121 struct ib_common_attrib_service *svc;
126 LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
128 PORTAL_ALLOC(svc, sizeof(*svc));
132 memset (svc, 0, sizeof (*svc));
134 svc->service_id = kibnal_data.kib_service_id;
136 rc = ib_cached_gid_get(kibnal_data.kib_device,
137 kibnal_data.kib_port,
141 CERROR ("Can't get port %d GID: %d\n",
142 kibnal_data.kib_port, rc);
146 rc = ib_cached_pkey_get(kibnal_data.kib_device,
147 kibnal_data.kib_port,
151 CERROR ("Can't get port %d PKEY: %d\n",
152 kibnal_data.kib_port, rc);
156 svc->service_lease = 0xffffffff;
158 kibnal_set_service_keys(svc, kibnal_data.kib_nid);
160 CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
162 svc->service_name, *kibnal_service_nid_field(svc));
164 rc = ib_service_set (kibnal_data.kib_device,
165 kibnal_data.kib_port,
167 IB_SA_SERVICE_COMP_MASK_ID |
168 IB_SA_SERVICE_COMP_MASK_GID |
169 IB_SA_SERVICE_COMP_MASK_PKEY |
170 IB_SA_SERVICE_COMP_MASK_LEASE |
171 KIBNAL_SERVICE_KEY_MASK,
172 kibnal_tunables.kib_io_timeout * HZ,
173 kibnal_service_setunset_done, &rc2, &tid);
176 CERROR ("Immediate error %d advertising NID "LPX64"\n",
177 rc, kibnal_data.kib_nid);
181 down (&kibnal_data.kib_nid_signal);
185 CERROR ("Error %d advertising NID "LPX64"\n",
186 rc, kibnal_data.kib_nid);
188 PORTAL_FREE(svc, sizeof(*svc));
193 kibnal_unadvertise (int expect_success)
195 struct ib_common_attrib_service *svc;
200 LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
202 PORTAL_ALLOC(svc, sizeof(*svc));
206 memset (svc, 0, sizeof(*svc));
208 kibnal_set_service_keys(svc, kibnal_data.kib_nid);
210 CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
211 svc->service_name, *kibnal_service_nid_field(svc));
213 rc = ib_service_delete (kibnal_data.kib_device,
214 kibnal_data.kib_port,
216 KIBNAL_SERVICE_KEY_MASK,
217 kibnal_tunables.kib_io_timeout * HZ,
218 kibnal_service_setunset_done, &rc2, &tid);
220 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
221 rc, kibnal_data.kib_nid);
225 down (&kibnal_data.kib_nid_signal);
227 if ((rc2 == 0) == !!expect_success)
228 goto out; /* success: rc == 0 */
231 CERROR("Error %d unadvertising NID "LPX64"\n",
232 rc, kibnal_data.kib_nid);
234 CWARN("Removed conflicting NID "LPX64"\n",
235 kibnal_data.kib_nid);
237 PORTAL_FREE(svc, sizeof(*svc));
241 kibnal_set_mynid(ptl_nid_t nid)
244 lib_ni_t *ni = &kibnal_lib.libnal_ni;
247 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
248 nid, ni->ni_pid.nid);
250 do_gettimeofday(&tv);
252 down (&kibnal_data.kib_nid_mutex);
254 if (nid == kibnal_data.kib_nid) {
255 /* no change of NID */
256 up (&kibnal_data.kib_nid_mutex);
260 CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
261 kibnal_data.kib_nid, nid);
263 if (kibnal_data.kib_nid != PTL_NID_ANY) {
265 kibnal_unadvertise (1);
267 rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
269 CERROR ("Error %d stopping listener\n", rc);
272 kibnal_data.kib_nid = ni->ni_pid.nid = nid;
273 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
275 /* Delete all existing peers and their connections after new
276 * NID/incarnation set to ensure no old connections in our brave
278 kibnal_del_peer (PTL_NID_ANY, 0);
280 if (kibnal_data.kib_nid == PTL_NID_ANY) {
281 /* No new NID to install */
282 up (&kibnal_data.kib_nid_mutex);
286 /* remove any previous advert (crashed node etc) */
287 kibnal_unadvertise(0);
289 /* Assign new service number */
290 kibnal_data.kib_service_id = ib_cm_service_assign();
291 CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
293 rc = ib_cm_listen(kibnal_data.kib_service_id,
294 TS_IB_CM_SERVICE_EXACT_MASK,
295 kibnal_passive_conn_callback, NULL,
296 &kibnal_data.kib_listen_handle);
298 rc = kibnal_advertise();
300 #if IBNAL_CHECK_ADVERT
301 kibnal_check_advert();
303 up (&kibnal_data.kib_nid_mutex);
307 ib_cm_listen_stop(kibnal_data.kib_listen_handle);
308 /* remove any peers that sprung up while I failed to
309 * advertise myself */
310 kibnal_del_peer (PTL_NID_ANY, 0);
313 kibnal_data.kib_nid = PTL_NID_ANY;
314 up (&kibnal_data.kib_nid_mutex);
319 kibnal_create_peer (ptl_nid_t nid)
323 LASSERT (nid != PTL_NID_ANY);
325 PORTAL_ALLOC (peer, sizeof (*peer));
329 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
332 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
334 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
335 INIT_LIST_HEAD (&peer->ibp_conns);
336 INIT_LIST_HEAD (&peer->ibp_tx_queue);
338 peer->ibp_reconnect_time = jiffies;
339 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
341 atomic_inc (&kibnal_data.kib_npeers);
342 CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
348 kibnal_destroy_peer (kib_peer_t *peer)
350 CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
352 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
353 LASSERT (peer->ibp_persistence == 0);
354 LASSERT (!kibnal_peer_active(peer));
355 LASSERT (peer->ibp_connecting == 0);
356 LASSERT (list_empty (&peer->ibp_conns));
357 LASSERT (list_empty (&peer->ibp_tx_queue));
359 PORTAL_FREE (peer, sizeof (*peer));
361 /* NB a peer's connections keep a reference on their peer until
362 * they are destroyed, so we can be assured that _all_ state to do
363 * with this peer has been cleaned up when its refcount drops to
365 atomic_dec (&kibnal_data.kib_npeers);
369 kibnal_put_peer (kib_peer_t *peer)
371 CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
373 atomic_read (&peer->ibp_refcount));
375 LASSERT (atomic_read (&peer->ibp_refcount) > 0);
376 if (!atomic_dec_and_test (&peer->ibp_refcount))
379 kibnal_destroy_peer (peer);
383 kibnal_find_peer_locked (ptl_nid_t nid)
385 struct list_head *peer_list = kibnal_nid2peerlist (nid);
386 struct list_head *tmp;
389 list_for_each (tmp, peer_list) {
391 peer = list_entry (tmp, kib_peer_t, ibp_list);
393 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
394 peer->ibp_connecting != 0 || /* creating conns */
395 !list_empty (&peer->ibp_conns)); /* active conn */
397 if (peer->ibp_nid != nid)
400 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
401 peer, nid, atomic_read (&peer->ibp_refcount));
408 kibnal_get_peer (ptl_nid_t nid)
412 read_lock (&kibnal_data.kib_global_lock);
413 peer = kibnal_find_peer_locked (nid);
414 if (peer != NULL) /* +1 ref for caller? */
415 atomic_inc (&peer->ibp_refcount);
416 read_unlock (&kibnal_data.kib_global_lock);
422 kibnal_unlink_peer_locked (kib_peer_t *peer)
424 LASSERT (peer->ibp_persistence == 0);
425 LASSERT (list_empty(&peer->ibp_conns));
427 LASSERT (kibnal_peer_active(peer));
428 list_del_init (&peer->ibp_list);
429 /* lose peerlist's ref */
430 kibnal_put_peer (peer);
434 kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
437 struct list_head *ptmp;
440 read_lock (&kibnal_data.kib_global_lock);
442 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
444 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
446 peer = list_entry (ptmp, kib_peer_t, ibp_list);
447 LASSERT (peer->ibp_persistence != 0 ||
448 peer->ibp_connecting != 0 ||
449 !list_empty (&peer->ibp_conns));
454 *nidp = peer->ibp_nid;
455 *persistencep = peer->ibp_persistence;
457 read_unlock (&kibnal_data.kib_global_lock);
462 read_unlock (&kibnal_data.kib_global_lock);
467 kibnal_add_persistent_peer (ptl_nid_t nid)
473 if (nid == PTL_NID_ANY)
476 peer = kibnal_create_peer (nid);
480 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
482 peer2 = kibnal_find_peer_locked (nid);
484 kibnal_put_peer (peer);
487 /* peer table takes existing ref on peer */
488 list_add_tail (&peer->ibp_list,
489 kibnal_nid2peerlist (nid));
492 peer->ibp_persistence++;
494 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
499 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
501 struct list_head *ctmp;
502 struct list_head *cnxt;
506 peer->ibp_persistence = 0;
507 else if (peer->ibp_persistence > 0)
508 peer->ibp_persistence--;
510 if (peer->ibp_persistence != 0)
513 if (list_empty(&peer->ibp_conns)) {
514 kibnal_unlink_peer_locked(peer);
516 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
517 conn = list_entry(ctmp, kib_conn_t, ibc_list);
519 kibnal_close_conn_locked (conn, 0);
521 /* NB peer is no longer persistent; closing its last conn
524 /* NB peer now unlinked; might even be freed if the peer table had the
529 kibnal_del_peer (ptl_nid_t nid, int single_share)
532 struct list_head *ptmp;
533 struct list_head *pnxt;
540 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
542 if (nid != PTL_NID_ANY)
543 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
546 hi = kibnal_data.kib_peer_hash_size - 1;
549 for (i = lo; i <= hi; i++) {
550 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
551 peer = list_entry (ptmp, kib_peer_t, ibp_list);
552 LASSERT (peer->ibp_persistence != 0 ||
553 peer->ibp_connecting != 0 ||
554 !list_empty (&peer->ibp_conns));
556 if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
559 kibnal_del_peer_locked (peer, single_share);
560 rc = 0; /* matched something */
567 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
573 kibnal_get_conn_by_idx (int index)
576 struct list_head *ptmp;
578 struct list_head *ctmp;
581 read_lock (&kibnal_data.kib_global_lock);
583 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
584 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
586 peer = list_entry (ptmp, kib_peer_t, ibp_list);
587 LASSERT (peer->ibp_persistence > 0 ||
588 peer->ibp_connecting != 0 ||
589 !list_empty (&peer->ibp_conns));
591 list_for_each (ctmp, &peer->ibp_conns) {
595 conn = list_entry (ctmp, kib_conn_t, ibc_list);
596 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
597 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
598 atomic_read (&conn->ibc_refcount));
599 atomic_inc (&conn->ibc_refcount);
600 read_unlock (&kibnal_data.kib_global_lock);
606 read_unlock (&kibnal_data.kib_global_lock);
611 kibnal_create_conn (void)
621 struct ib_qp_create_param qp_create;
622 struct ib_qp_attribute qp_attr;
625 PORTAL_ALLOC (conn, sizeof (*conn));
627 CERROR ("Can't allocate connection\n");
631 /* zero flags, NULL pointers etc... */
632 memset (conn, 0, sizeof (*conn));
634 INIT_LIST_HEAD (&conn->ibc_tx_queue);
635 INIT_LIST_HEAD (&conn->ibc_active_txs);
636 spin_lock_init (&conn->ibc_lock);
638 atomic_inc (&kibnal_data.kib_nconns);
639 /* well not really, but I call destroy() on failure, which decrements */
641 PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
642 if (conn->ibc_rxs == NULL)
644 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
646 rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
648 IB_ACCESS_LOCAL_WRITE);
652 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
654 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
655 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
656 kib_rx_t *rx = &conn->ibc_rxs[i];
659 rx->rx_vaddr = vaddr;
660 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
662 vaddr += IBNAL_MSG_SIZE;
663 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
665 page_offset += IBNAL_MSG_SIZE;
666 LASSERT (page_offset <= PAGE_SIZE);
668 if (page_offset == PAGE_SIZE) {
671 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
675 params.qp_create = (struct ib_qp_create_param) {
677 /* Sends have an optional RDMA */
678 .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE,
679 .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
680 .max_send_gather_element = 1,
681 .max_receive_scatter_element = 1,
683 .pd = kibnal_data.kib_pd,
684 .send_queue = kibnal_data.kib_cq,
685 .receive_queue = kibnal_data.kib_cq,
686 .send_policy = IB_WQ_SIGNAL_SELECTABLE,
687 .receive_policy = IB_WQ_SIGNAL_SELECTABLE,
689 .transport = IB_TRANSPORT_RC,
690 .device_specific = NULL,
693 rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
695 CERROR ("Failed to create queue pair: %d\n", rc);
699 /* Mark QP created */
700 conn->ibc_state = IBNAL_CONN_INIT_QP;
702 params.qp_attr = (struct ib_qp_attribute) {
703 .state = IB_QP_STATE_INIT,
704 .port = kibnal_data.kib_port,
705 .enable_rdma_read = 1,
706 .enable_rdma_write = 1,
707 .valid_fields = (IB_QP_ATTRIBUTE_STATE |
708 IB_QP_ATTRIBUTE_PORT |
709 IB_QP_ATTRIBUTE_PKEY_INDEX |
710 IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
712 rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr);
714 CERROR ("Failed to modify queue pair: %d\n", rc);
718 /* 1 ref for caller */
719 atomic_set (&conn->ibc_refcount, 1);
723 kibnal_destroy_conn (conn);
728 kibnal_destroy_conn (kib_conn_t *conn)
732 CDEBUG (D_NET, "connection %p\n", conn);
734 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
735 LASSERT (list_empty(&conn->ibc_tx_queue));
736 LASSERT (list_empty(&conn->ibc_active_txs));
737 LASSERT (conn->ibc_nsends_posted == 0);
738 LASSERT (conn->ibc_connreq == NULL);
740 switch (conn->ibc_state) {
741 case IBNAL_CONN_ZOMBIE:
742 /* called after connection sequence initiated */
744 case IBNAL_CONN_INIT_QP:
745 rc = ib_qp_destroy(conn->ibc_qp);
747 CERROR("Can't destroy QP: %d\n", rc);
750 case IBNAL_CONN_INIT_NOTHING:
757 if (conn->ibc_rx_pages != NULL)
758 kibnal_free_pages(conn->ibc_rx_pages);
760 if (conn->ibc_rxs != NULL)
761 PORTAL_FREE(conn->ibc_rxs,
762 IBNAL_RX_MSGS * sizeof(kib_rx_t));
764 if (conn->ibc_peer != NULL)
765 kibnal_put_peer(conn->ibc_peer);
767 PORTAL_FREE(conn, sizeof (*conn));
769 atomic_dec(&kibnal_data.kib_nconns);
771 if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
772 kibnal_data.kib_shutdown) {
773 /* I just nuked the last connection on shutdown; wake up
774 * everyone so they can exit. */
775 wake_up_all(&kibnal_data.kib_sched_waitq);
776 wake_up_all(&kibnal_data.kib_connd_waitq);
781 kibnal_put_conn (kib_conn_t *conn)
785 CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
786 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
787 atomic_read (&conn->ibc_refcount));
789 LASSERT (atomic_read (&conn->ibc_refcount) > 0);
790 if (!atomic_dec_and_test (&conn->ibc_refcount))
793 /* last ref only goes on zombies */
794 LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
796 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
798 list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
799 wake_up (&kibnal_data.kib_connd_waitq);
801 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
805 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
808 struct list_head *ctmp;
809 struct list_head *cnxt;
812 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
813 conn = list_entry (ctmp, kib_conn_t, ibc_list);
816 kibnal_close_conn_locked (conn, why);
823 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
826 struct list_head *ctmp;
827 struct list_head *cnxt;
830 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
831 conn = list_entry (ctmp, kib_conn_t, ibc_list);
833 if (conn->ibc_incarnation == incarnation)
836 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
837 peer->ibp_nid, conn->ibc_incarnation, incarnation);
840 kibnal_close_conn_locked (conn, -ESTALE);
847 kibnal_close_matching_conns (ptl_nid_t nid)
851 struct list_head *ptmp;
852 struct list_head *pnxt;
858 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
860 if (nid != PTL_NID_ANY)
861 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
864 hi = kibnal_data.kib_peer_hash_size - 1;
867 for (i = lo; i <= hi; i++) {
868 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
870 peer = list_entry (ptmp, kib_peer_t, ibp_list);
871 LASSERT (peer->ibp_persistence != 0 ||
872 peer->ibp_connecting != 0 ||
873 !list_empty (&peer->ibp_conns));
875 if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
878 count += kibnal_close_peer_conns_locked (peer, 0);
882 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
884 /* wildcards always succeed */
885 if (nid == PTL_NID_ANY)
888 return (count == 0 ? -ENOENT : 0);
892 kibnal_cmd(struct portals_cfg *pcfg, void * private)
896 LASSERT (pcfg != NULL);
898 switch(pcfg->pcfg_command) {
899 case NAL_CMD_GET_PEER: {
903 rc = kibnal_get_peer_info(pcfg->pcfg_count,
905 pcfg->pcfg_nid = nid;
909 pcfg->pcfg_count = 0;
910 pcfg->pcfg_wait = share_count;
913 case NAL_CMD_ADD_PEER: {
914 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
917 case NAL_CMD_DEL_PEER: {
918 rc = kibnal_del_peer (pcfg->pcfg_nid,
919 /* flags == single_share */
920 pcfg->pcfg_flags != 0);
923 case NAL_CMD_GET_CONN: {
924 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
930 pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
933 pcfg->pcfg_flags = 0;
934 kibnal_put_conn (conn);
938 case NAL_CMD_CLOSE_CONNECTION: {
939 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
942 case NAL_CMD_REGISTER_MYNID: {
943 if (pcfg->pcfg_nid == PTL_NID_ANY)
946 rc = kibnal_set_mynid (pcfg->pcfg_nid);
955 kibnal_free_pages (kib_pages_t *p)
957 int npages = p->ibp_npages;
962 rc = ib_memory_deregister(p->ibp_handle);
964 CERROR ("Deregister error: %d\n", rc);
967 for (i = 0; i < npages; i++)
968 if (p->ibp_pages[i] != NULL)
969 __free_page(p->ibp_pages[i]);
971 PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
975 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
978 struct ib_physical_buffer *phys_pages;
982 PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
984 CERROR ("Can't allocate buffer %d\n", npages);
988 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
989 p->ibp_npages = npages;
991 for (i = 0; i < npages; i++) {
992 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
993 if (p->ibp_pages[i] == NULL) {
994 CERROR ("Can't allocate page %d of %d\n", i, npages);
995 kibnal_free_pages(p);
1000 PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1001 if (phys_pages == NULL) {
1002 CERROR ("Can't allocate physarray for %d pages\n", npages);
1003 kibnal_free_pages(p);
1007 for (i = 0; i < npages; i++) {
1008 phys_pages[i].size = PAGE_SIZE;
1009 phys_pages[i].address =
1010 kibnal_page2phys(p->ibp_pages[i]);
1014 rc = ib_memory_register_physical(kibnal_data.kib_pd,
1017 npages * PAGE_SIZE, 0,
1023 PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1026 CERROR ("Error %d mapping %d pages\n", rc, npages);
1027 kibnal_free_pages(p);
1037 kibnal_setup_tx_descs (void)
1040 int page_offset = 0;
1048 /* pre-mapped messages are not bigger than 1 page */
1049 LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1051 /* No fancy arithmetic when we do the buffer calculations */
1052 LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1054 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1056 0); /* local read access only */
1060 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1062 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1063 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1064 tx = &kibnal_data.kib_tx_descs[i];
1066 memset (tx, 0, sizeof(*tx)); /* zero flags etc */
1068 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1069 tx->tx_vaddr = vaddr;
1070 tx->tx_isnblk = (i >= IBNAL_NTX);
1071 tx->tx_mapped = KIB_TX_UNMAPPED;
1073 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1074 i, tx, tx->tx_msg, tx->tx_vaddr);
1077 list_add (&tx->tx_list,
1078 &kibnal_data.kib_idle_nblk_txs);
1080 list_add (&tx->tx_list,
1081 &kibnal_data.kib_idle_txs);
1083 vaddr += IBNAL_MSG_SIZE;
1084 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1086 page_offset += IBNAL_MSG_SIZE;
1087 LASSERT (page_offset <= PAGE_SIZE);
1089 if (page_offset == PAGE_SIZE) {
1092 LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1100 kibnal_api_shutdown (nal_t *nal)
1105 if (nal->nal_refct != 0) {
1106 /* This module got the first ref */
1107 PORTAL_MODULE_UNUSE;
1111 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1112 atomic_read (&portal_kmemory));
1114 LASSERT(nal == &kibnal_api);
1116 switch (kibnal_data.kib_init) {
1118 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1121 case IBNAL_INIT_ALL:
1122 /* stop calls to nal_cmd */
1123 libcfs_nal_cmd_unregister(OPENIBNAL);
1126 /* resetting my NID unadvertises me, removes my
1127 * listener and nukes all current peers */
1128 kibnal_set_mynid (PTL_NID_ANY);
1130 /* Wait for all peer state to clean up */
1132 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1134 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1135 "waiting for %d peers to close down\n",
1136 atomic_read (&kibnal_data.kib_npeers));
1137 set_current_state (TASK_INTERRUPTIBLE);
1138 schedule_timeout (HZ);
1143 rc = ib_cq_destroy (kibnal_data.kib_cq);
1145 CERROR ("Destroy CQ error: %d\n", rc);
1148 case IBNAL_INIT_TXD:
1149 kibnal_free_pages (kibnal_data.kib_tx_pages);
1152 case IBNAL_INIT_FMR:
1153 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1155 CERROR ("Destroy FMR pool error: %d\n", rc);
1159 rc = ib_pd_destroy(kibnal_data.kib_pd);
1161 CERROR ("Destroy PD error: %d\n", rc);
1164 case IBNAL_INIT_LIB:
1165 lib_fini(&kibnal_lib);
1168 case IBNAL_INIT_DATA:
1169 /* Module refcount only gets to zero when all peers
1170 * have been closed so all lists must be empty */
1171 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1172 LASSERT (kibnal_data.kib_peers != NULL);
1173 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1174 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1176 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1177 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1178 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1179 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1180 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1182 /* flag threads to terminate; wake and wait for them to die */
1183 kibnal_data.kib_shutdown = 1;
1184 wake_up_all (&kibnal_data.kib_sched_waitq);
1185 wake_up_all (&kibnal_data.kib_connd_waitq);
1188 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1190 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1191 "Waiting for %d threads to terminate\n",
1192 atomic_read (&kibnal_data.kib_nthreads));
1193 set_current_state (TASK_INTERRUPTIBLE);
1194 schedule_timeout (HZ);
1198 case IBNAL_INIT_NOTHING:
1202 if (kibnal_data.kib_tx_descs != NULL)
1203 PORTAL_FREE (kibnal_data.kib_tx_descs,
1204 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1206 if (kibnal_data.kib_peers != NULL)
1207 PORTAL_FREE (kibnal_data.kib_peers,
1208 sizeof (struct list_head) *
1209 kibnal_data.kib_peer_hash_size);
1211 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1212 atomic_read (&portal_kmemory));
1213 printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
1214 atomic_read(&portal_kmemory));
1216 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1220 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1221 ptl_ni_limits_t *requested_limits,
1222 ptl_ni_limits_t *actual_limits)
1224 ptl_process_id_t process_id;
1225 int pkmem = atomic_read(&portal_kmemory);
1229 LASSERT (nal == &kibnal_api);
1231 if (nal->nal_refct != 0) {
1232 if (actual_limits != NULL)
1233 *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1234 /* This module got the first ref */
1239 LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1241 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1243 init_MUTEX (&kibnal_data.kib_nid_mutex);
1244 init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
1245 kibnal_data.kib_nid = PTL_NID_ANY;
1247 rwlock_init(&kibnal_data.kib_global_lock);
1249 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1250 PORTAL_ALLOC (kibnal_data.kib_peers,
1251 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1252 if (kibnal_data.kib_peers == NULL) {
1255 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1256 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1258 spin_lock_init (&kibnal_data.kib_connd_lock);
1259 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1260 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1261 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1263 spin_lock_init (&kibnal_data.kib_sched_lock);
1264 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1265 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1266 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1268 spin_lock_init (&kibnal_data.kib_tx_lock);
1269 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1270 INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1271 init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1273 PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1274 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1275 if (kibnal_data.kib_tx_descs == NULL) {
1276 CERROR ("Can't allocate tx descs\n");
1280 /* lists/ptrs/locks initialised */
1281 kibnal_data.kib_init = IBNAL_INIT_DATA;
1282 /*****************************************************/
1285 process_id.pid = requested_pid;
1286 process_id.nid = kibnal_data.kib_nid;
1288 rc = lib_init(&kibnal_lib, nal, process_id,
1289 requested_limits, actual_limits);
1291 CERROR("lib_init failed: error %d\n", rc);
1295 /* lib interface initialised */
1296 kibnal_data.kib_init = IBNAL_INIT_LIB;
1297 /*****************************************************/
1299 for (i = 0; i < IBNAL_N_SCHED; i++) {
1300 rc = kibnal_thread_start (kibnal_scheduler,
1301 (void *)((unsigned long)i));
1303 CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1309 rc = kibnal_thread_start (kibnal_connd, NULL);
1311 CERROR ("Can't spawn openibnal connd: %d\n", rc);
1315 kibnal_data.kib_device = ib_device_get_by_index(0);
1316 if (kibnal_data.kib_device == NULL) {
1317 CERROR ("Can't open ib device 0\n");
1321 rc = ib_device_properties_get(kibnal_data.kib_device,
1322 &kibnal_data.kib_device_props);
1324 CERROR ("Can't get device props: %d\n", rc);
1328 CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
1329 kibnal_data.kib_device_props.max_initiator_per_qp,
1330 kibnal_data.kib_device_props.max_responder_per_qp);
1332 kibnal_data.kib_port = 0;
1333 for (i = 1; i <= 2; i++) {
1334 rc = ib_port_properties_get(kibnal_data.kib_device, i,
1335 &kibnal_data.kib_port_props);
1337 kibnal_data.kib_port = i;
1341 if (kibnal_data.kib_port == 0) {
1342 CERROR ("Can't find a port\n");
1346 rc = ib_pd_create(kibnal_data.kib_device,
1347 NULL, &kibnal_data.kib_pd);
1349 CERROR ("Can't create PD: %d\n", rc);
1353 /* flag PD initialised */
1354 kibnal_data.kib_init = IBNAL_INIT_PD;
1355 /*****************************************************/
1358 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
1359 struct ib_fmr_pool_param params = {
1360 .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1361 .access = (IB_ACCESS_LOCAL_WRITE |
1362 IB_ACCESS_REMOTE_WRITE |
1363 IB_ACCESS_REMOTE_READ),
1364 .pool_size = pool_size,
1365 .dirty_watermark = (pool_size * 3)/4,
1366 .flush_function = NULL,
1370 rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
1371 &kibnal_data.kib_fmr_pool);
1373 CERROR ("Can't create FMR pool size %d: %d\n",
1379 /* flag FMR pool initialised */
1380 kibnal_data.kib_init = IBNAL_INIT_FMR;
1382 /*****************************************************/
1384 rc = kibnal_setup_tx_descs();
1386 CERROR ("Can't register tx descs: %d\n", rc);
1390 /* flag TX descs initialised */
1391 kibnal_data.kib_init = IBNAL_INIT_TXD;
1392 /*****************************************************/
1395 struct ib_cq_callback callback = {
1396 .context = IBNAL_CALLBACK_CTXT,
1397 .policy = IB_CQ_PROVIDER_REARM,
1399 .entry = kibnal_callback,
1403 int nentries = IBNAL_CQ_ENTRIES;
1405 rc = ib_cq_create (kibnal_data.kib_device,
1406 &nentries, &callback, NULL,
1407 &kibnal_data.kib_cq);
1409 CERROR ("Can't create CQ: %d\n", rc);
1413 /* I only want solicited events */
1414 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
1418 /* flag CQ initialised */
1419 kibnal_data.kib_init = IBNAL_INIT_CQ;
1420 /*****************************************************/
1422 rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
1424 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1428 /* flag everything initialised */
1429 kibnal_data.kib_init = IBNAL_INIT_ALL;
1430 /*****************************************************/
1432 printk(KERN_INFO "Lustre: OpenIB NAL loaded "
1433 "(initial mem %d)\n", pkmem);
1438 kibnal_api_shutdown (&kibnal_api);
1443 kibnal_module_fini (void)
1445 #ifdef CONFIG_SYSCTL
1446 if (kibnal_tunables.kib_sysctl != NULL)
1447 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1449 PtlNIFini(kibnal_ni);
1451 ptl_unregister_nal(OPENIBNAL);
1455 kibnal_module_init (void)
1459 /* the following must be sizeof(int) for proc_dointvec() */
1460 LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1462 kibnal_api.nal_ni_init = kibnal_api_startup;
1463 kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1465 /* Initialise dynamic tunables to defaults once only */
1466 kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1468 rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
1470 CERROR("Can't register IBNAL: %d\n", rc);
1471 return (-ENOMEM); /* or something... */
1474 /* Pure gateways want the NAL started up at module load time... */
1475 rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1476 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1477 ptl_unregister_nal(OPENIBNAL);
1481 #ifdef CONFIG_SYSCTL
1482 /* Press on regardless even if registering sysctl doesn't work */
1483 kibnal_tunables.kib_sysctl =
1484 register_sysctl_table (kibnal_top_ctl_table, 0);
1489 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1490 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
1491 MODULE_LICENSE("GPL");
1493 module_init(kibnal_module_init);
1494 module_exit(kibnal_module_fini);