1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openibnal.h"
27 ptl_handle_ni_t kibnal_ni;
28 kib_data_t kibnal_data;
29 kib_tunables_t kibnal_tunables;
31 #define IBNAL_SYSCTL 202
34 IBNAL_SYSCTL_TIMEOUT=1,
35 IBNAL_SYSCTL_LISTENER_TIMEOUT,
40 static ctl_table kibnal_ctl_table[] = {
41 {IBNAL_SYSCTL_TIMEOUT, "timeout",
42 &kibnal_tunables.kib_io_timeout, sizeof (int),
43 0644, NULL, &proc_dointvec},
44 {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout",
45 &kibnal_tunables.kib_listener_timeout, sizeof(int),
46 0644, NULL, &proc_dointvec},
47 {IBNAL_SYSCTL_BACKLOG, "backlog",
48 &kibnal_tunables.kib_backlog, sizeof(int),
49 0644, NULL, kibnal_listener_procint},
50 {IBNAL_SYSCTL_PORT, "port",
51 &kibnal_tunables.kib_port, sizeof(int),
52 0644, NULL, kibnal_listener_procint},
56 static ctl_table kibnal_top_ctl_table[] = {
57 {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
62 kibnal_cksum (void *ptr, int nob)
68 sum = ((sum << 1) | (sum >> 31)) + *c++;
70 /* ensure I don't return 0 (== no checksum) */
71 return (sum == 0) ? 1 : sum;
75 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
78 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
82 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
84 /* CAVEAT EMPTOR! all message fields not set here should have been
85 * initialised previously. */
86 msg->ibm_magic = IBNAL_MSG_MAGIC;
87 msg->ibm_version = IBNAL_MSG_VERSION;
89 msg->ibm_credits = credits;
92 msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid;
93 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
94 msg->ibm_dstnid = dstnid;
95 msg->ibm_dststamp = dststamp;
97 /* NB ibm_cksum zero while computing cksum */
98 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
103 kibnal_unpack_msg(kib_msg_t *msg, int nob)
105 const int hdr_size = offsetof(kib_msg_t, ibm_u);
111 CERROR("Short message: %d\n", nob);
115 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
117 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
120 CERROR("Bad magic: %08x\n", msg->ibm_magic);
124 if (msg->ibm_version !=
125 (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
126 CERROR("Bad version: %d\n", msg->ibm_version);
130 if (nob < hdr_size) {
131 CERROR("Short message: %d\n", nob);
135 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
137 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
141 /* checksum must be computed with ibm_cksum zero and BEFORE anything
143 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
145 if (msg_cksum != 0 &&
146 msg_cksum != kibnal_cksum(msg, msg_nob)) {
147 CERROR("Bad checksum\n");
150 msg->ibm_cksum = msg_cksum;
153 /* leave magic unflipped as a clue to peer endianness */
154 __swab16s(&msg->ibm_version);
155 LASSERT (sizeof(msg->ibm_type) == 1);
156 LASSERT (sizeof(msg->ibm_credits) == 1);
157 msg->ibm_nob = msg_nob;
158 __swab64s(&msg->ibm_srcnid);
159 __swab64s(&msg->ibm_srcstamp);
160 __swab64s(&msg->ibm_dstnid);
161 __swab64s(&msg->ibm_dststamp);
164 if (msg->ibm_srcnid == PTL_NID_ANY) {
165 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
169 switch (msg->ibm_type) {
171 CERROR("Unknown message type %x\n", msg->ibm_type);
174 case IBNAL_MSG_SVCQRY:
178 case IBNAL_MSG_SVCRSP:
179 if (msg_nob < hdr_size + sizeof(msg->ibm_u.svcrsp)) {
180 CERROR("Short SVCRSP: %d(%d)\n", msg_nob,
181 (int)(hdr_size + sizeof(msg->ibm_u.svcrsp)));
185 __swab64s(&msg->ibm_u.svcrsp.ibsr_svc_id);
186 __swab16s(&msg->ibm_u.svcrsp.ibsr_svc_pkey);
190 case IBNAL_MSG_CONNREQ:
191 case IBNAL_MSG_CONNACK:
192 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
193 CERROR("Short CONNREQ: %d(%d)\n", msg_nob,
194 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
198 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
201 case IBNAL_MSG_IMMEDIATE:
202 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
203 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
204 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
209 case IBNAL_MSG_PUT_RDMA:
210 case IBNAL_MSG_GET_RDMA:
211 if (msg_nob < hdr_size + sizeof(msg->ibm_u.rdma)) {
212 CERROR("Short RDMA req: %d(%d)\n", msg_nob,
213 (int)(hdr_size + sizeof(msg->ibm_u.rdma)));
217 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
218 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
219 __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
223 case IBNAL_MSG_PUT_DONE:
224 case IBNAL_MSG_GET_DONE:
225 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
226 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
227 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
231 __swab32s(&msg->ibm_u.completion.ibcm_status);
238 kibnal_sock_write (struct socket *sock, void *buffer, int nob)
241 mm_segment_t oldmm = get_fs();
246 struct msghdr msg = {
253 .msg_flags = MSG_DONTWAIT
256 /* We've set up the socket's send buffer to be large enough for
257 * everything we send, so a single non-blocking send should
258 * complete without error. */
261 rc = sock_sendmsg(sock, &msg, iov.iov_len);
274 kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
277 mm_segment_t oldmm = get_fs();
278 long ticks = timeout * HZ;
290 struct msghdr msg = {
300 /* Set receive timeout to remaining time */
301 tv = (struct timeval) {
302 .tv_sec = ticks / HZ,
303 .tv_usec = ((ticks % HZ) * 1000000) / HZ
306 rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
307 (char *)&tv, sizeof(tv));
310 CERROR("Can't set socket recv timeout %d: %d\n",
317 rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
318 ticks -= jiffies - then;
325 return -ECONNABORTED;
327 buffer = ((char *)buffer) + rc;
339 kibnal_create_sock(struct socket **sockp)
344 mm_segment_t oldmm = get_fs();
346 rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
348 CERROR("Can't create socket: %d\n", rc);
352 /* Ensure sends will not block */
353 option = 2 * sizeof(kib_msg_t);
355 rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
356 (char *)&option, sizeof(option));
359 CERROR("Can't set send buffer %d: %d\n", option, rc);
365 rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
366 (char *)&option, sizeof(option));
369 CERROR("Can't set SO_REUSEADDR: %d\n", rc);
382 kibnal_pause(int ticks)
384 set_current_state(TASK_UNINTERRUPTIBLE);
385 schedule_timeout(ticks);
389 kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp)
391 struct sockaddr_in locaddr;
392 struct sockaddr_in srvaddr;
397 for (port = 1023; port >= 512; port--) {
399 memset(&locaddr, 0, sizeof(locaddr));
400 locaddr.sin_family = AF_INET;
401 locaddr.sin_port = htons(port);
402 locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
404 memset (&srvaddr, 0, sizeof (srvaddr));
405 srvaddr.sin_family = AF_INET;
406 srvaddr.sin_port = htons (peer->ibp_port);
407 srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip);
409 rc = kibnal_create_sock(&sock);
413 rc = sock->ops->bind(sock,
414 (struct sockaddr *)&locaddr, sizeof(locaddr));
418 if (rc == -EADDRINUSE) {
419 CDEBUG(D_NET, "Port %d already in use\n", port);
423 CERROR("Can't bind to reserved port %d: %d\n", port, rc);
427 rc = sock->ops->connect(sock,
428 (struct sockaddr *)&srvaddr, sizeof(srvaddr),
437 if (rc != -EADDRNOTAVAIL) {
438 CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
439 port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
443 CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n",
444 port, HIPQUAD(peer->ibp_ip), peer->ibp_port);
448 return -EHOSTUNREACH;
452 kibnal_make_svcqry (kib_conn_t *conn)
454 kib_peer_t *peer = conn->ibc_peer;
460 LASSERT (conn->ibc_connreq != NULL);
461 msg = &conn->ibc_connreq->cr_msg;
463 kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
464 kibnal_pack_msg(msg, 0, peer->ibp_nid, 0);
466 rc = kibnal_connect_sock(peer, &sock);
470 rc = kibnal_sock_write(sock, msg, msg->ibm_nob);
472 CERROR("Error %d sending svcqry to "
473 LPX64"@%u.%u.%u.%u/%d\n", rc,
474 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
478 nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp);
479 rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout);
481 CERROR("Error %d receiving svcrsp from "
482 LPX64"@%u.%u.%u.%u/%d\n", rc,
483 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
487 rc = kibnal_unpack_msg(msg, nob);
489 CERROR("Error %d unpacking svcrsp from "
490 LPX64"@%u.%u.%u.%u/%d\n", rc,
491 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
495 if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
496 CERROR("Unexpected response type %d from "
497 LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_type,
498 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
503 if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
504 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
505 CERROR("Unexpected dst NID/stamp "LPX64"/"LPX64" from "
506 LPX64"@%u.%u.%u.%u/%d\n",
507 msg->ibm_dstnid, msg->ibm_dststamp,
508 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
513 if (msg->ibm_srcnid != peer->ibp_nid) {
514 CERROR("Unexpected src NID "LPX64" from "
515 LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_srcnid,
516 peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
521 conn->ibc_incarnation = msg->ibm_srcstamp;
522 conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
529 kibnal_handle_svcqry (struct socket *sock)
531 struct sockaddr_in addr;
533 unsigned int peer_port;
541 rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
543 CERROR("Can't get peer's IP: %d\n", rc);
547 peer_ip = ntohl(addr.sin_addr.s_addr);
548 peer_port = ntohs(addr.sin_port);
550 if (peer_port >= 1024) {
551 CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
552 HIPQUAD(peer_ip), peer_port);
556 PORTAL_ALLOC(msg, sizeof(*msg));
558 CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
559 HIPQUAD(peer_ip), peer_port);
563 rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
564 kibnal_tunables.kib_listener_timeout);
566 CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n",
567 rc, HIPQUAD(peer_ip), peer_port);
571 rc = kibnal_unpack_msg(msg, offsetof(kib_msg_t, ibm_u));
573 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
574 rc, HIPQUAD(peer_ip), peer_port);
578 if (msg->ibm_type != IBNAL_MSG_SVCQRY) {
579 CERROR("Unexpected message %d from %u.%u.%u.%u/%d\n",
580 msg->ibm_type, HIPQUAD(peer_ip), peer_port);
584 if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
585 CERROR("Unexpected dstnid "LPX64"(expected "LPX64" "
586 "from %u.%u.%u.%u/%d\n", msg->ibm_dstnid,
587 kibnal_lib.libnal_ni.ni_pid.nid,
588 HIPQUAD(peer_ip), peer_port);
592 srcnid = msg->ibm_srcnid;
593 srcstamp = msg->ibm_srcstamp;
595 kibnal_init_msg(msg, IBNAL_MSG_SVCRSP, sizeof(msg->ibm_u.svcrsp));
597 msg->ibm_u.svcrsp.ibsr_svc_id = kibnal_data.kib_svc_id;
598 memcpy(msg->ibm_u.svcrsp.ibsr_svc_gid, kibnal_data.kib_svc_gid,
599 sizeof(kibnal_data.kib_svc_gid));
600 msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
602 kibnal_pack_msg(msg, 0, srcnid, srcstamp);
604 rc = kibnal_sock_write (sock, msg, msg->ibm_nob);
606 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
607 rc, HIPQUAD(peer_ip), peer_port);
612 PORTAL_FREE(msg, sizeof(*msg));
616 kibnal_free_acceptsock (kib_acceptsock_t *as)
618 sock_release(as->ibas_sock);
619 PORTAL_FREE(as, sizeof(*as));
623 kibnal_ip_listener(void *arg)
625 struct sockaddr_in addr;
628 kib_acceptsock_t *as;
634 /* Parent thread holds kib_nid_mutex, and is, or is about to
635 * block on kib_listener_signal */
637 port = kibnal_tunables.kib_port;
638 snprintf(name, sizeof(name), "kibnal_lstn%03d", port);
639 kportal_daemonize(name);
640 kportal_blockallsigs();
642 init_waitqueue_entry(&wait, current);
644 rc = kibnal_create_sock(&sock);
648 memset(&addr, 0, sizeof(addr));
649 addr.sin_family = AF_INET;
650 addr.sin_port = htons(port);
651 addr.sin_addr.s_addr = INADDR_ANY;
653 rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
655 CERROR("Can't bind to port %d\n", port);
659 rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog);
661 CERROR("Can't set listen backlog %d: %d\n",
662 kibnal_tunables.kib_backlog, rc);
666 LASSERT (kibnal_data.kib_listener_sock == NULL);
667 kibnal_data.kib_listener_sock = sock;
669 /* unblock waiting parent */
670 LASSERT (kibnal_data.kib_listener_shutdown == 0);
671 up(&kibnal_data.kib_listener_signal);
673 /* Wake me any time something happens on my socket */
674 add_wait_queue(sock->sk->sk_sleep, &wait);
677 while (kibnal_data.kib_listener_shutdown == 0) {
680 PORTAL_ALLOC(as, sizeof(*as));
682 CERROR("Out of Memory: pausing...\n");
686 as->ibas_sock = NULL;
689 if (as->ibas_sock == NULL) {
690 as->ibas_sock = sock_alloc();
691 if (as->ibas_sock == NULL) {
692 CERROR("Can't allocate socket: pausing...\n");
696 /* XXX this should add a ref to sock->ops->owner, if
697 * TCP could be a module */
698 as->ibas_sock->type = sock->type;
699 as->ibas_sock->ops = sock->ops;
702 set_current_state(TASK_INTERRUPTIBLE);
704 rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK);
706 /* Sleep for socket activity? */
708 kibnal_data.kib_listener_shutdown == 0)
711 set_current_state(TASK_RUNNING);
714 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
716 list_add_tail(&as->ibas_list,
717 &kibnal_data.kib_connd_acceptq);
719 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
720 wake_up(&kibnal_data.kib_connd_waitq);
727 CERROR("Accept failed: %d, pausing...\n", rc);
733 if (as->ibas_sock != NULL)
734 sock_release(as->ibas_sock);
735 PORTAL_FREE(as, sizeof(*as));
739 remove_wait_queue(sock->sk->sk_sleep, &wait);
742 kibnal_data.kib_listener_sock = NULL;
744 /* set completion status and unblock thread waiting for me
745 * (parent on startup failure, executioner on normal shutdown) */
746 kibnal_data.kib_listener_shutdown = rc;
747 up(&kibnal_data.kib_listener_signal);
753 kibnal_start_ip_listener (void)
758 CDEBUG(D_NET, "Starting listener\n");
760 /* Called holding kib_nid_mutex: listener stopped */
761 LASSERT (kibnal_data.kib_listener_sock == NULL);
763 kibnal_data.kib_listener_shutdown = 0;
764 pid = kernel_thread(kibnal_ip_listener, NULL, 0);
766 CERROR("Can't spawn listener: %ld\n", pid);
770 /* Block until listener has started up. */
771 down(&kibnal_data.kib_listener_signal);
773 rc = kibnal_data.kib_listener_shutdown;
774 LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL));
776 CDEBUG((rc == 0) ? D_WARNING : D_ERROR,
777 "Listener %s: pid:%ld port:%d backlog:%d\n",
778 (rc == 0) ? "started OK" : "startup failed",
779 pid, kibnal_tunables.kib_port, kibnal_tunables.kib_backlog);
785 kibnal_stop_ip_listener(int clear_acceptq)
787 struct list_head zombie_accepts;
788 kib_acceptsock_t *as;
791 CDEBUG(D_NET, "Stopping listener\n");
793 /* Called holding kib_nid_mutex: listener running */
794 LASSERT (kibnal_data.kib_listener_sock != NULL);
796 kibnal_data.kib_listener_shutdown = 1;
797 wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep);
799 /* Block until listener has torn down. */
800 down(&kibnal_data.kib_listener_signal);
802 LASSERT (kibnal_data.kib_listener_sock == NULL);
803 CWARN("Listener stopped\n");
808 /* Close any unhandled accepts */
809 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
811 list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq);
812 list_del_init(&kibnal_data.kib_connd_acceptq);
814 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
816 while (!list_empty(&zombie_accepts)) {
817 as = list_entry(zombie_accepts.next,
818 kib_acceptsock_t, ibas_list);
819 list_del(&as->ibas_list);
820 kibnal_free_acceptsock(as);
824 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
826 kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
827 void *buffer, size_t *lenp)
830 kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
831 void *buffer, size_t *lenp, loff_t *ppos)
834 int *tunable = (int *)table->data;
838 /* No race with nal initialisation since the nal is setup all the time
839 * it's loaded. When that changes, change this! */
840 LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
842 down(&kibnal_data.kib_nid_mutex);
844 LASSERT (tunable == &kibnal_tunables.kib_port ||
845 tunable == &kibnal_tunables.kib_backlog);
848 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
849 rc = proc_dointvec(table, write, filp, buffer, lenp);
851 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
854 (*tunable != old_val ||
855 kibnal_data.kib_listener_sock == NULL)) {
857 if (kibnal_data.kib_listener_sock != NULL)
858 kibnal_stop_ip_listener(0);
860 rc = kibnal_start_ip_listener();
862 CERROR("Unable to restart listener with new tunable:"
863 " reverting to old value\n");
865 kibnal_start_ip_listener();
869 up(&kibnal_data.kib_nid_mutex);
871 LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
876 kibnal_start_ib_listener (void)
880 LASSERT (kibnal_data.kib_listen_handle == NULL);
882 kibnal_data.kib_svc_id = ib_cm_service_assign();
883 CDEBUG(D_NET, "svc id "LPX64"\n", kibnal_data.kib_svc_id);
885 rc = ib_cached_gid_get(kibnal_data.kib_device,
886 kibnal_data.kib_port, 0,
887 kibnal_data.kib_svc_gid);
889 CERROR("Can't get port %d GID: %d\n",
890 kibnal_data.kib_port, rc);
894 rc = ib_cached_pkey_get(kibnal_data.kib_device,
895 kibnal_data.kib_port, 0,
896 &kibnal_data.kib_svc_pkey);
898 CERROR ("Can't get port %d PKEY: %d\n",
899 kibnal_data.kib_port, rc);
903 rc = ib_cm_listen(kibnal_data.kib_svc_id,
904 TS_IB_CM_SERVICE_EXACT_MASK,
905 kibnal_passive_conn_callback, NULL,
906 &kibnal_data.kib_listen_handle);
908 kibnal_data.kib_listen_handle = NULL;
909 CERROR ("Can't create IB listener: %d\n", rc);
913 LASSERT (kibnal_data.kib_listen_handle != NULL);
918 kibnal_stop_ib_listener (void)
922 LASSERT (kibnal_data.kib_listen_handle != NULL);
924 rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
926 CERROR("Error stopping IB listener: %d\n", rc);
928 kibnal_data.kib_listen_handle = NULL;
932 kibnal_set_mynid (ptl_nid_t nid)
934 lib_ni_t *ni = &kibnal_lib.libnal_ni;
937 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
938 nid, ni->ni_pid.nid);
940 down (&kibnal_data.kib_nid_mutex);
942 if (nid == kibnal_data.kib_nid) {
943 /* no change of NID */
944 up (&kibnal_data.kib_nid_mutex);
948 CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
949 kibnal_data.kib_nid, nid);
951 if (kibnal_data.kib_listener_sock != NULL)
952 kibnal_stop_ip_listener(1);
954 if (kibnal_data.kib_listen_handle != NULL)
955 kibnal_stop_ib_listener();
957 ni->ni_pid.nid = nid;
958 kibnal_data.kib_incarnation++;
960 /* Delete all existing peers and their connections after new
961 * NID/incarnation set to ensure no old connections in our brave new
963 kibnal_del_peer (PTL_NID_ANY, 0);
965 if (ni->ni_pid.nid != PTL_NID_ANY) {
966 /* got a new NID to install */
967 rc = kibnal_start_ib_listener();
969 CERROR("Can't start IB listener: %d\n", rc);
973 rc = kibnal_start_ip_listener();
975 CERROR("Can't start IP listener: %d\n", rc);
980 up(&kibnal_data.kib_nid_mutex);
984 kibnal_stop_ib_listener();
986 ni->ni_pid.nid = PTL_NID_ANY;
987 kibnal_data.kib_incarnation++;
989 kibnal_del_peer (PTL_NID_ANY, 0);
990 up(&kibnal_data.kib_nid_mutex);
995 kibnal_create_peer (ptl_nid_t nid)
999 LASSERT (nid != PTL_NID_ANY);
1001 PORTAL_ALLOC (peer, sizeof (*peer));
1005 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
1007 peer->ibp_nid = nid;
1008 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
1010 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
1011 INIT_LIST_HEAD (&peer->ibp_conns);
1012 INIT_LIST_HEAD (&peer->ibp_tx_queue);
1013 INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
1015 peer->ibp_reconnect_time = jiffies;
1016 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
1018 atomic_inc (&kibnal_data.kib_npeers);
1019 CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
1025 kibnal_destroy_peer (kib_peer_t *peer)
1027 CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
1029 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
1030 LASSERT (peer->ibp_persistence == 0);
1031 LASSERT (!kibnal_peer_active(peer));
1032 LASSERT (peer->ibp_connecting == 0);
1033 LASSERT (list_empty (&peer->ibp_connd_list));
1034 LASSERT (list_empty (&peer->ibp_conns));
1035 LASSERT (list_empty (&peer->ibp_tx_queue));
1037 PORTAL_FREE (peer, sizeof (*peer));
1039 /* NB a peer's connections keep a reference on their peer until
1040 * they are destroyed, so we can be assured that _all_ state to do
1041 * with this peer has been cleaned up when its refcount drops to
1043 atomic_dec (&kibnal_data.kib_npeers);
1047 kibnal_put_peer (kib_peer_t *peer)
1049 CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
1050 peer, peer->ibp_nid,
1051 atomic_read (&peer->ibp_refcount));
1053 LASSERT (atomic_read (&peer->ibp_refcount) > 0);
1054 if (!atomic_dec_and_test (&peer->ibp_refcount))
1057 kibnal_destroy_peer (peer);
1061 kibnal_find_peer_locked (ptl_nid_t nid)
1063 struct list_head *peer_list = kibnal_nid2peerlist (nid);
1064 struct list_head *tmp;
1067 list_for_each (tmp, peer_list) {
1069 peer = list_entry (tmp, kib_peer_t, ibp_list);
1071 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
1072 peer->ibp_connecting != 0 || /* creating conns */
1073 !list_empty (&peer->ibp_conns)); /* active conn */
1075 if (peer->ibp_nid != nid)
1078 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
1079 peer, nid, atomic_read (&peer->ibp_refcount));
1086 kibnal_get_peer (ptl_nid_t nid)
1089 unsigned long flags;
1091 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1092 peer = kibnal_find_peer_locked (nid);
1093 if (peer != NULL) /* +1 ref for caller? */
1094 atomic_inc (&peer->ibp_refcount);
1095 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1101 kibnal_unlink_peer_locked (kib_peer_t *peer)
1103 LASSERT (peer->ibp_persistence == 0);
1104 LASSERT (list_empty(&peer->ibp_conns));
1106 LASSERT (kibnal_peer_active(peer));
1107 list_del_init (&peer->ibp_list);
1108 /* lose peerlist's ref */
1109 kibnal_put_peer (peer);
1113 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
1117 struct list_head *ptmp;
1118 unsigned long flags;
1121 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1123 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1125 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
1127 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1128 LASSERT (peer->ibp_persistence != 0 ||
1129 peer->ibp_connecting != 0 ||
1130 !list_empty (&peer->ibp_conns));
1135 *nidp = peer->ibp_nid;
1136 *ipp = peer->ibp_ip;
1137 *portp = peer->ibp_port;
1138 *persistencep = peer->ibp_persistence;
1140 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
1146 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1151 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
1153 unsigned long flags;
1157 if (nid == PTL_NID_ANY)
1160 peer = kibnal_create_peer (nid);
1164 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1166 peer2 = kibnal_find_peer_locked (nid);
1167 if (peer2 != NULL) {
1168 kibnal_put_peer (peer);
1171 /* peer table takes existing ref on peer */
1172 list_add_tail (&peer->ibp_list,
1173 kibnal_nid2peerlist (nid));
1177 peer->ibp_port = port;
1178 peer->ibp_persistence++;
1180 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1185 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
1187 struct list_head *ctmp;
1188 struct list_head *cnxt;
1192 peer->ibp_persistence = 0;
1193 else if (peer->ibp_persistence > 0)
1194 peer->ibp_persistence--;
1196 if (peer->ibp_persistence != 0)
1199 if (list_empty(&peer->ibp_conns)) {
1200 kibnal_unlink_peer_locked(peer);
1202 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1203 conn = list_entry(ctmp, kib_conn_t, ibc_list);
1205 kibnal_close_conn_locked (conn, 0);
1207 /* NB peer is no longer persistent; closing its last conn
1210 /* NB peer now unlinked; might even be freed if the peer table had the
1211 * last ref on it. */
1215 kibnal_del_peer (ptl_nid_t nid, int single_share)
1217 unsigned long flags;
1218 struct list_head *ptmp;
1219 struct list_head *pnxt;
1226 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1228 if (nid != PTL_NID_ANY)
1229 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1232 hi = kibnal_data.kib_peer_hash_size - 1;
1235 for (i = lo; i <= hi; i++) {
1236 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1237 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1238 LASSERT (peer->ibp_persistence != 0 ||
1239 peer->ibp_connecting != 0 ||
1240 !list_empty (&peer->ibp_conns));
1242 if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
1245 kibnal_del_peer_locked (peer, single_share);
1246 rc = 0; /* matched something */
1253 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1259 kibnal_get_conn_by_idx (int index)
1262 struct list_head *ptmp;
1264 struct list_head *ctmp;
1265 unsigned long flags;
1268 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1270 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1271 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
1273 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1274 LASSERT (peer->ibp_persistence > 0 ||
1275 peer->ibp_connecting != 0 ||
1276 !list_empty (&peer->ibp_conns));
1278 list_for_each (ctmp, &peer->ibp_conns) {
1282 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1283 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1284 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1285 atomic_read (&conn->ibc_refcount));
1286 atomic_inc (&conn->ibc_refcount);
1287 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
1294 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1299 kibnal_create_conn (void)
1309 struct ib_qp_create_param qp_create;
1310 struct ib_qp_attribute qp_attr;
1313 PORTAL_ALLOC (conn, sizeof (*conn));
1315 CERROR ("Can't allocate connection\n");
1319 /* zero flags, NULL pointers etc... */
1320 memset (conn, 0, sizeof (*conn));
1322 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1323 INIT_LIST_HEAD (&conn->ibc_active_txs);
1324 spin_lock_init (&conn->ibc_lock);
1326 atomic_inc (&kibnal_data.kib_nconns);
1327 /* well not really, but I call destroy() on failure, which decrements */
1329 PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1330 if (conn->ibc_rxs == NULL)
1332 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1334 rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
1336 IB_ACCESS_LOCAL_WRITE);
1340 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
1342 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1343 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1344 kib_rx_t *rx = &conn->ibc_rxs[i];
1347 rx->rx_vaddr = vaddr;
1348 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1350 vaddr += IBNAL_MSG_SIZE;
1351 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
1353 page_offset += IBNAL_MSG_SIZE;
1354 LASSERT (page_offset <= PAGE_SIZE);
1356 if (page_offset == PAGE_SIZE) {
1359 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1363 params.qp_create = (struct ib_qp_create_param) {
1365 /* Sends have an optional RDMA */
1366 .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE,
1367 .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
1368 .max_send_gather_element = 1,
1369 .max_receive_scatter_element = 1,
1371 .pd = kibnal_data.kib_pd,
1372 .send_queue = kibnal_data.kib_cq,
1373 .receive_queue = kibnal_data.kib_cq,
1374 .send_policy = IB_WQ_SIGNAL_SELECTABLE,
1375 .receive_policy = IB_WQ_SIGNAL_SELECTABLE,
1377 .transport = IB_TRANSPORT_RC,
1378 .device_specific = NULL,
1381 rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
1383 CERROR ("Failed to create queue pair: %d\n", rc);
1387 /* Mark QP created */
1388 conn->ibc_state = IBNAL_CONN_INIT_QP;
1390 params.qp_attr = (struct ib_qp_attribute) {
1391 .state = IB_QP_STATE_INIT,
1392 .port = kibnal_data.kib_port,
1393 .enable_rdma_read = 1,
1394 .enable_rdma_write = 1,
1395 .valid_fields = (IB_QP_ATTRIBUTE_STATE |
1396 IB_QP_ATTRIBUTE_PORT |
1397 IB_QP_ATTRIBUTE_PKEY_INDEX |
1398 IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
1400 rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr);
1402 CERROR ("Failed to modify queue pair: %d\n", rc);
1406 /* 1 ref for caller */
1407 atomic_set (&conn->ibc_refcount, 1);
1411 kibnal_destroy_conn (conn);
1416 kibnal_destroy_conn (kib_conn_t *conn)
1420 CDEBUG (D_NET, "connection %p\n", conn);
1422 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1423 LASSERT (list_empty(&conn->ibc_tx_queue));
1424 LASSERT (list_empty(&conn->ibc_active_txs));
1425 LASSERT (conn->ibc_nsends_posted == 0);
1426 LASSERT (conn->ibc_connreq == NULL);
1428 switch (conn->ibc_state) {
1429 case IBNAL_CONN_ZOMBIE:
1430 /* called after connection sequence initiated */
1432 case IBNAL_CONN_INIT_QP:
1433 rc = ib_qp_destroy(conn->ibc_qp);
1435 CERROR("Can't destroy QP: %d\n", rc);
1438 case IBNAL_CONN_INIT_NOTHING:
1445 if (conn->ibc_rx_pages != NULL)
1446 kibnal_free_pages(conn->ibc_rx_pages);
1448 if (conn->ibc_rxs != NULL)
1449 PORTAL_FREE(conn->ibc_rxs,
1450 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1452 if (conn->ibc_peer != NULL)
1453 kibnal_put_peer(conn->ibc_peer);
1455 PORTAL_FREE(conn, sizeof (*conn));
1457 atomic_dec(&kibnal_data.kib_nconns);
1459 if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
1460 kibnal_data.kib_shutdown) {
1461 /* I just nuked the last connection on shutdown; wake up
1462 * everyone so they can exit. */
1463 wake_up_all(&kibnal_data.kib_sched_waitq);
1464 wake_up_all(&kibnal_data.kib_reaper_waitq);
1469 kibnal_put_conn (kib_conn_t *conn)
1471 unsigned long flags;
1473 CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
1474 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1475 atomic_read (&conn->ibc_refcount));
1477 LASSERT (atomic_read (&conn->ibc_refcount) > 0);
1478 if (!atomic_dec_and_test (&conn->ibc_refcount))
1481 /* last ref only goes on zombies */
1482 LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
1484 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
1486 list_add (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
1487 wake_up (&kibnal_data.kib_reaper_waitq);
1489 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
1493 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1496 struct list_head *ctmp;
1497 struct list_head *cnxt;
1500 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1501 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1504 kibnal_close_conn_locked (conn, why);
1511 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1514 struct list_head *ctmp;
1515 struct list_head *cnxt;
1518 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1519 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1521 if (conn->ibc_incarnation == incarnation)
1524 CDEBUG(D_NET, "Closing stale conn %p nid:"LPX64
1525 " incarnation:"LPX64"("LPX64")\n", conn,
1526 peer->ibp_nid, conn->ibc_incarnation, incarnation);
1529 kibnal_close_conn_locked (conn, -ESTALE);
1536 kibnal_close_matching_conns (ptl_nid_t nid)
1538 unsigned long flags;
1540 struct list_head *ptmp;
1541 struct list_head *pnxt;
1547 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1549 if (nid != PTL_NID_ANY)
1550 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1553 hi = kibnal_data.kib_peer_hash_size - 1;
1556 for (i = lo; i <= hi; i++) {
1557 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1559 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1560 LASSERT (peer->ibp_persistence != 0 ||
1561 peer->ibp_connecting != 0 ||
1562 !list_empty (&peer->ibp_conns));
1564 if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1567 count += kibnal_close_peer_conns_locked (peer, 0);
1571 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1573 /* wildcards always succeed */
1574 if (nid == PTL_NID_ANY)
1577 return (count == 0 ? -ENOENT : 0);
1581 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1585 LASSERT (pcfg != NULL);
1587 switch(pcfg->pcfg_command) {
1588 case NAL_CMD_GET_PEER: {
1592 int share_count = 0;
1594 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1595 &nid, &ip, &port, &share_count);
1596 pcfg->pcfg_nid = nid;
1597 pcfg->pcfg_size = 0;
1599 pcfg->pcfg_misc = port;
1600 pcfg->pcfg_count = 0;
1601 pcfg->pcfg_wait = share_count;
1604 case NAL_CMD_ADD_PEER: {
1605 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1606 pcfg->pcfg_id, /* IP */
1607 pcfg->pcfg_misc); /* port */
1610 case NAL_CMD_DEL_PEER: {
1611 rc = kibnal_del_peer (pcfg->pcfg_nid,
1612 /* flags == single_share */
1613 pcfg->pcfg_flags != 0);
1616 case NAL_CMD_GET_CONN: {
1617 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1623 pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
1625 pcfg->pcfg_misc = 0;
1626 pcfg->pcfg_flags = 0;
1627 kibnal_put_conn (conn);
1631 case NAL_CMD_CLOSE_CONNECTION: {
1632 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1635 case NAL_CMD_REGISTER_MYNID: {
1636 if (pcfg->pcfg_nid == PTL_NID_ANY)
1639 rc = kibnal_set_mynid (pcfg->pcfg_nid);
1648 kibnal_free_pages (kib_pages_t *p)
1650 int npages = p->ibp_npages;
1654 if (p->ibp_mapped) {
1655 rc = ib_memory_deregister(p->ibp_handle);
1657 CERROR ("Deregister error: %d\n", rc);
1660 for (i = 0; i < npages; i++)
1661 if (p->ibp_pages[i] != NULL)
1662 __free_page(p->ibp_pages[i]);
1664 PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1668 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
1671 struct ib_physical_buffer *phys_pages;
1675 PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1677 CERROR ("Can't allocate buffer %d\n", npages);
1681 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1682 p->ibp_npages = npages;
1684 for (i = 0; i < npages; i++) {
1685 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1686 if (p->ibp_pages[i] == NULL) {
1687 CERROR ("Can't allocate page %d of %d\n", i, npages);
1688 kibnal_free_pages(p);
1693 PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1694 if (phys_pages == NULL) {
1695 CERROR ("Can't allocate physarray for %d pages\n", npages);
1696 kibnal_free_pages(p);
1700 for (i = 0; i < npages; i++) {
1701 phys_pages[i].size = PAGE_SIZE;
1702 phys_pages[i].address =
1703 kibnal_page2phys(p->ibp_pages[i]);
1707 rc = ib_memory_register_physical(kibnal_data.kib_pd,
1710 npages * PAGE_SIZE, 0,
1716 PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1719 CERROR ("Error %d mapping %d pages\n", rc, npages);
1720 kibnal_free_pages(p);
1730 kibnal_setup_tx_descs (void)
1733 int page_offset = 0;
1741 /* pre-mapped messages are not bigger than 1 page */
1742 LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1744 /* No fancy arithmetic when we do the buffer calculations */
1745 LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1747 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1749 0); /* local read access only */
1753 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1755 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1756 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1757 tx = &kibnal_data.kib_tx_descs[i];
1759 memset (tx, 0, sizeof(*tx)); /* zero flags etc */
1761 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1762 tx->tx_vaddr = vaddr;
1763 tx->tx_isnblk = (i >= IBNAL_NTX);
1764 tx->tx_mapped = KIB_TX_UNMAPPED;
1766 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1767 i, tx, tx->tx_msg, tx->tx_vaddr);
1770 list_add (&tx->tx_list,
1771 &kibnal_data.kib_idle_nblk_txs);
1773 list_add (&tx->tx_list,
1774 &kibnal_data.kib_idle_txs);
1776 vaddr += IBNAL_MSG_SIZE;
1777 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1779 page_offset += IBNAL_MSG_SIZE;
1780 LASSERT (page_offset <= PAGE_SIZE);
1782 if (page_offset == PAGE_SIZE) {
1785 LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1793 kibnal_api_shutdown (nal_t *nal)
1798 if (nal->nal_refct != 0) {
1799 /* This module got the first ref */
1800 PORTAL_MODULE_UNUSE;
1804 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1805 atomic_read (&portal_kmemory));
1807 LASSERT(nal == &kibnal_api);
1809 switch (kibnal_data.kib_init) {
1811 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1814 case IBNAL_INIT_ALL:
1815 /* stop calls to nal_cmd */
1816 libcfs_nal_cmd_unregister(OPENIBNAL);
1819 /* resetting my NID unadvertises me, removes my
1820 * listener and nukes all current peers */
1821 kibnal_set_mynid (PTL_NID_ANY);
1823 /* Wait for all peer state to clean up */
1825 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1827 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1828 "waiting for %d peers to close down\n",
1829 atomic_read (&kibnal_data.kib_npeers));
1830 set_current_state (TASK_INTERRUPTIBLE);
1831 schedule_timeout (HZ);
1836 rc = ib_cq_destroy (kibnal_data.kib_cq);
1838 CERROR ("Destroy CQ error: %d\n", rc);
1841 case IBNAL_INIT_TXD:
1842 kibnal_free_pages (kibnal_data.kib_tx_pages);
1845 case IBNAL_INIT_FMR:
1846 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1848 CERROR ("Destroy FMR pool error: %d\n", rc);
1852 rc = ib_pd_destroy(kibnal_data.kib_pd);
1854 CERROR ("Destroy PD error: %d\n", rc);
1857 case IBNAL_INIT_LIB:
1858 lib_fini(&kibnal_lib);
1861 case IBNAL_INIT_DATA:
1862 /* Module refcount only gets to zero when all peers
1863 * have been closed so all lists must be empty */
1864 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1865 LASSERT (kibnal_data.kib_peers != NULL);
1866 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1867 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1869 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1870 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1871 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1872 LASSERT (list_empty (&kibnal_data.kib_reaper_conns));
1873 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1874 LASSERT (list_empty (&kibnal_data.kib_connd_acceptq));
1876 /* flag threads to terminate; wake and wait for them to die */
1877 kibnal_data.kib_shutdown = 1;
1878 wake_up_all (&kibnal_data.kib_sched_waitq);
1879 wake_up_all (&kibnal_data.kib_reaper_waitq);
1880 wake_up_all (&kibnal_data.kib_connd_waitq);
1883 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1885 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1886 "Waiting for %d threads to terminate\n",
1887 atomic_read (&kibnal_data.kib_nthreads));
1888 set_current_state (TASK_INTERRUPTIBLE);
1889 schedule_timeout (HZ);
1893 case IBNAL_INIT_NOTHING:
1897 if (kibnal_data.kib_tx_descs != NULL)
1898 PORTAL_FREE (kibnal_data.kib_tx_descs,
1899 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1901 if (kibnal_data.kib_peers != NULL)
1902 PORTAL_FREE (kibnal_data.kib_peers,
1903 sizeof (struct list_head) *
1904 kibnal_data.kib_peer_hash_size);
1906 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1907 atomic_read (&portal_kmemory));
1908 printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
1909 atomic_read(&portal_kmemory));
1911 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1915 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1916 ptl_ni_limits_t *requested_limits,
1917 ptl_ni_limits_t *actual_limits)
1920 ptl_process_id_t process_id;
1921 int pkmem = atomic_read(&portal_kmemory);
1925 LASSERT (nal == &kibnal_api);
1927 if (nal->nal_refct != 0) {
1928 if (actual_limits != NULL)
1929 *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1930 /* This module got the first ref */
1935 LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1937 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1939 do_gettimeofday(&tv);
1940 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1942 init_MUTEX (&kibnal_data.kib_nid_mutex);
1943 init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
1945 rwlock_init(&kibnal_data.kib_global_lock);
1947 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1948 PORTAL_ALLOC (kibnal_data.kib_peers,
1949 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1950 if (kibnal_data.kib_peers == NULL) {
1953 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1954 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1956 spin_lock_init (&kibnal_data.kib_reaper_lock);
1957 INIT_LIST_HEAD (&kibnal_data.kib_reaper_conns);
1958 init_waitqueue_head (&kibnal_data.kib_reaper_waitq);
1960 spin_lock_init (&kibnal_data.kib_connd_lock);
1961 INIT_LIST_HEAD (&kibnal_data.kib_connd_acceptq);
1962 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1963 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1965 spin_lock_init (&kibnal_data.kib_sched_lock);
1966 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1967 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1968 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1970 spin_lock_init (&kibnal_data.kib_tx_lock);
1971 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1972 INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1973 init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1975 PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1976 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1977 if (kibnal_data.kib_tx_descs == NULL) {
1978 CERROR ("Can't allocate tx descs\n");
1982 /* lists/ptrs/locks initialised */
1983 kibnal_data.kib_init = IBNAL_INIT_DATA;
1984 /*****************************************************/
1987 process_id.pid = requested_pid;
1988 process_id.nid = PTL_NID_ANY; /* don't know my NID yet */
1990 rc = lib_init(&kibnal_lib, nal, process_id,
1991 requested_limits, actual_limits);
1993 CERROR("lib_init failed: error %d\n", rc);
1997 /* lib interface initialised */
1998 kibnal_data.kib_init = IBNAL_INIT_LIB;
1999 /*****************************************************/
2001 for (i = 0; i < IBNAL_N_SCHED; i++) {
2002 rc = kibnal_thread_start (kibnal_scheduler,
2003 (void *)((unsigned long)i));
2005 CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
2011 for (i = 0; i < IBNAL_N_CONND; i++) {
2012 rc = kibnal_thread_start (kibnal_connd,
2013 (void *)((unsigned long)i));
2015 CERROR("Can't spawn openibnal connd[%d]: %d\n",
2021 rc = kibnal_thread_start (kibnal_reaper, NULL);
2023 CERROR ("Can't spawn openibnal reaper: %d\n", rc);
2027 kibnal_data.kib_device = ib_device_get_by_index(0);
2028 if (kibnal_data.kib_device == NULL) {
2029 CERROR ("Can't open ib device 0\n");
2033 rc = ib_device_properties_get(kibnal_data.kib_device,
2034 &kibnal_data.kib_device_props);
2036 CERROR ("Can't get device props: %d\n", rc);
2040 CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
2041 kibnal_data.kib_device_props.max_initiator_per_qp,
2042 kibnal_data.kib_device_props.max_responder_per_qp);
2044 kibnal_data.kib_port = 0;
2045 for (i = 1; i <= 2; i++) {
2046 rc = ib_port_properties_get(kibnal_data.kib_device, i,
2047 &kibnal_data.kib_port_props);
2049 kibnal_data.kib_port = i;
2053 if (kibnal_data.kib_port == 0) {
2054 CERROR ("Can't find a port\n");
2058 rc = ib_pd_create(kibnal_data.kib_device,
2059 NULL, &kibnal_data.kib_pd);
2061 CERROR ("Can't create PD: %d\n", rc);
2065 /* flag PD initialised */
2066 kibnal_data.kib_init = IBNAL_INIT_PD;
2067 /*****************************************************/
2070 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
2071 struct ib_fmr_pool_param params = {
2072 .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
2073 .access = (IB_ACCESS_LOCAL_WRITE |
2074 IB_ACCESS_REMOTE_WRITE |
2075 IB_ACCESS_REMOTE_READ),
2076 .pool_size = pool_size,
2077 .dirty_watermark = (pool_size * 3)/4,
2078 .flush_function = NULL,
2082 rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
2083 &kibnal_data.kib_fmr_pool);
2085 CERROR ("Can't create FMR pool size %d: %d\n",
2091 /* flag FMR pool initialised */
2092 kibnal_data.kib_init = IBNAL_INIT_FMR;
2094 /*****************************************************/
2096 rc = kibnal_setup_tx_descs();
2098 CERROR ("Can't register tx descs: %d\n", rc);
2102 /* flag TX descs initialised */
2103 kibnal_data.kib_init = IBNAL_INIT_TXD;
2104 /*****************************************************/
2107 struct ib_cq_callback callback = {
2108 .context = IBNAL_CALLBACK_CTXT,
2109 .policy = IB_CQ_PROVIDER_REARM,
2111 .entry = kibnal_callback,
2115 int nentries = IBNAL_CQ_ENTRIES;
2117 rc = ib_cq_create (kibnal_data.kib_device,
2118 &nentries, &callback, NULL,
2119 &kibnal_data.kib_cq);
2121 CERROR ("Can't create CQ: %d\n", rc);
2125 /* I only want solicited events */
2126 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
2130 /* flag CQ initialised */
2131 kibnal_data.kib_init = IBNAL_INIT_CQ;
2132 /*****************************************************/
2134 rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
2136 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
2140 /* flag everything initialised */
2141 kibnal_data.kib_init = IBNAL_INIT_ALL;
2142 /*****************************************************/
2144 printk(KERN_INFO "Lustre: OpenIB NAL loaded "
2145 "(initial mem %d)\n", pkmem);
2150 kibnal_api_shutdown (&kibnal_api);
2155 kibnal_module_fini (void)
2157 if (kibnal_tunables.kib_sysctl != NULL)
2158 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
2159 PtlNIFini(kibnal_ni);
2161 ptl_unregister_nal(OPENIBNAL);
2165 kibnal_module_init (void)
2169 /* the following must be sizeof(int) for proc_dointvec() */
2170 LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int));
2171 LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int));
2172 LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int));
2173 LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int));
2175 kibnal_api.nal_ni_init = kibnal_api_startup;
2176 kibnal_api.nal_ni_fini = kibnal_api_shutdown;
2178 /* Initialise dynamic tunables to defaults once only */
2179 kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
2180 kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT;
2181 kibnal_tunables.kib_backlog = IBNAL_BACKLOG;
2182 kibnal_tunables.kib_port = IBNAL_PORT;
2184 rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
2186 CERROR("Can't register IBNAL: %d\n", rc);
2187 return (-ENOMEM); /* or something... */
2190 /* Pure gateways want the NAL started up at module load time... */
2191 rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
2192 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
2193 ptl_unregister_nal(OPENIBNAL);
2197 kibnal_tunables.kib_sysctl =
2198 register_sysctl_table (kibnal_top_ctl_table, 0);
2199 if (kibnal_tunables.kib_sysctl == NULL) {
2200 CERROR("Can't register sysctl table\n");
2201 PtlNIFini(kibnal_ni);
2202 ptl_unregister_nal(OPENIBNAL);
2209 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
2210 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
2211 MODULE_LICENSE("GPL");
2213 module_init(kibnal_module_init);
2214 module_exit(kibnal_module_fini);