X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd.c;h=32bbbec5262d2597ea91ee5c23c062a65b91a743;hp=e7232a05a91b9eec42339d1039a394f6cb7273c8;hb=2dc9c16e770415d56839e1996015fec5fab93f29;hpb=c065f52531e335044388b2759712eeecbb1e78e9 diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index e7232a0..32bbbec 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -25,20 +25,53 @@ #include "socknal.h" +nal_t ksocknal_api; +ksock_nal_data_t ksocknal_data; ptl_handle_ni_t ksocknal_ni; -static nal_t ksocknal_api; -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -ksock_nal_data_t ksocknal_data; -#else -static ksock_nal_data_t ksocknal_data; -#endif +ksock_tunables_t ksocknal_tunables; kpr_nal_interface_t ksocknal_router_interface = { kprni_nalid: SOCKNAL, kprni_arg: &ksocknal_data, kprni_fwd: ksocknal_fwd_packet, + kprni_notify: ksocknal_notify, +}; + +#ifdef CONFIG_SYSCTL +#define SOCKNAL_SYSCTL 200 + +#define SOCKNAL_SYSCTL_TIMEOUT 1 +#define SOCKNAL_SYSCTL_EAGER_ACK 2 +#define SOCKNAL_SYSCTL_ZERO_COPY 3 +#define SOCKNAL_SYSCTL_TYPED 4 +#define SOCKNAL_SYSCTL_MIN_BULK 5 + +static ctl_table ksocknal_ctl_table[] = { + {SOCKNAL_SYSCTL_TIMEOUT, "timeout", + &ksocknal_tunables.ksnd_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", + &ksocknal_tunables.ksnd_eager_ack, sizeof (int), + 0644, NULL, &proc_dointvec}, +#if SOCKNAL_ZC + {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", + &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif + {SOCKNAL_SYSCTL_TYPED, "typed", + &ksocknal_tunables.ksnd_typed_conns, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", + &ksocknal_tunables.ksnd_min_bulk, sizeof (int), + 0644, NULL, &proc_dointvec}, + { 0 } }; +static ctl_table ksocknal_top_ctl_table[] = { + {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + { 0 } +}; +#endif int ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -54,23 +87,6 @@ ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, return PTL_OK; } -int -ksocknal_api_shutdown(nal_t *nal, int ni) -{ - CDEBUG (D_NET, "closing all connections\n"); - - ksocknal_del_route (PTL_NID_ANY, 0, 0, 0); - ksocknal_close_conn (PTL_NID_ANY, 0); - return PTL_OK; -} - -void -ksocknal_api_yield(nal_t *nal) -{ - our_cond_resched(); - return; -} - void ksocknal_api_lock(nal_t *nal, unsigned long *flags) { @@ -93,19 +109,44 @@ ksocknal_api_unlock(nal_t *nal, unsigned long *flags) nal_cb->cb_sti(nal_cb,flags); } -nal_t * -ksocknal_init(int interface, ptl_pt_index_t ptl_size, - ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +int +ksocknal_api_yield(nal_t *nal, unsigned long *flags, int milliseconds) { - CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0); - lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size); - return (&ksocknal_api); + /* NB called holding statelock */ + wait_queue_t wait; + unsigned long now = jiffies; + + CDEBUG (D_NET, "yield\n"); + + if (milliseconds == 0) { + our_cond_resched(); + return 0; + } + + init_waitqueue_entry(&wait, current); + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); + + ksocknal_api_unlock(nal, flags); + + if (milliseconds < 0) + schedule (); + else + schedule_timeout((milliseconds * HZ) / 1000); + + ksocknal_api_lock(nal, flags); + + remove_wait_queue (&ksocknal_data.ksnd_yield_waitq, &wait); + + if (milliseconds > 0) { + milliseconds -= ((jiffies - now) * 1000) / HZ; + if (milliseconds < 0) + milliseconds = 0; + } + + return (milliseconds); } -/* - * EXTRA functions follow - */ - int ksocknal_set_mynid(ptl_nid_t nid) { @@ -160,19 +201,19 @@ ksocknal_bind_irq (unsigned int irq) snprintf (cmdline, sizeof (cmdline), "echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq); - printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", irq, info->ksni_sched, cmdline); /* FIXME: Find a better method of setting IRQ affinity... */ - call_usermodehelper (argv[0], argv, envp); + USERMODEHELPER(argv[0], argv, envp); #endif } ksock_route_t * ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, - int irq_affinity, int xchange_nids, int nonagel) + int irq_affinity, int eager) { ksock_route_t *route; @@ -183,18 +224,17 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, atomic_set (&route->ksnr_refcount, 1); route->ksnr_sharecount = 0; route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies_64; + route->ksnr_timeout = jiffies; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; route->ksnr_buffer_size = buffer_size; route->ksnr_irq_affinity = irq_affinity; - route->ksnr_xchange_nids = xchange_nids; - route->ksnr_nonagel = nonagel; + route->ksnr_eager = eager; route->ksnr_connecting = 0; + route->ksnr_connected = 0; route->ksnr_deleted = 0; - route->ksnr_generation = 0; - route->ksnr_conn = NULL; + route->ksnr_conn_count = 0; return (route); } @@ -203,7 +243,6 @@ void ksocknal_destroy_route (ksock_route_t *route) { LASSERT (route->ksnr_sharecount == 0); - LASSERT (route->ksnr_conn == NULL); if (route->ksnr_peer != NULL) ksocknal_put_peer (route->ksnr_peer); @@ -214,9 +253,8 @@ ksocknal_destroy_route (ksock_route_t *route) void ksocknal_put_route (ksock_route_t *route) { - CDEBUG (D_OTHER, "putting route[%p] -> "LPX64" (%d)\n", - route, route->ksnr_peer->ksnp_nid, - atomic_read (&route->ksnr_refcount)); + CDEBUG (D_OTHER, "putting route[%p] (%d)\n", + route, atomic_read (&route->ksnr_refcount)); LASSERT (atomic_read (&route->ksnr_refcount) > 0); if (!atomic_dec_and_test (&route->ksnr_refcount)) @@ -245,9 +283,6 @@ ksocknal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ksnp_routes); INIT_LIST_HEAD (&peer->ksnp_tx_queue); - /* Can't unload while peers exist; ensures all I/O has terminated - * before unload attempts */ - PORTAL_MODULE_USE; atomic_inc (&ksocknal_data.ksnd_npeers); return (peer); } @@ -269,7 +304,6 @@ ksocknal_destroy_peer (ksock_peer_t *peer) * that _all_ state to do with this peer has been cleaned up when * its refcount drops to zero. */ atomic_dec (&ksocknal_data.ksnd_npeers); - PORTAL_MODULE_UNUSE; } void @@ -371,7 +405,7 @@ ksocknal_get_route_by_idx (int index) int ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, - int nonagle, int xchange_nids, int bind_irq, int share) + int bind_irq, int share, int eager) { unsigned long flags; ksock_peer_t *peer; @@ -388,8 +422,8 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, if (peer == NULL) return (-ENOMEM); - route = ksocknal_create_route (ipaddr, port, bufnob, - nonagle, xchange_nids, bind_irq); + route = ksocknal_create_route (ipaddr, port, bufnob, + bind_irq, eager); if (route == NULL) { ksocknal_put_peer (peer); return (-ENOMEM); @@ -428,7 +462,7 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, route->ksnr_peer = peer; atomic_inc (&peer->ksnp_refcount); /* peer's route list takes existing ref on route */ - list_add (&route->ksnr_list, &peer->ksnp_routes); + list_add_tail (&route->ksnr_list, &peer->ksnp_routes); } route->ksnr_sharecount++; @@ -441,8 +475,10 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, void ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) { - ksock_peer_t *peer = route->ksnr_peer; - ksock_conn_t *conn = route->ksnr_conn; + ksock_peer_t *peer = route->ksnr_peer; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; if (!share) route->ksnr_sharecount = 0; @@ -452,18 +488,22 @@ ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) return; } - if (conn != NULL) { - if (!keep_conn) - ksocknal_close_conn_locked (conn); - else { - /* keeping the conn; just dissociate it and route... */ - conn->ksnc_route = NULL; - route->ksnr_conn = NULL; - ksocknal_put_route (route); /* drop conn's ref on route */ - ksocknal_put_conn (conn); /* drop route's ref on conn */ + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + if (!keep_conn) { + ksocknal_close_conn_locked (conn, 0); + continue; } + + /* keeping the conn; just dissociate it and route... */ + conn->ksnc_route = NULL; + ksocknal_put_route (route); /* drop conn's ref on route */ } - + route->ksnr_deleted = 1; list_del (&route->ksnr_list); ksocknal_put_route (route); /* drop peer's ref */ @@ -568,14 +608,12 @@ ksocknal_get_peer_addr (ksock_conn_t *conn) struct sockaddr_in sin; int len = sizeof (sin); int rc; - - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, (struct sockaddr *)&sin, &len, 2); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); LASSERT (len <= sizeof (sin)); - ksocknal_putconnsock (conn); if (rc != 0) { CERROR ("Error %d getting sock peer IP\n", rc); @@ -590,12 +628,8 @@ unsigned int ksocknal_conn_irq (ksock_conn_t *conn) { int irq = 0; - int rc; struct dst_entry *dst; - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); - dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -608,7 +642,8 @@ ksocknal_conn_irq (ksock_conn_t *conn) dst_release (dst); } - ksocknal_putconnsock (conn); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); return (irq); } @@ -647,9 +682,11 @@ ksocknal_choose_scheduler_locked (unsigned int irq) } int -ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, - struct socket *sock, int bind_irq) +ksocknal_create_conn (ksock_route_t *route, struct socket *sock, + int bind_irq, int type) { + ptl_nid_t nid; + __u64 incarnation; unsigned long flags; ksock_conn_t *conn; ksock_peer_t *peer; @@ -660,14 +697,29 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, int rc; /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need the refcounting so - * that we don't close the socket while I/O is being done on it. */ + * have been created in userland and (b) we need to refcount the + * socket so that we don't close it while I/O is being done on + * it, and sock->file has that pre-cooked... */ LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) > 0); - rc = ksocknal_set_linger (sock); + rc = ksocknal_setup_sock (sock); if (rc != 0) return (rc); + if (route == NULL) { + /* acceptor or explicit connect */ + nid = PTL_NID_ANY; + } else { + LASSERT (type != SOCKNAL_CONN_NONE); + /* autoconnect: expect this nid on exchange */ + nid = route->ksnr_peer->ksnp_nid; + } + + rc = ksocknal_hello (sock, &nid, &type, &incarnation); + if (rc != 0) + return (rc); + peer = NULL; if (route == NULL) { /* not autoconnect */ /* Assume this socket connects to a brand new peer */ @@ -687,6 +739,8 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; + conn->ksnc_type = type; + conn->ksnc_incarnation = incarnation; conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; conn->ksnc_saved_write_space = sock->sk->sk_write_space; atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ @@ -696,22 +750,23 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_new_packet (conn, 0); INIT_LIST_HEAD (&conn->ksnc_tx_queue); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&conn->ksnc_tx_pending); -#endif conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; atomic_set (&conn->ksnc_tx_nob, 0); ksocknal_get_peer_addr (conn); + CWARN("New conn nid:"LPX64" ip:%08x/%d incarnation:"LPX64"\n", + nid, conn->ksnc_ipaddr, conn->ksnc_port, incarnation); + irq = ksocknal_conn_irq (conn); write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); if (route != NULL) { /* Autoconnected! */ - LASSERT (route->ksnr_conn == NULL && route->ksnr_connecting); + LASSERT ((route->ksnr_connected & (1 << type)) == 0); + LASSERT ((route->ksnr_connecting & (1 << type)) != 0); if (route->ksnr_deleted) { /* This conn was autoconnected, but the autoconnect @@ -724,14 +779,13 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, } - /* associate conn/route for auto-reconnect */ - route->ksnr_conn = conn; - atomic_inc (&conn->ksnc_refcount); + /* associate conn/route */ conn->ksnc_route = route; atomic_inc (&route->ksnr_refcount); - route->ksnr_connecting = 0; - route->ksnr_generation++; + route->ksnr_connecting &= ~(1 << type); + route->ksnr_connected |= (1 << type); + route->ksnr_conn_count++; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; peer = route->ksnr_peer; @@ -753,6 +807,12 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, conn->ksnc_peer = peer; atomic_inc (&peer->ksnp_refcount); + peer->ksnp_last_alive = jiffies; + peer->ksnp_error = 0; + + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_tunables.ksnd_io_timeout * HZ; list_add (&conn->ksnc_list, &peer->ksnp_conns); atomic_inc (&conn->ksnc_refcount); @@ -780,8 +840,15 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_queue_tx_locked (tx, conn); } + rc = ksocknal_close_stale_conns_locked (peer, incarnation); + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + if (rc != 0) + CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", + rc, conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr)); + if (bind_irq) /* irq binding required */ ksocknal_bind_irq (irq); @@ -789,22 +856,23 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_data_ready (sock->sk, 0); ksocknal_write_space (sock->sk); - CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", - conn, conn->ksnc_peer->ksnp_nid); + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64" ip %d.%d.%d.%d\n", + conn, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr)); ksocknal_put_conn (conn); return (0); } void -ksocknal_close_conn_locked (ksock_conn_t *conn) +ksocknal_close_conn_locked (ksock_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and queues the - * connection for the reaper to terminate. + * connection for the reaper to terminate. * Caller holds ksnd_global_lock exclusively in irq context */ ksock_peer_t *peer = conn->ksnc_peer; ksock_route_t *route; + LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; atomic_inc (&ksocknal_data.ksnd_nclosing_conns); @@ -812,53 +880,42 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) route = conn->ksnc_route; if (route != NULL) { /* dissociate conn from route... */ - LASSERT (!route->ksnr_connecting && - !route->ksnr_deleted); + LASSERT (!route->ksnr_deleted); + LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0); + LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); - route->ksnr_conn = NULL; + route->ksnr_connected &= ~(1 << conn->ksnc_type); conn->ksnc_route = NULL; + list_del (&route->ksnr_list); /* make route least favourite */ + list_add_tail (&route->ksnr_list, &peer->ksnp_routes); + ksocknal_put_route (route); /* drop conn's ref on route */ - ksocknal_put_conn (conn); /* drop route's ref on conn */ } /* ksnd_deathrow_conns takes over peer's ref */ list_del (&conn->ksnc_list); - if (list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) { - /* I've just closed last conn belonging to a - * non-autoconnecting peer */ - ksocknal_unlink_peer_locked (peer); + if (list_empty (&peer->ksnp_conns)) { + /* No more connections to this peer */ + + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } } spin_lock (&ksocknal_data.ksnd_reaper_lock); list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); - if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) - wake_up (&ksocknal_data.ksnd_reaper_waitq); + wake_up (&ksocknal_data.ksnd_reaper_waitq); spin_unlock (&ksocknal_data.ksnd_reaper_lock); } -int -ksocknal_close_conn_unlocked (ksock_conn_t *conn) -{ - unsigned long flags; - int did_it = 0; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - if (!conn->ksnc_closing) { - did_it = 1; - ksocknal_close_conn_locked (conn); - } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - return (did_it); -} - void ksocknal_terminate_conn (ksock_conn_t *conn) { @@ -867,12 +924,35 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ unsigned long flags; + ksock_peer_t *peer = conn->ksnc_peer; + ksock_sched_t *sched = conn->ksnc_scheduler; + struct timeval now; + time_t then = 0; + int notify = 0; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_irqsave(&sched->kss_lock, flags); + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)){ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); /* serialise with callbacks */ write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - LASSERT (conn->ksnc_closing); - /* Remove conn's network callbacks. * NB I _have_ to restore the callback, rather than storing a noop, * since the socket could survive past this module being unloaded!! */ @@ -884,8 +964,20 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * sk_user_data is NULL. */ conn->ksnc_sock->sk->sk_user_data = NULL; + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... */ conn->ksnc_scheduler->kss_nconns--; + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT (list_empty (&peer->ksnp_conns)); + + /* convert peer's last-known-alive timestamp from jiffies */ + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; + notify = 1; + } + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); /* The socket is closed on the final put; either here, or in @@ -894,6 +986,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ ksocknal_putconnsock (conn); + + if (notify) + kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, + 0, then); } void @@ -906,28 +1002,16 @@ ksocknal_destroy_conn (ksock_conn_t *conn) LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); -#if SOCKNAL_ZC - LASSERT (list_empty (&conn->ksnc_tx_pending)); -#endif - /* complete queued packets */ - while (!list_empty (&conn->ksnc_tx_queue)) { - ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, - ksock_tx_t, tx_list); - - CERROR ("Deleting packet type %d len %d ("LPX64"->"LPX64")\n", - NTOH__u32 (tx->tx_hdr->type), - NTOH__u32 (PTL_HDR_LENGTH(tx->tx_hdr)), - NTOH__u64 (tx->tx_hdr->src_nid), - NTOH__u64 (tx->tx_hdr->dest_nid)); - - list_del (&tx->tx_list); - ksocknal_tx_done (tx, 0); - } + LASSERT (list_empty(&conn->ksnc_tx_queue)); /* complete current receive if any */ switch (conn->ksnc_rx_state) { case SOCKNAL_RX_BODY: - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie); + CERROR("Completing partial receive from "LPX64 + ", ip %d.%d.%d.%d:%d, with error\n", + conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); break; case SOCKNAL_RX_BODY_FWD: ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); @@ -962,26 +1046,86 @@ ksocknal_put_conn (ksock_conn_t *conn) spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - if (waitqueue_active (&ksocknal_data.ksnd_reaper_waitq)) - wake_up (&ksocknal_data.ksnd_reaper_waitq); + wake_up (&ksocknal_data.ksnd_reaper_waitq); spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); } int -ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) +ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why) { - unsigned long flags; ksock_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + if (ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr) { + count++; + ksocknal_close_conn_locked (conn, why); + } + } + + return (count); +} + +int +ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation) +{ + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_incarnation == incarnation) + continue; + + CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d " + "incarnation:"LPX64"("LPX64")\n", + peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_incarnation, incarnation); + + count++; + ksocknal_close_conn_locked (conn, -ESTALE); + } + + return (count); +} + +int +ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) +{ + ksock_peer_t *peer = conn->ksnc_peer; + __u32 ipaddr = conn->ksnc_ipaddr; + unsigned long flags; + int count; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + + count = ksocknal_close_peer_conns_locked (peer, ipaddr, why); + + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + + return (count); +} + +int +ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) +{ + unsigned long flags; ksock_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; int hi; int i; - int rc = -ENOENT; + int count = 0; write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); @@ -1000,24 +1144,35 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) continue; - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - - conn = list_entry (ctmp, ksock_conn_t, - ksnc_list); - - if (!(ipaddr == 0 || - conn->ksnc_ipaddr == ipaddr)) - continue; - - rc = 0; - ksocknal_close_conn_locked (conn); - } + count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0); } } write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - return (rc); + /* wildcards always succeed */ + if (nid == PTL_NID_ANY || ipaddr == 0) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +void +ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + + CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns (gw_nid, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) @@ -1155,83 +1310,93 @@ ksocknal_push (ptl_nid_t nid) } int -ksocknal_cmd(struct portal_ioctl_data * data, void * private) +ksocknal_cmd(struct portals_cfg *pcfg, void * private) { int rc = -EINVAL; - LASSERT (data != NULL); + LASSERT (pcfg != NULL); - switch(data->ioc_nal_cmd) { + switch(pcfg->pcfg_command) { case NAL_CMD_GET_AUTOCONN: { - ksock_route_t *route = ksocknal_get_route_by_idx (data->ioc_count); + ksock_route_t *route = ksocknal_get_route_by_idx (pcfg->pcfg_count); if (route == NULL) rc = -ENOENT; else { rc = 0; - data->ioc_nid = route->ksnr_peer->ksnp_nid; - data->ioc_id = route->ksnr_ipaddr; - data->ioc_misc = route->ksnr_port; - data->ioc_count = route->ksnr_generation; - data->ioc_size = route->ksnr_buffer_size; - data->ioc_wait = route->ksnr_sharecount; - data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | - (route->ksnr_xchange_nids ? 2 : 0) | - (route->ksnr_irq_affinity ? 4 : 0); + pcfg->pcfg_nid = route->ksnr_peer->ksnp_nid; + pcfg->pcfg_id = route->ksnr_ipaddr; + pcfg->pcfg_misc = route->ksnr_port; + pcfg->pcfg_count = route->ksnr_conn_count; + pcfg->pcfg_size = route->ksnr_buffer_size; + pcfg->pcfg_wait = route->ksnr_sharecount; + pcfg->pcfg_flags = (route->ksnr_irq_affinity ? 2 : 0) | + (route->ksnr_eager ? 4 : 0); ksocknal_put_route (route); } break; } case NAL_CMD_ADD_AUTOCONN: { - rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, - data->ioc_misc, data->ioc_size, - (data->ioc_flags & 1) != 0, - (data->ioc_flags & 2) != 0, - (data->ioc_flags & 4) != 0, - (data->ioc_flags & 8) != 0); + rc = ksocknal_add_route (pcfg->pcfg_nid, pcfg->pcfg_id, + pcfg->pcfg_misc, pcfg->pcfg_size, + (pcfg->pcfg_flags & 0x02) != 0, + (pcfg->pcfg_flags & 0x04) != 0, + (pcfg->pcfg_flags & 0x08) != 0); break; } case NAL_CMD_DEL_AUTOCONN: { - rc = ksocknal_del_route (data->ioc_nid, data->ioc_id, - (data->ioc_flags & 1) != 0, - (data->ioc_flags & 2) != 0); + rc = ksocknal_del_route (pcfg->pcfg_nid, pcfg->pcfg_id, + (pcfg->pcfg_flags & 1) != 0, + (pcfg->pcfg_flags & 2) != 0); break; } case NAL_CMD_GET_CONN: { - ksock_conn_t *conn = ksocknal_get_conn_by_idx (data->ioc_count); + ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count); if (conn == NULL) rc = -ENOENT; else { rc = 0; - data->ioc_nid = conn->ksnc_peer->ksnp_nid; - data->ioc_id = conn->ksnc_ipaddr; - data->ioc_misc = conn->ksnc_port; + pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; + pcfg->pcfg_id = conn->ksnc_ipaddr; + pcfg->pcfg_misc = conn->ksnc_port; + pcfg->pcfg_flags = conn->ksnc_type; ksocknal_put_conn (conn); } break; } case NAL_CMD_REGISTER_PEER_FD: { - struct socket *sock = sockfd_lookup (data->ioc_fd, &rc); + struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc); + int type = pcfg->pcfg_misc; - if (sock != NULL) { - rc = ksocknal_create_conn (data->ioc_nid, NULL, - sock, data->ioc_flags); - if (rc != 0) - fput (sock->file); + if (sock == NULL) + break; + + switch (type) { + case SOCKNAL_CONN_NONE: + case SOCKNAL_CONN_ANY: + case SOCKNAL_CONN_CONTROL: + case SOCKNAL_CONN_BULK_IN: + case SOCKNAL_CONN_BULK_OUT: + rc = ksocknal_create_conn(NULL, sock, pcfg->pcfg_flags, type); + default: + break; } + if (rc != 0) + fput (sock->file); break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_conn (data->ioc_nid, data->ioc_id); + rc = ksocknal_close_matching_conns (pcfg->pcfg_nid, + pcfg->pcfg_id); break; } case NAL_CMD_REGISTER_MYNID: { - rc = ksocknal_set_mynid (data->ioc_nid); + rc = ksocknal_set_mynid (pcfg->pcfg_nid); break; } case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push (data->ioc_nid); + rc = ksocknal_push (pcfg->pcfg_nid); break; } } @@ -1240,30 +1405,36 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) } void -ksocknal_free_buffers (void) +ksocknal_free_fmbs (ksock_fmb_pool_t *p) { - if (ksocknal_data.ksnd_fmbs != NULL) { - ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs; - int i; - int j; - - for (i = 0; - i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); - i++, fmb++) - for (j = 0; j < fmb->fmb_npages; j++) - if (fmb->fmb_pages[j] != NULL) - __free_page (fmb->fmb_pages[j]); - - PORTAL_FREE (ksocknal_data.ksnd_fmbs, - sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS)); + int npages = p->fmp_buff_pages; + ksock_fmb_t *fmb; + int i; + + LASSERT (list_empty(&p->fmp_blocked_conns)); + LASSERT (p->fmp_nactive_fmbs == 0); + + while (!list_empty(&p->fmp_idle_fmbs)) { + + fmb = list_entry(p->fmp_idle_fmbs.next, + ksock_fmb_t, fmb_list); + + for (i = 0; i < npages; i++) + if (fmb->fmb_kiov[i].kiov_page != NULL) + __free_page(fmb->fmb_kiov[i].kiov_page); + + list_del(&fmb->fmb_list); + PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); } +} + +void +ksocknal_free_buffers (void) +{ + ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp); + ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp); - LASSERT (ksocknal_data.ksnd_active_ltxs == 0); - if (ksocknal_data.ksnd_ltxs != NULL) - PORTAL_FREE (ksocknal_data.ksnd_ltxs, - sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + - SOCKNAL_NNBLK_LTXS)); + LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0); if (ksocknal_data.ksnd_schedulers != NULL) PORTAL_FREE (ksocknal_data.ksnd_schedulers, @@ -1274,26 +1445,57 @@ ksocknal_free_buffers (void) ksocknal_data.ksnd_peer_hash_size); } -void /*__exit*/ -ksocknal_module_fini (void) +void +ksocknal_api_shutdown (nal_t *nal) { int i; + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); + LASSERT(nal == &ksocknal_api); + switch (ksocknal_data.ksnd_init) { default: LASSERT (0); case SOCKNAL_INIT_ALL: - kportal_nal_unregister(SOCKNAL); - PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + libcfs_nal_cmd_unregister(SOCKNAL); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; /* fall through */ - case SOCKNAL_INIT_PTL: - PtlNIFini(ksocknal_ni); + case SOCKNAL_INIT_LIB: + /* No more calls to ksocknal_cmd() to create new + * autoroutes/connections since we're being unloaded. */ + + /* Delete all autoroute entries */ + ksocknal_del_route(PTL_NID_ANY, 0, 0, 0); + + /* Delete all connections */ + ksocknal_close_matching_conns (PTL_NID_ANY, 0); + + /* Wait for all peer state to clean up */ + i = 2; + while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + atomic_read (&ksocknal_data.ksnd_npeers)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + /* Tell lib we've stopped calling into her. */ lib_fini(&ksocknal_lib); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; /* fall through */ case SOCKNAL_INIT_DATA: @@ -1304,6 +1506,7 @@ ksocknal_module_fini (void) for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); } + LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns)); LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); @@ -1340,6 +1543,8 @@ ksocknal_module_fini (void) kpr_deregister (&ksocknal_data.ksnd_router); ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; /* fall through */ case SOCKNAL_INIT_NOTHING: @@ -1349,41 +1554,59 @@ ksocknal_module_fini (void) CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); } -int __init -ksocknal_module_init (void) +void +ksocknal_init_incarnation (void) { - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; - int j; + struct timeval tv; - /* packet descriptor must fit in a router descriptor's scratchpad */ - LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. Hopefully + * we won't be able to reboot more frequently than 1MHz for the + * forseeable future :) */ + + do_gettimeofday(&tv); + + ksocknal_data.ksnd_incarnation = + (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; +} - LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); +int +ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) +{ + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; - ksocknal_api.forward = ksocknal_api_forward; - ksocknal_api.shutdown = ksocknal_api_shutdown; - ksocknal_api.yield = ksocknal_api_yield; - ksocknal_api.validate = NULL; /* our api validate is a NOOP */ - ksocknal_api.lock = ksocknal_api_lock; - ksocknal_api.unlock = ksocknal_api_unlock; - ksocknal_api.nal_data = &ksocknal_data; + LASSERT (nal == &ksocknal_api); - ksocknal_lib.nal_data = &ksocknal_data; + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = ksocknal_lib.ni.actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + ksocknal_init_incarnation(); + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; PORTAL_ALLOC (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); if (ksocknal_data.ksnd_peers == NULL) - RETURN (-ENOMEM); + return (-ENOMEM); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); @@ -1392,21 +1615,20 @@ ksocknal_module_init (void) ksocknal_data.ksnd_nal_cb = &ksocknal_lib; spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); - + init_waitqueue_head(&ksocknal_data.ksnd_yield_waitq); + spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); - - spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list); - INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list); - init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); + ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); @@ -1424,8 +1646,8 @@ ksocknal_module_init (void) PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_module_fini (); - RETURN(-ENOMEM); + ksocknal_api_shutdown (&ksocknal_api); + return (-ENOMEM); } for (i = 0; i < SOCKNAL_N_SCHED; i++) { @@ -1440,39 +1662,19 @@ ksocknal_module_init (void) init_waitqueue_head (&kss->kss_waitq); } - CDEBUG (D_MALLOC, "ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), - sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); - - PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, - sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS)); - if (ksocknal_data.ksnd_ltxs == NULL) { - ksocknal_module_fini (); - return (-ENOMEM); - } - - /* Deterministic bugs please */ - memset (ksocknal_data.ksnd_ltxs, 0xeb, - sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); - - for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { - ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; - - ltx->ltx_tx.tx_hdr = <x->ltx_hdr; - ltx->ltx_idle = i < SOCKNAL_NLTXS ? - &ksocknal_data.ksnd_idle_ltx_list : - &ksocknal_data.ksnd_idle_nblk_ltx_list; - list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); - } - - rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); - if (rc != 0) { - CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); - ksocknal_module_fini (); - RETURN (rc); + /* NB we have to wait to be told our true NID... */ + process_id.pid = 0; + process_id.nid = 0; + + rc = lib_init(&ksocknal_lib, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + ksocknal_api_shutdown (&ksocknal_api); + return (rc); } - PtlNIDebug(ksocknal_ni, ~0); - ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called for (i = 0; i < SOCKNAL_N_SCHED; i++) { rc = ksocknal_thread_start (ksocknal_scheduler, @@ -1480,8 +1682,8 @@ ksocknal_module_init (void) if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_module_fini (); - RETURN (rc); + ksocknal_api_shutdown (&ksocknal_api); + return (rc); } } @@ -1489,16 +1691,16 @@ ksocknal_module_init (void) rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_module_fini (); - RETURN (rc); + ksocknal_api_shutdown (&ksocknal_api); + return (rc); } } rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_module_fini (); - RETURN (rc); + ksocknal_api_shutdown (&ksocknal_api); + return (rc); } rc = kpr_register(&ksocknal_data.ksnd_router, @@ -1507,68 +1709,127 @@ ksocknal_module_init (void) CDEBUG(D_NET, "Can't initialise routing interface " "(rc = %d): not routing\n", rc); } else { - /* Only allocate forwarding buffers if I'm on a gateway */ - - PORTAL_ALLOC(ksocknal_data.ksnd_fmbs, - sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS)); - if (ksocknal_data.ksnd_fmbs == NULL) { - ksocknal_module_fini (); - RETURN(-ENOMEM); - } - - /* NULL out buffer pointers etc */ - memset(ksocknal_data.ksnd_fmbs, 0, - sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS)); + /* Only allocate forwarding buffers if there's a router */ for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb = - &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i]; - - if (i < SOCKNAL_SMALL_FWD_NMSGS) { - fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; - } else { - fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; + ksock_fmb_t *fmb; + ksock_fmb_pool_t *pool; + + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; + + PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, + fmb_kiov[pool->fmp_buff_pages])); + if (fmb == NULL) { + ksocknal_api_shutdown(&ksocknal_api); + return (-ENOMEM); } - LASSERT (fmb->fmb_npages > 0); - for (j = 0; j < fmb->fmb_npages; j++) { - fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + fmb->fmb_pool = pool; + + for (j = 0; j < pool->fmp_buff_pages; j++) { + fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); - if (fmb->fmb_pages[j] == NULL) { - ksocknal_module_fini (); + if (fmb->fmb_kiov[j].kiov_page == NULL) { + ksocknal_api_shutdown (&ksocknal_api); return (-ENOMEM); } - LASSERT(page_address (fmb->fmb_pages[j]) != - NULL); + LASSERT(page_address(fmb->fmb_kiov[j].kiov_page) != NULL); } - list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); } } - rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_module_fini (); + ksocknal_api_shutdown (&ksocknal_api); return (rc); } - PORTAL_SYMBOL_REGISTER(ksocknal_ni); - /* flag everything initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " - "mem %d)\n", + printk(KERN_INFO "Lustre: Routing socket NAL loaded " + "(Routing %s, initial mem %d, incarnation "LPX64")\n", kpr_routing (&ksocknal_data.ksnd_router) ? - "enabled" : "disabled", pkmem); + "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation); + + return (0); +} +void __exit +ksocknal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); +#endif + PtlNIFini(ksocknal_ni); + + ptl_unregister_nal(SOCKNAL); +} + +int __init +ksocknal_module_init (void) +{ + int rc; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* the following must be sizeof(int) for proc_dointvec() */ + LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int)); + LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int)); +#if SOCKNAL_ZC + LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int)); +#endif + /* check ksnr_connected/connecting field large enough */ + LASSERT(SOCKNAL_CONN_NTYPES <= 4); + + ksocknal_api.startup = ksocknal_api_startup; + ksocknal_api.forward = ksocknal_api_forward; + ksocknal_api.shutdown = ksocknal_api_shutdown; + ksocknal_api.lock = ksocknal_api_lock; + ksocknal_api.unlock = ksocknal_api_unlock; + ksocknal_api.nal_data = &ksocknal_data; + + ksocknal_lib.nal_data = &ksocknal_data; + + /* Initialise dynamic tunables to defaults once only */ + ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; + ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK; + ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; + ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK; +#if SOCKNAL_ZC + ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; +#endif + + rc = ptl_register_nal(SOCKNAL, &ksocknal_api); + if (rc != PTL_OK) { + CERROR("Can't register SOCKNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(SOCKNAL, 0, NULL, NULL, &ksocknal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(SOCKNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + ksocknal_tunables.ksnd_sysctl = + register_sysctl_table (ksocknal_top_ctl_table, 0); +#endif return (0); } @@ -1579,4 +1840,3 @@ MODULE_LICENSE("GPL"); module_init(ksocknal_module_init); module_exit(ksocknal_module_fini); -EXPORT_SYMBOL (ksocknal_ni);