From 27c8aafa3a4aab5674aceabf04414d4a44265cd7 Mon Sep 17 00:00:00 2001 From: eeb Date: Wed, 1 Jun 2005 21:23:21 +0000 Subject: [PATCH] * Added interface and socket queries to libcfs (libcfs/linux-tcpip.c). Some of this came from socknal_lib-linux.c but was generalised a little so all the NALs that use TCP/IP to do connection establishment can use it. CAVEAT EMPTOR! Just the linux versions are done; the darwin versions are not implemented yet. Changed socknal and ranal to use this. * Brought ranal up-to-date with new config + mod params for all tunables. Ranal gets its local address from the IP of a (single) specified interface, or the first "suitable" one found if no interface is specified. --- lnet/include/libcfs/libcfs.h | 19 + lnet/include/libcfs/linux/Makefile.am | 4 +- lnet/include/libcfs/linux/libcfs.h | 1 + lnet/include/libcfs/linux/linux-tcpip.h | 58 +++ lnet/include/lnet/lib-lnet.h | 1 + lnet/include/lnet/lib-p30.h | 1 + lnet/klnds/ralnd/Makefile.in | 2 +- lnet/klnds/ralnd/ralnd.c | 728 +++++++++----------------------- lnet/klnds/ralnd/ralnd.h | 76 ++-- lnet/klnds/ralnd/ralnd_cb.c | 140 ++++-- lnet/klnds/ralnd/ralnd_modparams.c | 149 +++++++ lnet/klnds/socklnd/socklnd.c | 258 ++++++++--- lnet/klnds/socklnd/socklnd.h | 28 +- lnet/klnds/socklnd/socklnd_cb.c | 93 ++-- lnet/klnds/socklnd/socklnd_lib-linux.c | 547 ++---------------------- lnet/klnds/socklnd/socklnd_lib-linux.h | 19 - lnet/libcfs/Makefile.in | 2 +- lnet/libcfs/linux/Makefile.am | 2 +- lnet/libcfs/linux/linux-tcpip.c | 656 ++++++++++++++++++++++++++++ lnet/libcfs/nidstrings.c | 9 +- lnet/lnet/config.c | 74 ++++ lnet/utils/portals.c | 41 +- 22 files changed, 1601 insertions(+), 1307 deletions(-) create mode 100644 lnet/include/libcfs/linux/linux-tcpip.h create mode 100644 lnet/klnds/ralnd/ralnd_modparams.c create mode 100644 lnet/libcfs/linux/linux-tcpip.c diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index ec8d5b1..c77a495 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -273,6 +273,25 @@ struct libcfs_ioctl_handler { int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand); int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand); +/* libcfs tcpip */ +int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask); +int libcfs_ipif_enumerate(char ***names); +void libcfs_ipif_free_enumeration(char **names, int n); +int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog); +int libcfs_sock_accept(struct socket **newsockp, struct socket *sock, + int bufsize); +void libcfs_sock_abort_accept(struct socket *sock); +int libcfs_sock_connect(struct socket **sockp, int *fatal, int bufsize, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port); +int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize); +int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize); +int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout); +int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout); +void libcfs_sock_release(struct socket *sock); + +void libcfs_pause(cfs_duration_t ticks); + /* libcfs watchdogs */ struct lc_watchdog; diff --git a/lnet/include/libcfs/linux/Makefile.am b/lnet/include/libcfs/linux/Makefile.am index 159cf57..6d1e241 100644 --- a/lnet/include/libcfs/linux/Makefile.am +++ b/lnet/include/libcfs/linux/Makefile.am @@ -1,3 +1,3 @@ EXTRA_DIST := kp30.h libcfs.h linux-fs.h linux-lock.h linux-mem.h \ - linux-prim.h linux-time.h lltrace.h portals_compat25.h \ - portals_lib.h portals_utils.h + linux-prim.h linux-time.h linux-tcpip.h lltrace.h \ + portals_compat25.h portals_lib.h portals_utils.h diff --git a/lnet/include/libcfs/linux/libcfs.h b/lnet/include/libcfs/linux/libcfs.h index cd48871c..e62ac48 100644 --- a/lnet/include/libcfs/linux/libcfs.h +++ b/lnet/include/libcfs/linux/libcfs.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef HAVE_ASM_TYPES_H #include diff --git a/lnet/include/libcfs/linux/linux-tcpip.h b/lnet/include/libcfs/linux/linux-tcpip.h new file mode 100644 index 0000000..fce2ede --- /dev/null +++ b/lnet/include/libcfs/linux/linux-tcpip.h @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_LINUX_CFS_TCP_H__ +#define __LIBCFS_LINUX_CFS_TCP_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) +# define sk_allocation allocation +# define sk_data_ready data_ready +# define sk_write_space write_space +# define sk_user_data user_data +# define sk_prot prot +# define sk_sndbuf sndbuf +# define sk_socket socket +# define sk_sleep sleep +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) +# define sk_wmem_queued wmem_queued +# define sk_err err +#endif + +#define SOCK_SNDBUF(so) ((so)->sk->sk_sndbuf) +#define SOCK_WMEM_QUEUED(so) ((so)->sk->sk_wmem_queued) +#define SOCK_ERROR(so) ((so)->sk->sk_err) +#define SOCK_TEST_NOSPACE(so) test_bit(SOCK_NOSPACE, &(so)->flags) + +#endif + +#endif diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 61e6dc0..ef8d896 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -460,6 +460,7 @@ extern void ptl_md_deconstruct(ptl_libmd_t *lmd, ptl_md_t *umd); #ifdef __KERNEL__ extern void ptl_register_nal(ptl_nal_t *nal); extern void ptl_unregister_nal(ptl_nal_t *nal); +extern ptl_err_t ptl_set_ip_niaddr (ptl_ni_t *ni); #endif extern ptl_err_t ptl_parse_routes (char *route_str); diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h index 61e6dc0..ef8d896 100644 --- a/lnet/include/lnet/lib-p30.h +++ b/lnet/include/lnet/lib-p30.h @@ -460,6 +460,7 @@ extern void ptl_md_deconstruct(ptl_libmd_t *lmd, ptl_md_t *umd); #ifdef __KERNEL__ extern void ptl_register_nal(ptl_nal_t *nal); extern void ptl_unregister_nal(ptl_nal_t *nal); +extern ptl_err_t ptl_set_ip_niaddr (ptl_ni_t *ni); #endif extern ptl_err_t ptl_parse_routes (char *route_str); diff --git a/lnet/klnds/ralnd/Makefile.in b/lnet/klnds/ralnd/Makefile.in index 1772cc2..439f902 100644 --- a/lnet/klnds/ralnd/Makefile.in +++ b/lnet/klnds/ralnd/Makefile.in @@ -1,5 +1,5 @@ MODULES := kranal -kranal-objs := ranal.o ranal_cb.o +kranal-objs := ranal.o ranal_cb.o ranal_modparams.o EXTRA_POST_CFLAGS := @RACPPFLAGS@ diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index e48d5e2..a091b21 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -22,8 +22,8 @@ */ #include "ranal.h" -static int kranal_devids[] = {RAPK_MAIN_DEVICE_ID, - RAPK_EXPANSION_DEVICE_ID}; +static int kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID, + RAPK_EXPANSION_DEVICE_ID}; ptl_nal_t kranal_nal = { .nal_type = RANAL, @@ -37,190 +37,6 @@ ptl_nal_t kranal_nal = { }; kra_data_t kranal_data; -kra_tunables_t kranal_tunables; - -#define RANAL_SYSCTL_TIMEOUT 1 -#define RANAL_SYSCTL_LISTENER_TIMEOUT 2 -#define RANAL_SYSCTL_BACKLOG 3 -#define RANAL_SYSCTL_PORT 4 -#define RANAL_SYSCTL_MAX_IMMEDIATE 5 - -#define RANAL_SYSCTL 202 - -static ctl_table kranal_ctl_table[] = { - {RANAL_SYSCTL_TIMEOUT, "timeout", - &kranal_tunables.kra_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", - &kranal_tunables.kra_listener_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_BACKLOG, "backlog", - &kranal_tunables.kra_backlog, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_PORT, "port", - &kranal_tunables.kra_port, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_MAX_IMMEDIATE, "max_immediate", - &kranal_tunables.kra_max_immediate, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table kranal_top_ctl_table[] = { - {RANAL_SYSCTL, "ranal", NULL, 0, 0555, kranal_ctl_table}, - { 0 } -}; - -int -kranal_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - - /* We've set up the socket's send buffer to be large enough for - * everything we send, so a single non-blocking send should - * complete without error. */ - - set_fs(KERNEL_DS); - rc = sock_sendmsg(sock, &msg, iov.iov_len); - set_fs(oldmm); - - if (rc == nob) - return 0; - - if (rc >= 0) - return -EAGAIN; - - return rc; -} - -int -kranal_sock_read (struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - mm_segment_t oldmm = get_fs(); - long ticks = timeout * HZ; - unsigned long then; - struct timeval tv; - - LASSERT (nob > 0); - LASSERT (ticks > 0); - - for (;;) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - /* Set receive timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = ticks / HZ, - .tv_usec = ((ticks % HZ) * 1000000) / HZ - }; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set socket recv timeout %d: %d\n", - timeout, rc); - return rc; - } - - set_fs(KERNEL_DS); - then = jiffies; - rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); - ticks -= jiffies - then; - set_fs(oldmm); - - if (rc < 0) - return rc; - - if (rc == 0) - return -ECONNABORTED; - - buffer = ((char *)buffer) + rc; - nob -= rc; - - if (nob == 0) - return 0; - - if (ticks <= 0) - return -ETIMEDOUT; - } -} - -int -kranal_create_sock(struct socket **sockp) -{ - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - /* Ensure sending connection info doesn't block */ - option = 2 * sizeof(kra_connreq_t); - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set send buffer %d: %d\n", option, rc); - goto failed; - } - - option = 1; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR: %d\n", rc); - goto failed; - } - - *sockp = sock; - return 0; - - failed: - sock_release(sock); - return rc; -} - -void -kranal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} void kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid) @@ -247,7 +63,7 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout) { int rc; - rc = kranal_sock_read(sock, connreq, sizeof(*connreq), timeout); + rc = libcfs_sock_read(sock, connreq, sizeof(*connreq), timeout); if (rc != 0) { CERROR("Read failed: %d\n", rc); return rc; @@ -431,7 +247,7 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev) kranal_set_conn_uniqueness(conn); conn->rac_device = dev; - conn->rac_timeout = MAX(kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); + conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); kranal_update_reaper_timeout(conn->rac_timeout); rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid, @@ -616,7 +432,7 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, } rc = kranal_recv_connreq(sock, &rx_connreq, - kranal_tunables.kra_listener_timeout); + *kranal_tunables.kra_listener_timeout); if (rc != 0) { CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rc); @@ -640,7 +456,7 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid); - rc = kranal_sock_write(sock, &tx_connreq, sizeof(tx_connreq)); + rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), 0); if (rc != 0) { CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rc); @@ -663,63 +479,32 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, int ranal_connect_sock(kra_peer_t *peer, struct socket **sockp) { - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; unsigned int port; + int fatal; int rc; for (port = 1023; port >= 512; port--) { - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(port); - locaddr.sin_addr.s_addr = htonl(INADDR_ANY); - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (peer->rap_port); - srvaddr.sin_addr.s_addr = htonl (peer->rap_ip); - - rc = kranal_create_sock(&sock); - if (rc != 0) - return rc; - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc != 0) { - sock_release(sock); - - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", port); - continue; - } - - CERROR("Can't bind to reserved port %d: %d\n", port, rc); - return rc; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - 0); - if (rc == 0) { - *sockp = sock; + rc = libcfs_sock_connect(sockp, &fatal, + 2 * sizeof(kra_conn_t), + 0, port, + peer->rap_ip, peer->rap_port); + if (rc == 0) return 0; + + if (!fatal) { + CDEBUG(D_NET, "Port %d already in use\n", port); + continue; } - sock_release(sock); - - if (rc != -EADDRNOTAVAIL) { - CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port, rc); - return rc; - } - - CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port); + CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n", + port, HIPQUAD(peer->rap_ip), peer->rap_port, rc); + return rc; } /* all ports busy */ + CERROR("Can't connect to %u.%u.%u.%u/%d: all ports busy\n", + HIPQUAD(peer->rap_ip), peer->rap_port); return -EHOSTUNREACH; } @@ -754,21 +539,21 @@ kranal_active_conn_handshake(kra_peer_t *peer, * immediately after accepting a connection, so we connect and then * send immediately. */ - rc = kranal_sock_write(sock, &connreq, sizeof(connreq)); + rc = libcfs_sock_write(sock, &connreq, sizeof(connreq), 0); if (rc != 0) { CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer->rap_ip), peer->rap_port, rc); goto failed_1; } - rc = kranal_recv_connreq(sock, &connreq, kranal_tunables.kra_timeout); + rc = kranal_recv_connreq(sock, &connreq, *kranal_tunables.kra_timeout); if (rc != 0) { CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer->rap_ip), peer->rap_port, rc); goto failed_1; } - sock_release(sock); + libcfs_sock_release(sock); rc = -EPROTO; if (connreq.racr_srcnid != peer->rap_nid) { @@ -797,7 +582,7 @@ kranal_active_conn_handshake(kra_peer_t *peer, return 0; failed_1: - sock_release(sock); + libcfs_sock_release(sock); failed_0: kranal_conn_decref(conn); return rc; @@ -846,9 +631,10 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) return rc; /* assume this is a new peer */ - peer = kranal_create_peer(peer_nid); - if (peer == NULL) { - CERROR("Can't allocate peer for "LPX64"\n", peer_nid); + rc = kranal_create_peer(&peer, peer_nid); + if (rc != 0) { + CERROR("Can't create conn for %s\n", + libcfs_nid2str(peer_nid)); kranal_conn_decref(conn); return -ENOMEM; } @@ -971,7 +757,8 @@ kranal_connect (kra_peer_t *peer) LASSERT (list_empty(&peer->rap_tx_queue)); /* reset reconnection timeouts */ - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; + peer->rap_reconnect_interval = + *kranal_tunables.kra_min_reconnect_interval; peer->rap_reconnect_time = CURRENT_SECONDS; write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); @@ -980,8 +767,9 @@ kranal_connect (kra_peer_t *peer) LASSERT (peer->rap_reconnect_interval != 0); peer->rap_reconnect_time = CURRENT_SECONDS + peer->rap_reconnect_interval; - peer->rap_reconnect_interval = MAX(RANAL_MAX_RECONNECT_INTERVAL, - 1 * peer->rap_reconnect_interval); + peer->rap_reconnect_interval = + MAX(*kranal_tunables.kra_max_reconnect_interval, + 2 * peer->rap_reconnect_interval); /* Grab all blocked packets while we have the global lock */ list_add(&zombies, &peer->rap_tx_queue); @@ -1007,14 +795,13 @@ kranal_connect (kra_peer_t *peer) void kranal_free_acceptsock (kra_acceptsock_t *ras) { - sock_release(ras->ras_sock); + libcfs_sock_release(ras->ras_sock); PORTAL_FREE(ras, sizeof(*ras)); } int kranal_listener (void *arg) { - struct sockaddr_in addr; wait_queue_t wait; struct socket *sock; kra_acceptsock_t *ras; @@ -1023,37 +810,19 @@ kranal_listener (void *arg) int rc; unsigned long flags; - /* Parent thread holds kra_nid_mutex, and is, or is about to - * block on kra_listener_signal */ + /* Parent thread is blocked or about to block on kra_listener_signal */ - port = kranal_tunables.kra_port; + port = *kranal_tunables.kra_port; snprintf(name, sizeof(name), "kranal_lstn%03d", port); kportal_daemonize(name); kportal_blockallsigs(); init_waitqueue_entry(&wait, current); - rc = kranal_create_sock(&sock); + rc = libcfs_sock_listen(&sock, 0, port, + *kranal_tunables.kra_backlog); if (rc != 0) - goto out_0; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(port); - addr.sin_addr.s_addr = INADDR_ANY; - - rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr)); - if (rc != 0) { - CERROR("Can't bind to port %d\n", port); - goto out_1; - } - - rc = sock->ops->listen(sock, kranal_tunables.kra_backlog); - if (rc != 0) { - CERROR("Can't set listen backlog %d: %d\n", - kranal_tunables.kra_backlog, rc); - goto out_1; - } + goto out; LASSERT (kranal_data.kra_listener_sock == NULL); kranal_data.kra_listener_sock = sock; @@ -1071,68 +840,39 @@ kranal_listener (void *arg) if (ras == NULL) { PORTAL_ALLOC(ras, sizeof(*ras)); if (ras == NULL) { - CERROR("Out of Memory: pausing...\n"); - kranal_pause(HZ); + CERROR("ENOMEM: listener pausing\n"); + libcfs_pause(cfs_time_seconds(1)); continue; } - ras->ras_sock = NULL; } - if (ras->ras_sock == NULL) { - ras->ras_sock = sock_alloc(); - if (ras->ras_sock == NULL) { - CERROR("Can't allocate socket: pausing...\n"); - kranal_pause(HZ); - continue; + rc = libcfs_sock_accept(&ras->ras_sock, sock, + 2 * sizeof(kra_conn_t)); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error: %d, listener pausing\n", rc); + libcfs_pause(cfs_time_seconds(1)); } - /* XXX this should add a ref to sock->ops->owner, if - * TCP could be a module */ - ras->ras_sock->type = sock->type; - ras->ras_sock->ops = sock->ops; + continue; } - set_current_state(TASK_INTERRUPTIBLE); - - rc = sock->ops->accept(sock, ras->ras_sock, O_NONBLOCK); + spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - /* Sleep for socket activity? */ - if (rc == -EAGAIN && - kranal_data.kra_listener_shutdown == 0) - schedule(); + list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq); - set_current_state(TASK_RUNNING); - - if (rc == 0) { - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - list_add_tail(&ras->ras_list, - &kranal_data.kra_connd_acceptq); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - wake_up(&kranal_data.kra_connd_waitq); - - ras = NULL; - continue; - } + spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); + wake_up(&kranal_data.kra_connd_waitq); - if (rc != -EAGAIN) { - CERROR("Accept failed: %d, pausing...\n", rc); - kranal_pause(HZ); - } + ras = NULL; } - if (ras != NULL) { - if (ras->ras_sock != NULL) - sock_release(ras->ras_sock); + if (ras != NULL) PORTAL_FREE(ras, sizeof(*ras)); - } rc = 0; - remove_wait_queue(sock->sk->sk_sleep, &wait); - out_1: - sock_release(sock); + libcfs_sock_release(sock); kranal_data.kra_listener_sock = NULL; - out_0: + out: /* set completion status and unblock thread waiting for me * (parent on startup failure, executioner on normal shutdown) */ kranal_data.kra_listener_shutdown = rc; @@ -1149,7 +889,6 @@ kranal_start_listener (void) CDEBUG(D_NET, "Starting listener\n"); - /* Called holding kra_nid_mutex: listener stopped */ LASSERT (kranal_data.kra_listener_sock == NULL); kranal_data.kra_listener_shutdown = 0; @@ -1170,136 +909,33 @@ kranal_start_listener (void) } void -kranal_stop_listener(int clear_acceptq) +kranal_stop_listener(void) { - struct list_head zombie_accepts; - unsigned long flags; - kra_acceptsock_t *ras; - CDEBUG(D_NET, "Stopping listener\n"); - /* Called holding kra_nid_mutex: listener running */ LASSERT (kranal_data.kra_listener_sock != NULL); kranal_data.kra_listener_shutdown = 1; - wake_up_all(kranal_data.kra_listener_sock->sk->sk_sleep); + libcfs_sock_abort_accept(kranal_data.kra_listener_sock); /* Block until listener has torn down. */ down(&kranal_data.kra_listener_signal); LASSERT (kranal_data.kra_listener_sock == NULL); CDEBUG(D_NET, "Listener stopped\n"); - - if (!clear_acceptq) - return; - - /* Close any unhandled accepts */ - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - list_add(&zombie_accepts, &kranal_data.kra_connd_acceptq); - list_del_init(&kranal_data.kra_connd_acceptq); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - while (!list_empty(&zombie_accepts)) { - ras = list_entry(zombie_accepts.next, - kra_acceptsock_t, ras_list); - list_del(&ras->ras_list); - kranal_free_acceptsock(ras); - } } int -kranal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp) +kranal_create_peer (kra_peer_t **peerp, ptl_nid_t nid) { - int *tunable = (int *)table->data; - int old_val; - int rc; - - /* No race with nal initialisation since the nal is setup all the time - * it's loaded. When that changes, change this! */ - LASSERT (kranal_data.kra_init == RANAL_INIT_ALL); - - down(&kranal_data.kra_nid_mutex); - - LASSERT (tunable == &kranal_tunables.kra_port || - tunable == &kranal_tunables.kra_backlog); - old_val = *tunable; - - rc = proc_dointvec(table, write, filp, buffer, lenp); - - if (write && - (*tunable != old_val || - kranal_data.kra_listener_sock == NULL)) { - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(0); - - rc = kranal_start_listener(); - - if (rc != 0) { - CWARN("Unable to start listener with new tunable:" - " reverting to old value\n"); - *tunable = old_val; - kranal_start_listener(); - } - } - - up(&kranal_data.kra_nid_mutex); - - LASSERT (kranal_data.kra_init == RANAL_INIT_ALL); - return rc; -} - -int -kranal_set_mynid(ptl_nid_t nid) -{ - unsigned long flags; - ptl_ni_t *ni = kranal_data.kra_ni; - int rc = 0; - - CDEBUG(D_NET, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_nid); - - down(&kranal_data.kra_nid_mutex); - - if (nid == ni->ni_nid) { - /* no change of NID */ - up(&kranal_data.kra_nid_mutex); - return 0; - } - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(1); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - kranal_data.kra_peerstamp++; - ni->ni_nid = nid; - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* Delete all existing peers and their connections after new - * NID/connstamp set to ensure no old connections in our brave - * new world. */ - kranal_del_peer(PTL_NID_ANY); - - if (nid != PTL_NID_ANY) - rc = kranal_start_listener(); - - up(&kranal_data.kra_nid_mutex); - return rc; -} - -kra_peer_t * -kranal_create_peer (ptl_nid_t nid) -{ - kra_peer_t *peer; + kra_peer_t *peer; + unsigned long flags; LASSERT (nid != PTL_NID_ANY); PORTAL_ALLOC(peer, sizeof(*peer)); if (peer == NULL) - return NULL; + return -ENOMEM; memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -1312,15 +948,33 @@ kranal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD(&peer->rap_tx_queue); peer->rap_reconnect_time = CURRENT_SECONDS; - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; + peer->rap_reconnect_interval = + *kranal_tunables.kra_min_reconnect_interval; - atomic_inc(&kranal_data.kra_npeers); - return peer; + write_lock_irqsave(&kranal_data.kra_global_lock, flags); + + if (kranal_data.kra_listener_shutdown) { + /* shutdown has started already */ + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); + + PORTAL_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + kranal_data.kra_npeers++; + + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); + + *peerp = peer; + return 0; } void kranal_destroy_peer (kra_peer_t *peer) { + unsigned long flags; + CDEBUG(D_NET, "peer "LPX64" %p deleted\n", peer->rap_nid, peer); LASSERT (atomic_read(&peer->rap_refcount) == 0); @@ -1337,7 +991,9 @@ kranal_destroy_peer (kra_peer_t *peer) * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec(&kranal_data.kra_npeers); + write_lock_irqsave(&kranal_data.kra_global_lock, flags); + kranal_data.kra_npeers--; + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); } kra_peer_t * @@ -1432,13 +1088,14 @@ kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) unsigned long flags; kra_peer_t *peer; kra_peer_t *peer2; + int rc; if (nid == PTL_NID_ANY) return -EINVAL; - peer = kranal_create_peer(nid); - if (peer == NULL) - return -ENOMEM; + rc = kranal_create_peer(&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave(&kranal_data.kra_global_lock, flags); @@ -1671,10 +1328,15 @@ kranal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) break; } case IOC_PORTAL_REGISTER_MYNID: { - if (data->ioc_nid == PTL_NID_ANY) + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kranal_set_mynid(data->ioc_nid); + } break; } } @@ -1741,7 +1403,8 @@ kranal_alloc_txdescs(struct list_head *freelist, int n) int kranal_device_init(int id, kra_device_t *dev) { - const int total_ntx = RANAL_NTX + RANAL_NTX_NBLK; + int total_ntx = *kranal_tunables.kra_ntx + + *kranal_tunables.kra_ntx_nblk; RAP_RETURN rrc; dev->rad_id = id; @@ -1762,16 +1425,17 @@ kranal_device_init(int id, kra_device_t *dev) rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND, &dev->rad_rdma_cqh); if (rrc != RAP_SUCCESS) { - CERROR("Can't create rdma cq size %d" - " for device %d: %d\n", total_ntx, id, rrc); + CERROR("Can't create rdma cq size %d for device %d: %d\n", + total_ntx, id, rrc); goto failed_1; } - rrc = RapkCreateCQ(dev->rad_handle, RANAL_FMA_CQ_SIZE, RAP_CQTYPE_RECV, - &dev->rad_fma_cqh); + rrc = RapkCreateCQ(dev->rad_handle, + *kranal_tunables.kra_fma_cq_size, + RAP_CQTYPE_RECV, &dev->rad_fma_cqh); if (rrc != RAP_SUCCESS) { - CERROR("Can't create fma cq size %d" - " for device %d: %d\n", RANAL_FMA_CQ_SIZE, id, rrc); + CERROR("Can't create fma cq size %d for device %d: %d\n", + *kranal_tunables.kra_fma_cq_size, id, rrc); goto failed_2; } @@ -1788,6 +1452,13 @@ kranal_device_init(int id, kra_device_t *dev) void kranal_device_fini(kra_device_t *dev) { + LASSERT (list_empty(&dev->rad_ready_conns)); + LASSERT (list_empty(&dev->rad_new_conns)); + LASSERT (dev->rad_nphysmap == 0); + LASSERT (dev->rad_nppphysmap == 0); + LASSERT (dev->rad_nvirtmap == 0); + LASSERT (dev->rad_nobvirtmap == 0); + LASSERT(dev->rad_scheduler == NULL); RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh); RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh); @@ -1812,42 +1483,62 @@ kranal_shutdown (ptl_ni_t *ni) LBUG(); case RANAL_INIT_ALL: - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kranal_set_mynid(PTL_NID_ANY); - /* no new peers or conns */ + /* Stop listener and prevent new peers from being created */ + kranal_stop_listener(); - /* Wait for all peer/conn state to clean up */ + /* Remove all existing peers from the peer table */ + kranal_del_peer(PTL_NID_ANY); + + /* Wait for pending conn reqs to be handled */ i = 2; - while (atomic_read(&kranal_data.kra_nconns) != 0 || - atomic_read(&kranal_data.kra_npeers) != 0) { + spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); + while (!list_empty(&kranal_data.kra_connd_acceptq)) { + spin_unlock_irqrestore(&kranal_data.kra_connd_lock, + flags); i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers and %d conns to close down\n", - atomic_read(&kranal_data.kra_npeers), - atomic_read(&kranal_data.kra_nconns)); - kranal_pause(HZ); + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ + "waiting for conn reqs to clean up\n"); + libcfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); } + spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); + + /* Wait for all peers to be freed */ + i = 2; + write_lock_irqsave(&kranal_data.kra_global_lock, flags); + while (kranal_data.kra_npeers != 0) { + write_unlock_irqrestore(&kranal_data.kra_global_lock, + flags); + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ + "waiting for %d peers to close down\n", + kranal_data.kra_npeers); + libcfs_pause(cfs_time_seconds(1)); + + write_lock_irqsave(&kranal_data.kra_global_lock, + flags); + } + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); /* fall through */ case RANAL_INIT_DATA: break; } - /* Conn/Peer state all cleaned up BEFORE setting shutdown, so threads - * don't have to worry about shutdown races */ - LASSERT (atomic_read(&kranal_data.kra_nconns) == 0); - LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); + /* Peer state all cleaned up BEFORE setting shutdown, so threads don't + * have to worry about shutdown races. NB connections may be created + * while there are still active connds, but these will be temporary + * since peer creation always fails after the listener has started to + * shut down. */ + LASSERT (kranal_data.kra_npeers == 0); - /* flag threads to terminate; wake and wait for them to die */ + /* Flag threads to terminate */ kranal_data.kra_shutdown = 1; for (i = 0; i < kranal_data.kra_ndevs; i++) { kra_device_t *dev = &kranal_data.kra_devices[i]; - LASSERT (list_empty(&dev->rad_ready_conns)); - LASSERT (list_empty(&dev->rad_new_conns)); - spin_lock_irqsave(&dev->rad_lock, flags); wake_up(&dev->rad_waitq); spin_unlock_irqrestore(&dev->rad_lock, flags); @@ -1862,16 +1553,17 @@ kranal_shutdown (ptl_ni_t *ni) wake_up_all(&kranal_data.kra_connd_waitq); spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); + /* Wait for threads to exit */ i = 2; while (atomic_read(&kranal_data.kra_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read(&kranal_data.kra_nthreads)); - kranal_pause(HZ); + libcfs_pause(cfs_time_seconds(1)); } - LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); + LASSERT (kranal_data.kra_npeers == 0); if (kranal_data.kra_peers != NULL) { for (i = 0; i < kranal_data.kra_peer_hash_size; i++) LASSERT (list_empty(&kranal_data.kra_peers[i])); @@ -1922,6 +1614,11 @@ kranal_startup (ptl_ni_t *ni) CERROR ("Only 1 instance supported\n"); return PTL_FAIL; } + + if (ptl_set_ip_niaddr(ni) != PTL_OK) { + CERROR ("Can't determine my NID\n"); + return PTL_FAIL; + } memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */ @@ -1938,7 +1635,6 @@ kranal_startup (ptl_ni_t *ni) kranal_data.kra_connstamp = kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - init_MUTEX(&kranal_data.kra_nid_mutex); init_MUTEX_LOCKED(&kranal_data.kra_listener_signal); rwlock_init(&kranal_data.kra_global_lock); @@ -1989,11 +1685,13 @@ kranal_startup (ptl_ni_t *ni) for (i = 0; i < kranal_data.kra_conn_hash_size; i++) INIT_LIST_HEAD(&kranal_data.kra_conns[i]); - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, RANAL_NTX); + rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, + *kranal_tunables.kra_ntx); if (rc != 0) goto failed; - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,RANAL_NTX_NBLK); + rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs, + *kranal_tunables.kra_ntx_nblk); if (rc != 0) goto failed; @@ -2003,7 +1701,7 @@ kranal_startup (ptl_ni_t *ni) goto failed; } - for (i = 0; i < RANAL_N_CONND; i++) { + for (i = 0; i < *kranal_tunables.kra_n_connd; i++) { rc = kranal_thread_start(kranal_connd, (void *)(unsigned long)i); if (rc != 0) { CERROR("Can't spawn ranal connd[%d]: %d\n", @@ -2014,51 +1712,18 @@ kranal_startup (ptl_ni_t *ni) LASSERT (kranal_data.kra_ndevs == 0); - if (ni->ni_interfaces[0] == NULL) { - /* Use all available RapidArray devices */ - for (i = 0; i < sizeof(kranal_devids)/sizeof(kranal_devids[0]); i++) { - LASSERT (i < RANAL_MAXDEVS); - - dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; + /* Use all available RapidArray devices */ + for (i = 0; i < RANAL_MAXDEVS; i++) { + dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; - rc = kranal_device_init(kranal_devids[i], dev); - if (rc == 0) - kranal_data.kra_ndevs++; - } - - if (kranal_data.kra_ndevs == 0) { - CERROR("Can't initialise any RapidArray devices\n"); - goto failed; - } - } else { - /* Use specified RapidArray devices */ - for (i = 0; i < PTL_MAX_INTERFACES; i++) { - int devid; - int len; - - if (kranal_data.kra_ndevs == RANAL_MAXDEVS) { - CERROR("Too many interfaces\n"); - goto failed; - } - - dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; - - if (sscanf(ni->ni_interfaces[i], "%d%n", &devid, &len) < 1 || - len != strlen(ni->ni_interfaces[i])) { - CERROR("Can't parse interface '%s'\n", - ni->ni_interfaces[i]); - goto failed; - } - - rc = kranal_device_init(devid, dev); - if (rc != 0) { - CERROR("Can't open interface '%s': %d\n", - ni->ni_interfaces[i], rc); - goto failed; - } - + rc = kranal_device_init(kranal_devids[i], dev); + if (rc == 0) kranal_data.kra_ndevs++; - } + } + + if (kranal_data.kra_ndevs == 0) { + CERROR("Can't initialise any RapidArray devices\n"); + goto failed; } for (i = 0; i < kranal_data.kra_ndevs; i++) { @@ -2071,11 +1736,15 @@ kranal_startup (ptl_ni_t *ni) } } + rc = kranal_start_listener(); + if (rc != 0) + goto failed; + /* flag everything initialised */ kranal_data.kra_init = RANAL_INIT_ALL; /*****************************************************/ - - CDEBUG(D_MALLOC, "initial kmem %d\n", atomic_read(&portal_kmemory)); + + CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem); printk(KERN_INFO "Lustre: RapidArray NAL loaded " "(initial mem %d)\n", pkmem); @@ -2089,10 +1758,8 @@ kranal_startup (ptl_ni_t *ni) void __exit kranal_module_fini (void) { - if (kranal_tunables.kra_sysctl != NULL) - unregister_sysctl_table(kranal_tunables.kra_sysctl); - ptl_unregister_nal(&kranal_nal); + kranal_tunables_fini(); } int __init @@ -2100,31 +1767,12 @@ kranal_module_init (void) { int rc; - /* the following must be sizeof(int) for - * proc_dointvec/kranal_listener_procint() */ - CLASSERT (sizeof(kranal_tunables.kra_timeout) == sizeof(int)); - CLASSERT (sizeof(kranal_tunables.kra_listener_timeout) == sizeof(int)); - CLASSERT (sizeof(kranal_tunables.kra_backlog) == sizeof(int)); - CLASSERT (sizeof(kranal_tunables.kra_port) == sizeof(int)); - CLASSERT (sizeof(kranal_tunables.kra_max_immediate) == sizeof(int)); - - /* Initialise dynamic tunables to defaults once only */ - kranal_tunables.kra_timeout = RANAL_TIMEOUT; - kranal_tunables.kra_listener_timeout = RANAL_LISTENER_TIMEOUT; - kranal_tunables.kra_backlog = RANAL_BACKLOG; - kranal_tunables.kra_port = RANAL_PORT; - kranal_tunables.kra_max_immediate = RANAL_MAX_IMMEDIATE; + rc = kranal_tunables_init(); + if (rc != 0) + return rc; ptl_register_nal(&kranal_nal); - kranal_tunables.kra_sysctl = - register_sysctl_table(kranal_top_ctl_table, 0); - if (kranal_tunables.kra_sysctl == NULL) { - CERROR("Can't register sysctl table\n"); - ptl_unregister_nal(&kranal_nal); - return -ENOMEM; - } - return 0; } diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index 71cd6660..06ce4c8 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -59,46 +59,55 @@ #include -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +/* default vals for modparams/tunables */ +#define RANAL_N_CONND 4 /* # connection daemons */ -#define RANAL_N_CONND 4 /* # connection daemons */ - -#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry (seconds)... */ +#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry... */ #define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ -#define RANAL_FMA_MAX_PREFIX 232 /* max size of FMA "Prefix" */ -#define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ - -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ +#define RANAL_NTX 64 /* # tx descs */ +#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ +#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ * (overflow is a performance hit) */ +#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ +#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ +#define RANAL_BACKLOG 127 /* listener's backlog */ +#define RANAL_PORT 987 /* listener's port */ +#define RANAL_MAX_IMMEDIATE (2<<10) /* immediate payload breakpoint */ + +/* tunables determined at compile time */ +#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ +#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ #define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ -/* default vals for runtime tunables */ -#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ -#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ -#define RANAL_BACKLOG 127 /* listener's backlog */ -#define RANAL_PORT 988 /* listener's port */ -#define RANAL_MAX_IMMEDIATE (2<<10) /* immediate payload breakpoint */ +/* fixed constants */ +#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */ +#define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ + typedef struct { - int kra_timeout; /* comms timeout (seconds) */ - int kra_listener_timeout; /* max time the listener can block */ - int kra_backlog; /* listener's backlog */ - int kra_port; /* listener's TCP/IP port */ - int kra_max_immediate; /* immediate payload breakpoint */ - + int *kra_n_connd; /* # connection daemons */ + int *kra_min_reconnect_interval; /* first failed connection retry... */ + int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kra_ntx; /* # tx descs */ + int *kra_ntx_nblk; /* # reserved tx descs */ + int *kra_fma_cq_size; /* # entries in receive CQ */ + int *kra_timeout; /* comms timeout (seconds) */ + int *kra_listener_timeout; /* max time the listener can block */ + int *kra_backlog; /* listener's backlog */ + int *kra_port; /* listener's TCP/IP port */ + int *kra_max_immediate; /* immediate payload breakpoint */ + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM struct ctl_table_header *kra_sysctl; /* sysctl interface */ +#endif } kra_tunables_t; typedef struct @@ -114,6 +123,10 @@ typedef struct wait_queue_head_t rad_waitq; /* scheduler waits here */ spinlock_t rad_lock; /* serialise */ void *rad_scheduler; /* scheduling thread */ + unsigned int rad_nphysmap; /* # phys mappings */ + unsigned int rad_nppphysmap; /* # phys pages mapped */ + unsigned int rad_nvirtmap; /* # virt mappings */ + unsigned long rad_nobvirtmap; /* # virt bytes mapped */ } kra_device_t; typedef struct @@ -123,7 +136,6 @@ typedef struct atomic_t kra_nthreads; /* # live threads */ ptl_ni_t *kra_ni; /* _the_ nal instance */ - struct semaphore kra_nid_mutex; /* serialise NID/listener ops */ struct semaphore kra_listener_signal; /* block for listener startup/shutdown */ struct socket *kra_listener_sock; /* listener's socket */ int kra_listener_shutdown; /* ask listener to close */ @@ -135,7 +147,7 @@ typedef struct struct list_head *kra_peers; /* hash table of all my known peers */ int kra_peer_hash_size; /* size of kra_peers */ - atomic_t kra_npeers; /* # peers extant */ + int kra_npeers; /* # peers extant */ struct list_head *kra_conns; /* conns hashed by cqid */ int kra_conn_hash_size; /* size of kra_conns */ @@ -475,7 +487,8 @@ extern void kranal_update_reaper_timeout (long timeout); extern void kranal_tx_done (kra_tx_t *tx, int completion); extern void kranal_unlink_peer_locked (kra_peer_t *peer); extern void kranal_schedule_conn (kra_conn_t *conn); -extern kra_peer_t *kranal_create_peer (ptl_nid_t nid); +extern int kranal_create_peer (kra_peer_t **peerp, ptl_nid_t nid); +extern int kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port); extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid); extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); extern int kranal_del_peer (ptl_nid_t nid); @@ -488,4 +501,5 @@ extern void kranal_close_conn_locked (kra_conn_t *conn, int error); extern void kranal_terminate_conn_locked (kra_conn_t *conn); extern void kranal_connect (kra_peer_t *peer); extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); -extern void kranal_pause(int ticks); +extern int kranal_tunables_init(void); +extern void kranal_tunables_fini(void); diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index 53dabb8..78993f7 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -282,7 +282,7 @@ kranal_setup_rdma_buffer (kra_tx_t *tx, int niov, return kranal_setup_virt_buffer(tx, niov, iov, offset, nob); } -void +int kranal_map_buffer (kra_tx_t *tx) { kra_conn_t *conn = tx->tx_conn; @@ -299,23 +299,45 @@ kranal_map_buffer (kra_tx_t *tx) case RANAL_BUF_IMMEDIATE: case RANAL_BUF_PHYS_MAPPED: case RANAL_BUF_VIRT_MAPPED: - break; + return 0; case RANAL_BUF_PHYS_UNMAPPED: rrc = RapkRegisterPhys(dev->rad_handle, tx->tx_phys, tx->tx_phys_npages, &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); + if (rrc != RAP_SUCCESS) { + CERROR ("Can't map %d pages: dev %d " + "phys %u pp %u, virt %u nob %lu\n", + tx->tx_phys_npages, dev->rad_id, + dev->rad_nphysmap, dev->rad_nppphysmap, + dev->rad_nvirtmap, dev->rad_nobvirtmap); + return -ENOMEM; /* assume insufficient resources */ + } + + dev->rad_nphysmap++; + dev->rad_nppphysmap += tx->tx_phys_npages; + tx->tx_buftype = RANAL_BUF_PHYS_MAPPED; - break; + return 0; case RANAL_BUF_VIRT_UNMAPPED: rrc = RapkRegisterMemory(dev->rad_handle, tx->tx_buffer, tx->tx_nob, &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); + if (rrc != RAP_SUCCESS) { + CERROR ("Can't map %d bytes: dev %d " + "phys %u pp %u, virt %u nob %lu\n", + tx->tx_nob, dev->rad_id, + dev->rad_nphysmap, dev->rad_nppphysmap, + dev->rad_nvirtmap, dev->rad_nobvirtmap); + return -ENOMEM; /* assume insufficient resources */ + } + + dev->rad_nvirtmap++; + dev->rad_nobvirtmap += tx->tx_nob; + tx->tx_buftype = RANAL_BUF_VIRT_MAPPED; - break; + return 0; } } @@ -342,6 +364,10 @@ kranal_unmap_buffer (kra_tx_t *tx) rrc = RapkDeregisterMemory(dev->rad_handle, NULL, &tx->tx_map_key); LASSERT (rrc == RAP_SUCCESS); + + dev->rad_nphysmap--; + dev->rad_nppphysmap -= tx->tx_phys_npages; + tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED; break; @@ -352,6 +378,10 @@ kranal_unmap_buffer (kra_tx_t *tx) rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer, &tx->tx_map_key); LASSERT (rrc == RAP_SUCCESS); + + dev->rad_nvirtmap--; + dev->rad_nobvirtmap -= tx->tx_nob; + tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED; break; } @@ -428,6 +458,8 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) kra_peer_t *peer; kra_conn_t *conn; unsigned long now; + int rc; + int retry; rwlock_t *g_lock = &kranal_data.kra_global_lock; /* If I get here, I've committed to send, so I complete the tx with @@ -435,33 +467,46 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - read_lock(g_lock); + for (retry = 0; ; retry = 1) { - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { - read_unlock(g_lock); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } + read_lock(g_lock); - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - kranal_post_fma(conn, tx); + peer = kranal_find_peer_locked(nid); + if (peer != NULL) { + conn = kranal_find_conn_locked(peer); + if (conn != NULL) { + kranal_post_fma(conn, tx); + read_unlock(g_lock); + return; + } + } + + /* Making connections; I'll need a write lock... */ read_unlock(g_lock); - return; - } + write_lock_irqsave(g_lock, flags); - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock_irqsave(g_lock, flags); - - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { + peer = kranal_find_peer_locked(nid); + if (peer != NULL) + break; + write_unlock_irqrestore(g_lock, flags); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + kranal_tx_done(tx, -EHOSTUNREACH); + return; + } + rc = kranal_add_persistent_peer(nid, PTL_NIDADDR(nid), + *kranal_tunables.kra_port); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + kranal_tx_done(tx, rc); + return; + } + } + conn = kranal_find_conn_locked(peer); if (conn != NULL) { /* Connection exists; queue message on it */ @@ -469,7 +514,7 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) write_unlock_irqrestore(g_lock, flags); return; } - + LASSERT (peer->rap_persistence > 0); if (!peer->rap_connecting) { @@ -641,7 +686,12 @@ kranal_do_send (ptl_ni_t *ni, tx->tx_conn = conn; tx->tx_ptlmsg[0] = ptlmsg; - kranal_map_buffer(tx); + rc = kranal_map_buffer(tx); + if (rc != 0) { + kranal_tx_done(tx, rc); + return PTL_FAIL; + } + kranal_rdma(tx, RANAL_MSG_GET_DONE, &conn->rac_rxmsg->ram_u.get.ragm_desc, nob, conn->rac_rxmsg->ram_u.get.ragm_cookie); @@ -662,7 +712,7 @@ kranal_do_send (ptl_ni_t *ni, if ((ptlmsg->msg_md->md_options & PTL_MD_KIOV) == 0 && ptlmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA && - ptlmsg->msg_md->md_length <= kranal_tunables.kra_max_immediate) + ptlmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate) break; tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ); @@ -702,7 +752,7 @@ kranal_do_send (ptl_ni_t *ni, case PTL_MSG_PUT: if (kiov == NULL && /* not paged */ nob <= RANAL_FMA_MAX_DATA && /* small enough */ - nob <= kranal_tunables.kra_max_immediate) + nob <= *kranal_tunables.kra_max_immediate) break; /* send IMMEDIATE */ tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ); @@ -832,7 +882,11 @@ kranal_do_recv (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, } tx->tx_conn = conn; - kranal_map_buffer(tx); + rc = kranal_map_buffer(tx); + if (rc != 0) { + kranal_tx_done(tx, rc); + return PTL_FAIL; + } tx->tx_msg.ram_u.putack.rapam_src_cookie = conn->rac_rxmsg->ram_u.putreq.raprm_cookie; @@ -1404,7 +1458,7 @@ kranal_process_fmaq (kra_conn_t *conn) int expect_reply; /* NB 1. kranal_sendmsg() may fail if I'm out of credits right now. - * However I will be rescheduled some by an FMA completion event + * However I will be rescheduled by an FMA completion event * when I eventually get some. * NB 2. Sampling rac_state here races with setting it elsewhere. * But it doesn't matter if I try to send a "real" message just @@ -1484,7 +1538,6 @@ kranal_process_fmaq (kra_conn_t *conn) case RANAL_MSG_IMMEDIATE: rc = kranal_sendmsg(conn, &tx->tx_msg, tx->tx_buffer, tx->tx_nob); - expect_reply = 0; break; case RANAL_MSG_PUT_NAK: @@ -1492,13 +1545,16 @@ kranal_process_fmaq (kra_conn_t *conn) case RANAL_MSG_GET_NAK: case RANAL_MSG_GET_DONE: rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 0; break; case RANAL_MSG_PUT_REQ: + rc = kranal_map_buffer(tx); + LASSERT (rc != -EAGAIN); + if (rc != 0) + break; + tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie; rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - kranal_map_buffer(tx); expect_reply = 1; break; @@ -1508,7 +1564,11 @@ kranal_process_fmaq (kra_conn_t *conn) break; case RANAL_MSG_GET_REQ: - kranal_map_buffer(tx); + rc = kranal_map_buffer(tx); + LASSERT (rc != -EAGAIN); + if (rc != 0) + break; + tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie; tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key; tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits = @@ -1529,10 +1589,8 @@ kranal_process_fmaq (kra_conn_t *conn) return; } - LASSERT (rc == 0); - - if (!expect_reply) { - kranal_tx_done(tx, 0); + if (!expect_reply || rc != 0) { + kranal_tx_done(tx, rc); } else { /* LASSERT(current) above ensures this doesn't race with reply * processing */ diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c new file mode 100644 index 0000000..d2c6fe9 --- /dev/null +++ b/lnet/klnds/ralnd/ralnd_modparams.c @@ -0,0 +1,149 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "ranal.h" + +static int n_connd = RANAL_N_CONND; +CFS_MODULE_PARM(n_connd, "i", int, 0444, + "# of connection daemons"); + +static int min_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = RANAL_MAX_RECONNECT_INTERVAL; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int ntx = RANAL_NTX; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of 'normal' transmit descriptors"); + +static int ntx_nblk = RANAL_NTX_NBLK; +CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, + "# of 'reserved' transmit descriptors"); + +static int fma_cq_size = RANAL_FMA_CQ_SIZE; +CFS_MODULE_PARM(fma_cq_size, "i", int, 0444, + "size of the completion queue"); + +static int timeout = RANAL_TIMEOUT; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "communications timeout (seconds)"); + +static int listener_timeout = RANAL_LISTENER_TIMEOUT; +CFS_MODULE_PARM(listener_timeout, "i", int, 0644, + "passive connection timeout"); + +static int backlog = RANAL_BACKLOG; +CFS_MODULE_PARM(backlog, "i", int, 0444, + "passive connection (listen) backlog"); + +static int port = RANAL_PORT; +CFS_MODULE_PARM(port, "i", int, 0444, + "connection request TCP/IP port"); + +static int max_immediate = RANAL_MAX_IMMEDIATE; +CFS_MODULE_PARM(max_immediate, "i", int, 0644, + "immediate/RDMA breakpoint"); + +kra_tunables_t kranal_tunables = { + .kra_n_connd = &n_connd, + .kra_min_reconnect_interval = &min_reconnect_interval, + .kra_max_reconnect_interval = &max_reconnect_interval, + .kra_ntx = &ntx, + .kra_ntx_nblk = &ntx_nblk, + .kra_fma_cq_size = &fma_cq_size, + .kra_timeout = &timeout, + .kra_listener_timeout = &listener_timeout, + .kra_backlog = &backlog, + .kra_port = &port, + .kra_max_immediate = &max_immediate, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table kranal_ctl_table[] = { + {1, "n_connd", &n_connd, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {3, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {5, "ntx_nblk", &ntx_nblk, + sizeof(int), 0444, NULL, &proc_dointvec}, + {6, "fma_cq_size", &fma_cq_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {8, "listener_timeout", &listener_timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {9, "backlog", &backlog, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "port", &port, + sizeof(int), 0444, NULL, &proc_dointvec}, + {11, "max_immediate", &max_immediate, + sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table kranal_top_ctl_table[] = { + {202, "ranal", NULL, 0, 0555, kranal_ctl_table}, + {0} +}; + +int +kranal_tunables_init () +{ + kranal_tunables.kra_sysctl = + register_sysctl_table(kranal_top_ctl_table, 0); + + if (kranal_tunables.kra_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kranal_tunables_fini () +{ + if (kranal_tunables.kra_sysctl != NULL) + unregister_sysctl_table(kranal_tunables.kra_sysctl); +} + +#else + +int +kranal_tunables_init () +{ + return 0; +} + +void +kranal_tunables_fini () +{ +} + +#endif + diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index f60c567..9655ad9 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -94,17 +94,19 @@ ksocknal_destroy_route (ksock_route_t *route) PORTAL_FREE (route, sizeof (*route)); } -ksock_peer_t * -ksocknal_create_peer (ptl_ni_t *ni, ptl_nid_t nid) +int +ksocknal_create_peer (ksock_peer_t **peerp, ptl_ni_t *ni, ptl_nid_t nid) { - ksock_net_t *net = ni->ni_data; - ksock_peer_t *peer; + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; + unsigned long flags; LASSERT (nid != PTL_NID_ANY); + LASSERT (!in_interrupt()); PORTAL_ALLOC (peer, sizeof (*peer)); if (peer == NULL) - return (NULL); + return -ENOMEM; memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ @@ -116,14 +118,29 @@ ksocknal_create_peer (ptl_ni_t *ni, ptl_nid_t nid) CFS_INIT_LIST_HEAD (&peer->ksnp_routes); CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue); - atomic_inc (&net->ksnn_npeers); - return (peer); + spin_lock_irqsave(&net->ksnn_lock, flags); + + if (net->ksnn_shutdown) { + spin_unlock_irqrestore(&net->ksnn_lock, flags); + + PORTAL_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_irqrestore(&net->ksnn_lock, flags); + + *peerp = peer; + return 0; } void ksocknal_destroy_peer (ksock_peer_t *peer) { - ksock_net_t *net = peer->ksnp_ni->ni_data; + ksock_net_t *net = peer->ksnp_ni->ni_data; + unsigned long flags; CDEBUG (D_NET, "peer %s %p deleted\n", libcfs_nid2str(peer->ksnp_nid), peer); @@ -139,7 +156,9 @@ ksocknal_destroy_peer (ksock_peer_t *peer) * until they are destroyed, so we can be assured that _all_ state to * do with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&net->ksnn_npeers); + spin_lock_irqsave(&net->ksnn_lock, flags); + net->ksnn_npeers--; + spin_unlock_irqrestore(&net->ksnn_lock, flags); } ksock_peer_t * @@ -206,8 +225,8 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer) } int -ksocknal_get_peer_info (int index, ptl_nid_t *nid, - __u32 *myip, __u32 *peer_ip, int *port, +ksocknal_get_peer_info (ptl_ni_t *ni, int index, + ptl_nid_t *nid, __u32 *myip, __u32 *peer_ip, int *port, int *conn_count, int *share_count) { ksock_peer_t *peer; @@ -225,6 +244,9 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + if (peer->ksnp_ni != ni) + continue; + if (peer->ksnp_n_passive_ips == 0 && list_empty(&peer->ksnp_routes)) { if (index-- > 0) @@ -414,14 +436,15 @@ ksocknal_add_peer (ptl_ni_t *ni, ptl_nid_t nid, __u32 ipaddr, int port) ksock_peer_t *peer2; ksock_route_t *route; ksock_route_t *route2; + int rc; if (nid == PTL_NID_ANY) return (-EINVAL); /* Have a brand new peer ready... */ - peer = ksocknal_create_peer(ni, nid); - if (peer == NULL) - return (-ENOMEM); + rc = ksocknal_create_peer(&peer, ni, nid); + if (rc != 0) + return rc; route = ksocknal_create_route (ipaddr, port); if (route == NULL) { @@ -560,7 +583,7 @@ ksocknal_del_peer (ptl_ni_t *ni, ptl_nid_t nid, __u32 ip) } ksock_conn_t * -ksocknal_get_conn_by_idx (int index) +ksocknal_get_conn_by_idx (ptl_ni_t *ni, int index) { ksock_peer_t *peer; struct list_head *ptmp; @@ -576,6 +599,9 @@ ksocknal_get_conn_by_idx (int index) LASSERT (!peer->ksnp_closing); + if (peer->ksnp_ni != ni) + continue; + list_for_each (ctmp, &peer->ksnp_conns) { if (index-- > 0) continue; @@ -889,18 +915,11 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, ksocknal_route_decref(newroute); } -void -ksocknal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} - int ksocknal_listener (void *arg) { char name[16]; - ksock_connreq_t *cr; + ksock_connreq_t *cr = NULL; int rc; unsigned long flags; @@ -911,9 +930,9 @@ ksocknal_listener (void *arg) kportal_daemonize(name); kportal_blockallsigs(); - rc = ksocknal_lib_listen(&ksocknal_data.ksnd_listener_sock, - *ksocknal_tunables.ksnd_port, - *ksocknal_tunables.ksnd_backlog); + rc = libcfs_sock_listen(&ksocknal_data.ksnd_listener_sock, + 0, *ksocknal_tunables.ksnd_port, + *ksocknal_tunables.ksnd_backlog); /* set init status and unblock parent */ ksocknal_data.ksnd_listener_shutdown = rc; @@ -923,25 +942,40 @@ ksocknal_listener (void *arg) return rc; while (ksocknal_data.ksnd_listener_shutdown == 0) { + + if (cr == NULL) { + PORTAL_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + CWARN("ENOMEM: listener pausing\n"); + libcfs_pause(cfs_time_seconds(1)); + continue; + } + } - rc = ksocknal_lib_accept(ksocknal_data.ksnd_listener_sock, &cr); + rc = libcfs_sock_accept(&cr->ksncr_sock, + ksocknal_data.ksnd_listener_sock, + 0); if (rc != 0) { if (rc != -EAGAIN) { - CWARN("Accept error: %d\n", rc); - ksocknal_pause(cfs_time_seconds(1)); + CWARN("Accept error %d: listener pausing\n", + rc); + libcfs_pause(cfs_time_seconds(1)); } continue; } - + spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags); list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); wake_up(&ksocknal_data.ksnd_connd_waitq); spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, flags); + cr = NULL; } - ksocknal_lib_release_sock(ksocknal_data.ksnd_listener_sock); + libcfs_sock_release(ksocknal_data.ksnd_listener_sock); + if (cr != NULL) + PORTAL_FREE(cr, sizeof(*cr)); /* unblock executioner */ mutex_up(&ksocknal_data.ksnd_listener_signal); @@ -980,7 +1014,7 @@ ksocknal_stop_listener(void) /* make the listener exit its loop... */ ksocknal_data.ksnd_listener_shutdown = 1; - ksocknal_lib_abort_accept(ksocknal_data.ksnd_listener_sock); + libcfs_sock_abort_accept(ksocknal_data.ksnd_listener_sock); /* block until listener exits */ mutex_down(&ksocknal_data.ksnd_listener_signal); @@ -993,6 +1027,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; __u32 ipaddrs[PTL_MAX_INTERFACES]; int nipaddrs; + ptl_ni_t *ni; + ksock_net_t *net; ptl_nid_t nid; struct list_head *tmp; __u64 incarnation; @@ -1043,8 +1079,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) goto failed_1; if (route != NULL) { - ptl_ni_t *ni = route->ksnr_peer->ksnp_ni; - ksock_net_t *net = ni->ni_data; + ni = route->ksnr_peer->ksnp_ni; + net = (ksock_net_t *)ni->ni_data; /* Active connection sends HELLO eagerly */ nipaddrs = ksocknal_local_ipvec(ni, ipaddrs); @@ -1057,9 +1093,14 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) } /* Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to */ + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid; - rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs); + rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs, + (route == NULL) ? + *ksocknal_tunables.ksnd_listen_timeout : + *ksocknal_tunables.ksnd_timeout); if (rc < 0) goto failed_1; nipaddrs = rc; @@ -1069,7 +1110,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) peer = route->ksnr_peer; ksocknal_peer_addref(peer); } else { - ptl_ni_t *ni = ptl_net2ni(PTL_NIDNET(nid)); + ni = ptl_net2ni(PTL_NIDNET(nid)); if (ni == NULL) { CERROR("Refusing connection attempt " @@ -1077,8 +1118,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) rc = -ECONNREFUSED; goto failed_1; } - - peer = ksocknal_create_peer(ni, nid); + + net = (ksock_net_t *)ni->ni_data; + rc = ksocknal_create_peer(&peer, ni, nid); /* lose extra ref from ptl_net2ni NB we wait for all the peers * to be deleted before ni teardown can complete; i.e. ni can't @@ -1086,10 +1128,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) * there's no to account the peer's refs on ni. */ ptl_ni_decref(ni); - if (peer == NULL) { - rc = -ENOMEM; + if (rc != 0) goto failed_1; - } write_lock_irqsave(global_lock, flags); @@ -1251,7 +1291,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) PORTAL_FREE (conn, sizeof(*conn)); failed_0: - ksocknal_lib_release_sock(sock); + libcfs_sock_release(sock); LASSERT (rc != 0); return (rc); @@ -1850,9 +1890,12 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) int conn_count = 0; int share_count = 0; - rc = ksocknal_get_peer_info(data->ioc_count, &nid, - &myip, &ip, &port, + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &nid, &myip, &ip, &port, &conn_count, &share_count); + if (rc != 0) + return rc; + data->ioc_nid = nid; data->ioc_count = share_count; data->ioc_u32[0] = ip; @@ -1877,7 +1920,7 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) int txmem; int rxmem; int nagle; - ksock_conn_t *conn = ksocknal_get_conn_by_idx (data->ioc_count); + ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count); if (conn == NULL) return -ENOENT; @@ -1903,15 +1946,14 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg) data->ioc_u32[0]); case IOC_PORTAL_REGISTER_MYNID: + /* Ignore if this is a noop */ if (data->ioc_nid == ni->ni_nid) return 0; - LASSERT (PTL_NIDNET(data->ioc_nid) == PTL_NIDNET(ni->ni_nid)); - - CERROR("obsolete IOC_PORTAL_REGISTER_MYNID for %s(%s)\n", + CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n", libcfs_nid2str(data->ioc_nid), libcfs_nid2str(ni->ni_nid)); - return 0; + return -EINVAL; case IOC_PORTAL_PUSH_CONNECTION: return ksocknal_push (ni, data->ioc_nid); @@ -1968,6 +2010,7 @@ ksocknal_base_shutdown (void) { ksock_sched_t *sched; int i; + unsigned long flags; CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); @@ -1981,12 +2024,19 @@ ksocknal_base_shutdown (void) ksocknal_stop_listener(); /* Wait for queued connreqs to clean up */ i = 2; + spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags); while (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, + flags); i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for connreqs to clean up\n"); - ksocknal_pause(cfs_time_seconds(1)); + libcfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags); } + spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, + flags); /* fall through */ @@ -2031,7 +2081,7 @@ ksocknal_base_shutdown (void) "waiting for %d threads to terminate\n", ksocknal_data.ksnd_nthreads); read_unlock(&ksocknal_data.ksnd_global_lock); - ksocknal_pause(cfs_time_seconds(1)); + libcfs_pause(cfs_time_seconds(1)); read_lock(&ksocknal_data.ksnd_global_lock); } read_unlock(&ksocknal_data.ksnd_global_lock); @@ -2220,22 +2270,33 @@ ksocknal_shutdown (ptl_ni_t *ni) { ksock_net_t *net = ni->ni_data; int i; + unsigned long flags; LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); LASSERT(ksocknal_data.ksnd_nnets > 0); + spin_lock_irqsave(&net->ksnn_lock, flags); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_irqrestore(&net->ksnn_lock, flags); + /* Delete all peers */ ksocknal_del_peer(ni, PTL_NID_ANY, 0); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&net->ksnn_npeers) != 0) { + spin_lock_irqsave(&net->ksnn_lock, flags); + while (net->ksnn_npeers != 0) { + spin_unlock_irqrestore(&net->ksnn_lock, flags); + i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d peers to disconnect\n", - atomic_read (&net->ksnn_npeers)); - ksocknal_pause(cfs_time_seconds(1)); + net->ksnn_npeers); + libcfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&net->ksnn_lock, flags); } + spin_unlock_irqrestore(&net->ksnn_lock, flags); for (i = 0; i < net->ksnn_ninterfaces; i++) { LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0); @@ -2249,6 +2310,61 @@ ksocknal_shutdown (ptl_ni_t *ni) ksocknal_base_shutdown(); } +int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == PTL_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + j++; + } + + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + ptl_err_t ksocknal_startup (ptl_ni_t *ni) { @@ -2269,31 +2385,39 @@ ksocknal_startup (ptl_ni_t *ni) goto fail_0; memset(net, 0, sizeof(*net)); - + spin_lock_init(&net->ksnn_lock); net->ksnn_incarnation = ksocknal_new_incarnation(); ni->ni_data = net; if (ni->ni_interfaces[0] == NULL) { - rc = ksocknal_lib_enumerate_ifs(net->ksnn_interfaces, - PTL_MAX_INTERFACES); - if (rc < 0) + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) goto fail_1; - if (rc == 0) { - CERROR("Can't find any interfaces\n"); - goto fail_1; - } - net->ksnn_ninterfaces = rc; } else { for (i = 0; i < PTL_MAX_INTERFACES; i++) { + int up; + if (ni->ni_interfaces[i] == NULL) break; - rc = ksocknal_lib_init_if(&net->ksnn_interfaces[i], - ni->ni_interfaces[i]); - if (rc != 0) + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); + + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); goto fail_1; + } + + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } } net->ksnn_ninterfaces = i; } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 0014402..7245899 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -89,6 +89,10 @@ # define SOCKNAL_RISK_KMAP_DEADLOCK 1 #endif +/* minimum socket buffer required for connection handshake */ +#define SOCKNAL_MIN_BUFFER (2*(sizeof(ptl_hdr_t) + \ + PTL_MAX_INTERFACES * sizeof(__u32))) + typedef struct /* pool of forwarding buffers */ { spinlock_t fmp_lock; /* serialise */ @@ -158,7 +162,9 @@ typedef struct typedef struct { __u64 ksnn_incarnation; /* my epoch */ - atomic_t ksnn_npeers; /* # peers */ + spinlock_t ksnn_lock; /* serialise */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ int ksnn_ninterfaces; /* IP interfaces */ ksock_interface_t ksnn_interfaces[PTL_MAX_INTERFACES]; } ksock_net_t; @@ -434,15 +440,13 @@ ksocknal_connsock_addref (ksock_conn_t *conn) return (rc); } -extern void ksocknal_lib_release_sock(struct socket *sock); - static inline void ksocknal_connsock_decref (ksock_conn_t *conn) { LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0); if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { LASSERT (conn->ksnc_closing); - ksocknal_lib_release_sock(conn->ksnc_sock); + libcfs_sock_release(conn->ksnc_sock); conn->ksnc_sock = NULL; } } @@ -531,7 +535,8 @@ extern int ksocknal_reaper (void *arg); extern int ksocknal_send_hello (ksock_conn_t *conn, ptl_nid_t nid, __u64 incarnation, __u32 *ipaddrs, int nipaddrs); extern int ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, - __u64 *incarnation, __u32 *ipaddrs); + __u64 *incarnation, __u32 *ipaddrs, + int timeout); extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn); extern void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn); @@ -547,21 +552,8 @@ extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx); extern void ksocknal_lib_eager_ack (ksock_conn_t *conn); extern int ksocknal_lib_recv_iov (ksock_conn_t *conn); extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn); -extern int ksocknal_lib_sock_write (struct socket *sock, - void *buffer, int nob); -extern int ksocknal_lib_sock_read (struct socket *sock, - void *buffer, int nob); extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle); -extern int ksocknal_lib_connect_sock(struct socket **sockp, int *fatal, - ksock_route_t *route, int local_port); -extern int ksocknal_lib_listen(struct socket **sockp, int port, int backlog); -extern int ksocknal_lib_accept(struct socket *sock, ksock_connreq_t **crp); -extern void ksocknal_lib_abort_accept(struct socket *sock); extern int ksocknal_lib_tunables_init(void); extern void ksocknal_lib_tunables_fini(void); - -extern int ksocknal_lib_init_if (ksock_interface_t *iface, char *name); -extern int ksocknal_lib_enumerate_ifs (ksock_interface_t *ifs, int nifs); - diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 25af875..f6bcac0 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -689,7 +689,7 @@ ksocknal_launch_packet (ptl_ni_t *ni, ksock_tx_t *tx, ptl_nid_t nid) ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - int create_peer = 0; + int retry; int rc; /* Ensure the frags we've been given EXACTLY match the number of @@ -713,50 +713,47 @@ ksocknal_launch_packet (ptl_ni_t *ni, ksock_tx_t *tx, ptl_nid_t nid) g_lock = &ksocknal_data.ksnd_global_lock; - again: - if (create_peer) { - rc = ksocknal_add_peer(ni, nid, PTL_NIDADDR(nid), - *ksocknal_tunables.ksnd_port); - if (rc != 0) { - CERROR("Can't add peer %s: %d\n", - libcfs_nid2str(nid), rc); - return rc; - } - } - + for (retry = 0;; retry) { #if !SOCKNAL_ROUND_ROBIN - read_lock (g_lock); - peer = ksocknal_find_peer_locked(ni, nid); - if (peer != NULL) { - if (ksocknal_find_connectable_route_locked(peer) == NULL) { - conn = ksocknal_find_conn_locked (tx, peer); - if (conn != NULL) { - /* I've got no routes that need to be - * connecting and I do have an actual - * connection... */ - ksocknal_queue_tx_locked (tx, conn); - read_unlock (g_lock); - return (0); + read_lock (g_lock); + peer = ksocknal_find_peer_locked(ni, nid); + if (peer != NULL) { + if (ksocknal_find_connectable_route_locked(peer) == NULL) { + conn = ksocknal_find_conn_locked (tx, peer); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock (g_lock); + return (0); + } } } - } - /* I'll need a write lock... */ - read_unlock (g_lock); + /* I'll need a write lock... */ + read_unlock (g_lock); #endif - write_lock_irqsave(g_lock, flags); + write_lock_irqsave(g_lock, flags); - peer = ksocknal_find_peer_locked(ni, nid); - if (peer == NULL) { + peer = ksocknal_find_peer_locked(ni, nid); + if (peer != NULL) + break; + write_unlock_irqrestore(g_lock, flags); - if (create_peer) { + if (retry) { CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); - return -ENOENT; + return -EHOSTUNREACH; } - create_peer = 1; - goto again; + rc = ksocknal_add_peer(ni, nid, PTL_NIDADDR(nid), + *ksocknal_tunables.ksnd_port); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + return rc; + } } for (;;) { @@ -1710,8 +1707,9 @@ ksocknal_send_hello (ksock_conn_t *conn, ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]); } - /* Receiver should be eager */ - rc = ksocknal_lib_sock_write (sock, &hdr, sizeof(hdr)); + /* socket buffer should have been set large enough not to block + * (timeout == 0) */ + rc = libcfs_sock_write(sock, &hdr, sizeof(hdr), 0); if (rc != 0) { CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); @@ -1721,7 +1719,7 @@ ksocknal_send_hello (ksock_conn_t *conn, if (nipaddrs == 0) return (0); - rc = ksocknal_lib_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs)); + rc = libcfs_sock_write(sock, ipaddrs, nipaddrs * sizeof(*ipaddrs), 0); if (rc != 0) CERROR ("Error %d sending HELLO payload (%d)" " to %u.%u.%u.%u/%d\n", rc, nipaddrs, @@ -1748,7 +1746,8 @@ ksocknal_invert_type(int type) int ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, - __u64 *incarnation, __u32 *ipaddrs) + __u64 *incarnation, __u32 *ipaddrs, + int timeout) { struct socket *sock = conn->ksnc_sock; int rc; @@ -1761,7 +1760,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, hmv = (ptl_magicversion_t *)&hdr.dest_nid; LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - rc = ksocknal_lib_sock_read (sock, hmv, sizeof (*hmv)); + rc = libcfs_sock_read(sock, hmv, sizeof (*hmv), timeout); if (rc != 0) { CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", rc, HIPQUAD(conn->ksnc_ipaddr)); @@ -1794,7 +1793,8 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, * header, followed by payload full of interface IP addresses. * Read the rest of it in now... */ - rc = ksocknal_lib_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + rc = libcfs_sock_read(sock, hmv + 1, sizeof (hdr) - sizeof (*hmv), + timeout); if (rc != 0) { CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n", rc, HIPQUAD(conn->ksnc_ipaddr)); @@ -1867,7 +1867,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, if (nips == 0) return (0); - rc = ksocknal_lib_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs)); + rc = libcfs_sock_read(sock, ipaddrs, nips * sizeof(*ipaddrs), timeout); if (rc != 0) { CERROR ("Error %d reading IPs from %s ip %u.%u.%u.%u\n", rc, libcfs_nid2str(*nid), HIPQUAD(conn->ksnc_ipaddr)); @@ -1895,15 +1895,12 @@ ksocknal_connect_peer (ksock_route_t *route, int type) int port; int fatal; - /* Iterate through reserved ports. When typed connections are - * used, we will need to bind to multiple ports, but we only know - * this at connect time. But, by that time we've already called - * bind() so we need a new socket. */ - for (port = 1023; port > 512; --port) { + /* Iterate through reserved ports. */ - rc = ksocknal_lib_connect_sock(&sock, &fatal, route, port); - + rc = libcfs_sock_connect(&sock, &fatal, 0, + route->ksnr_myipaddr, port, + route->ksnr_ipaddr, route->ksnr_port); if (rc == 0) { rc = ksocknal_create_conn(route, sock, type); return rc; diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 7dc2287..d82e4b8 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -487,84 +487,6 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) } int -ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - set_fs (KERNEL_DS); - rc = sock_sendmsg (sock, &msg, iov.iov_len); - set_fs (oldmm); - - if (rc < 0) - return (rc); - - if (rc == 0) { - CERROR ("Unexpected zero rc\n"); - return (-ECONNABORTED); - } - - buffer = ((char *)buffer) + rc; - nob -= rc; - } - - return (0); -} - -int -ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - set_fs (KERNEL_DS); - rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); - set_fs (oldmm); - - if (rc < 0) - return (rc); - - if (rc == 0) - return (-ECONNABORTED); - - buffer = ((char *)buffer) + rc; - nob -= rc; - } - - return (0); -} - -int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) { mm_segment_t oldmm = get_fs (); @@ -579,23 +501,15 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int return (-ESHUTDOWN); } - set_fs (KERNEL_DS); - - len = sizeof(*txmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)txmem, &len); - if (rc == 0) { - len = sizeof(*rxmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)rxmem, &len); - } + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { len = sizeof(*nagle); + set_fs(KERNEL_DS); rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)nagle, &len); + set_fs(oldmm); } - set_fs (oldmm); ksocknal_connsock_decref(conn); if (rc == 0) @@ -607,6 +521,20 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int } int +ksocknal_lib_buffersize (int current_sz, int tunable_sz) +{ + /* ensure >= SOCKNAL_MIN_BUFFER */ + if (current_sz < SOCKNAL_MIN_BUFFER) + return MAX(SOCKNAL_MIN_BUFFER, tunable_sz); + + if (tunable_sz > SOCKNAL_MIN_BUFFER) + return tunable_sz; + + /* leave alone */ + return 0; +} + +int ksocknal_lib_setup_sock (struct socket *sock) { mm_segment_t oldmm = get_fs (); @@ -616,6 +544,8 @@ ksocknal_lib_setup_sock (struct socket *sock) int keep_intvl; int keep_count; int do_keepalive; + int sndbuf; + int rcvbuf; struct linger linger; sock->sk->sk_allocation = GFP_NOFS; @@ -658,29 +588,23 @@ ksocknal_lib_setup_sock (struct socket *sock) } } - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = *ksocknal_tunables.ksnd_buffer_size; + rc = libcfs_sock_getbuf(sock, &sndbuf, &rcvbuf); + if (rc != 0) { + CERROR("Can't get buffer sizes: %d\n", rc); + return rc; + } - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", - option, rc); - return (rc); - } + sndbuf = ksocknal_lib_buffersize(sndbuf, + *ksocknal_tunables.ksnd_buffer_size); + rcvbuf = ksocknal_lib_buffersize(rcvbuf, + *ksocknal_tunables.ksnd_buffer_size); - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", - option, rc); - return (rc); - } - } + rc = libcfs_sock_setbuf(sock, sndbuf, rcvbuf); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + sndbuf, rcvbuf, rc); + return (rc); + } /* snapshot tunables */ keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; @@ -732,409 +656,6 @@ ksocknal_lib_setup_sock (struct socket *sock) return (0); } -int -ksocknal_lib_set_sock_timeout (struct socket *sock, int timeout) -{ - struct timeval tv; - int rc; - mm_segment_t oldmm = get_fs(); - - /* Set the socket timeouts, so our connection handshake completes in - * finite time */ - tv.tv_sec = timeout; - tv.tv_usec = 0; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send timeout %d: %d\n", timeout, rc); - return rc; - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive timeout %d: %d\n", timeout, rc); - return rc; - } - - return 0; -} - -void -ksocknal_lib_release_sock(struct socket *sock) -{ - sock_release(sock); -} - -int -ksocknal_lib_create_sock(struct socket **sockp, int *fatal, - int timeout, __u32 local_ip, int local_port) -{ - struct sockaddr_in locaddr; - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - - *fatal = 1; /* assume errors are fatal */ - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = (local_ip == 0) ? - INADDR_ANY : htonl(local_ip); - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc != 0) { - CERROR ("Can't create socket: %d\n", rc); - return (rc); - } - - rc = ksocknal_lib_set_sock_timeout(sock, timeout); - if (rc != 0) - goto failed; - - set_fs (KERNEL_DS); - option = 1; - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; - } - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == -EADDRINUSE) { - CDEBUG(D_WARNING, "Port %d already in use\n", local_port); - *fatal = 0; - goto failed; - } - if (rc != 0) { - CERROR("Error trying to bind to reserved port %d: %d\n", - local_port, rc); - goto failed; - } - - return 0; - - failed: - sock_release(sock); - return rc; -} - -int -ksocknal_lib_listen(struct socket **sockp, int port, int backlog) -{ - int fatal; - int rc; - - rc = ksocknal_lib_create_sock(sockp, &fatal, 1, 0, port); - if (rc != 0) - return rc; - - rc = (*sockp)->ops->listen(*sockp, backlog); - if (rc == 0) - return 0; - - CERROR("Can't set listen backlog %d: %d\n", backlog, rc); - sock_release(*sockp); - return rc; -} - -int -ksocknal_lib_accept(struct socket *sock, ksock_connreq_t **crp) -{ - wait_queue_t wait; - struct socket *newsock; - int rc; - - init_waitqueue_entry(&wait, current); - - newsock = sock_alloc(); - if (newsock == NULL) { - CERROR("Can't allocate socket\n"); - return -ENOMEM; - } - - /* XXX this should add a ref to sock->ops->owner, if - * TCP could be a module */ - newsock->type = sock->type; - newsock->ops = sock->ops; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(sock->sk->sk_sleep, &wait); - - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); - if (rc == -EAGAIN) { - /* Nothing ready, so wait for activity */ - schedule(); - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); - } - - remove_wait_queue(sock->sk->sk_sleep, &wait); - set_current_state(TASK_RUNNING); - - if (rc != 0) - goto failed; - - rc = ksocknal_lib_set_sock_timeout(newsock, - *ksocknal_tunables.ksnd_listen_timeout); - if (rc != 0) - goto failed; - - rc = -ENOMEM; - PORTAL_ALLOC(*crp, sizeof(**crp)); - if (*crp == NULL) - goto failed; - - (*crp)->ksncr_sock = newsock; - return 0; - - failed: - sock_release(newsock); - return rc; -} - -void -ksocknal_lib_abort_accept(struct socket *sock) -{ - wake_up_all(sock->sk->sk_sleep); -} - -int -ksocknal_lib_connect_sock(struct socket **sockp, int *fatal, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in srvaddr; - int rc; - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - rc = ksocknal_lib_create_sock(sockp, fatal, - *ksocknal_tunables.ksnd_timeout, - route->ksnr_myipaddr, local_port); - if (rc != 0) - return rc; - - rc = (*sockp)->ops->connect(*sockp, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - 0); - if (rc == 0) - return 0; - - /* EADDRNOTAVAIL probably means we're already connected to the same - * peer/port on the same local port on a differently typed - * connection. Let our caller retry with a different local - * port... */ - *fatal = !(rc == -EADDRNOTAVAIL); - - CDEBUG(*fatal ? D_ERROR : D_NET, - "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, - HIPQUAD(route->ksnr_myipaddr), local_port, - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - - sock_release(*sockp); - return rc; -} - -int -ksocknal_lib_getifaddr(struct socket *sock, char *name, __u32 *ip, __u32 *mask) -{ - mm_segment_t oldmm = get_fs(); - struct ifreq ifr; - int rc; - __u32 val; - - LASSERT (strnlen(name, IFNAMSIZ) < IFNAMSIZ); - - strcpy(ifr.ifr_name, name); - ifr.ifr_addr.sa_family = AF_INET; - set_fs(KERNEL_DS); - rc = sock->ops->ioctl(sock, SIOCGIFADDR, (unsigned long)&ifr); - set_fs(oldmm); - - if (rc != 0) - return rc; - - val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; - *ip = ntohl(val); - - strcpy(ifr.ifr_name, name); - ifr.ifr_addr.sa_family = AF_INET; - set_fs(KERNEL_DS); - rc = sock->ops->ioctl(sock, SIOCGIFNETMASK, (unsigned long)&ifr); - set_fs(oldmm); - - if (rc != 0) - return rc; - - val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; - *mask = ntohl(val); - return 0; -} - -int -ksocknal_lib_init_if (ksock_interface_t *iface, char *name) -{ - struct socket *sock; - int rc; - int nob; - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR ("Can't create socket: %d\n", rc); - return rc; - } - - nob = strnlen(name, IFNAMSIZ); - if (nob == IFNAMSIZ) { - CERROR("Interface name %s too long\n", name); - rc -EINVAL; - } else { - CLASSERT (sizeof(iface->ksni_name) >= IFNAMSIZ); - strcpy(iface->ksni_name, name); - - rc = ksocknal_lib_getifaddr(sock, name, - &iface->ksni_ipaddr, - &iface->ksni_netmask); - if (rc != 0) - CERROR("Can't get IP address for interface %s\n", name); - } - - sock_release(sock); - return rc; -} - -int -ksocknal_lib_enumerate_ifs (ksock_interface_t *ifs, int nifs) -{ - int nalloc = PTL_MAX_INTERFACES; - char name[IFNAMSIZ]; - int nfound; - int nused; - struct socket *sock; - struct ifconf ifc; - struct ifreq *ifr; - mm_segment_t oldmm = get_fs(); - __u32 ipaddr; - __u32 netmask; - int rc; - int i; - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR ("Can't create socket: %d\n", rc); - return rc; - } - - for (;;) { - PORTAL_ALLOC(ifr, nalloc * sizeof(*ifr)); - if (ifr == NULL) { - CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); - rc = -ENOMEM; - goto out0; - } - - ifc.ifc_buf = (char *)ifr; - ifc.ifc_len = nalloc * sizeof(*ifr); - - set_fs(KERNEL_DS); - rc = sock->ops->ioctl(sock, SIOCGIFCONF, (unsigned long)&ifc); - set_fs(oldmm); - - if (rc < 0) { - CERROR ("Error %d enumerating interfaces\n", rc); - goto out1; - } - - LASSERT (rc == 0); - - nfound = ifc.ifc_len/sizeof(*ifr); - LASSERT (nfound <= nalloc); - - if (nfound <= nalloc) - break; - - /* Assume there are more interfaces */ - if (nalloc >= 16 * PTL_MAX_INTERFACES) { - CWARN("Too many interfaces: " - "only trying the first %d\n", nfound); - break; - } - - nalloc *= 2; - } - - for (i = nused = 0; i < nfound; i++) { - strncpy(name, ifr[i].ifr_name, IFNAMSIZ); /* ensure terminated name */ - name[IFNAMSIZ-1] = 0; - - if (!strncmp(name, "lo", 2)) { - CDEBUG(D_WARNING, "ignoring %s\n", name); - continue; - } - - strcpy(ifr[i].ifr_name, name); - set_fs(KERNEL_DS); - rc = sock->ops->ioctl(sock, SIOCGIFFLAGS, - (unsigned long)&ifr[i]); - set_fs(oldmm); - - if (rc != 0) { - CDEBUG(D_WARNING, "Can't get flags for %s\n", name); - continue; - } - - if ((ifr[i].ifr_flags & IFF_UP) == 0) { - CDEBUG(D_WARNING, "Interface %s down\n", name); - continue; - } - - rc = ksocknal_lib_getifaddr(sock, name, &ipaddr, &netmask); - if (rc != 0) { - CDEBUG(D_WARNING, - "Can't get IP address or netmask for %s\n", - name); - continue; - } - - if (nused >= nifs) { - CWARN("Too many available interfaces: " - "only using the first %d\n", nused); - break; - } - - memset(&ifs[nused], 0, sizeof(ifs[nused])); - - CLASSERT(sizeof(ifs[nused].ksni_name) >= IFNAMSIZ); - strcpy(ifs[nused].ksni_name, name); - ifs[nused].ksni_ipaddr = ipaddr; - ifs[nused].ksni_netmask = netmask; - nused++; - - CDEBUG(D_WARNING, "Added interface %s: %u.%u.%u.%u\n", - name, HIPQUAD(ipaddr)); - } - - rc = nused; - out1: - PORTAL_FREE(ifr, nalloc * sizeof(*ifr)); - out0: - sock_release(sock); - return rc; -} - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.h b/lnet/klnds/socklnd/socklnd_lib-linux.h index 982bcdb..827e625 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.h +++ b/lnet/klnds/socklnd/socklnd_lib-linux.h @@ -44,26 +44,7 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_sleep sleep -#endif - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_wmem_queued wmem_queued -# define sk_err err -#endif - #define SOCKNAL_ARCH_EAGER_ACK 0 -#define SOCK_WMEM_QUEUED(so) ((so)->sk->sk_wmem_queued) -#define SOCK_ERROR(so) ((so)->sk->sk_err) -#define SOCK_TEST_NOSPACE(so) test_bit(SOCK_NOSPACE, &(so)->flags) #ifndef CONFIG_SMP static inline diff --git a/lnet/libcfs/Makefile.in b/lnet/libcfs/Makefile.in index db09995..88f523d 100644 --- a/lnet/libcfs/Makefile.in +++ b/lnet/libcfs/Makefile.in @@ -2,7 +2,7 @@ MODULES = libcfs libcfs-linux-objs := linux-tracefile.o linux-debug.o libcfs-linux-objs += linux-prim.o linux-mem.o -libcfs-linux-objs += linux-fs.o linux-sync.o +libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o libcfs-linux-objs += linux-lwt.o linux-proc.o linux-curproc.o libcfs-linux-objs += linux-utils.o linux-module.o diff --git a/lnet/libcfs/linux/Makefile.am b/lnet/libcfs/linux/Makefile.am index 49f8e87..8bf35cc 100644 --- a/lnet/libcfs/linux/Makefile.am +++ b/lnet/libcfs/linux/Makefile.am @@ -1,4 +1,4 @@ EXTRA_DIST := linux-debug.c linux-lwt.c linux-prim.c linux-tracefile.c \ linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c \ - linux-module.c linux-sync.c linux-curproc.c + linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c diff --git a/lnet/libcfs/linux/linux-tcpip.c b/lnet/libcfs/linux/linux-tcpip.c new file mode 100644 index 0000000..1c996b2 --- /dev/null +++ b/lnet/libcfs/linux/linux-tcpip.c @@ -0,0 +1,656 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + +#include +#include + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + mm_segment_t oldmm = get_fs(); + struct ifreq ifr; + struct socket *sock; + int nob; + int rc; + __u32 val; + + rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + rc -EINVAL; + goto out; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + + strcpy(ifr.ifr_name, name); + set_fs(KERNEL_DS); + rc = sock->ops->ioctl(sock, SIOCGIFFLAGS, (unsigned long)&ifr); + set_fs(oldmm); + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + goto out; + } + + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + goto out; + } + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + set_fs(KERNEL_DS); + rc = sock->ops->ioctl(sock, SIOCGIFADDR, (unsigned long)&ifr); + set_fs(oldmm); + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + set_fs(KERNEL_DS); + rc = sock->ops->ioctl(sock, SIOCGIFNETMASK, (unsigned long)&ifr); + set_fs(oldmm); + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; + *mask = ntohl(val); + + out: + sock_release(sock); + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_query); + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + struct socket *sock; + struct ifreq *ifr; + struct ifconf ifc; + mm_segment_t oldmm = get_fs(); + int rc; + int nob; + int i; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > PAGE_SIZE) { + toobig = 1; + nalloc = PAGE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + PORTAL_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + + set_fs(KERNEL_DS); + rc = sock->ops->ioctl(sock, SIOCGIFCONF, (unsigned long)&ifc); + set_fs(oldmm); + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + LASSERT (rc == 0); + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); + + if (nfound < nalloc || toobig) + break; + + PORTAL_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + + if (nfound == 0) + goto out1; + + PORTAL_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + /* NULL out all names[i] */ + memset (names, 0, nfound * sizeof(*names)); + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + PORTAL_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + + out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); + out1: + PORTAL_FREE(ifr, nalloc * sizeof(*ifr)); + out0: + sock_release(sock); + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_enumerate); + +void +libcfs_ipif_free_enumeration (char **names, int n) +{ + int i; + + LASSERT (n > 0); + + for (i = 0; i < n && names[i] != NULL; i++) + PORTAL_FREE(names[i], IFNAMSIZ); + + PORTAL_FREE(names, n * sizeof(*names)); +} + +EXPORT_SYMBOL(libcfs_ipif_free_enumeration); + +int +libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + /* Set send timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket send timeout " + "%ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + } + + set_fs (KERNEL_DS); + rc = sock_sendmsg (sock, &msg, iov.iov_len); + set_fs (oldmm); + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return (-ECONNABORTED); + } + + if (timeout == 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} + +EXPORT_SYMBOL(libcfs_sock_write); + +int +libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + LASSERT (ticks > 0); + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + /* Set receive timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket recv timeout %ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + + set_fs(KERNEL_DS); + then = jiffies; + rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); + ticks -= jiffies - then; + set_fs(oldmm); + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNABORTED; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (ticks <= 0) + return -ETIMEDOUT; + } +} + +EXPORT_SYMBOL(libcfs_sock_read); + +static int +libcfs_sock_create (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + struct socket *sock; + int rc; + int option; + mm_segment_t oldmm = get_fs(); + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (rc); + } + + set_fs (KERNEL_DS); + option = 1; + rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } + + /* can't specify a local port without a local IP */ + LASSERT (local_ip == 0 || local_port != 0); + + if (local_ip != 0 || local_port != 0) { + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = (local_ip == 0) ? + INADDR_ANY : htonl(local_ip); + + rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + + return 0; + + failed: + sock_release(sock); + return rc; +} + +int +libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize) +{ + mm_segment_t oldmm = get_fs(); + int option; + int rc; + + if (txbufsize != 0) { + option = txbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return (rc); + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return (rc); + } + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_setbuf); + +int +libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize) +{ + mm_segment_t oldmm = get_fs(); + int option; + int optlen; + int rc; + + if (txbufsize != NULL) { + optlen = sizeof(option); + set_fs (KERNEL_DS); + rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, &optlen); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't get send buffer size: %d\n", rc); + return (rc); + } + *txbufsize = option; + } + + if (rxbufsize != NULL) { + optlen = sizeof(option); + set_fs (KERNEL_DS); + rc = sock_getsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, &optlen); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't get receive buffer size: %d\n", rc); + return (rc); + } + *rxbufsize = option; + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getbuf); + +int +libcfs_sock_listen (struct socket **sockp, + __u32 local_ip, int local_port, int backlog) +{ + int fatal; + int rc; + + rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + + rc = (*sockp)->ops->listen(*sockp, backlog); + if (rc == 0) + return 0; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_listen); + +int +libcfs_sock_accept (struct socket **newsockp, struct socket *sock, int bufsize) +{ + wait_queue_t wait; + struct socket *newsock; + int rc; + + init_waitqueue_entry(&wait, current); + + newsock = sock_alloc(); + if (newsock == NULL) { + CERROR("Can't allocate socket\n"); + return -ENOMEM; + } + + /* XXX this should add a ref to sock->ops->owner, if + * TCP could be a module */ + newsock->type = sock->type; + newsock->ops = sock->ops; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(sock->sk->sk_sleep, &wait); + + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + if (rc == -EAGAIN) { + /* Nothing ready, so wait for activity */ + schedule(); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + } + + remove_wait_queue(sock->sk->sk_sleep, &wait); + set_current_state(TASK_RUNNING); + + if (rc != 0) + goto failed; + + if (bufsize != 0) { + rc = libcfs_sock_setbuf(newsock, bufsize, bufsize); + if (rc != 0) + goto failed; + } + + *newsockp = newsock; + return 0; + + failed: + sock_release(newsock); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_accept); + +void +libcfs_sock_abort_accept (struct socket *sock) +{ + wake_up_all(sock->sk->sk_sleep); +} + +EXPORT_SYMBOL(libcfs_sock_abort_accept); + +int +libcfs_sock_connect (struct socket **sockp, int *fatal, int bufsize, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + struct sockaddr_in srvaddr; + int rc; + + rc = libcfs_sock_create(sockp, fatal, local_ip, local_port); + if (rc != 0) + return rc; + + if (bufsize != 0) { + rc = libcfs_sock_setbuf(*sockp, bufsize, bufsize); + if (rc != 0) + goto failed; + } + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = (*sockp)->ops->connect(*sockp, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + 0); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *fatal = !(rc == -EADDRNOTAVAIL); + + CDEBUG(*fatal ? D_ERROR : D_NET, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + + failed: + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_connect); + +void +libcfs_sock_release (struct socket *sock) +{ + sock_release(sock); +} + +EXPORT_SYMBOL(libcfs_sock_release); + +void +libcfs_pause (cfs_duration_t ticks) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} + +EXPORT_SYMBOL(libcfs_pause); diff --git a/lnet/libcfs/nidstrings.c b/lnet/libcfs/nidstrings.c index 2156f87..9b84b66 100644 --- a/lnet/libcfs/nidstrings.c +++ b/lnet/libcfs/nidstrings.c @@ -310,7 +310,7 @@ libcfs_net2str(__u32 net) char *str = libcfs_next_nidstring(); if (nf == NULL) - snprintf(str, PTL_NALFMT_SIZE, "t<%u>%u", nal, num); + snprintf(str, PTL_NALFMT_SIZE, "<%u:%u>", nal, num); else if (num == 0) snprintf(str, PTL_NALFMT_SIZE, "%s", nf->nf_name); else @@ -337,7 +337,7 @@ libcfs_nid2str(ptl_nid_t nid) str = libcfs_next_nidstring(); if (nf == NULL) - snprintf(str, PTL_NALFMT_SIZE, "%x@t<%u>%u", addr, nal, nnum); + snprintf(str, PTL_NALFMT_SIZE, "%x@<%u:%u>", addr, nal, nnum); else { nf->nf_addr2str(addr, str); nob = strlen(str); @@ -366,10 +366,9 @@ libcfs_str2net_internal(char *str, __u32 *net) if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) break; } - if (i == libcfs_nnalstrfns) { - CWARN("No base: %s\n", str); + + if (i == libcfs_nnalstrfns) return NULL; - } nob = strlen(nf->nf_name); diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index d59e2fd..631a7c6 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -726,3 +726,77 @@ ptl_parse_routes (char *routes) LASSERT (ptl_tbnob == 0); return rc; } + +#ifdef __KERNEL__ +ptl_err_t +ptl_set_ip_niaddr (ptl_ni_t *ni) +{ + __u32 net = PTL_NIDNET(ni->ni_nid); + char **names; + int n; + __u32 ip; + __u32 netmask; + int up; + int i; + int rc; + + /* Convenience for NALs that use the IP address of a local interface as + * the local address part of their NID */ + + if (ni->ni_interfaces[0] != NULL) { + + CLASSERT (PTL_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Net %s doesn't support multiple interfaces\n", + libcfs_net2str(net)); + return PTL_FAIL; + } + + rc = libcfs_ipif_query(ni->ni_interfaces[0], + &up, &ip, &netmask); + if (rc != 0) { + CERROR("Net %s can't qeury interface %s: %d\n", + libcfs_net2str(net), ni->ni_interfaces[0], rc); + return PTL_FAIL; + } + + ni->ni_nid = PTL_MKNID(net, ip); + return PTL_OK; + } + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Net %s can't enumerate interfaces: %d\n", + libcfs_net2str(net), n); + return 0; + } + + for (i = 0; i < n; i++) { + rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); + + if (rc != 0) { + CWARN("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), names[i], rc); + continue; + } + + if (!up) { + CWARN("Net %s ignoring interface %s (down)\n", + libcfs_net2str(net), names[i]); + continue; + } + + libcfs_ipif_free_enumeration(names, n); + ni->ni_nid = PTL_MKNID(net, ip); + return PTL_OK; + } + + CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); + libcfs_ipif_free_enumeration(names, n); + return PTL_FAIL; +} + +EXPORT_SYMBOL(ptl_set_ip_niaddr); + +#endif diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 6c60eab..4543da9 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -349,6 +349,7 @@ int jt_ptl_network(int argc, char **argv) if (net == PTL_NIDNET(data.ioc_nid)) { g_net_set = 1; + g_net = net; return 0; } continue; @@ -373,8 +374,8 @@ int jt_ptl_network(int argc, char **argv) return -1; } - fprintf(stderr,"%s not a local network (%s on its own to list)\n", - argv[1]); + fprintf(stderr,"%s not a local network (%s on its own to list them all)\n", + argv[1], argv[0]); return -1; #endif } @@ -527,21 +528,22 @@ jt_ptl_print_peers (int argc, char **argv) break; if (g_net_is_compatible(NULL, SOCKNAL, 0)) - printf (LPX64"[%d]%s@%s:%d #%d\n", - data.ioc_nid, + printf ("%-20s [%d]%s->%s:%d #%d\n", + libcfs_nid2str(data.ioc_nid), data.ioc_count, /* persistence */ ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* my ip */ ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */ data.ioc_u32[1], /* peer port */ data.ioc_u32[3]); /* conn_count */ else if (g_net_is_compatible(NULL, RANAL, OPENIBNAL, VIBNAL, 0)) - printf (LPX64"[%d]@%s:%d\n", - data.ioc_nid, data.ioc_count, + printf ("%-20s [%d]@%s:%d\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_count, ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */ data.ioc_u32[1]); /* peer port */ else - printf (LPX64"[%d]\n", - data.ioc_nid, data.ioc_count); + printf ("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), data.ioc_count); } if (index == 0) { @@ -689,32 +691,31 @@ jt_ptl_print_connections (int argc, char **argv) PORTAL_IOC_INIT(data); data.ioc_net = g_net; data.ioc_count = index; - + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_CONN, &data); if (rc != 0) break; if (g_net_is_compatible (NULL, SOCKNAL, 0)) - printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n", - data.ioc_u32[4], /* scheduler */ - ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */ - data.ioc_nid, - ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */ - data.ioc_u32[1], /* remote port */ + printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n", + libcfs_nid2str(data.ioc_nid), (data.ioc_u32[3] == SOCKNAL_CONN_ANY) ? "A" : (data.ioc_u32[3] == SOCKNAL_CONN_CONTROL) ? "C" : (data.ioc_u32[3] == SOCKNAL_CONN_BULK_IN) ? "I" : (data.ioc_u32[3] == SOCKNAL_CONN_BULK_OUT) ? "O" : "?", + data.ioc_u32[4], /* scheduler */ + ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */ + ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */ + data.ioc_u32[1], /* remote port */ data.ioc_count, /* tx buffer size */ data.ioc_u32[5], /* rx buffer size */ data.ioc_flags ? "nagle" : "nonagle"); else if (g_net_is_compatible (NULL, RANAL, 0)) - printf ("[%d]"LPX64"@%s:%d\n", - data.ioc_u32[0], /* device id */ - data.ioc_nid); + printf ("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_u32[0] /* device id */); else - printf (LPX64"\n", - data.ioc_nid); + printf ("%s\n", libcfs_nid2str(data.ioc_nid)); } if (index == 0) { -- 1.8.3.1