Whamcloud - gitweb
LU-10391 socklnd: use interface index to track local addr
[fs/lustre-release.git] / lnet / lnet / acceptor.c
index 4de013a..b11e09d 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,14 @@ static struct {
        int                     pta_shutdown;
        struct socket           *pta_sock;
        struct completion       pta_signal;
+       struct net              *pta_ns;
+       wait_queue_head_t       pta_waitq;
+       atomic_t                pta_ready;
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+       void                    (*pta_odata)(struct sock *);
+#else
+       void                    (*pta_odata)(struct sock *, int);
+#endif
 } lnet_acceptor_state = {
        .pta_shutdown = 1
 };
@@ -63,9 +71,9 @@ lnet_accept_magic(__u32 magic, __u32 constant)
 
 EXPORT_SYMBOL(lnet_acceptor_port);
 
-static char *accept = "secure";
+static char *accept_type = "secure";
 
-module_param(accept, charp, 0444);
+module_param_named(accept, accept_type, charp, 0444);
 MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)");
 module_param(accept_port, int, 0444);
 MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)");
@@ -74,18 +82,6 @@ MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog");
 module_param(accept_timeout, int, 0644);
 MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)");
 
-static char *accept_type = NULL;
-
-static int
-lnet_acceptor_get_tunables(void)
-{
-       /* Userland acceptor uses 'accept_type' instead of 'accept', due to
-        * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
-        * for compatibility. Hence the trick. */
-       accept_type = accept;
-       return 0;
-}
-
 int
 lnet_acceptor_timeout(void)
 {
@@ -148,50 +144,43 @@ lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
 }
 EXPORT_SYMBOL(lnet_connect_console_error);
 
-int
-lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
-           __u32 local_ip, __u32 peer_ip, int peer_port)
+struct socket *
+lnet_connect(lnet_nid_t peer_nid, int interface, __u32 peer_ip,
+            int peer_port, struct net *ns)
 {
-       lnet_acceptor_connreq_t cr;
+       struct lnet_acceptor_connreq cr;
        struct socket           *sock;
        int                     rc;
        int                     port;
-       int                     fatal;
 
-       CLASSERT(sizeof(cr) <= 16);             /* not too big to be on the stack */
+       BUILD_BUG_ON(sizeof(cr) > 16); /* not too big to be on the stack */
 
        for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
             port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
             --port) {
                /* Iterate through reserved ports. */
 
-               rc = lnet_sock_connect(&sock, &fatal,
-                                        local_ip, port,
-                                        peer_ip, peer_port);
-               if (rc != 0) {
-                       if (fatal)
-                               goto failed;
-                       continue;
+               sock = lnet_sock_connect(interface, port,
+                                        peer_ip, peer_port, ns);
+               if (IS_ERR(sock)) {
+                       rc = PTR_ERR(sock);
+                       if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL)
+                               continue;
+                       goto failed;
                }
 
-               CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1);
+               BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1);
 
                cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
                cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
                cr.acr_nid     = peer_nid;
 
-               if (the_lnet.ln_testprotocompat != 0) {
+               if (the_lnet.ln_testprotocompat) {
                        /* single-shot proto check */
-                       lnet_net_lock(LNET_LOCK_EX);
-                       if ((the_lnet.ln_testprotocompat & 4) != 0) {
+                       if (test_and_clear_bit(2, &the_lnet.ln_testprotocompat))
                                cr.acr_version++;
-                               the_lnet.ln_testprotocompat &= ~4;
-                       }
-                       if ((the_lnet.ln_testprotocompat & 8) != 0) {
+                       if (test_and_clear_bit(3, &the_lnet.ln_testprotocompat))
                                cr.acr_magic = LNET_PROTO_MAGIC;
-                               the_lnet.ln_testprotocompat &= ~8;
-                       }
-                       lnet_net_unlock(LNET_LOCK_EX);
                }
 
                rc = lnet_sock_write(sock, &cr, sizeof(cr),
@@ -199,8 +188,7 @@ lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
                if (rc != 0)
                        goto failed_sock;
 
-               *sockp = sock;
-               return 0;
+               return sock;
        }
 
        rc = -EADDRINUSE;
@@ -210,19 +198,19 @@ failed_sock:
        sock_release(sock);
 failed:
        lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
-       return rc;
+       return ERR_PTR(rc);
 }
 EXPORT_SYMBOL(lnet_connect);
 
 static int
 lnet_accept(struct socket *sock, __u32 magic)
 {
-       lnet_acceptor_connreq_t cr;
+       struct lnet_acceptor_connreq cr;
        __u32                   peer_ip;
        int                     peer_port;
        int                     rc;
        int                     flip;
-       lnet_ni_t              *ni;
+       struct lnet_ni *ni;
        char                   *str;
 
        LASSERT(sizeof(cr) <= 16);              /* not too big for the stack */
@@ -245,13 +233,12 @@ lnet_accept(struct socket *sock, __u32 magic)
                                               accept_timeout);
 
                        if (rc != 0)
-                               CERROR("Error sending magic+version in response"
-                                      "to LNET magic from %pI4h: %d\n",
+                               CERROR("Error sending magic+version in response to LNET magic from %pI4h: %d\n",
                                       &peer_ip, rc);
                        return -EPROTO;
                }
 
-               if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+               if (lnet_accept_magic(magic, LNET_PROTO_TCP_MAGIC))
                        str = "'old' socknal/tcpnal";
                else
                        str = "unrecognised";
@@ -291,15 +278,14 @@ lnet_accept(struct socket *sock, __u32 magic)
                                       accept_timeout);
 
                if (rc != 0)
-                       CERROR("Error sending magic+version in response"
-                              "to version %d from %pI4h: %d\n",
+                       CERROR("Error sending magic+version in response to version %d from %pI4h: %d\n",
                               peer_version, &peer_ip, rc);
                return -EPROTO;
        }
 
        rc = lnet_sock_read(sock, &cr.acr_nid,
                              sizeof(cr) -
-                             offsetof(lnet_acceptor_connreq_t, acr_nid),
+                             offsetof(struct lnet_acceptor_connreq, acr_nid),
                              accept_timeout);
        if (rc != 0) {
                CERROR("Error %d reading connection request from "
@@ -310,8 +296,8 @@ lnet_accept(struct socket *sock, __u32 magic)
        if (flip)
                __swab64s(&cr.acr_nid);
 
-       ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
-       if (ni == NULL ||               /* no matching net */
+       ni = lnet_nid2ni_addref(cr.acr_nid);
+       if (ni == NULL ||               /* no matching net */
            ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
                if (ni != NULL)
                        lnet_ni_decref(ni);
@@ -321,7 +307,7 @@ lnet_accept(struct socket *sock, __u32 magic)
                return -EPERM;
        }
 
-       if (ni->ni_lnd->lnd_accept == NULL) {
+       if (ni->ni_net->net_lnd->lnd_accept == NULL) {
                /* This catches a request for the loopback LND */
                lnet_ni_decref(ni);
                LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h "
@@ -333,12 +319,30 @@ lnet_accept(struct socket *sock, __u32 magic)
        CDEBUG(D_NET, "Accept %s from %pI4h\n",
               libcfs_nid2str(cr.acr_nid), &peer_ip);
 
-       rc = ni->ni_lnd->lnd_accept(ni, sock);
+       rc = ni->ni_net->net_lnd->lnd_accept(ni, sock);
 
        lnet_ni_decref(ni);
        return rc;
 }
 
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+static void lnet_acceptor_ready(struct sock *sk)
+#else
+static void lnet_acceptor_ready(struct sock *sk, int len)
+#endif
+{
+       /* Ensure pta_odata has actually been set before calling it */
+       rmb();
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+       lnet_acceptor_state.pta_odata(sk);
+#else
+       lnet_acceptor_state.pta_odata(sk, 0);
+#endif
+
+       atomic_set(&lnet_acceptor_state.pta_ready, 1);
+       wake_up(&lnet_acceptor_state.pta_waitq);
+}
+
 static int
 lnet_acceptor(void *arg)
 {
@@ -351,11 +355,11 @@ lnet_acceptor(void *arg)
 
        LASSERT(lnet_acceptor_state.pta_sock == NULL);
 
-       cfs_block_allsigs();
-
-       rc = lnet_sock_listen(&lnet_acceptor_state.pta_sock,
-                               0, accept_port, accept_backlog);
-       if (rc != 0) {
+       lnet_acceptor_state.pta_sock =
+               lnet_sock_listen(accept_port, accept_backlog,
+                                lnet_acceptor_state.pta_ns);
+       if (IS_ERR(lnet_acceptor_state.pta_sock)) {
+               rc = PTR_ERR(lnet_acceptor_state.pta_sock);
                if (rc == -EADDRINUSE)
                        LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
                                           " %d: port already in use\n",
@@ -367,7 +371,18 @@ lnet_acceptor(void *arg)
 
                lnet_acceptor_state.pta_sock = NULL;
        } else {
+               rc = 0;
                LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+               init_waitqueue_head(&lnet_acceptor_state.pta_waitq);
+               lnet_acceptor_state.pta_odata =
+                       lnet_acceptor_state.pta_sock->sk->sk_data_ready;
+               /* ensure pta_odata gets set before there is any chance of
+                * lnet_accept_ready() trying to read it.
+                */
+               wmb();
+               lnet_acceptor_state.pta_sock->sk->sk_data_ready =
+                       lnet_acceptor_ready;
+               atomic_set(&lnet_acceptor_state.pta_ready, 1);
        }
 
        /* set init status and unblock parent */
@@ -379,21 +394,25 @@ lnet_acceptor(void *arg)
 
        while (!lnet_acceptor_state.pta_shutdown) {
 
-               rc = lnet_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+               wait_event_idle(lnet_acceptor_state.pta_waitq,
+                               lnet_acceptor_state.pta_shutdown ||
+                               atomic_read(&lnet_acceptor_state.pta_ready));
+               if (!atomic_read(&lnet_acceptor_state.pta_ready))
+                       continue;
+               atomic_set(&lnet_acceptor_state.pta_ready, 0);
+               rc = kernel_accept(lnet_acceptor_state.pta_sock, &newsock,
+                                  SOCK_NONBLOCK);
                if (rc != 0) {
                        if (rc != -EAGAIN) {
                                CWARN("Accept error %d: pausing...\n", rc);
-                               set_current_state(TASK_UNINTERRUPTIBLE);
-                               schedule_timeout(cfs_time_seconds(1));
+                               schedule_timeout_uninterruptible(
+                                       cfs_time_seconds(1));
                        }
                        continue;
                }
 
-               /* maybe we're waken up with lnet_sock_abort_accept() */
-               if (lnet_acceptor_state.pta_shutdown) {
-                       sock_release(newsock);
-                       break;
-               }
+               /* make sure we call lnet_sock_accept() again, until it fails */
+               atomic_set(&lnet_acceptor_state.pta_ready, 1);
 
                rc = lnet_sock_getaddr(newsock, true, &peer_ip, &peer_port);
                if (rc != 0) {
@@ -425,6 +444,8 @@ failed:
                sock_release(newsock);
        }
 
+       lnet_acceptor_state.pta_sock->sk->sk_data_ready =
+               lnet_acceptor_state.pta_odata;
        sock_release(lnet_acceptor_state.pta_sock);
        lnet_acceptor_state.pta_sock = NULL;
 
@@ -467,24 +488,22 @@ lnet_acceptor_start(void)
 
        LASSERT(lnet_acceptor_state.pta_sock == NULL);
 
-       rc = lnet_acceptor_get_tunables();
-       if (rc != 0)
-               return rc;
-
        init_completion(&lnet_acceptor_state.pta_signal);
        rc = accept2secure(accept_type, &secure);
        if (rc <= 0)
                return rc;
 
-       if (lnet_count_acceptor_nis() == 0)  /* not required */
+       if (lnet_count_acceptor_nets() == 0)  /* not required */
                return 0;
-
+       if (current->nsproxy && current->nsproxy->net_ns)
+               lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+       else
+               lnet_acceptor_state.pta_ns = &init_net;
        task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
                           "acceptor_%03ld", secure);
        if (IS_ERR(task)) {
                rc2 = PTR_ERR(task);
                CERROR("Can't start acceptor thread: %ld\n", rc2);
-
                return -ESRCH;
        }
 
@@ -505,17 +524,15 @@ lnet_acceptor_start(void)
 void
 lnet_acceptor_stop(void)
 {
-       struct sock *sk;
-
        if (lnet_acceptor_state.pta_shutdown) /* not running */
                return;
 
-       lnet_acceptor_state.pta_shutdown = 1;
-
-       sk = lnet_acceptor_state.pta_sock->sk;
+       /* If still required, return immediately */
+       if (the_lnet.ln_refcount && lnet_count_acceptor_nets() > 0)
+               return;
 
-       /* awake any sleepers using safe method */
-       sk->sk_state_change(sk);
+       lnet_acceptor_state.pta_shutdown = 1;
+       wake_up(&lnet_acceptor_state.pta_waitq);
 
        /* block until acceptor signals exit */
        wait_for_completion(&lnet_acceptor_state.pta_signal);