Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_lib-linux.c
index e583c9c..6ce7d03 100644 (file)
@@ -55,6 +55,8 @@ enum {
         SOCKLND_TX_BUFFER_SIZE,
         SOCKLND_NAGLE,
         SOCKLND_IRQ_AFFINITY,
+        SOCKLND_ROUND_ROBIN,
+        SOCKLND_KEEPALIVE,
         SOCKLND_KEEPALIVE_IDLE,
         SOCKLND_KEEPALIVE_COUNT,
         SOCKLND_KEEPALIVE_INTVL,
@@ -80,6 +82,8 @@ enum {
 #define SOCKLND_TX_BUFFER_SIZE  CTL_UNNUMBERED
 #define SOCKLND_NAGLE           CTL_UNNUMBERED
 #define SOCKLND_IRQ_AFFINITY    CTL_UNNUMBERED
+#define SOCKLND_ROUND_ROBIN     CTL_UNNUMBERED
+#define SOCKLND_KEEPALIVE       CTL_UNNUMBERED
 #define SOCKLND_KEEPALIVE_IDLE  CTL_UNNUMBERED
 #define SOCKLND_KEEPALIVE_COUNT CTL_UNNUMBERED
 #define SOCKLND_KEEPALIVE_INTVL CTL_UNNUMBERED
@@ -94,7 +98,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_TIMEOUT,
                 .procname = "timeout",
-                .data     = ksocknal_tunables.ksnd_timeout,
+                .data     = &ksocknal_tunables.ksnd_timeout,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -103,7 +107,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_CREDITS,
                 .procname = "credits",
-                .data     = ksocknal_tunables.ksnd_credits,
+                .data     = &ksocknal_tunables.ksnd_credits,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -112,7 +116,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
          {
                 .ctl_name = SOCKLND_PEER_CREDITS,
                 .procname = "peer_credits",
-                .data     = ksocknal_tunables.ksnd_peercredits,
+                .data     = &ksocknal_tunables.ksnd_peercredits,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -121,7 +125,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_NCONNDS,
                 .procname = "nconnds",
-                .data     = ksocknal_tunables.ksnd_nconnds,
+                .data     = &ksocknal_tunables.ksnd_nconnds,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -130,7 +134,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_RECONNECTS_MIN,
                 .procname = "min_reconnectms",
-                .data     = ksocknal_tunables.ksnd_min_reconnectms,
+                .data     = &ksocknal_tunables.ksnd_min_reconnectms,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -139,7 +143,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_RECONNECTS_MAX,
                 .procname = "max_reconnectms",
-                .data     = ksocknal_tunables.ksnd_max_reconnectms,
+                .data     = &ksocknal_tunables.ksnd_max_reconnectms,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -148,7 +152,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_EAGER_ACK,
                 .procname = "eager_ack",
-                .data     = ksocknal_tunables.ksnd_eager_ack,
+                .data     = &ksocknal_tunables.ksnd_eager_ack,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -157,7 +161,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_ZERO_COPY,
                 .procname = "zero_copy",
-                .data     = ksocknal_tunables.ksnd_zc_min_frag,
+                .data     = &ksocknal_tunables.ksnd_zc_min_payload,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -166,7 +170,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_ZERO_COPY_RECV,
                 .procname = "zero_copy_recv",
-                .data     = ksocknal_tunables.ksnd_zc_recv,
+                .data     = &ksocknal_tunables.ksnd_zc_recv,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -176,7 +180,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
                 .procname = "zero_copy_recv",
-                .data     = ksocknal_tunables.ksnd_zc_recv_min_nfrags
+                .data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -185,7 +189,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_TYPED,
                 .procname = "typed",
-                .data     = ksocknal_tunables.ksnd_typed_conns,
+                .data     = &ksocknal_tunables.ksnd_typed_conns,
                 .maxlen   = sizeof (int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
@@ -194,7 +198,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_BULK_MIN,
                 .procname = "min_bulk",
-                .data     = ksocknal_tunables.ksnd_min_bulk,
+                .data     = &ksocknal_tunables.ksnd_min_bulk,
                 .maxlen   = sizeof (int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -203,7 +207,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_RX_BUFFER_SIZE,
                 .procname = "rx_buffer_size",
-                .data     = ksocknal_tunables.ksnd_rx_buffer_size,
+                .data     = &ksocknal_tunables.ksnd_rx_buffer_size,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -212,7 +216,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_TX_BUFFER_SIZE,
                 .procname = "tx_buffer_size",
-                .data     = ksocknal_tunables.ksnd_tx_buffer_size,
+                .data     = &ksocknal_tunables.ksnd_tx_buffer_size,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -221,7 +225,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_NAGLE,
                 .procname = "nagle",
-                .data     = ksocknal_tunables.ksnd_nagle,
+                .data     = &ksocknal_tunables.ksnd_nagle,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -231,7 +235,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_IRQ_AFFINITY,
                 .procname = "irq_affinity",
-                .data     = ksocknal_tunables.ksnd_irq_affinity,
+                .data     = &ksocknal_tunables.ksnd_irq_affinity,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -239,9 +243,27 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         },
 #endif
         {
+                .ctl_name = SOCKLND_ROUND_ROBIN,
+                .procname = "round_robin",
+                .data     = &ksocknal_tunables.ksnd_round_robin,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec,
+                .strategy = &sysctl_intvec,
+        },
+        {
+                .ctl_name = SOCKLND_KEEPALIVE,
+                .procname = "keepalive",
+                .data     = &ksocknal_tunables.ksnd_keepalive,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec,
+                .strategy = &sysctl_intvec,
+        },
+        {
                 .ctl_name = SOCKLND_KEEPALIVE_IDLE,
                 .procname = "keepalive_idle",
-                .data     = ksocknal_tunables.ksnd_keepalive_idle,
+                .data     = &ksocknal_tunables.ksnd_keepalive_idle,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -250,7 +272,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_KEEPALIVE_COUNT,
                 .procname = "keepalive_count",
-                .data     = ksocknal_tunables.ksnd_keepalive_count,
+                .data     = &ksocknal_tunables.ksnd_keepalive_count,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -259,7 +281,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_KEEPALIVE_INTVL,
                 .procname = "keepalive_intvl",
-                .data     = ksocknal_tunables.ksnd_keepalive_intvl,
+                .data     = &ksocknal_tunables.ksnd_keepalive_intvl,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -269,7 +291,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_BACKOFF_INIT,
                 .procname = "backoff_init",
-                .data     = ksocknal_tunables.ksnd_backoff_init,
+                .data     = &ksocknal_tunables.ksnd_backoff_init,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -278,7 +300,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_BACKOFF_MAX,
                 .procname = "backoff_max",
-                .data     = ksocknal_tunables.ksnd_backoff_max,
+                .data     = &ksocknal_tunables.ksnd_backoff_max,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -289,7 +311,7 @@ static cfs_sysctl_table_t ksocknal_ctl_table[] = {
         {
                 .ctl_name = SOCKLND_PROTOCOL,
                 .procname = "protocol",
-                .data     = ksocknal_tunables.ksnd_protocol,
+                .data     = &ksocknal_tunables.ksnd_protocol,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec,
@@ -315,6 +337,18 @@ cfs_sysctl_table_t ksocknal_top_ctl_table[] = {
 int
 ksocknal_lib_tunables_init ()
 {
+        if (!*ksocknal_tunables.ksnd_typed_conns) {
+                int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+                if (*ksocknal_tunables.ksnd_protocol < 3)
+                        rc = 0;
+#endif
+                if (rc != 0) {
+                        CERROR("Protocol V3.x MUST have typed connections\n");
+                        return rc;
+                }
+        }
+
         if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
                 *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
         if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
@@ -370,13 +404,13 @@ ksocknal_lib_bind_irq (unsigned int irq)
 
         info = &ksocknal_data.ksnd_irqinfo[irq];
 
-        write_lock_bh (&ksocknal_data.ksnd_global_lock);
+        cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
         LASSERT (info->ksni_valid);
         bind = !info->ksni_bound;
         info->ksni_bound = 1;
 
-        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
+        cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         if (!bind)                              /* bound already */
                 return;
@@ -447,9 +481,12 @@ ksocknal_lib_sock_irq (struct socket *sock)
 }
 
 int
-ksocknal_lib_zc_capable(struct socket *sock)
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
 {
-        int  caps = sock->sk->sk_route_caps;
+        int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+        if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+                return 0;
 
         /* ZC if the socket supports scatter/gather and doesn't need software
          * checksums */
@@ -514,15 +551,16 @@ int
 ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 {
         struct socket *sock = conn->ksnc_sock;
-        lnet_kiov_t    *kiov = tx->tx_kiov;
+        lnet_kiov_t   *kiov = tx->tx_kiov;
         int            rc;
         int            nob;
 
+        /* Not NOOP message */
+        LASSERT (tx->tx_lnetmsg != NULL);
+
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
-
-        if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag &&
-            tx->tx_msg.ksm_zc_req_cookie != 0) {
+        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
                 /* Zero copy is enabled */
                 struct sock   *sk = sock->sk;
                 struct page   *page = kiov->kiov_page;
@@ -1096,7 +1134,7 @@ ksocknal_data_ready (struct sock *sk, int n)
 
         /* interleave correctly with closing sockets... */
         LASSERT(!in_irq());
-        read_lock (&ksocknal_data.ksnd_global_lock);
+        cfs_read_lock (&ksocknal_data.ksnd_global_lock);
 
         conn = sk->sk_user_data;
         if (conn == NULL) {             /* raced with ksocknal_terminate_conn */
@@ -1105,7 +1143,7 @@ ksocknal_data_ready (struct sock *sk, int n)
         } else
                 ksocknal_read_callback(conn);
 
-        read_unlock (&ksocknal_data.ksnd_global_lock);
+        cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
 
         EXIT;
 }
@@ -1119,7 +1157,7 @@ ksocknal_write_space (struct sock *sk)
 
         /* interleave correctly with closing sockets... */
         LASSERT(!in_irq());
-        read_lock (&ksocknal_data.ksnd_global_lock);
+        cfs_read_lock (&ksocknal_data.ksnd_global_lock);
 
         conn = sk->sk_user_data;
         wspace = SOCKNAL_WSPACE(sk);
@@ -1138,7 +1176,7 @@ ksocknal_write_space (struct sock *sk)
                 LASSERT (sk->sk_write_space != &ksocknal_write_space);
                 sk->sk_write_space (sk);
 
-                read_unlock (&ksocknal_data.ksnd_global_lock);
+                cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
                 return;
         }
 
@@ -1152,7 +1190,7 @@ ksocknal_write_space (struct sock *sk)
                 clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
         }
 
-        read_unlock (&ksocknal_data.ksnd_global_lock);
+        cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
 }
 
 void
@@ -1187,3 +1225,64 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
 
         return ;
 }
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+        int            rc = 0;
+        ksock_sched_t *sched;
+        
+        sched = conn->ksnc_scheduler;
+        cfs_spin_lock_bh (&sched->kss_lock);
+        
+        if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+            !conn->ksnc_tx_ready) {
+                /* SOCK_NOSPACE is set when the socket fills
+                 * and cleared in the write_space callback
+                 * (which also sets ksnc_tx_ready).  If
+                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                 * zero, I didn't fill the socket and
+                 * write_space won't reschedule me, so I
+                 * return -ENOMEM to get my caller to retry
+                 * after a timeout */
+                rc = -ENOMEM;
+        }
+        
+        cfs_spin_unlock_bh (&sched->kss_lock);
+
+        return rc;
+}
+
+__u64
+ksocknal_lib_new_incarnation(void)
+{
+        struct timeval tv;
+
+        /* The incarnation number is the time this module loaded and it
+         * identifies this particular instance of the socknal.  Hopefully
+         * we won't be able to reboot more frequently than 1MHz for the
+         * forseeable future :) */
+
+        do_gettimeofday(&tv);
+
+        return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_lib_bind_thread_to_cpu(int id)
+{
+#if defined(CONFIG_SMP) && defined(CPU_AFFINITY)
+        id = ksocknal_sched2cpu(id);
+        if (cpu_online(id)) {
+                cpumask_t m = CPU_MASK_NONE;
+                cpu_set(id, m);
+                set_cpus_allowed(current, m);
+                return 0;
+        }
+
+        return -1;
+
+#else
+        return 0;
+#endif
+}