X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-linux.c;h=8595e4ce462f555519bcf1f895da3567f5d46666;hp=c4dc1e1ef131572f7e24981662aa22abe0685504;hb=0a9c9e444635dcf35a74bfb2f46efb3040ca17a0;hpb=f9d9a20e1f0de7e7099537ec5a47cef5130e283a diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index c4dc1e1..8595e4c 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -37,218 +37,322 @@ #include "socklnd.h" # if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM -static cfs_sysctl_table_t ksocknal_ctl_table[23]; -cfs_sysctl_table_t ksocknal_top_ctl_table[] = { - { - .ctl_name = 200, - .procname = "socknal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ksocknal_ctl_table - }, - { 0 } +#ifndef HAVE_SYSCTL_UNNUMBERED + +enum { + SOCKLND_TIMEOUT = 1, + SOCKLND_CREDITS, + SOCKLND_PEER_CREDITS, + SOCKLND_NCONNDS, + SOCKLND_RECONNECTS_MIN, + SOCKLND_RECONNECTS_MAX, + SOCKLND_EAGER_ACK, + SOCKLND_ZERO_COPY, + SOCKLND_TYPED, + SOCKLND_BULK_MIN, + SOCKLND_RX_BUFFER_SIZE, + SOCKLND_TX_BUFFER_SIZE, + SOCKLND_NAGLE, + SOCKLND_IRQ_AFFINITY, + SOCKLND_ROUND_ROBIN, + SOCKLND_KEEPALIVE, + SOCKLND_KEEPALIVE_IDLE, + SOCKLND_KEEPALIVE_COUNT, + SOCKLND_KEEPALIVE_INTVL, + SOCKLND_BACKOFF_INIT, + SOCKLND_BACKOFF_MAX, + SOCKLND_PROTOCOL, + SOCKLND_ZERO_COPY_RECV, + SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS }; +#else -int -ksocknal_lib_tunables_init () -{ - int i = 0; - int j = 1; - - if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) - *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; - - if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) - *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; +#define SOCKLND_TIMEOUT CTL_UNNUMBERED +#define SOCKLND_CREDITS CTL_UNNUMBERED +#define SOCKLND_PEER_CREDITS CTL_UNNUMBERED +#define SOCKLND_NCONNDS CTL_UNNUMBERED +#define SOCKLND_RECONNECTS_MIN CTL_UNNUMBERED +#define SOCKLND_RECONNECTS_MAX CTL_UNNUMBERED +#define SOCKLND_EAGER_ACK CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY CTL_UNNUMBERED +#define SOCKLND_TYPED CTL_UNNUMBERED +#define SOCKLND_BULK_MIN CTL_UNNUMBERED +#define SOCKLND_RX_BUFFER_SIZE CTL_UNNUMBERED +#define SOCKLND_TX_BUFFER_SIZE CTL_UNNUMBERED +#define SOCKLND_NAGLE CTL_UNNUMBERED +#define SOCKLND_IRQ_AFFINITY CTL_UNNUMBERED +#define SOCKLND_ROUND_ROBIN CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_IDLE CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_COUNT CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_INTVL CTL_UNNUMBERED +#define SOCKLND_BACKOFF_INIT CTL_UNNUMBERED +#define SOCKLND_BACKOFF_MAX CTL_UNNUMBERED +#define SOCKLND_PROTOCOL CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY_RECV CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS CTL_UNNUMBERED +#endif - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, +static cfs_sysctl_table_t ksocknal_ctl_table[] = { + { + .ctl_name = SOCKLND_TIMEOUT, .procname = "timeout", - .data = ksocknal_tunables.ksnd_timeout, + .data = &ksocknal_tunables.ksnd_timeout, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_CREDITS, .procname = "credits", - .data = ksocknal_tunables.ksnd_credits, + .data = &ksocknal_tunables.ksnd_credits, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_PEER_CREDITS, .procname = "peer_credits", - .data = ksocknal_tunables.ksnd_peercredits, + .data = &ksocknal_tunables.ksnd_peercredits, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NCONNDS, .procname = "nconnds", - .data = ksocknal_tunables.ksnd_nconnds, + .data = &ksocknal_tunables.ksnd_nconnds, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MIN, .procname = "min_reconnectms", - .data = ksocknal_tunables.ksnd_min_reconnectms, + .data = &ksocknal_tunables.ksnd_min_reconnectms, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MAX, .procname = "max_reconnectms", - .data = ksocknal_tunables.ksnd_max_reconnectms, + .data = &ksocknal_tunables.ksnd_max_reconnectms, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_EAGER_ACK, .procname = "eager_ack", - .data = ksocknal_tunables.ksnd_eager_ack, + .data = &ksocknal_tunables.ksnd_eager_ack, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY, .procname = "zero_copy", - .data = ksocknal_tunables.ksnd_zc_min_frag, + .data = &ksocknal_tunables.ksnd_zc_min_payload, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY_RECV, .procname = "zero_copy_recv", - .data = ksocknal_tunables.ksnd_zc_recv, + .data = &ksocknal_tunables.ksnd_zc_recv, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, - .procname = "zero_copy_recv_min_nfrags", - .data = ksocknal_tunables.ksnd_zc_recv_min_nfrags, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + + { + .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS, + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv_min_nfrags, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TYPED, .procname = "typed", - .data = ksocknal_tunables.ksnd_typed_conns, + .data = &ksocknal_tunables.ksnd_typed_conns, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_BULK_MIN, .procname = "min_bulk", - .data = ksocknal_tunables.ksnd_min_bulk, + .data = &ksocknal_tunables.ksnd_min_bulk, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RX_BUFFER_SIZE, .procname = "rx_buffer_size", - .data = ksocknal_tunables.ksnd_rx_buffer_size, + .data = &ksocknal_tunables.ksnd_rx_buffer_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TX_BUFFER_SIZE, .procname = "tx_buffer_size", - .data = ksocknal_tunables.ksnd_tx_buffer_size, + .data = &ksocknal_tunables.ksnd_tx_buffer_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NAGLE, .procname = "nagle", - .data = ksocknal_tunables.ksnd_nagle, + .data = &ksocknal_tunables.ksnd_nagle, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #ifdef CPU_AFFINITY - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + { + .ctl_name = SOCKLND_IRQ_AFFINITY, .procname = "irq_affinity", - .data = ksocknal_tunables.ksnd_irq_affinity, + .data = &ksocknal_tunables.ksnd_irq_affinity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #endif - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, - .procname = "keepalive_idle", - .data = ksocknal_tunables.ksnd_keepalive_idle, + { + .ctl_name = SOCKLND_ROUND_ROBIN, + .procname = "round_robin", + .data = &ksocknal_tunables.ksnd_round_robin, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, - .procname = "keepalive_count", - .data = ksocknal_tunables.ksnd_keepalive_count, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE, + .procname = "keepalive", + .data = &ksocknal_tunables.ksnd_keepalive, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_IDLE, + .procname = "keepalive_idle", + .data = &ksocknal_tunables.ksnd_keepalive_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_COUNT, + .procname = "keepalive_count", + .data = &ksocknal_tunables.ksnd_keepalive_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_INTVL, .procname = "keepalive_intvl", - .data = ksocknal_tunables.ksnd_keepalive_intvl, + .data = &ksocknal_tunables.ksnd_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #ifdef SOCKNAL_BACKOFF - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + { + .ctl_name = SOCKLND_BACKOFF_INIT, .procname = "backoff_init", - .data = ksocknal_tunables.ksnd_backoff_init, + .data = &ksocknal_tunables.ksnd_backoff_init, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_BACKOFF_MAX, .procname = "backoff_max", - .data = ksocknal_tunables.ksnd_backoff_max, + .data = &ksocknal_tunables.ksnd_backoff_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #endif #if SOCKNAL_VERSION_DEBUG - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { - .ctl_name = j++, + { + .ctl_name = SOCKLND_PROTOCOL, .procname = "protocol", - .data = ksocknal_tunables.ksnd_protocol, + .data = &ksocknal_tunables.ksnd_protocol, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec - }; + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #endif - ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { 0 }; + {0} +}; - LASSERT (j == i); - LASSERT (i <= sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0])); + +cfs_sysctl_table_t ksocknal_top_ctl_table[] = { + { + .ctl_name = CTL_SOCKLND, + .procname = "socknal", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ksocknal_ctl_table + }, + { 0 } +}; + +int +ksocknal_lib_tunables_init () +{ + if (!*ksocknal_tunables.ksnd_typed_conns) { + int rc = -EINVAL; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol < 3) + rc = 0; +#endif + if (rc != 0) { + CERROR("Protocol V3.x MUST have typed connections\n"); + return rc; + } + } + + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; ksocknal_tunables.ksnd_sysctl = cfs_register_sysctl_table(ksocknal_top_ctl_table, 0); @@ -300,13 +404,13 @@ ksocknal_lib_bind_irq (unsigned int irq) info = &ksocknal_data.ksnd_irqinfo[irq]; - write_lock_bh (&ksocknal_data.ksnd_global_lock); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); LASSERT (info->ksni_valid); bind = !info->ksni_bound; info->ksni_bound = 1; - write_unlock_bh (&ksocknal_data.ksnd_global_lock); + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); if (!bind) /* bound already */ return; @@ -377,9 +481,12 @@ ksocknal_lib_sock_irq (struct socket *sock) } int -ksocknal_lib_zc_capable(struct socket *sock) +ksocknal_lib_zc_capable(ksock_conn_t *conn) { - int caps = sock->sk->sk_route_caps; + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; /* ZC if the socket supports scatter/gather and doesn't need software * checksums */ @@ -444,15 +551,16 @@ int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; - lnet_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int rc; int nob; + /* Not NOOP message */ + LASSERT (tx->tx_lnetmsg != NULL); + /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ - - if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && - tx->tx_msg.ksm_zc_req_cookie != 0) { + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { /* Zero copy is enabled */ struct sock *sk = sock->sk; struct page *page = kiov->kiov_page; @@ -697,7 +805,6 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) msg.msg_iovlen = niov; } - LASSERT (nob <= conn->ksnc_rx_nob_wanted); set_fs (KERNEL_DS); @@ -1027,7 +1134,7 @@ ksocknal_data_ready (struct sock *sk, int n) /* interleave correctly with closing sockets... */ LASSERT(!in_irq()); - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; if (conn == NULL) { /* raced with ksocknal_terminate_conn */ @@ -1036,7 +1143,7 @@ ksocknal_data_ready (struct sock *sk, int n) } else ksocknal_read_callback(conn); - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; } @@ -1050,7 +1157,7 @@ ksocknal_write_space (struct sock *sk) /* interleave correctly with closing sockets... */ LASSERT(!in_irq()); - read_lock (&ksocknal_data.ksnd_global_lock); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; wspace = SOCKNAL_WSPACE(sk); @@ -1069,7 +1176,7 @@ ksocknal_write_space (struct sock *sk) LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return; } @@ -1083,7 +1190,7 @@ ksocknal_write_space (struct sock *sk) clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); } - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); } void @@ -1118,3 +1225,64 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) return ; } + +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + cfs_spin_lock_bh (&sched->kss_lock); + + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + cfs_spin_unlock_bh (&sched->kss_lock); + + return rc; +} + +__u64 +ksocknal_lib_new_incarnation(void) +{ + struct timeval tv; + + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. Hopefully + * we won't be able to reboot more frequently than 1MHz for the + * forseeable future :) */ + + do_gettimeofday(&tv); + + return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; +} + +int +ksocknal_lib_bind_thread_to_cpu(int id) +{ +#if defined(CONFIG_SMP) && defined(CPU_AFFINITY) + id = ksocknal_sched2cpu(id); + if (cpu_online(id)) { + cpumask_t m = CPU_MASK_NONE; + cpu_set(id, m); + set_cpus_allowed(current, m); + return 0; + } + + return -1; + +#else + return 0; +#endif +}