1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/socklnd/socklnd_lib-winnt.c
38 * windows socknal library
43 # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
44 static ctl_table ksocknal_ctl_table[18];
46 ctl_table ksocknal_top_ctl_table[] = {
47 {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
52 ksocknal_lib_tunables_init ()
57 ksocknal_ctl_table[i++] = (ctl_table)
58 {j++, "timeout", ksocknal_tunables.ksnd_timeout,
59 sizeof (int), 0644, NULL, &proc_dointvec};
60 ksocknal_ctl_table[i++] = (ctl_table)
61 {j++, "credits", ksocknal_tunables.ksnd_credits,
62 sizeof (int), 0444, NULL, &proc_dointvec};
63 ksocknal_ctl_table[i++] = (ctl_table)
64 {j++, "peer_credits", ksocknal_tunables.ksnd_peertxcredits,
65 sizeof (int), 0444, NULL, &proc_dointvec};
66 ksocknal_ctl_table[i++] = (ctl_table)
67 {j++, "peer_buffer_credits", ksocknal_tunables.ksnd_peerrtrcredits,
68 sizeof (int), 0444, NULL, &proc_dointvec};
69 ksocknal_ctl_table[i++] = (ctl_table)
70 {j++, "nconnds", ksocknal_tunables.ksnd_nconnds,
71 sizeof (int), 0444, NULL, &proc_dointvec};
72 ksocknal_ctl_table[i++] = (ctl_table)
73 {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms,
74 sizeof (int), 0444, NULL, &proc_dointvec};
75 ksocknal_ctl_table[i++] = (ctl_table)
76 {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms,
77 sizeof (int), 0444, NULL, &proc_dointvec};
78 ksocknal_ctl_table[i++] = (ctl_table)
79 {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack,
80 sizeof (int), 0644, NULL, &proc_dointvec};
82 ksocknal_ctl_table[i++] = (ctl_table)
83 {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_payload,
84 sizeof (int), 0644, NULL, &proc_dointvec};
86 ksocknal_ctl_table[i++] = (ctl_table)
87 {j++, "typed", ksocknal_tunables.ksnd_typed_conns,
88 sizeof (int), 0444, NULL, &proc_dointvec};
89 ksocknal_ctl_table[i++] = (ctl_table)
90 {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk,
91 sizeof (int), 0644, NULL, &proc_dointvec};
92 ksocknal_ctl_table[i++] = (ctl_table)
93 {j++, "buffer_size", ksocknal_tunables.ksnd_buffer_size,
94 sizeof(int), 0644, NULL, &proc_dointvec};
95 ksocknal_ctl_table[i++] = (ctl_table)
96 {j++, "nagle", ksocknal_tunables.ksnd_nagle,
97 sizeof(int), 0644, NULL, &proc_dointvec};
99 ksocknal_ctl_table[i++] = (ctl_table)
100 {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity,
101 sizeof(int), 0644, NULL, &proc_dointvec};
103 ksocknal_ctl_table[i++] = (ctl_table)
104 {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle,
105 sizeof(int), 0644, NULL, &proc_dointvec};
106 ksocknal_ctl_table[i++] = (ctl_table)
107 {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count,
108 sizeof(int), 0644, NULL, &proc_dointvec};
109 ksocknal_ctl_table[i++] = (ctl_table)
110 {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl,
111 sizeof(int), 0644, NULL, &proc_dointvec};
114 LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0]));
116 ksocknal_tunables.ksnd_sysctl =
117 register_sysctl_table(ksocknal_top_ctl_table, 0);
119 if (ksocknal_tunables.ksnd_sysctl == NULL)
120 CWARN("Can't setup /proc tunables\n");
126 ksocknal_lib_tunables_fini ()
128 if (ksocknal_tunables.ksnd_sysctl != NULL)
129 unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
133 ksocknal_lib_tunables_init ()
139 ksocknal_lib_tunables_fini ()
145 ksocknal_lib_bind_irq (unsigned int irq)
150 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
152 int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
153 &conn->ksnc_ipaddr, &conn->ksnc_port);
155 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
156 LASSERT (!conn->ksnc_closing);
159 CERROR ("Error %d getting sock peer IP\n", rc);
163 rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
164 &conn->ksnc_myipaddr, NULL);
166 CERROR ("Error %d getting sock local IP\n", rc);
174 ksocknal_lib_sock_irq (struct socket *sock)
179 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
181 ksocknal_kvaddr_to_page (unsigned long vaddr)
185 if (vaddr >= VMALLOC_START &&
187 page = vmalloc_to_page ((void *)vaddr);
188 #ifdef CONFIG_HIGHMEM
189 else if (vaddr >= PKMAP_BASE &&
190 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
191 page = vmalloc_to_page ((void *)vaddr);
192 /* in 2.4 ^ just walks the page tables */
195 page = virt_to_page (vaddr);
207 * Lock the i/o vector buffers into MDL structure
210 * iov: the array of i/o vectors
211 * niov: number of i/o vectors to be locked
212 * len: the real length of the iov vectors
215 * ksock_mdl_t *: the Mdl of the locked buffers or
216 * NULL pointer in failure case
224 IN struct iovec *iov,
233 ksock_mdl_t * mdl = NULL;
234 ksock_mdl_t * tail = NULL;
236 LASSERT(iov != NULL);
238 LASSERT(len != NULL);
240 for (i=0; i < niov; i++) {
242 ksock_mdl_t * Iovec = NULL;
248 recving ? IoWriteAccess : IoReadAccess,
263 total +=iov[i].iov_len;
270 ks_release_mdl(mdl, FALSE);
280 * Lock the kiov pages into MDL structure
283 * kiov: the array of kiov pages
284 * niov: number of kiov to be locked
285 * len: the real length of the kiov arrary
288 * PMDL: the Mdl of the locked buffers or NULL
289 * pointer in failure case
296 IN lnet_kiov_t * kiov,
304 ksock_mdl_t * mdl = NULL;
305 ksock_mdl_t * tail = NULL;
307 LASSERT(kiov != NULL);
309 LASSERT(len != NULL);
311 for (i=0; i < nkiov; i++) {
313 ksock_mdl_t * Iovec = NULL;
317 // Lock the kiov page into Iovec ¡Â
321 (PUCHAR)kiov[i].kiov_page->addr +
325 recving ? IoWriteAccess : IoReadAccess,
334 // Attach the Iovec to the mdl chain
345 total += kiov[i].kiov_len;
353 ks_release_mdl(mdl, FALSE);
363 ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
365 struct socket *sock = conn->ksnc_sock;
366 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
367 unsigned long vaddr = (unsigned long)iov->iov_base
368 int offset = vaddr & (PAGE_SIZE - 1);
369 int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset);
376 /* NB we can't trust socket ops to either consume our iovs
377 * or leave them alone. */
379 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
380 if (tx->tx_zc_capable &&
381 (sock->sk->sk_route_caps & NETIF_F_SG) &&
382 (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
383 (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
384 int msgflg = MSG_DONTWAIT;
386 CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
387 (void *)vaddr, page, page_address(page), offset, zcsize);
389 if (!list_empty (&conn->ksnc_tx_queue) ||
390 zcsize < tx->tx_resid)
393 rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd);
397 /* lock the whole tx iovs into a single mdl chain */
398 mdl = ks_lock_iovs(tx->tx_iov, tx->tx_niov, FALSE, &nob);
401 /* send the total mdl chain */
402 rc = ks_send_mdl( conn->ksnc_sock, tx, mdl, nob,
403 (!list_empty (&conn->ksnc_tx_queue) || nob < tx->tx_resid) ?
404 (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
414 ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
416 struct socket *sock = conn->ksnc_sock;
417 lnet_kiov_t *kiov = tx->tx_kiov;
422 /* NB we can't trust socket ops to either consume our iovs
423 * or leave them alone. */
426 if (tx->tx_zc_capable &&
427 (sock->sk->sk_route_caps & NETIF_F_SG) &&
428 (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
429 struct page *page = kiov->kiov_page;
430 int offset = kiov->kiov_offset;
431 int fragsize = kiov->kiov_len;
432 int msgflg = MSG_DONTWAIT;
434 CDEBUG(D_NET, "page %p + offset %x for %d\n",
435 page, offset, kiov->kiov_len);
437 if (!list_empty(&conn->ksnc_tx_queue) ||
438 fragsize < tx->tx_resid)
441 rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg,
446 /* lock the whole tx kiovs into a single mdl chain */
447 mdl = ks_lock_kiovs(tx->tx_kiov, tx->tx_nkiov, FALSE, &nob);
450 /* send the total mdl chain */
452 conn->ksnc_sock, tx, mdl, nob,
453 (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) ?
454 (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
465 ksocknal_lib_recv_iov (ksock_conn_t *conn)
467 struct iovec *iov = conn->ksnc_rx_iov;
472 /* lock the whole tx iovs into a single mdl chain */
473 mdl = ks_lock_iovs(iov, conn->ksnc_rx_niov, TRUE, &size);
479 LASSERT (size <= conn->ksnc_rx_nob_wanted);
481 /* try to request data for the whole mdl chain */
482 rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
488 ksocknal_lib_recv_kiov (ksock_conn_t *conn)
490 lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
495 /* NB we can't trust socket ops to either consume our iovs
496 * or leave them alone, so we only receive 1 frag at a time. */
497 LASSERT (conn->ksnc_rx_nkiov > 0);
499 /* lock the whole tx kiovs into a single mdl chain */
500 mdl = ks_lock_kiovs(kiov, conn->ksnc_rx_nkiov, TRUE, &size);
507 LASSERT (size <= conn->ksnc_rx_nob_wanted);
509 /* try to request data for the whole mdl chain */
510 rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
516 ksocknal_lib_eager_ack (ksock_conn_t *conn)
521 rc = ks_set_tcp_option(
522 conn->ksnc_sock, TCP_SOCKET_NODELAY,
523 &option, sizeof(option) );
525 CERROR("Can't disable nagle: %d\n", rc);
530 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
532 ksock_tconn_t * tconn = conn->ksnc_sock;
536 ks_get_tconn (tconn);
540 len = sizeof(*nagle);
542 rc = ks_get_tcp_option(
543 tconn, TCP_SOCKET_NODELAY,
544 (__u32 *)nagle, &len);
546 ks_put_tconn (tconn);
548 printk("ksocknal_get_conn_tunables: nodelay = %d rc = %d\n", *nagle, rc);
553 *txmem = *rxmem = *nagle = 0;
559 ksocknal_lib_buffersize (int current_sz, int tunable_sz)
561 /* ensure >= SOCKNAL_MIN_BUFFER */
562 if (current_sz < SOCKNAL_MIN_BUFFER)
563 return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
565 if (tunable_sz > SOCKNAL_MIN_BUFFER)
573 ksocknal_lib_setup_sock (struct socket *sock)
584 /* set the window size */
587 tconn->kstc_snd_wnd = ksocknal_tunables.ksnd_buffer_size;
588 tconn->kstc_rcv_wnd = ksocknal_tunables.ksnd_buffer_size;
592 if (!ksocknal_tunables.ksnd_nagle) {
595 rc = ks_set_tcp_option(
596 sock, TCP_SOCKET_NODELAY,
597 &option, sizeof (option));
599 printk ("Can't disable nagle: %d\n", rc);
604 /* snapshot tunables */
605 keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
606 keep_count = *ksocknal_tunables.ksnd_keepalive_count;
607 keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
609 keep_alive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
611 option = (__u32)(keep_alive ? 1 : 0);
613 rc = ks_set_tcp_option(
614 sock, TCP_SOCKET_KEEPALIVE,
615 &option, sizeof (option));
617 CERROR ("Can't disable nagle: %d\n", rc);
625 ksocknal_lib_push_conn (ksock_conn_t *conn)
627 ksock_tconn_t * tconn;
632 tconn = conn->ksnc_sock;
636 spin_lock(&tconn->kstc_lock);
637 if (tconn->kstc_type == kstt_sender) {
638 nagle = tconn->sender.kstc_info.nagle;
639 tconn->sender.kstc_info.nagle = 0;
641 LASSERT(tconn->kstc_type == kstt_child);
642 nagle = tconn->child.kstc_info.nagle;
643 tconn->child.kstc_info.nagle = 0;
646 spin_unlock(&tconn->kstc_lock);
649 rc = ks_set_tcp_option(
657 spin_lock(&tconn->kstc_lock);
659 if (tconn->kstc_type == kstt_sender) {
660 tconn->sender.kstc_info.nagle = nagle;
662 LASSERT(tconn->kstc_type == kstt_child);
663 tconn->child.kstc_info.nagle = nagle;
665 spin_unlock(&tconn->kstc_lock);
670 /* @mode: 0: receiving mode / 1: sending mode */
672 ksocknal_sched_conn (ksock_conn_t *conn, int mode, ksock_tx_t *tx)
675 ksock_sched_t * sched;
678 /* interleave correctly with closing sockets... */
679 read_lock (&ksocknal_data.ksnd_global_lock);
681 sched = conn->ksnc_scheduler;
683 spin_lock_irqsave (&sched->kss_lock, flags);
685 if (mode) { /* transmission can continue ... */
687 #error "This is out of date - we should be calling ksocknal_write_callback()"
688 conn->ksnc_tx_ready = 1;
691 /* Incomplete send: place tx on HEAD of tx_queue */
692 list_add (&tx->tx_list, &conn->ksnc_tx_queue);
695 if ( !conn->ksnc_tx_scheduled &&
696 !list_empty(&conn->ksnc_tx_queue)) { //packets to send
697 list_add_tail (&conn->ksnc_tx_list,
698 &sched->kss_tx_conns);
699 conn->ksnc_tx_scheduled = 1;
700 /* extra ref for scheduler */
701 atomic_inc (&conn->ksnc_conn_refcount);
703 cfs_waitq_signal (&sched->kss_waitq);
705 } else { /* receiving can continue ... */
707 conn->ksnc_rx_ready = 1;
709 if ( !conn->ksnc_rx_scheduled) { /* not being progressed */
710 list_add_tail(&conn->ksnc_rx_list,
711 &sched->kss_rx_conns);
712 conn->ksnc_rx_scheduled = 1;
713 /* extra ref for scheduler */
714 atomic_inc (&conn->ksnc_conn_refcount);
716 cfs_waitq_signal (&sched->kss_waitq);
720 spin_unlock_irqrestore (&sched->kss_lock, flags);
721 read_unlock (&ksocknal_data.ksnd_global_lock);
726 void ksocknal_schedule_callback(struct socket*sock, int mode, void * tx, ulong_ptr bytes)
728 ksock_conn_t * conn = (ksock_conn_t *) sock->kstc_conn;
731 ksocknal_sched_conn(conn, mode, tx);
733 if ( CAN_BE_SCHED(bytes, (ulong_ptr)conn->ksnc_rx_nob_wanted )) {
734 ksocknal_sched_conn(conn, mode, tx);
740 ksocknal_tx_launched (ksock_tx_t *tx);
743 ksocknal_fini_sending(ksock_tcpx_fini_t *tcpx)
745 ksocknal_tx_launched(tcpx->tx);
751 struct socket* tconn,
756 ksock_tx_t * tx = (ksock_tx_t *)txp;
759 * the transmission was done, we need update the tx
762 LASSERT(tx->tx_resid >= (int)rc);
763 tx->tx_resid -= (int)rc;
766 * just partial of tx is sent out, we need update
767 * the fields of tx and schedule later transmission.
772 if (tx->tx_niov > 0) {
774 /* if there's iov, we need process iov first */
776 if (rc < tx->tx_iov->iov_len) {
777 /* didn't send whole iov entry... */
778 tx->tx_iov->iov_base =
779 (char *)(tx->tx_iov->iov_base) + rc;
780 tx->tx_iov->iov_len -= rc;
783 /* the whole of iov was sent out */
784 rc -= tx->tx_iov->iov_len;
792 /* now we need process the kiov queues ... */
796 if (rc < tx->tx_kiov->kiov_len) {
797 /* didn't send whole kiov entry... */
798 tx->tx_kiov->kiov_offset += rc;
799 tx->tx_kiov->kiov_len -= rc;
802 /* whole kiov was sent out */
803 rc -= tx->tx_kiov->kiov_len;
812 ksock_tcpx_fini_t * tcpx =
813 cfs_alloc(sizeof(ksock_tcpx_fini_t), CFS_ALLOC_ZERO);
815 ASSERT(tx->tx_resid == 0);
819 ksocknal_tx_launched (tx);
824 ExInitializeWorkItem(
826 ksocknal_fini_sending,
842 ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
847 ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
849 sock->kstc_conn = conn;
850 sock->kstc_sched_cb = ksocknal_schedule_callback;
851 sock->kstc_update_tx = ksocknal_update_tx;
855 ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
857 sock->kstc_conn = NULL;
858 sock->kstc_sched_cb = NULL;
859 sock->kstc_update_tx = NULL;