1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2006 Cluster File Systems, Inc, All rights reserved.
7 * This file is part of Lustre, http://www.lustre.org.
9 * This Lustre Software is proprietary - please refer to the license
10 * agreement you received with your software.
12 * windows socknal library
18 # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
19 static ctl_table ksocknal_ctl_table[18];
21 ctl_table ksocknal_top_ctl_table[] = {
22 {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
27 ksocknal_lib_tunables_init ()
32 ksocknal_ctl_table[i++] = (ctl_table)
33 {j++, "timeout", ksocknal_tunables.ksnd_timeout,
34 sizeof (int), 0644, NULL, &proc_dointvec};
35 ksocknal_ctl_table[i++] = (ctl_table)
36 {j++, "credits", ksocknal_tunables.ksnd_credits,
37 sizeof (int), 0444, NULL, &proc_dointvec};
38 ksocknal_ctl_table[i++] = (ctl_table)
39 {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits,
40 sizeof (int), 0444, NULL, &proc_dointvec};
41 ksocknal_ctl_table[i++] = (ctl_table)
42 {j++, "nconnds", ksocknal_tunables.ksnd_nconnds,
43 sizeof (int), 0444, NULL, &proc_dointvec};
44 ksocknal_ctl_table[i++] = (ctl_table)
45 {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms,
46 sizeof (int), 0444, NULL, &proc_dointvec};
47 ksocknal_ctl_table[i++] = (ctl_table)
48 {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms,
49 sizeof (int), 0444, NULL, &proc_dointvec};
50 ksocknal_ctl_table[i++] = (ctl_table)
51 {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack,
52 sizeof (int), 0644, NULL, &proc_dointvec};
54 ksocknal_ctl_table[i++] = (ctl_table)
55 {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag,
56 sizeof (int), 0644, NULL, &proc_dointvec};
58 ksocknal_ctl_table[i++] = (ctl_table)
59 {j++, "typed", ksocknal_tunables.ksnd_typed_conns,
60 sizeof (int), 0444, NULL, &proc_dointvec};
61 ksocknal_ctl_table[i++] = (ctl_table)
62 {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk,
63 sizeof (int), 0644, NULL, &proc_dointvec};
64 ksocknal_ctl_table[i++] = (ctl_table)
65 {j++, "buffer_size", ksocknal_tunables.ksnd_buffer_size,
66 sizeof(int), 0644, NULL, &proc_dointvec};
67 ksocknal_ctl_table[i++] = (ctl_table)
68 {j++, "nagle", ksocknal_tunables.ksnd_nagle,
69 sizeof(int), 0644, NULL, &proc_dointvec};
71 ksocknal_ctl_table[i++] = (ctl_table)
72 {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity,
73 sizeof(int), 0644, NULL, &proc_dointvec};
75 ksocknal_ctl_table[i++] = (ctl_table)
76 {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle,
77 sizeof(int), 0644, NULL, &proc_dointvec};
78 ksocknal_ctl_table[i++] = (ctl_table)
79 {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count,
80 sizeof(int), 0644, NULL, &proc_dointvec};
81 ksocknal_ctl_table[i++] = (ctl_table)
82 {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl,
83 sizeof(int), 0644, NULL, &proc_dointvec};
86 LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0]));
88 ksocknal_tunables.ksnd_sysctl =
89 register_sysctl_table(ksocknal_top_ctl_table, 0);
91 if (ksocknal_tunables.ksnd_sysctl == NULL)
92 CWARN("Can't setup /proc tunables\n");
98 ksocknal_lib_tunables_fini ()
100 if (ksocknal_tunables.ksnd_sysctl != NULL)
101 unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
105 ksocknal_lib_tunables_init ()
111 ksocknal_lib_tunables_fini ()
117 ksocknal_lib_bind_irq (unsigned int irq)
122 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
124 int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
125 &conn->ksnc_ipaddr, &conn->ksnc_port);
127 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
128 LASSERT (!conn->ksnc_closing);
131 CERROR ("Error %d getting sock peer IP\n", rc);
135 rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
136 &conn->ksnc_myipaddr, NULL);
138 CERROR ("Error %d getting sock local IP\n", rc);
146 ksocknal_lib_sock_irq (struct socket *sock)
151 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
153 ksocknal_kvaddr_to_page (unsigned long vaddr)
157 if (vaddr >= VMALLOC_START &&
159 page = vmalloc_to_page ((void *)vaddr);
161 else if (vaddr >= PKMAP_BASE &&
162 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
163 page = vmalloc_to_page ((void *)vaddr);
164 /* in 2.4 ^ just walks the page tables */
167 page = virt_to_page (vaddr);
179 * Lock the i/o vector buffers into MDL structure
182 * iov: the array of i/o vectors
183 * niov: number of i/o vectors to be locked
184 * len: the real length of the iov vectors
187 * ksock_mdl_t *: the Mdl of the locked buffers or
188 * NULL pointer in failure case
196 IN struct iovec *iov,
205 ksock_mdl_t * mdl = NULL;
206 ksock_mdl_t * tail = NULL;
208 LASSERT(iov != NULL);
210 LASSERT(len != NULL);
212 for (i=0; i < niov; i++) {
214 ksock_mdl_t * Iovec = NULL;
220 recving ? IoWriteAccess : IoReadAccess,
235 total +=iov[i].iov_len;
242 ks_release_mdl(mdl, FALSE);
252 * Lock the kiov pages into MDL structure
255 * kiov: the array of kiov pages
256 * niov: number of kiov to be locked
257 * len: the real length of the kiov arrary
260 * PMDL: the Mdl of the locked buffers or NULL
261 * pointer in failure case
268 IN lnet_kiov_t * kiov,
276 ksock_mdl_t * mdl = NULL;
277 ksock_mdl_t * tail = NULL;
279 LASSERT(kiov != NULL);
281 LASSERT(len != NULL);
283 for (i=0; i < nkiov; i++) {
285 ksock_mdl_t * Iovec = NULL;
289 // Lock the kiov page into Iovec ¡Â
293 (PUCHAR)kiov[i].kiov_page->addr +
297 recving ? IoWriteAccess : IoReadAccess,
306 // Attach the Iovec to the mdl chain
317 total += kiov[i].kiov_len;
325 ks_release_mdl(mdl, FALSE);
335 ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
337 struct socket *sock = conn->ksnc_sock;
338 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
339 unsigned long vaddr = (unsigned long)iov->iov_base
340 int offset = vaddr & (PAGE_SIZE - 1);
341 int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset);
348 /* NB we can't trust socket ops to either consume our iovs
349 * or leave them alone. */
351 #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
352 if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
353 (sock->sk->sk_route_caps & NETIF_F_SG) &&
354 (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
355 (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
356 int msgflg = MSG_DONTWAIT;
358 CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
359 (void *)vaddr, page, page_address(page), offset, zcsize);
361 if (!list_empty (&conn->ksnc_tx_queue) ||
362 zcsize < tx->tx_resid)
365 rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd);
369 /* lock the whole tx iovs into a single mdl chain */
370 mdl = ks_lock_iovs(tx->tx_iov, tx->tx_niov, FALSE, &nob);
373 /* send the total mdl chain */
374 rc = ks_send_mdl( conn->ksnc_sock, tx, mdl, nob,
375 (!list_empty (&conn->ksnc_tx_queue) || nob < tx->tx_resid) ?
376 (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
386 ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
388 struct socket *sock = conn->ksnc_sock;
389 lnet_kiov_t *kiov = tx->tx_kiov;
394 /* NB we can't trust socket ops to either consume our iovs
395 * or leave them alone. */
398 if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag &&
399 (sock->sk->sk_route_caps & NETIF_F_SG) &&
400 (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
401 struct page *page = kiov->kiov_page;
402 int offset = kiov->kiov_offset;
403 int fragsize = kiov->kiov_len;
404 int msgflg = MSG_DONTWAIT;
406 CDEBUG(D_NET, "page %p + offset %x for %d\n",
407 page, offset, kiov->kiov_len);
409 if (!list_empty(&conn->ksnc_tx_queue) ||
410 fragsize < tx->tx_resid)
413 rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg,
418 /* lock the whole tx kiovs into a single mdl chain */
419 mdl = ks_lock_kiovs(tx->tx_kiov, tx->tx_nkiov, FALSE, &nob);
422 /* send the total mdl chain */
424 conn->ksnc_sock, tx, mdl, nob,
425 (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) ?
426 (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
437 ksocknal_lib_recv_iov (ksock_conn_t *conn)
439 struct iovec *iov = conn->ksnc_rx_iov;
444 /* lock the whole tx iovs into a single mdl chain */
445 mdl = ks_lock_iovs(iov, conn->ksnc_rx_niov, TRUE, &size);
451 LASSERT (size <= conn->ksnc_rx_nob_wanted);
453 /* try to request data for the whole mdl chain */
454 rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
460 ksocknal_lib_recv_kiov (ksock_conn_t *conn)
462 lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
467 /* NB we can't trust socket ops to either consume our iovs
468 * or leave them alone, so we only receive 1 frag at a time. */
469 LASSERT (conn->ksnc_rx_nkiov > 0);
471 /* lock the whole tx kiovs into a single mdl chain */
472 mdl = ks_lock_kiovs(kiov, conn->ksnc_rx_nkiov, TRUE, &size);
479 LASSERT (size <= conn->ksnc_rx_nob_wanted);
481 /* try to request data for the whole mdl chain */
482 rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
488 ksocknal_lib_eager_ack (ksock_conn_t *conn)
493 rc = ks_set_tcp_option(
494 conn->ksnc_sock, TCP_SOCKET_NODELAY,
495 &option, sizeof(option) );
497 CERROR("Can't disable nagle: %d\n", rc);
502 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
504 ksock_tconn_t * tconn = conn->ksnc_sock;
508 ks_get_tconn (tconn);
512 len = sizeof(*nagle);
514 rc = ks_get_tcp_option(
515 tconn, TCP_SOCKET_NODELAY,
516 (__u32 *)nagle, &len);
518 ks_put_tconn (tconn);
520 printk("ksocknal_get_conn_tunables: nodelay = %d rc = %d\n", *nagle, rc);
525 *txmem = *rxmem = *nagle = 0;
531 ksocknal_lib_buffersize (int current_sz, int tunable_sz)
533 /* ensure >= SOCKNAL_MIN_BUFFER */
534 if (current_sz < SOCKNAL_MIN_BUFFER)
535 return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
537 if (tunable_sz > SOCKNAL_MIN_BUFFER)
545 ksocknal_lib_setup_sock (struct socket *sock)
556 /* set the window size */
559 tconn->kstc_snd_wnd = ksocknal_tunables.ksnd_buffer_size;
560 tconn->kstc_rcv_wnd = ksocknal_tunables.ksnd_buffer_size;
564 if (!ksocknal_tunables.ksnd_nagle) {
567 rc = ks_set_tcp_option(
568 sock, TCP_SOCKET_NODELAY,
569 &option, sizeof (option));
571 printk ("Can't disable nagle: %d\n", rc);
576 /* snapshot tunables */
577 keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
578 keep_count = *ksocknal_tunables.ksnd_keepalive_count;
579 keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
581 keep_alive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
583 option = (__u32)(keep_alive ? 1 : 0);
585 rc = ks_set_tcp_option(
586 sock, TCP_SOCKET_KEEPALIVE,
587 &option, sizeof (option));
589 CERROR ("Can't disable nagle: %d\n", rc);
597 ksocknal_lib_push_conn (ksock_conn_t *conn)
599 ksock_tconn_t * tconn;
604 tconn = conn->ksnc_sock;
608 spin_lock(&tconn->kstc_lock);
609 if (tconn->kstc_type == kstt_sender) {
610 nagle = tconn->sender.kstc_info.nagle;
611 tconn->sender.kstc_info.nagle = 0;
613 LASSERT(tconn->kstc_type == kstt_child);
614 nagle = tconn->child.kstc_info.nagle;
615 tconn->child.kstc_info.nagle = 0;
618 spin_unlock(&tconn->kstc_lock);
621 rc = ks_set_tcp_option(
629 spin_lock(&tconn->kstc_lock);
631 if (tconn->kstc_type == kstt_sender) {
632 tconn->sender.kstc_info.nagle = nagle;
634 LASSERT(tconn->kstc_type == kstt_child);
635 tconn->child.kstc_info.nagle = nagle;
637 spin_unlock(&tconn->kstc_lock);
642 /* @mode: 0: receiving mode / 1: sending mode */
644 ksocknal_sched_conn (ksock_conn_t *conn, int mode, ksock_tx_t *tx)
647 ksock_sched_t * sched;
650 /* interleave correctly with closing sockets... */
651 read_lock (&ksocknal_data.ksnd_global_lock);
653 sched = conn->ksnc_scheduler;
655 spin_lock_irqsave (&sched->kss_lock, flags);
657 if (mode) { /* transmission can continue ... */
659 #error "This is out of date - we should be calling ksocknal_write_callback()"
660 conn->ksnc_tx_ready = 1;
663 /* Incomplete send: place tx on HEAD of tx_queue */
664 list_add (&tx->tx_list, &conn->ksnc_tx_queue);
667 if ( !conn->ksnc_tx_scheduled &&
668 !list_empty(&conn->ksnc_tx_queue)) { //packets to send
669 list_add_tail (&conn->ksnc_tx_list,
670 &sched->kss_tx_conns);
671 conn->ksnc_tx_scheduled = 1;
672 /* extra ref for scheduler */
673 atomic_inc (&conn->ksnc_conn_refcount);
675 cfs_waitq_signal (&sched->kss_waitq);
677 } else { /* receiving can continue ... */
679 conn->ksnc_rx_ready = 1;
681 if ( !conn->ksnc_rx_scheduled) { /* not being progressed */
682 list_add_tail(&conn->ksnc_rx_list,
683 &sched->kss_rx_conns);
684 conn->ksnc_rx_scheduled = 1;
685 /* extra ref for scheduler */
686 atomic_inc (&conn->ksnc_conn_refcount);
688 cfs_waitq_signal (&sched->kss_waitq);
692 spin_unlock_irqrestore (&sched->kss_lock, flags);
693 read_unlock (&ksocknal_data.ksnd_global_lock);
698 void ksocknal_schedule_callback(struct socket*sock, int mode, void * tx, ulong_ptr bytes)
700 ksock_conn_t * conn = (ksock_conn_t *) sock->kstc_conn;
703 ksocknal_sched_conn(conn, mode, tx);
705 if ( CAN_BE_SCHED(bytes, (ulong_ptr)conn->ksnc_rx_nob_wanted )) {
706 ksocknal_sched_conn(conn, mode, tx);
712 ksocknal_tx_launched (ksock_tx_t *tx);
715 ksocknal_fini_sending(ksock_tcpx_fini_t *tcpx)
717 ksocknal_tx_launched(tcpx->tx);
723 struct socket* tconn,
728 ksock_tx_t * tx = (ksock_tx_t *)txp;
731 * the transmission was done, we need update the tx
734 LASSERT(tx->tx_resid >= (int)rc);
735 tx->tx_resid -= (int)rc;
738 * just partial of tx is sent out, we need update
739 * the fields of tx and schedule later transmission.
744 if (tx->tx_niov > 0) {
746 /* if there's iov, we need process iov first */
748 if (rc < tx->tx_iov->iov_len) {
749 /* didn't send whole iov entry... */
750 tx->tx_iov->iov_base =
751 (char *)(tx->tx_iov->iov_base) + rc;
752 tx->tx_iov->iov_len -= rc;
755 /* the whole of iov was sent out */
756 rc -= tx->tx_iov->iov_len;
764 /* now we need process the kiov queues ... */
768 if (rc < tx->tx_kiov->kiov_len) {
769 /* didn't send whole kiov entry... */
770 tx->tx_kiov->kiov_offset += rc;
771 tx->tx_kiov->kiov_len -= rc;
774 /* whole kiov was sent out */
775 rc -= tx->tx_kiov->kiov_len;
784 ksock_tcpx_fini_t * tcpx =
785 cfs_alloc(sizeof(ksock_tcpx_fini_t), CFS_ALLOC_ZERO);
787 ASSERT(tx->tx_resid == 0);
791 ksocknal_tx_launched (tx);
796 ExInitializeWorkItem(
798 ksocknal_fini_sending,
814 ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
819 ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
821 sock->kstc_conn = conn;
822 sock->kstc_sched_cb = ksocknal_schedule_callback;
823 sock->kstc_update_tx = ksocknal_update_tx;
827 ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
829 sock->kstc_conn = NULL;
830 sock->kstc_sched_cb = NULL;
831 sock->kstc_update_tx = NULL;