1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see [sun.com URL with a
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/ulnds/socklnd/usocklnd.c
38 * Author: Maxim Patlasov <maxim@clusterfs.com>
46 .lnd_startup = usocklnd_startup,
47 .lnd_shutdown = usocklnd_shutdown,
48 .lnd_send = usocklnd_send,
49 .lnd_recv = usocklnd_recv,
50 .lnd_accept = usocklnd_accept,
53 usock_data_t usock_data;
54 usock_tunables_t usock_tuns = {
61 .ut_peertxcredits = 8,
66 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
67 #define MAX_REASONABLE_NPT 1000
70 usocklnd_validate_tunables()
72 if (usock_tuns.ut_timeout <= 0 ||
73 usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
74 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
75 usock_tuns.ut_timeout);
79 if (usock_tuns.ut_poll_timeout <= 0 ||
80 usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
81 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
82 usock_tuns.ut_poll_timeout);
86 if (usock_tuns.ut_fair_limit <= 0) {
87 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
88 usock_tuns.ut_fair_limit);
92 if (usock_tuns.ut_npollthreads < 0 ||
93 usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
94 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
95 usock_tuns.ut_npollthreads);
99 if (usock_tuns.ut_txcredits <= 0) {
100 CERROR("USOCK_TXCREDITS: %d should be positive\n",
101 usock_tuns.ut_txcredits);
105 if (usock_tuns.ut_peertxcredits <= 0) {
106 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
107 usock_tuns.ut_peertxcredits);
111 if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
112 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
113 " than USOCK_TXCREDITS: %d\n",
114 usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
118 if (usock_tuns.ut_socknagle != 0 &&
119 usock_tuns.ut_socknagle != 1) {
120 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
121 usock_tuns.ut_socknagle);
125 if (usock_tuns.ut_sockbufsiz < 0) {
126 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
127 usock_tuns.ut_sockbufsiz);
135 usocklnd_release_poll_states(int n)
139 for (i = 0; i < n; i++) {
140 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
142 close(pt->upt_notifier_fd);
143 close(pt->upt_pollfd[0].fd);
145 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
146 cfs_fini_completion(&pt->upt_completion);
148 LIBCFS_FREE (pt->upt_pollfd,
149 sizeof(struct pollfd) * pt->upt_npollfd);
150 LIBCFS_FREE (pt->upt_idx2conn,
151 sizeof(usock_conn_t *) * pt->upt_npollfd);
152 LIBCFS_FREE (pt->upt_fd2idx,
153 sizeof(int) * pt->upt_nfd2idx);
158 usocklnd_update_tunables()
162 rc = cfs_parse_int_tunable(&usock_tuns.ut_timeout,
167 rc = cfs_parse_int_tunable(&usock_tuns.ut_poll_timeout,
168 "USOCK_POLL_TIMEOUT");
172 rc = cfs_parse_int_tunable(&usock_tuns.ut_npollthreads,
173 "USOCK_NPOLLTHREADS");
177 rc = cfs_parse_int_tunable(&usock_tuns.ut_fair_limit,
182 rc = cfs_parse_int_tunable(&usock_tuns.ut_min_bulk,
187 rc = cfs_parse_int_tunable(&usock_tuns.ut_txcredits,
192 rc = cfs_parse_int_tunable(&usock_tuns.ut_peertxcredits,
193 "USOCK_PEERTXCREDITS");
197 rc = cfs_parse_int_tunable(&usock_tuns.ut_socknagle,
202 rc = cfs_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
207 if (usocklnd_validate_tunables())
210 if (usock_tuns.ut_npollthreads == 0) {
211 usock_tuns.ut_npollthreads = cfs_online_cpus();
213 if (usock_tuns.ut_npollthreads <= 0) {
214 CERROR("Cannot find out the number of online CPUs\n");
224 usocklnd_base_startup()
226 usock_pollthread_t *pt;
230 rc = usocklnd_update_tunables();
234 usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
236 LIBCFS_ALLOC (usock_data.ud_pollthreads,
237 usock_data.ud_npollthreads *
238 sizeof(usock_pollthread_t));
239 if (usock_data.ud_pollthreads == NULL)
242 /* Initialize poll thread state structures */
243 for (i = 0; i < usock_data.ud_npollthreads; i++) {
246 pt = &usock_data.ud_pollthreads[i];
250 LIBCFS_ALLOC (pt->upt_pollfd,
251 sizeof(struct pollfd) * UPT_START_SIZ);
252 if (pt->upt_pollfd == NULL)
253 goto base_startup_failed_0;
255 LIBCFS_ALLOC (pt->upt_idx2conn,
256 sizeof(usock_conn_t *) * UPT_START_SIZ);
257 if (pt->upt_idx2conn == NULL)
258 goto base_startup_failed_1;
260 LIBCFS_ALLOC (pt->upt_fd2idx,
261 sizeof(int) * UPT_START_SIZ);
262 if (pt->upt_fd2idx == NULL)
263 goto base_startup_failed_2;
265 memset(pt->upt_fd2idx, 0,
266 sizeof(int) * UPT_START_SIZ);
268 LIBCFS_ALLOC (pt->upt_skip,
269 sizeof(int) * UPT_START_SIZ);
270 if (pt->upt_skip == NULL)
271 goto base_startup_failed_3;
273 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
275 rc = libcfs_socketpair(notifier);
277 goto base_startup_failed_4;
279 pt->upt_notifier_fd = notifier[0];
281 pt->upt_pollfd[0].fd = notifier[1];
282 pt->upt_pollfd[0].events = POLLIN;
283 pt->upt_pollfd[0].revents = 0;
286 pt->upt_idx2conn[0] = NULL;
289 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
290 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
291 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
292 cfs_init_completion(&pt->upt_completion);
295 /* Initialize peer hash list */
296 for (i = 0; i < UD_PEER_HASH_SIZE; i++)
297 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
299 pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
301 /* Spawn poll threads */
302 for (i = 0; i < usock_data.ud_npollthreads; i++) {
303 rc = cfs_create_thread(usocklnd_poll_thread,
304 &usock_data.ud_pollthreads[i]);
306 usocklnd_base_shutdown(i);
311 usock_data.ud_state = UD_STATE_INITIALIZED;
315 base_startup_failed_4:
316 LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
317 base_startup_failed_3:
318 LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
319 base_startup_failed_2:
320 LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
321 base_startup_failed_1:
322 LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
323 base_startup_failed_0:
325 usocklnd_release_poll_states(i);
326 LIBCFS_FREE (usock_data.ud_pollthreads,
327 usock_data.ud_npollthreads *
328 sizeof(usock_pollthread_t));
333 usocklnd_base_shutdown(int n)
337 usock_data.ud_shutdown = 1;
338 for (i = 0; i < n; i++) {
339 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
340 usocklnd_wakeup_pollthread(i);
341 cfs_wait_for_completion(&pt->upt_completion);
344 pthread_rwlock_destroy(&usock_data.ud_peers_lock);
346 usocklnd_release_poll_states(usock_data.ud_npollthreads);
348 LIBCFS_FREE (usock_data.ud_pollthreads,
349 usock_data.ud_npollthreads *
350 sizeof(usock_pollthread_t));
352 usock_data.ud_state = UD_STATE_INIT_NOTHING;
356 usocklnd_new_incarnation()
359 int rc = gettimeofday(&tv, NULL);
361 return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
365 usocklnd_assign_ni_nid(lnet_ni_t *ni)
371 /* Find correct IP-address and update ni_nid with it.
372 * Two cases are supported:
373 * 1) no explicit interfaces are defined. NID will be assigned to
374 * first non-lo interface that is up;
375 * 2) exactly one explicit interface is defined. For example,
376 * LNET_NETWORKS='tcp(eth0)' */
378 if (ni->ni_interfaces[0] == NULL) {
382 n = libcfs_ipif_enumerate(&names);
384 CERROR("Can't enumerate interfaces: %d\n", n);
388 for (i = 0; i < n; i++) {
390 if (!strcmp(names[i], "lo")) /* skip the loopback IF */
393 rc = libcfs_ipif_query(names[i], &up, &ipaddr);
395 CWARN("Can't get interface %s info: %d\n",
401 CWARN("Ignoring interface %s (down)\n",
406 break; /* one address is quite enough */
409 libcfs_ipif_free_enumeration(names, n);
412 CERROR("Can't find any usable interfaces\n");
416 CDEBUG(D_NET, "No explicit interfaces defined. "
417 "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
419 if (ni->ni_interfaces[1] != NULL) {
420 CERROR("only one explicit interface is allowed\n");
424 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
426 CERROR("Can't get interface %s info: %d\n",
427 ni->ni_interfaces[0], rc);
432 CERROR("Explicit interface defined: %s but is down\n",
433 ni->ni_interfaces[0]);
437 CDEBUG(D_NET, "Explicit interface defined: %s. "
438 "%u.%u.%u.%u used\n",
439 ni->ni_interfaces[0], HIPQUAD(ipaddr));
443 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
449 usocklnd_startup(lnet_ni_t *ni)
454 if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
455 rc = usocklnd_base_startup();
460 LIBCFS_ALLOC(net, sizeof(*net));
462 goto startup_failed_0;
464 memset(net, 0, sizeof(*net));
465 net->un_incarnation = usocklnd_new_incarnation();
466 pthread_mutex_init(&net->un_lock, NULL);
467 pthread_cond_init(&net->un_cond, NULL);
471 if (!(the_lnet.ln_pid & LNET_PID_USERFLAG)) {
472 rc = usocklnd_assign_ni_nid(ni);
474 goto startup_failed_1;
477 LASSERT (ni->ni_lnd == &the_tcplnd);
479 ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
480 ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
482 usock_data.ud_nets_count++;
486 pthread_mutex_destroy(&net->un_lock);
487 pthread_cond_destroy(&net->un_cond);
488 LIBCFS_FREE(net, sizeof(*net));
490 if (usock_data.ud_nets_count == 0)
491 usocklnd_base_shutdown(usock_data.ud_npollthreads);
497 usocklnd_shutdown(lnet_ni_t *ni)
499 usock_net_t *net = ni->ni_data;
501 net->un_shutdown = 1;
503 usocklnd_del_all_peers(ni);
505 /* Wait for all peer state to clean up */
506 pthread_mutex_lock(&net->un_lock);
507 while (net->un_peercount != 0)
508 pthread_cond_wait(&net->un_cond, &net->un_lock);
509 pthread_mutex_unlock(&net->un_lock);
511 /* Release usock_net_t structure */
512 pthread_mutex_destroy(&net->un_lock);
513 pthread_cond_destroy(&net->un_cond);
514 LIBCFS_FREE(net, sizeof(*net));
516 usock_data.ud_nets_count--;
517 if (usock_data.ud_nets_count == 0)
518 usocklnd_base_shutdown(usock_data.ud_npollthreads);
522 usocklnd_del_all_peers(lnet_ni_t *ni)
524 struct list_head *ptmp;
525 struct list_head *pnxt;
529 pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
531 for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
532 list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
533 peer = list_entry (ptmp, usock_peer_t, up_list);
535 if (peer->up_ni != ni)
538 usocklnd_del_peer_and_conns(peer);
542 pthread_rwlock_unlock(&usock_data.ud_peers_lock);
544 /* wakeup all threads */
545 for (i = 0; i < usock_data.ud_npollthreads; i++)
546 usocklnd_wakeup_pollthread(i);
550 usocklnd_del_peer_and_conns(usock_peer_t *peer)
552 /* peer cannot disappear because it's still in hash list */
554 pthread_mutex_lock(&peer->up_lock);
555 /* content of conn[] array cannot change now */
556 usocklnd_del_conns_locked(peer);
557 pthread_mutex_unlock(&peer->up_lock);
559 /* peer hash list is still protected by the caller */
560 list_del(&peer->up_list);
562 usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
566 usocklnd_del_conns_locked(usock_peer_t *peer)
570 for (i=0; i < N_CONN_TYPES; i++) {
571 usock_conn_t *conn = peer->up_conns[i];
573 usocklnd_conn_kill(conn);