4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
31 * This file is part of Lustre, http://www.lustre.org/
32 * Lustre is a trademark of Sun Microsystems, Inc.
34 * lnet/ulnds/socklnd/usocklnd.c
36 * Author: Maxim Patlasov <maxim@clusterfs.com>
44 .lnd_startup = usocklnd_startup,
45 .lnd_shutdown = usocklnd_shutdown,
46 .lnd_send = usocklnd_send,
47 .lnd_recv = usocklnd_recv,
48 .lnd_accept = usocklnd_accept,
51 usock_data_t usock_data;
52 usock_tunables_t usock_tuns = {
59 .ut_peertxcredits = 8,
64 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
65 #define MAX_REASONABLE_NPT 1000
68 usocklnd_validate_tunables()
70 if (usock_tuns.ut_timeout <= 0 ||
71 usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
72 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
73 usock_tuns.ut_timeout);
77 if (usock_tuns.ut_poll_timeout <= 0 ||
78 usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
79 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
80 usock_tuns.ut_poll_timeout);
84 if (usock_tuns.ut_fair_limit <= 0) {
85 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
86 usock_tuns.ut_fair_limit);
90 if (usock_tuns.ut_npollthreads < 0 ||
91 usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
92 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
93 usock_tuns.ut_npollthreads);
97 if (usock_tuns.ut_txcredits <= 0) {
98 CERROR("USOCK_TXCREDITS: %d should be positive\n",
99 usock_tuns.ut_txcredits);
103 if (usock_tuns.ut_peertxcredits <= 0) {
104 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
105 usock_tuns.ut_peertxcredits);
109 if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
110 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
111 " than USOCK_TXCREDITS: %d\n",
112 usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
116 if (usock_tuns.ut_socknagle != 0 &&
117 usock_tuns.ut_socknagle != 1) {
118 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
119 usock_tuns.ut_socknagle);
123 if (usock_tuns.ut_sockbufsiz < 0) {
124 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
125 usock_tuns.ut_sockbufsiz);
133 usocklnd_release_poll_states(int n)
137 for (i = 0; i < n; i++) {
138 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
140 libcfs_sock_release(pt->upt_notifier[0]);
141 libcfs_sock_release(pt->upt_notifier[1]);
143 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
144 cfs_mt_fini_completion(&pt->upt_completion);
146 LIBCFS_FREE (pt->upt_pollfd,
147 sizeof(struct pollfd) * pt->upt_npollfd);
148 LIBCFS_FREE (pt->upt_idx2conn,
149 sizeof(usock_conn_t *) * pt->upt_npollfd);
150 LIBCFS_FREE (pt->upt_fd2idx,
151 sizeof(int) * pt->upt_nfd2idx);
156 usocklnd_update_tunables()
160 rc = lnet_parse_int_tunable(&usock_tuns.ut_timeout,
165 rc = lnet_parse_int_tunable(&usock_tuns.ut_poll_timeout,
166 "USOCK_POLL_TIMEOUT");
170 rc = lnet_parse_int_tunable(&usock_tuns.ut_npollthreads,
171 "USOCK_NPOLLTHREADS");
175 rc = lnet_parse_int_tunable(&usock_tuns.ut_fair_limit,
180 rc = lnet_parse_int_tunable(&usock_tuns.ut_min_bulk,
185 rc = lnet_parse_int_tunable(&usock_tuns.ut_txcredits,
190 rc = lnet_parse_int_tunable(&usock_tuns.ut_peertxcredits,
191 "USOCK_PEERTXCREDITS");
195 rc = lnet_parse_int_tunable(&usock_tuns.ut_socknagle,
200 rc = lnet_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
205 if (usocklnd_validate_tunables())
208 if (usock_tuns.ut_npollthreads == 0) {
209 usock_tuns.ut_npollthreads = cfs_online_cpus();
211 if (usock_tuns.ut_npollthreads <= 0) {
212 CERROR("Cannot find out the number of online CPUs\n");
222 usocklnd_base_startup()
224 usock_pollthread_t *pt;
228 rc = usocklnd_update_tunables();
232 usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
234 LIBCFS_ALLOC (usock_data.ud_pollthreads,
235 usock_data.ud_npollthreads *
236 sizeof(usock_pollthread_t));
237 if (usock_data.ud_pollthreads == NULL)
240 /* Initialize poll thread state structures */
241 for (i = 0; i < usock_data.ud_npollthreads; i++) {
243 pt = &usock_data.ud_pollthreads[i];
247 LIBCFS_ALLOC (pt->upt_pollfd,
248 sizeof(struct pollfd) * UPT_START_SIZ);
249 if (pt->upt_pollfd == NULL)
250 goto base_startup_failed_0;
252 LIBCFS_ALLOC (pt->upt_idx2conn,
253 sizeof(usock_conn_t *) * UPT_START_SIZ);
254 if (pt->upt_idx2conn == NULL)
255 goto base_startup_failed_1;
257 LIBCFS_ALLOC (pt->upt_fd2idx,
258 sizeof(int) * UPT_START_SIZ);
259 if (pt->upt_fd2idx == NULL)
260 goto base_startup_failed_2;
262 memset(pt->upt_fd2idx, 0,
263 sizeof(int) * UPT_START_SIZ);
265 LIBCFS_ALLOC (pt->upt_skip,
266 sizeof(int) * UPT_START_SIZ);
267 if (pt->upt_skip == NULL)
268 goto base_startup_failed_3;
270 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
272 rc = libcfs_socketpair(pt->upt_notifier);
274 goto base_startup_failed_4;
276 pt->upt_pollfd[0].fd = LIBCFS_SOCK2FD(pt->upt_notifier[1]);
277 pt->upt_pollfd[0].events = POLLIN;
278 pt->upt_pollfd[0].revents = 0;
281 pt->upt_idx2conn[0] = NULL;
284 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
285 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
286 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
287 cfs_mt_init_completion(&pt->upt_completion);
290 /* Initialize peer hash list */
291 for (i = 0; i < UD_PEER_HASH_SIZE; i++)
292 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
294 pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
296 /* Spawn poll threads */
297 for (i = 0; i < usock_data.ud_npollthreads; i++) {
298 rc = cfs_create_thread(usocklnd_poll_thread,
299 &usock_data.ud_pollthreads[i], 0);
301 usocklnd_base_shutdown(i);
306 usock_data.ud_state = UD_STATE_INITIALIZED;
310 base_startup_failed_4:
311 LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
312 base_startup_failed_3:
313 LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
314 base_startup_failed_2:
315 LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
316 base_startup_failed_1:
317 LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
318 base_startup_failed_0:
320 usocklnd_release_poll_states(i);
321 LIBCFS_FREE (usock_data.ud_pollthreads,
322 usock_data.ud_npollthreads *
323 sizeof(usock_pollthread_t));
328 usocklnd_base_shutdown(int n)
332 usock_data.ud_shutdown = 1;
333 for (i = 0; i < n; i++) {
334 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
335 usocklnd_wakeup_pollthread(i);
336 cfs_mt_wait_for_completion(&pt->upt_completion);
339 pthread_rwlock_destroy(&usock_data.ud_peers_lock);
341 usocklnd_release_poll_states(usock_data.ud_npollthreads);
343 LIBCFS_FREE (usock_data.ud_pollthreads,
344 usock_data.ud_npollthreads *
345 sizeof(usock_pollthread_t));
347 usock_data.ud_state = UD_STATE_INIT_NOTHING;
351 usocklnd_new_incarnation()
354 int rc = gettimeofday(&tv, NULL);
356 return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
360 usocklnd_assign_ni_nid(lnet_ni_t *ni)
366 /* Find correct IP-address and update ni_nid with it.
367 * Two cases are supported:
368 * 1) no explicit interfaces are defined. NID will be assigned to
369 * first non-lo interface that is up;
370 * 2) exactly one explicit interface is defined. For example,
371 * LNET_NETWORKS='tcp(eth0)' */
373 if (ni->ni_interfaces[0] == NULL) {
377 n = libcfs_ipif_enumerate(&names);
379 CERROR("Can't enumerate interfaces: %d\n", n);
383 for (i = 0; i < n; i++) {
385 if (!strcmp(names[i], "lo")) /* skip the loopback IF */
388 rc = libcfs_ipif_query(names[i], &up, &ipaddr);
390 CWARN("Can't get interface %s info: %d\n",
396 CWARN("Ignoring interface %s (down)\n",
401 break; /* one address is quite enough */
404 libcfs_ipif_free_enumeration(names, n);
407 CERROR("Can't find any usable interfaces\n");
411 CDEBUG(D_NET, "No explicit interfaces defined. "
412 "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
414 if (ni->ni_interfaces[1] != NULL) {
415 CERROR("only one explicit interface is allowed\n");
419 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
421 CERROR("Can't get interface %s info: %d\n",
422 ni->ni_interfaces[0], rc);
427 CERROR("Explicit interface defined: %s but is down\n",
428 ni->ni_interfaces[0]);
432 CDEBUG(D_NET, "Explicit interface defined: %s. "
433 "%u.%u.%u.%u used\n",
434 ni->ni_interfaces[0], HIPQUAD(ipaddr));
438 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
444 usocklnd_startup(lnet_ni_t *ni)
449 if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
450 rc = usocklnd_base_startup();
455 LIBCFS_ALLOC(net, sizeof(*net));
457 goto startup_failed_0;
459 memset(net, 0, sizeof(*net));
460 net->un_incarnation = usocklnd_new_incarnation();
461 pthread_mutex_init(&net->un_lock, NULL);
462 pthread_cond_init(&net->un_cond, NULL);
466 rc = usocklnd_assign_ni_nid(ni);
468 goto startup_failed_1;
470 LASSERT (ni->ni_lnd == &the_tcplnd);
472 ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
473 ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
475 usock_data.ud_nets_count++;
479 pthread_mutex_destroy(&net->un_lock);
480 pthread_cond_destroy(&net->un_cond);
481 LIBCFS_FREE(net, sizeof(*net));
483 if (usock_data.ud_nets_count == 0)
484 usocklnd_base_shutdown(usock_data.ud_npollthreads);
490 usocklnd_shutdown(lnet_ni_t *ni)
492 usock_net_t *net = ni->ni_data;
494 net->un_shutdown = 1;
496 usocklnd_del_all_peers(ni);
498 /* Wait for all peer state to clean up */
499 pthread_mutex_lock(&net->un_lock);
500 while (net->un_peercount != 0)
501 pthread_cond_wait(&net->un_cond, &net->un_lock);
502 pthread_mutex_unlock(&net->un_lock);
504 /* Release usock_net_t structure */
505 pthread_mutex_destroy(&net->un_lock);
506 pthread_cond_destroy(&net->un_cond);
507 LIBCFS_FREE(net, sizeof(*net));
509 usock_data.ud_nets_count--;
510 if (usock_data.ud_nets_count == 0)
511 usocklnd_base_shutdown(usock_data.ud_npollthreads);
515 usocklnd_del_all_peers(lnet_ni_t *ni)
522 pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
524 for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
525 cfs_list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
526 peer = cfs_list_entry (ptmp, usock_peer_t, up_list);
528 if (peer->up_ni != ni)
531 usocklnd_del_peer_and_conns(peer);
535 pthread_rwlock_unlock(&usock_data.ud_peers_lock);
537 /* wakeup all threads */
538 for (i = 0; i < usock_data.ud_npollthreads; i++)
539 usocklnd_wakeup_pollthread(i);
543 usocklnd_del_peer_and_conns(usock_peer_t *peer)
545 /* peer cannot disappear because it's still in hash list */
547 pthread_mutex_lock(&peer->up_lock);
548 /* content of conn[] array cannot change now */
549 usocklnd_del_conns_locked(peer);
550 pthread_mutex_unlock(&peer->up_lock);
552 /* peer hash list is still protected by the caller */
553 cfs_list_del(&peer->up_list);
555 usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
559 usocklnd_del_conns_locked(usock_peer_t *peer)
563 for (i=0; i < N_CONN_TYPES; i++) {
564 usock_conn_t *conn = peer->up_conns[i];
566 usocklnd_conn_kill(conn);