1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2007 Cluster File Systems, Inc.
5 * Author: Maxim Patlasov <maxim@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
17 .lnd_startup = usocklnd_startup,
18 .lnd_shutdown = usocklnd_shutdown,
19 .lnd_send = usocklnd_send,
20 .lnd_recv = usocklnd_recv,
21 .lnd_accept = usocklnd_accept,
24 usock_data_t usock_data;
25 usock_tunables_t usock_tuns = {
32 .ut_peertxcredits = 8,
37 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
38 #define MAX_REASONABLE_NPT 1000
41 usocklnd_validate_tunables()
43 if (usock_tuns.ut_timeout <= 0 ||
44 usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
45 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
46 usock_tuns.ut_timeout);
50 if (usock_tuns.ut_poll_timeout <= 0 ||
51 usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
52 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
53 usock_tuns.ut_poll_timeout);
57 if (usock_tuns.ut_fair_limit <= 0) {
58 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
59 usock_tuns.ut_fair_limit);
63 if (usock_tuns.ut_npollthreads < 0 ||
64 usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
65 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
66 usock_tuns.ut_npollthreads);
70 if (usock_tuns.ut_txcredits <= 0) {
71 CERROR("USOCK_TXCREDITS: %d should be positive\n",
72 usock_tuns.ut_txcredits);
76 if (usock_tuns.ut_peertxcredits <= 0) {
77 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
78 usock_tuns.ut_peertxcredits);
82 if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
83 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
84 " than USOCK_TXCREDITS: %d\n",
85 usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
89 if (usock_tuns.ut_socknagle != 0 &&
90 usock_tuns.ut_socknagle != 1) {
91 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
92 usock_tuns.ut_socknagle);
96 if (usock_tuns.ut_sockbufsiz < 0) {
97 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
98 usock_tuns.ut_sockbufsiz);
106 usocklnd_release_poll_states(int n)
110 for (i = 0; i < n; i++) {
111 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
113 close(pt->upt_notifier_fd);
114 close(pt->upt_pollfd[0].fd);
116 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
117 cfs_fini_completion(&pt->upt_completion);
119 LIBCFS_FREE (pt->upt_pollfd,
120 sizeof(struct pollfd) * pt->upt_npollfd);
121 LIBCFS_FREE (pt->upt_idx2conn,
122 sizeof(usock_conn_t *) * pt->upt_npollfd);
123 LIBCFS_FREE (pt->upt_fd2idx,
124 sizeof(int) * pt->upt_nfd2idx);
129 usocklnd_update_tunables()
133 rc = cfs_parse_int_tunable(&usock_tuns.ut_timeout,
138 rc = cfs_parse_int_tunable(&usock_tuns.ut_poll_timeout,
139 "USOCK_POLL_TIMEOUT");
143 rc = cfs_parse_int_tunable(&usock_tuns.ut_npollthreads,
144 "USOCK_NPOLLTHREADS");
148 rc = cfs_parse_int_tunable(&usock_tuns.ut_fair_limit,
153 rc = cfs_parse_int_tunable(&usock_tuns.ut_min_bulk,
158 rc = cfs_parse_int_tunable(&usock_tuns.ut_txcredits,
163 rc = cfs_parse_int_tunable(&usock_tuns.ut_peertxcredits,
164 "USOCK_PEERTXCREDITS");
168 rc = cfs_parse_int_tunable(&usock_tuns.ut_socknagle,
173 rc = cfs_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
178 if (usocklnd_validate_tunables())
181 if (usock_tuns.ut_npollthreads == 0) {
182 usock_tuns.ut_npollthreads = cfs_online_cpus();
184 if (usock_tuns.ut_npollthreads <= 0) {
185 CERROR("Cannot find out the number of online CPUs\n");
195 usocklnd_base_startup()
197 usock_pollthread_t *pt;
201 rc = usocklnd_update_tunables();
205 usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
207 LIBCFS_ALLOC (usock_data.ud_pollthreads,
208 usock_data.ud_npollthreads *
209 sizeof(usock_pollthread_t));
210 if (usock_data.ud_pollthreads == NULL)
213 /* Initialize poll thread state structures */
214 for (i = 0; i < usock_data.ud_npollthreads; i++) {
217 pt = &usock_data.ud_pollthreads[i];
221 LIBCFS_ALLOC (pt->upt_pollfd,
222 sizeof(struct pollfd) * UPT_START_SIZ);
223 if (pt->upt_pollfd == NULL)
224 goto base_startup_failed_0;
226 LIBCFS_ALLOC (pt->upt_idx2conn,
227 sizeof(usock_conn_t *) * UPT_START_SIZ);
228 if (pt->upt_idx2conn == NULL)
229 goto base_startup_failed_1;
231 LIBCFS_ALLOC (pt->upt_fd2idx,
232 sizeof(int) * UPT_START_SIZ);
233 if (pt->upt_fd2idx == NULL)
234 goto base_startup_failed_2;
236 memset(pt->upt_fd2idx, 0,
237 sizeof(int) * UPT_START_SIZ);
239 LIBCFS_ALLOC (pt->upt_skip,
240 sizeof(int) * UPT_START_SIZ);
241 if (pt->upt_skip == NULL)
242 goto base_startup_failed_3;
244 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
246 rc = libcfs_socketpair(notifier);
248 goto base_startup_failed_4;
250 pt->upt_notifier_fd = notifier[0];
252 pt->upt_pollfd[0].fd = notifier[1];
253 pt->upt_pollfd[0].events = POLLIN;
254 pt->upt_pollfd[0].revents = 0;
257 pt->upt_idx2conn[0] = NULL;
260 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
261 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
262 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
263 cfs_init_completion(&pt->upt_completion);
266 /* Initialize peer hash list */
267 for (i = 0; i < UD_PEER_HASH_SIZE; i++)
268 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
270 pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
272 /* Spawn poll threads */
273 for (i = 0; i < usock_data.ud_npollthreads; i++) {
274 rc = cfs_create_thread(usocklnd_poll_thread,
275 &usock_data.ud_pollthreads[i]);
277 usocklnd_base_shutdown(i);
282 usock_data.ud_state = UD_STATE_INITIALIZED;
286 base_startup_failed_4:
287 LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
288 base_startup_failed_3:
289 LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
290 base_startup_failed_2:
291 LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
292 base_startup_failed_1:
293 LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
294 base_startup_failed_0:
296 usocklnd_release_poll_states(i);
297 LIBCFS_FREE (usock_data.ud_pollthreads,
298 usock_data.ud_npollthreads *
299 sizeof(usock_pollthread_t));
304 usocklnd_base_shutdown(int n)
308 usock_data.ud_shutdown = 1;
309 for (i = 0; i < n; i++) {
310 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
311 usocklnd_wakeup_pollthread(i);
312 cfs_wait_for_completion(&pt->upt_completion);
315 pthread_rwlock_destroy(&usock_data.ud_peers_lock);
317 usocklnd_release_poll_states(usock_data.ud_npollthreads);
319 LIBCFS_FREE (usock_data.ud_pollthreads,
320 usock_data.ud_npollthreads *
321 sizeof(usock_pollthread_t));
323 usock_data.ud_state = UD_STATE_INIT_NOTHING;
327 usocklnd_new_incarnation()
330 int rc = gettimeofday(&tv, NULL);
332 return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
336 usocklnd_assign_ni_nid(lnet_ni_t *ni)
342 /* Find correct IP-address and update ni_nid with it.
343 * Two cases are supported:
344 * 1) no explicit interfaces are defined. NID will be assigned to
345 * first non-lo interface that is up;
346 * 2) exactly one explicit interface is defined. For example,
347 * LNET_NETWORKS='tcp(eth0)' */
349 if (ni->ni_interfaces[0] == NULL) {
353 n = libcfs_ipif_enumerate(&names);
355 CERROR("Can't enumerate interfaces: %d\n", n);
359 for (i = 0; i < n; i++) {
361 if (!strcmp(names[i], "lo")) /* skip the loopback IF */
364 rc = libcfs_ipif_query(names[i], &up, &ipaddr);
366 CWARN("Can't get interface %s info: %d\n",
372 CWARN("Ignoring interface %s (down)\n",
377 break; /* one address is quite enough */
380 libcfs_ipif_free_enumeration(names, n);
383 CERROR("Can't find any usable interfaces\n");
387 CDEBUG(D_NET, "No explicit interfaces defined. "
388 "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
390 if (ni->ni_interfaces[1] != NULL) {
391 CERROR("only one explicit interface is allowed\n");
395 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
397 CERROR("Can't get interface %s info: %d\n",
398 ni->ni_interfaces[0], rc);
403 CERROR("Explicit interface defined: %s but is down\n",
404 ni->ni_interfaces[0]);
408 CDEBUG(D_NET, "Explicit interface defined: %s. "
409 "%u.%u.%u.%u used\n",
410 ni->ni_interfaces[0], HIPQUAD(ipaddr));
414 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
420 usocklnd_startup(lnet_ni_t *ni)
425 if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
426 rc = usocklnd_base_startup();
431 LIBCFS_ALLOC(net, sizeof(*net));
433 goto startup_failed_0;
435 memset(net, 0, sizeof(*net));
436 net->un_incarnation = usocklnd_new_incarnation();
437 pthread_mutex_init(&net->un_lock, NULL);
438 pthread_cond_init(&net->un_cond, NULL);
442 if (!(the_lnet.ln_pid & LNET_PID_USERFLAG)) {
443 rc = usocklnd_assign_ni_nid(ni);
445 goto startup_failed_1;
448 LASSERT (ni->ni_lnd == &the_tcplnd);
450 ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
451 ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
453 usock_data.ud_nets_count++;
457 pthread_mutex_destroy(&net->un_lock);
458 pthread_cond_destroy(&net->un_cond);
459 LIBCFS_FREE(net, sizeof(*net));
461 if (usock_data.ud_nets_count == 0)
462 usocklnd_base_shutdown(usock_data.ud_npollthreads);
468 usocklnd_shutdown(lnet_ni_t *ni)
470 usock_net_t *net = ni->ni_data;
472 net->un_shutdown = 1;
474 usocklnd_del_all_peers(ni);
476 /* Wait for all peer state to clean up */
477 pthread_mutex_lock(&net->un_lock);
478 while (net->un_peercount != 0)
479 pthread_cond_wait(&net->un_cond, &net->un_lock);
480 pthread_mutex_unlock(&net->un_lock);
482 /* Release usock_net_t structure */
483 pthread_mutex_destroy(&net->un_lock);
484 pthread_cond_destroy(&net->un_cond);
485 LIBCFS_FREE(net, sizeof(*net));
487 usock_data.ud_nets_count--;
488 if (usock_data.ud_nets_count == 0)
489 usocklnd_base_shutdown(usock_data.ud_npollthreads);
493 usocklnd_del_all_peers(lnet_ni_t *ni)
495 struct list_head *ptmp;
496 struct list_head *pnxt;
500 pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
502 for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
503 list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
504 peer = list_entry (ptmp, usock_peer_t, up_list);
506 if (peer->up_ni != ni)
509 usocklnd_del_peer_and_conns(peer);
513 pthread_rwlock_unlock(&usock_data.ud_peers_lock);
515 /* wakeup all threads */
516 for (i = 0; i < usock_data.ud_npollthreads; i++)
517 usocklnd_wakeup_pollthread(i);
521 usocklnd_del_peer_and_conns(usock_peer_t *peer)
523 /* peer cannot disappear because it's still in hash list */
525 pthread_mutex_lock(&peer->up_lock);
526 /* content of conn[] array cannot change now */
527 usocklnd_del_conns_locked(peer);
528 pthread_mutex_unlock(&peer->up_lock);
530 /* peer hash list is still protected by the caller */
531 list_del(&peer->up_list);
533 usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
537 usocklnd_del_conns_locked(usock_peer_t *peer)
541 for (i=0; i < N_CONN_TYPES; i++) {
542 usock_conn_t *conn = peer->up_conns[i];
544 usocklnd_conn_kill(conn);