Whamcloud - gitweb
new tag 2.2.93
[fs/lustre-release.git] / lnet / ulnds / socklnd / usocklnd.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  */
30 /*
31  * This file is part of Lustre, http://www.lustre.org/
32  * Lustre is a trademark of Sun Microsystems, Inc.
33  *
34  * lnet/ulnds/socklnd/usocklnd.c
35  *
36  * Author: Maxim Patlasov <maxim@clusterfs.com>
37  */
38
39 #include "usocklnd.h"
40 #include <sys/time.h>
41
42 lnd_t the_tcplnd = {
43         .lnd_type      = SOCKLND,
44         .lnd_startup   = usocklnd_startup,
45         .lnd_shutdown  = usocklnd_shutdown,
46         .lnd_send      = usocklnd_send,
47         .lnd_recv      = usocklnd_recv,
48         .lnd_accept    = usocklnd_accept,
49 };
50
51 usock_data_t usock_data;
52 usock_tunables_t usock_tuns = {
53         .ut_timeout         = 50,
54         .ut_poll_timeout    = 1,
55         .ut_fair_limit      = 1,
56         .ut_npollthreads    = 0,
57         .ut_min_bulk        = 1<<10,
58         .ut_txcredits       = 256,
59         .ut_peertxcredits   = 8,
60         .ut_socknagle       = 0,
61         .ut_sockbufsiz      = 0,
62 };
63
64 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
65 #define MAX_REASONABLE_NPT 1000
66
67 int
68 usocklnd_validate_tunables()
69 {
70         if (usock_tuns.ut_timeout <= 0 ||
71             usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
72                 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
73                        usock_tuns.ut_timeout);
74                 return -1;
75         }
76
77         if (usock_tuns.ut_poll_timeout <= 0 ||
78             usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
79                 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
80                        usock_tuns.ut_poll_timeout);
81                 return -1;
82         }
83
84         if (usock_tuns.ut_fair_limit <= 0) {
85                 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
86                        usock_tuns.ut_fair_limit);
87                 return -1;
88         }
89
90         if (usock_tuns.ut_npollthreads < 0 ||
91             usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
92                 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
93                        usock_tuns.ut_npollthreads);
94                 return -1;
95         }
96
97         if (usock_tuns.ut_txcredits <= 0) {
98                 CERROR("USOCK_TXCREDITS: %d should be positive\n",
99                        usock_tuns.ut_txcredits);
100                 return -1;
101         }
102
103         if (usock_tuns.ut_peertxcredits <= 0) {
104                 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
105                        usock_tuns.ut_peertxcredits);
106                 return -1;
107         }
108
109         if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
110                 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
111                        " than USOCK_TXCREDITS: %d\n",
112                        usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
113                 return -1;
114         }
115
116         if (usock_tuns.ut_socknagle != 0 &&
117             usock_tuns.ut_socknagle != 1) {
118                 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
119                        usock_tuns.ut_socknagle);
120                 return -1;
121         }
122
123         if (usock_tuns.ut_sockbufsiz < 0) {
124                 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
125                        usock_tuns.ut_sockbufsiz);
126                 return -1;
127         }
128
129         return 0;
130 }
131
132 void
133 usocklnd_release_poll_states(int n)
134 {
135         int i;
136
137         for (i = 0; i < n; i++) {
138                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
139
140                 libcfs_sock_release(pt->upt_notifier[0]);
141                 libcfs_sock_release(pt->upt_notifier[1]);
142
143                 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
144                 cfs_mt_fini_completion(&pt->upt_completion);
145
146                 LIBCFS_FREE (pt->upt_pollfd,
147                              sizeof(struct pollfd) * pt->upt_npollfd);
148                 LIBCFS_FREE (pt->upt_idx2conn,
149                               sizeof(usock_conn_t *) * pt->upt_npollfd);
150                 LIBCFS_FREE (pt->upt_fd2idx,
151                               sizeof(int) * pt->upt_nfd2idx);
152         }
153 }
154
155 int
156 usocklnd_update_tunables()
157 {
158         int rc;
159
160         rc = lnet_parse_int_tunable(&usock_tuns.ut_timeout,
161                                       "USOCK_TIMEOUT");
162         if (rc)
163                 return rc;
164
165         rc = lnet_parse_int_tunable(&usock_tuns.ut_poll_timeout,
166                                       "USOCK_POLL_TIMEOUT");
167         if (rc)
168                 return rc;
169
170         rc = lnet_parse_int_tunable(&usock_tuns.ut_npollthreads,
171                                       "USOCK_NPOLLTHREADS");
172         if (rc)
173                 return rc;
174
175         rc = lnet_parse_int_tunable(&usock_tuns.ut_fair_limit,
176                                       "USOCK_FAIR_LIMIT");
177         if (rc)
178                 return rc;
179
180         rc = lnet_parse_int_tunable(&usock_tuns.ut_min_bulk,
181                                       "USOCK_MIN_BULK");
182         if (rc)
183                 return rc;
184
185         rc = lnet_parse_int_tunable(&usock_tuns.ut_txcredits,
186                                       "USOCK_TXCREDITS");
187         if (rc)
188                 return rc;
189
190         rc = lnet_parse_int_tunable(&usock_tuns.ut_peertxcredits,
191                                       "USOCK_PEERTXCREDITS");
192         if (rc)
193                 return rc;
194
195         rc = lnet_parse_int_tunable(&usock_tuns.ut_socknagle,
196                                       "USOCK_SOCKNAGLE");
197         if (rc)
198                 return rc;
199
200         rc = lnet_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
201                                       "USOCK_SOCKBUFSIZ");
202         if (rc)
203                 return rc;
204
205         if (usocklnd_validate_tunables())
206                 return -EINVAL;
207
208         if (usock_tuns.ut_npollthreads == 0) {
209                 usock_tuns.ut_npollthreads = cfs_online_cpus();
210
211                 if (usock_tuns.ut_npollthreads <= 0) {
212                         CERROR("Cannot find out the number of online CPUs\n");
213                         return -EINVAL;
214                 }
215         }
216
217         return 0;
218 }
219
220
221 int
222 usocklnd_base_startup()
223 {
224         usock_pollthread_t *pt;
225         int                 i;
226         int                 rc;
227
228         rc = usocklnd_update_tunables();
229         if (rc)
230                 return rc;
231
232         usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
233
234         LIBCFS_ALLOC (usock_data.ud_pollthreads,
235                       usock_data.ud_npollthreads *
236                       sizeof(usock_pollthread_t));
237         if (usock_data.ud_pollthreads == NULL)
238                 return -ENOMEM;
239
240         /* Initialize poll thread state structures */
241         for (i = 0; i < usock_data.ud_npollthreads; i++) {
242
243                 pt = &usock_data.ud_pollthreads[i];
244
245                 rc = -ENOMEM;
246
247                 LIBCFS_ALLOC (pt->upt_pollfd,
248                               sizeof(struct pollfd) * UPT_START_SIZ);
249                 if (pt->upt_pollfd == NULL)
250                         goto base_startup_failed_0;
251
252                 LIBCFS_ALLOC (pt->upt_idx2conn,
253                               sizeof(usock_conn_t *) * UPT_START_SIZ);
254                 if (pt->upt_idx2conn == NULL)
255                         goto base_startup_failed_1;
256
257                 LIBCFS_ALLOC (pt->upt_fd2idx,
258                               sizeof(int) * UPT_START_SIZ);
259                 if (pt->upt_fd2idx == NULL)
260                         goto base_startup_failed_2;
261
262                 memset(pt->upt_fd2idx, 0,
263                        sizeof(int) * UPT_START_SIZ);
264
265                 LIBCFS_ALLOC (pt->upt_skip,
266                               sizeof(int) * UPT_START_SIZ);
267                 if (pt->upt_skip == NULL)
268                         goto base_startup_failed_3;
269
270                 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
271
272                 rc = libcfs_socketpair(pt->upt_notifier);
273                 if (rc != 0)
274                         goto base_startup_failed_4;
275
276                 pt->upt_pollfd[0].fd = LIBCFS_SOCK2FD(pt->upt_notifier[1]);
277                 pt->upt_pollfd[0].events = POLLIN;
278                 pt->upt_pollfd[0].revents = 0;
279
280                 pt->upt_nfds = 1;
281                 pt->upt_idx2conn[0] = NULL;
282
283                 pt->upt_errno = 0;
284                 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
285                 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
286                 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
287                 cfs_mt_init_completion(&pt->upt_completion);
288         }
289
290         /* Initialize peer hash list */
291         for (i = 0; i < UD_PEER_HASH_SIZE; i++)
292                 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
293
294         pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
295
296         /* Spawn poll threads */
297         for (i = 0; i < usock_data.ud_npollthreads; i++) {
298                 rc = cfs_create_thread(usocklnd_poll_thread,
299                                        &usock_data.ud_pollthreads[i], 0);
300                 if (rc) {
301                         usocklnd_base_shutdown(i);
302                         return rc;
303                 }
304         }
305
306         usock_data.ud_state = UD_STATE_INITIALIZED;
307
308         return 0;
309
310   base_startup_failed_4:
311         LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
312   base_startup_failed_3:
313         LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
314   base_startup_failed_2:
315         LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
316   base_startup_failed_1:
317         LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
318   base_startup_failed_0:
319         LASSERT(rc != 0);
320         usocklnd_release_poll_states(i);
321         LIBCFS_FREE (usock_data.ud_pollthreads,
322                      usock_data.ud_npollthreads *
323                      sizeof(usock_pollthread_t));
324         return rc;
325 }
326
327 void
328 usocklnd_base_shutdown(int n)
329 {
330         int i;
331
332         usock_data.ud_shutdown = 1;
333         for (i = 0; i < n; i++) {
334                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
335                 usocklnd_wakeup_pollthread(i);
336                 cfs_mt_wait_for_completion(&pt->upt_completion);
337         }
338
339         pthread_rwlock_destroy(&usock_data.ud_peers_lock);
340
341         usocklnd_release_poll_states(usock_data.ud_npollthreads);
342
343         LIBCFS_FREE (usock_data.ud_pollthreads,
344                      usock_data.ud_npollthreads *
345                      sizeof(usock_pollthread_t));
346
347         usock_data.ud_state = UD_STATE_INIT_NOTHING;
348 }
349
350 __u64
351 usocklnd_new_incarnation()
352 {
353         struct timeval tv;
354         int            rc = gettimeofday(&tv, NULL);
355         LASSERT (rc == 0);
356         return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
357 }
358
359 static int
360 usocklnd_assign_ni_nid(lnet_ni_t *ni)
361 {
362         int   rc;
363         int   up;
364         __u32 ipaddr;
365
366         /* Find correct IP-address and update ni_nid with it.
367          * Two cases are supported:
368          * 1) no explicit interfaces are defined. NID will be assigned to
369          * first non-lo interface that is up;
370          * 2) exactly one explicit interface is defined. For example,
371          * LNET_NETWORKS='tcp(eth0)' */
372
373         if (ni->ni_interfaces[0] == NULL) {
374                 char **names;
375                 int    i, n;
376
377                 n = libcfs_ipif_enumerate(&names);
378                 if (n <= 0) {
379                         CERROR("Can't enumerate interfaces: %d\n", n);
380                         return -1;
381                 }
382
383                 for (i = 0; i < n; i++) {
384
385                         if (!strcmp(names[i], "lo")) /* skip the loopback IF */
386                                 continue;
387
388                         rc = libcfs_ipif_query(names[i], &up, &ipaddr);
389                         if (rc != 0) {
390                                 CWARN("Can't get interface %s info: %d\n",
391                                       names[i], rc);
392                                 continue;
393                         }
394
395                         if (!up) {
396                                 CWARN("Ignoring interface %s (down)\n",
397                                       names[i]);
398                             continue;
399                         }
400
401                         break;      /* one address is quite enough */
402                 }
403
404                 libcfs_ipif_free_enumeration(names, n);
405
406                 if (i >= n) {
407                         CERROR("Can't find any usable interfaces\n");
408                         return -1;
409                 }
410
411                 CDEBUG(D_NET, "No explicit interfaces defined. "
412                        "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
413         } else {
414                 if (ni->ni_interfaces[1] != NULL) {
415                         CERROR("only one explicit interface is allowed\n");
416                         return -1;
417                 }
418
419                 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
420                 if (rc != 0) {
421                         CERROR("Can't get interface %s info: %d\n",
422                                ni->ni_interfaces[0], rc);
423                         return -1;
424                 }
425
426                 if (!up) {
427                         CERROR("Explicit interface defined: %s but is down\n",
428                                ni->ni_interfaces[0]);
429                         return -1;
430                 }
431
432                 CDEBUG(D_NET, "Explicit interface defined: %s. "
433                        "%u.%u.%u.%u used\n",
434                        ni->ni_interfaces[0], HIPQUAD(ipaddr));
435
436         }
437
438         ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
439
440         return 0;
441 }
442
443 int
444 usocklnd_startup(lnet_ni_t *ni)
445 {
446         int          rc;
447         usock_net_t *net;
448
449         if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
450                 rc = usocklnd_base_startup();
451                 if (rc != 0)
452                         return rc;
453         }
454
455         LIBCFS_ALLOC(net, sizeof(*net));
456         if (net == NULL)
457                 goto startup_failed_0;
458
459         memset(net, 0, sizeof(*net));
460         net->un_incarnation = usocklnd_new_incarnation();
461         pthread_mutex_init(&net->un_lock, NULL);
462         pthread_cond_init(&net->un_cond, NULL);
463
464         ni->ni_data = net;
465
466         rc = usocklnd_assign_ni_nid(ni);
467         if (rc != 0)
468                 goto startup_failed_1;
469
470         LASSERT (ni->ni_lnd == &the_tcplnd);
471
472         ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
473         ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
474
475         usock_data.ud_nets_count++;
476         return 0;
477
478  startup_failed_1:
479         pthread_mutex_destroy(&net->un_lock);
480         pthread_cond_destroy(&net->un_cond);
481         LIBCFS_FREE(net, sizeof(*net));
482  startup_failed_0:
483         if (usock_data.ud_nets_count == 0)
484                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
485
486         return -ENETDOWN;
487 }
488
489 void
490 usocklnd_shutdown(lnet_ni_t *ni)
491 {
492         usock_net_t *net = ni->ni_data;
493
494         net->un_shutdown = 1;
495
496         usocklnd_del_all_peers(ni);
497
498         /* Wait for all peer state to clean up */
499         pthread_mutex_lock(&net->un_lock);
500         while (net->un_peercount != 0)
501                 pthread_cond_wait(&net->un_cond, &net->un_lock);
502         pthread_mutex_unlock(&net->un_lock);
503
504         /* Release usock_net_t structure */
505         pthread_mutex_destroy(&net->un_lock);
506         pthread_cond_destroy(&net->un_cond);
507         LIBCFS_FREE(net, sizeof(*net));
508
509         usock_data.ud_nets_count--;
510         if (usock_data.ud_nets_count == 0)
511                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
512 }
513
514 void
515 usocklnd_del_all_peers(lnet_ni_t *ni)
516 {
517         cfs_list_t        *ptmp;
518         cfs_list_t        *pnxt;
519         usock_peer_t      *peer;
520         int                i;
521
522         pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
523
524         for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
525                 cfs_list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
526                         peer = cfs_list_entry (ptmp, usock_peer_t, up_list);
527
528                         if (peer->up_ni != ni)
529                                 continue;
530
531                         usocklnd_del_peer_and_conns(peer);
532                 }
533         }
534
535         pthread_rwlock_unlock(&usock_data.ud_peers_lock);
536
537         /* wakeup all threads */
538         for (i = 0; i < usock_data.ud_npollthreads; i++)
539                 usocklnd_wakeup_pollthread(i);
540 }
541
542 void
543 usocklnd_del_peer_and_conns(usock_peer_t *peer)
544 {
545         /* peer cannot disappear because it's still in hash list */
546
547         pthread_mutex_lock(&peer->up_lock);
548         /* content of conn[] array cannot change now */
549         usocklnd_del_conns_locked(peer);
550         pthread_mutex_unlock(&peer->up_lock);
551
552         /* peer hash list is still protected by the caller */
553         cfs_list_del(&peer->up_list);
554
555         usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
556 }
557
558 void
559 usocklnd_del_conns_locked(usock_peer_t *peer)
560 {
561         int i;
562
563         for (i=0; i < N_CONN_TYPES; i++) {
564                 usock_conn_t *conn = peer->up_conns[i];
565                 if (conn != NULL)
566                         usocklnd_conn_kill(conn);
567         }
568 }