Whamcloud - gitweb
b=16098
[fs/lustre-release.git] / lnet / ulnds / socklnd / usocklnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see [sun.com URL with a
20  * copy of GPLv2].
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lnet/ulnds/socklnd/usocklnd.c
37  *
38  * Author: Maxim Patlasov <maxim@clusterfs.com>
39  */
40
41 #include "usocklnd.h"
42 #include <sys/time.h>
43
44 lnd_t the_tcplnd = {
45         .lnd_type      = SOCKLND,
46         .lnd_startup   = usocklnd_startup,
47         .lnd_shutdown  = usocklnd_shutdown,
48         .lnd_send      = usocklnd_send,
49         .lnd_recv      = usocklnd_recv,
50         .lnd_accept    = usocklnd_accept,
51 };
52
53 usock_data_t usock_data;
54 usock_tunables_t usock_tuns = {
55         .ut_timeout         = 50,
56         .ut_poll_timeout    = 1,
57         .ut_fair_limit      = 1,
58         .ut_npollthreads    = 0,
59         .ut_min_bulk        = 1<<10,
60         .ut_txcredits       = 256,
61         .ut_peertxcredits   = 8,
62         .ut_socknagle       = 0,
63         .ut_sockbufsiz      = 0,
64 };
65
66 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
67 #define MAX_REASONABLE_NPT 1000
68
69 int
70 usocklnd_validate_tunables()
71 {
72         if (usock_tuns.ut_timeout <= 0 ||
73             usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
74                 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
75                        usock_tuns.ut_timeout);
76                 return -1;
77         }
78                 
79         if (usock_tuns.ut_poll_timeout <= 0 ||
80             usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
81                 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
82                        usock_tuns.ut_poll_timeout);
83                 return -1;
84         }
85
86         if (usock_tuns.ut_fair_limit <= 0) {
87                 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
88                        usock_tuns.ut_fair_limit);
89                 return -1;
90         }
91                 
92         if (usock_tuns.ut_npollthreads < 0 ||
93             usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
94                 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
95                        usock_tuns.ut_npollthreads);
96                 return -1;
97         }
98
99         if (usock_tuns.ut_txcredits <= 0) {
100                 CERROR("USOCK_TXCREDITS: %d should be positive\n",
101                        usock_tuns.ut_txcredits);
102                 return -1;
103         }
104                 
105         if (usock_tuns.ut_peertxcredits <= 0) {
106                 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
107                        usock_tuns.ut_peertxcredits);
108                 return -1;
109         }
110
111         if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
112                 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
113                        " than USOCK_TXCREDITS: %d\n",
114                        usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
115                 return -1;
116         }
117
118         if (usock_tuns.ut_socknagle != 0 &&
119             usock_tuns.ut_socknagle != 1) {
120                 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
121                        usock_tuns.ut_socknagle);
122                 return -1;
123         }
124         
125         if (usock_tuns.ut_sockbufsiz < 0) {
126                 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
127                        usock_tuns.ut_sockbufsiz);
128                 return -1;
129         }
130
131         return 0;
132 }
133
134 void
135 usocklnd_release_poll_states(int n)
136 {
137         int i;
138         
139         for (i = 0; i < n; i++) {
140                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
141                 
142                 close(pt->upt_notifier_fd);
143                 close(pt->upt_pollfd[0].fd);
144
145                 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
146                 cfs_fini_completion(&pt->upt_completion);
147                 
148                 LIBCFS_FREE (pt->upt_pollfd,
149                              sizeof(struct pollfd) * pt->upt_npollfd);
150                 LIBCFS_FREE (pt->upt_idx2conn,
151                               sizeof(usock_conn_t *) * pt->upt_npollfd);
152                 LIBCFS_FREE (pt->upt_fd2idx,
153                               sizeof(int) * pt->upt_nfd2idx);                
154         }
155 }
156
157 int
158 usocklnd_update_tunables()
159 {
160         int rc;
161         
162         rc = cfs_parse_int_tunable(&usock_tuns.ut_timeout,
163                                       "USOCK_TIMEOUT");
164         if (rc)
165                 return rc;
166
167         rc = cfs_parse_int_tunable(&usock_tuns.ut_poll_timeout,
168                                       "USOCK_POLL_TIMEOUT");
169         if (rc)
170                 return rc;
171
172         rc = cfs_parse_int_tunable(&usock_tuns.ut_npollthreads,
173                                       "USOCK_NPOLLTHREADS");
174         if (rc)
175                 return rc;
176
177         rc = cfs_parse_int_tunable(&usock_tuns.ut_fair_limit,
178                                       "USOCK_FAIR_LIMIT");
179         if (rc)
180                 return rc;
181
182         rc = cfs_parse_int_tunable(&usock_tuns.ut_min_bulk,
183                                       "USOCK_MIN_BULK");
184         if (rc)
185                 return rc;
186
187         rc = cfs_parse_int_tunable(&usock_tuns.ut_txcredits,
188                                       "USOCK_TXCREDITS");
189         if (rc)
190                 return rc;
191
192         rc = cfs_parse_int_tunable(&usock_tuns.ut_peertxcredits,
193                                       "USOCK_PEERTXCREDITS");
194         if (rc)
195                 return rc;
196
197         rc = cfs_parse_int_tunable(&usock_tuns.ut_socknagle,
198                                       "USOCK_SOCKNAGLE");
199         if (rc)
200                 return rc;
201
202         rc = cfs_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
203                                       "USOCK_SOCKBUFSIZ");
204         if (rc)
205                 return rc;
206
207         if (usocklnd_validate_tunables())
208                 return -EINVAL;
209         
210         if (usock_tuns.ut_npollthreads == 0) {
211                 usock_tuns.ut_npollthreads = cfs_online_cpus();
212
213                 if (usock_tuns.ut_npollthreads <= 0) {
214                         CERROR("Cannot find out the number of online CPUs\n");
215                         return -EINVAL;
216                 }
217         }
218         
219         return 0;
220 }
221
222
223 int
224 usocklnd_base_startup()
225 {
226         usock_pollthread_t *pt;
227         int                 i;
228         int                 rc;
229         
230         rc = usocklnd_update_tunables();
231         if (rc)
232                 return rc;
233         
234         usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
235
236         LIBCFS_ALLOC (usock_data.ud_pollthreads,
237                       usock_data.ud_npollthreads *
238                       sizeof(usock_pollthread_t));
239         if (usock_data.ud_pollthreads == NULL)
240                 return -ENOMEM;
241
242         /* Initialize poll thread state structures */
243         for (i = 0; i < usock_data.ud_npollthreads; i++) {
244                 int notifier[2];
245
246                 pt = &usock_data.ud_pollthreads[i];
247
248                 rc = -ENOMEM;
249                 
250                 LIBCFS_ALLOC (pt->upt_pollfd,
251                               sizeof(struct pollfd) * UPT_START_SIZ);
252                 if (pt->upt_pollfd == NULL)
253                         goto base_startup_failed_0;
254                 
255                 LIBCFS_ALLOC (pt->upt_idx2conn,
256                               sizeof(usock_conn_t *) * UPT_START_SIZ);
257                 if (pt->upt_idx2conn == NULL)
258                         goto base_startup_failed_1;
259
260                 LIBCFS_ALLOC (pt->upt_fd2idx,
261                               sizeof(int) * UPT_START_SIZ);
262                 if (pt->upt_fd2idx == NULL)
263                         goto base_startup_failed_2;                
264                 
265                 memset(pt->upt_fd2idx, 0,
266                        sizeof(int) * UPT_START_SIZ);                       
267                 
268                 LIBCFS_ALLOC (pt->upt_skip,
269                               sizeof(int) * UPT_START_SIZ);
270                 if (pt->upt_skip == NULL)
271                         goto base_startup_failed_3;
272
273                 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
274
275                 rc = libcfs_socketpair(notifier);
276                 if (rc != 0)
277                         goto base_startup_failed_4;
278
279                 pt->upt_notifier_fd = notifier[0];
280
281                 pt->upt_pollfd[0].fd = notifier[1];
282                 pt->upt_pollfd[0].events = POLLIN;
283                 pt->upt_pollfd[0].revents = 0;
284
285                 pt->upt_nfds = 1;
286                 pt->upt_idx2conn[0] = NULL;
287
288                 pt->upt_errno = 0;
289                 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
290                 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
291                 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
292                 cfs_init_completion(&pt->upt_completion);
293         }
294
295         /* Initialize peer hash list */        
296         for (i = 0; i < UD_PEER_HASH_SIZE; i++)
297                 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
298         
299         pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
300
301         /* Spawn poll threads */
302         for (i = 0; i < usock_data.ud_npollthreads; i++) {
303                 rc = cfs_create_thread(usocklnd_poll_thread,
304                                        &usock_data.ud_pollthreads[i]);
305                 if (rc) {
306                         usocklnd_base_shutdown(i);
307                         return rc;
308                 }
309         }
310         
311         usock_data.ud_state = UD_STATE_INITIALIZED;
312         
313         return 0;
314
315   base_startup_failed_4:
316         LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
317   base_startup_failed_3:
318         LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
319   base_startup_failed_2:
320         LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
321   base_startup_failed_1:
322         LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
323   base_startup_failed_0:
324         LASSERT(rc != 0);
325         usocklnd_release_poll_states(i);
326         LIBCFS_FREE (usock_data.ud_pollthreads,
327                      usock_data.ud_npollthreads *
328                      sizeof(usock_pollthread_t));
329         return rc;
330 }
331
332 void
333 usocklnd_base_shutdown(int n)
334 {
335         int i;
336         
337         usock_data.ud_shutdown = 1;
338         for (i = 0; i < n; i++) {
339                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
340                 usocklnd_wakeup_pollthread(i);
341                 cfs_wait_for_completion(&pt->upt_completion);
342         }
343
344         pthread_rwlock_destroy(&usock_data.ud_peers_lock);
345
346         usocklnd_release_poll_states(usock_data.ud_npollthreads);
347
348         LIBCFS_FREE (usock_data.ud_pollthreads,
349                      usock_data.ud_npollthreads *
350                      sizeof(usock_pollthread_t));
351         
352         usock_data.ud_state = UD_STATE_INIT_NOTHING;
353 }
354
355 __u64
356 usocklnd_new_incarnation()
357 {
358         struct timeval tv;
359         int            rc = gettimeofday(&tv, NULL);
360         LASSERT (rc == 0);
361         return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
362 }
363
364 static int
365 usocklnd_assign_ni_nid(lnet_ni_t *ni)
366 {
367         int   rc;
368         int   up;
369         __u32 ipaddr;
370         
371         /* Find correct IP-address and update ni_nid with it.
372          * Two cases are supported:
373          * 1) no explicit interfaces are defined. NID will be assigned to
374          * first non-lo interface that is up;
375          * 2) exactly one explicit interface is defined. For example,
376          * LNET_NETWORKS='tcp(eth0)' */     
377
378         if (ni->ni_interfaces[0] == NULL) {
379                 char **names;
380                 int    i, n;
381             
382                 n = libcfs_ipif_enumerate(&names);
383                 if (n <= 0) {
384                         CERROR("Can't enumerate interfaces: %d\n", n);
385                         return -1;
386                 }
387
388                 for (i = 0; i < n; i++) {
389    
390                         if (!strcmp(names[i], "lo")) /* skip the loopback IF */
391                                 continue;
392                     
393                         rc = libcfs_ipif_query(names[i], &up, &ipaddr);
394                         if (rc != 0) {
395                                 CWARN("Can't get interface %s info: %d\n",
396                                       names[i], rc);
397                                 continue;
398                         }
399                     
400                         if (!up) {
401                                 CWARN("Ignoring interface %s (down)\n",
402                                       names[i]);
403                             continue;
404                         }
405                         
406                         break;      /* one address is quite enough */
407                 }
408             
409                 libcfs_ipif_free_enumeration(names, n);
410
411                 if (i >= n) {
412                         CERROR("Can't find any usable interfaces\n");
413                         return -1;
414                 }
415
416                 CDEBUG(D_NET, "No explicit interfaces defined. "
417                        "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
418         } else {
419                 if (ni->ni_interfaces[1] != NULL) {
420                         CERROR("only one explicit interface is allowed\n");
421                         return -1;
422                 }
423
424                 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
425                 if (rc != 0) {
426                         CERROR("Can't get interface %s info: %d\n",
427                                ni->ni_interfaces[0], rc);
428                         return -1;
429                 }
430
431                 if (!up) {
432                         CERROR("Explicit interface defined: %s but is down\n",
433                                ni->ni_interfaces[0]);
434                         return -1;
435                 }
436                 
437                 CDEBUG(D_NET, "Explicit interface defined: %s. "
438                        "%u.%u.%u.%u used\n",
439                        ni->ni_interfaces[0], HIPQUAD(ipaddr));
440                 
441         }
442         
443         ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
444
445         return 0;
446 }
447
448 int
449 usocklnd_startup(lnet_ni_t *ni)
450 {
451         int          rc;
452         usock_net_t *net;
453
454         if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
455                 rc = usocklnd_base_startup();
456                 if (rc != 0)
457                         return rc;
458         }
459
460         LIBCFS_ALLOC(net, sizeof(*net));
461         if (net == NULL)
462                 goto startup_failed_0;
463
464         memset(net, 0, sizeof(*net));
465         net->un_incarnation = usocklnd_new_incarnation();
466         pthread_mutex_init(&net->un_lock, NULL);
467         pthread_cond_init(&net->un_cond, NULL);
468
469         ni->ni_data = net;
470
471         if (!(the_lnet.ln_pid & LNET_PID_USERFLAG)) {
472                 rc = usocklnd_assign_ni_nid(ni);
473                 if (rc != 0)
474                         goto startup_failed_1;
475         }
476
477         LASSERT (ni->ni_lnd == &the_tcplnd);
478
479         ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
480         ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
481     
482         usock_data.ud_nets_count++;
483         return 0;
484
485  startup_failed_1:
486         pthread_mutex_destroy(&net->un_lock);
487         pthread_cond_destroy(&net->un_cond);
488         LIBCFS_FREE(net, sizeof(*net));
489  startup_failed_0:
490         if (usock_data.ud_nets_count == 0)
491                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
492
493         return -ENETDOWN;
494 }
495
496 void
497 usocklnd_shutdown(lnet_ni_t *ni)
498 {
499         usock_net_t *net = ni->ni_data;
500
501         net->un_shutdown = 1;
502
503         usocklnd_del_all_peers(ni);        
504
505         /* Wait for all peer state to clean up */
506         pthread_mutex_lock(&net->un_lock);
507         while (net->un_peercount != 0) 
508                 pthread_cond_wait(&net->un_cond, &net->un_lock);
509         pthread_mutex_unlock(&net->un_lock);
510         
511         /* Release usock_net_t structure */
512         pthread_mutex_destroy(&net->un_lock);
513         pthread_cond_destroy(&net->un_cond);
514         LIBCFS_FREE(net, sizeof(*net));
515
516         usock_data.ud_nets_count--;
517         if (usock_data.ud_nets_count == 0)
518                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
519 }
520
521 void
522 usocklnd_del_all_peers(lnet_ni_t *ni)
523 {
524         struct list_head  *ptmp;
525         struct list_head  *pnxt;
526         usock_peer_t      *peer;
527         int                i;
528
529         pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
530
531         for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
532                 list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
533                         peer = list_entry (ptmp, usock_peer_t, up_list);
534                         
535                         if (peer->up_ni != ni)
536                                 continue;
537
538                         usocklnd_del_peer_and_conns(peer);
539                 }
540         }
541
542         pthread_rwlock_unlock(&usock_data.ud_peers_lock);
543         
544         /* wakeup all threads */
545         for (i = 0; i < usock_data.ud_npollthreads; i++)
546                 usocklnd_wakeup_pollthread(i);
547 }
548
549 void
550 usocklnd_del_peer_and_conns(usock_peer_t *peer)
551 {
552         /* peer cannot disappear because it's still in hash list */
553
554         pthread_mutex_lock(&peer->up_lock);
555         /* content of conn[] array cannot change now */
556         usocklnd_del_conns_locked(peer);
557         pthread_mutex_unlock(&peer->up_lock);
558
559         /* peer hash list is still protected by the caller */
560         list_del(&peer->up_list);
561
562         usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
563 }
564
565 void
566 usocklnd_del_conns_locked(usock_peer_t *peer)
567 {
568         int i;
569         
570         for (i=0; i < N_CONN_TYPES; i++) {
571                 usock_conn_t *conn = peer->up_conns[i];
572                 if (conn != NULL)
573                         usocklnd_conn_kill(conn);                 
574         }       
575 }