Whamcloud - gitweb
b=13884
[fs/lustre-release.git] / lnet / ulnds / socklnd / usocklnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2007 Cluster File Systems, Inc.
5  *   Author: Maxim Patlasov <maxim@clusterfs.com>
6  *
7  *   This file is part of the Lustre file system, http://www.lustre.org
8  *   Lustre is a trademark of Cluster File Systems, Inc.
9  *
10  */
11
12 #include "usocklnd.h"
13 #include <sys/time.h>
14
15 lnd_t the_tcplnd = {
16         .lnd_type      = SOCKLND,
17         .lnd_startup   = usocklnd_startup,
18         .lnd_shutdown  = usocklnd_shutdown,
19         .lnd_send      = usocklnd_send,
20         .lnd_recv      = usocklnd_recv,
21         .lnd_accept    = usocklnd_accept,
22 };
23
24 usock_data_t usock_data;
25 usock_tunables_t usock_tuns = {
26         .ut_timeout         = 50,
27         .ut_poll_timeout    = 1,
28         .ut_fair_limit      = 1,
29         .ut_npollthreads    = 0,
30         .ut_min_bulk        = 1<<10,
31         .ut_txcredits       = 256,
32         .ut_peertxcredits   = 8,
33         .ut_socknagle       = 0,
34         .ut_sockbufsiz      = 0,
35 };
36
37 #define MAX_REASONABLE_TIMEOUT 36000 /* 10 hours */
38 #define MAX_REASONABLE_NPT 1000
39
40 int
41 usocklnd_validate_tunables()
42 {
43         if (usock_tuns.ut_timeout <= 0 ||
44             usock_tuns.ut_timeout > MAX_REASONABLE_TIMEOUT) {
45                 CERROR("USOCK_TIMEOUT: %d is out of reasonable limits\n",
46                        usock_tuns.ut_timeout);
47                 return -1;
48         }
49                 
50         if (usock_tuns.ut_poll_timeout <= 0 ||
51             usock_tuns.ut_poll_timeout > MAX_REASONABLE_TIMEOUT) {
52                 CERROR("USOCK_POLL_TIMEOUT: %d is out of reasonable limits\n",
53                        usock_tuns.ut_poll_timeout);
54                 return -1;
55         }
56
57         if (usock_tuns.ut_fair_limit <= 0) {
58                 CERROR("Invalid USOCK_FAIR_LIMIT: %d (should be >0)\n",
59                        usock_tuns.ut_fair_limit);
60                 return -1;
61         }
62                 
63         if (usock_tuns.ut_npollthreads < 0 ||
64             usock_tuns.ut_npollthreads > MAX_REASONABLE_NPT) {
65                 CERROR("USOCK_NPOLLTHREADS: %d is out of reasonable limits\n",
66                        usock_tuns.ut_npollthreads);
67                 return -1;
68         }
69
70         if (usock_tuns.ut_txcredits <= 0) {
71                 CERROR("USOCK_TXCREDITS: %d should be positive\n",
72                        usock_tuns.ut_txcredits);
73                 return -1;
74         }
75                 
76         if (usock_tuns.ut_peertxcredits <= 0) {
77                 CERROR("USOCK_PEERTXCREDITS: %d should be positive\n",
78                        usock_tuns.ut_peertxcredits);
79                 return -1;
80         }
81
82         if (usock_tuns.ut_peertxcredits > usock_tuns.ut_txcredits) {
83                 CERROR("USOCK_PEERTXCREDITS: %d should not be greater"
84                        " than USOCK_TXCREDITS: %d\n",
85                        usock_tuns.ut_peertxcredits, usock_tuns.ut_txcredits);
86                 return -1;
87         }
88
89         if (usock_tuns.ut_socknagle != 0 &&
90             usock_tuns.ut_socknagle != 1) {
91                 CERROR("USOCK_SOCKNAGLE: %d should be 0 or 1\n",
92                        usock_tuns.ut_socknagle);
93                 return -1;
94         }
95         
96         if (usock_tuns.ut_sockbufsiz < 0) {
97                 CERROR("USOCK_SOCKBUFSIZ: %d should be 0 or positive\n",
98                        usock_tuns.ut_sockbufsiz);
99                 return -1;
100         }
101
102         return 0;
103 }
104
105 void
106 usocklnd_release_poll_states(int n)
107 {
108         int i;
109         
110         for (i = 0; i < n; i++) {
111                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
112                 
113                 close(pt->upt_notifier_fd);
114                 close(pt->upt_pollfd[0].fd);
115
116                 pthread_mutex_destroy(&pt->upt_pollrequests_lock);
117                 cfs_fini_completion(&pt->upt_completion);
118                 
119                 LIBCFS_FREE (pt->upt_pollfd,
120                              sizeof(struct pollfd) * pt->upt_npollfd);
121                 LIBCFS_FREE (pt->upt_idx2conn,
122                               sizeof(usock_conn_t *) * pt->upt_npollfd);
123                 LIBCFS_FREE (pt->upt_fd2idx,
124                               sizeof(int) * pt->upt_nfd2idx);                
125         }
126 }
127
128 int
129 usocklnd_update_tunables()
130 {
131         int rc;
132         
133         rc = cfs_parse_int_tunable(&usock_tuns.ut_timeout,
134                                       "USOCK_TIMEOUT");
135         if (rc)
136                 return rc;
137
138         rc = cfs_parse_int_tunable(&usock_tuns.ut_poll_timeout,
139                                       "USOCK_POLL_TIMEOUT");
140         if (rc)
141                 return rc;
142
143         rc = cfs_parse_int_tunable(&usock_tuns.ut_npollthreads,
144                                       "USOCK_NPOLLTHREADS");
145         if (rc)
146                 return rc;
147
148         rc = cfs_parse_int_tunable(&usock_tuns.ut_fair_limit,
149                                       "USOCK_FAIR_LIMIT");
150         if (rc)
151                 return rc;
152
153         rc = cfs_parse_int_tunable(&usock_tuns.ut_min_bulk,
154                                       "USOCK_MIN_BULK");
155         if (rc)
156                 return rc;
157
158         rc = cfs_parse_int_tunable(&usock_tuns.ut_txcredits,
159                                       "USOCK_TXCREDITS");
160         if (rc)
161                 return rc;
162
163         rc = cfs_parse_int_tunable(&usock_tuns.ut_peertxcredits,
164                                       "USOCK_PEERTXCREDITS");
165         if (rc)
166                 return rc;
167
168         rc = cfs_parse_int_tunable(&usock_tuns.ut_socknagle,
169                                       "USOCK_SOCKNAGLE");
170         if (rc)
171                 return rc;
172
173         rc = cfs_parse_int_tunable(&usock_tuns.ut_sockbufsiz,
174                                       "USOCK_SOCKBUFSIZ");
175         if (rc)
176                 return rc;
177
178         if (usocklnd_validate_tunables())
179                 return -EINVAL;
180         
181         if (usock_tuns.ut_npollthreads == 0) {
182                 usock_tuns.ut_npollthreads = cfs_online_cpus();
183
184                 if (usock_tuns.ut_npollthreads <= 0) {
185                         CERROR("Cannot find out the number of online CPUs\n");
186                         return -EINVAL;
187                 }
188         }
189         
190         return 0;
191 }
192
193
194 int
195 usocklnd_base_startup()
196 {
197         usock_pollthread_t *pt;
198         int                 i;
199         int                 rc;
200         
201         rc = usocklnd_update_tunables();
202         if (rc)
203                 return rc;
204         
205         usock_data.ud_npollthreads = usock_tuns.ut_npollthreads;
206
207         LIBCFS_ALLOC (usock_data.ud_pollthreads,
208                       usock_data.ud_npollthreads *
209                       sizeof(usock_pollthread_t));
210         if (usock_data.ud_pollthreads == NULL)
211                 return -ENOMEM;
212
213         /* Initialize poll thread state structures */
214         for (i = 0; i < usock_data.ud_npollthreads; i++) {
215                 int notifier[2];
216
217                 pt = &usock_data.ud_pollthreads[i];
218
219                 rc = -ENOMEM;
220                 
221                 LIBCFS_ALLOC (pt->upt_pollfd,
222                               sizeof(struct pollfd) * UPT_START_SIZ);
223                 if (pt->upt_pollfd == NULL)
224                         goto base_startup_failed_0;
225                 
226                 LIBCFS_ALLOC (pt->upt_idx2conn,
227                               sizeof(usock_conn_t *) * UPT_START_SIZ);
228                 if (pt->upt_idx2conn == NULL)
229                         goto base_startup_failed_1;
230
231                 LIBCFS_ALLOC (pt->upt_fd2idx,
232                               sizeof(int) * UPT_START_SIZ);
233                 if (pt->upt_fd2idx == NULL)
234                         goto base_startup_failed_2;                
235                 
236                 memset(pt->upt_fd2idx, 0,
237                        sizeof(int) * UPT_START_SIZ);                       
238                 
239                 LIBCFS_ALLOC (pt->upt_skip,
240                               sizeof(int) * UPT_START_SIZ);
241                 if (pt->upt_skip == NULL)
242                         goto base_startup_failed_3;
243
244                 pt->upt_npollfd = pt->upt_nfd2idx = UPT_START_SIZ;
245
246                 rc = libcfs_socketpair(notifier);
247                 if (rc != 0)
248                         goto base_startup_failed_4;
249
250                 pt->upt_notifier_fd = notifier[0];
251
252                 pt->upt_pollfd[0].fd = notifier[1];
253                 pt->upt_pollfd[0].events = POLLIN;
254                 pt->upt_pollfd[0].revents = 0;
255
256                 pt->upt_nfds = 1;
257                 pt->upt_idx2conn[0] = NULL;
258
259                 pt->upt_errno = 0;
260                 CFS_INIT_LIST_HEAD (&pt->upt_pollrequests);
261                 CFS_INIT_LIST_HEAD (&pt->upt_stale_list);
262                 pthread_mutex_init(&pt->upt_pollrequests_lock, NULL);
263                 cfs_init_completion(&pt->upt_completion);
264         }
265
266         /* Initialize peer hash list */        
267         for (i = 0; i < UD_PEER_HASH_SIZE; i++)
268                 CFS_INIT_LIST_HEAD(&usock_data.ud_peers[i]);
269         
270         pthread_rwlock_init(&usock_data.ud_peers_lock, NULL);
271
272         /* Spawn poll threads */
273         for (i = 0; i < usock_data.ud_npollthreads; i++) {
274                 rc = cfs_create_thread(usocklnd_poll_thread,
275                                        &usock_data.ud_pollthreads[i]);
276                 if (rc) {
277                         usocklnd_base_shutdown(i);
278                         return rc;
279                 }
280         }
281         
282         usock_data.ud_state = UD_STATE_INITIALIZED;
283         
284         return 0;
285
286   base_startup_failed_4:
287         LIBCFS_FREE (pt->upt_skip, sizeof(int) * UPT_START_SIZ);
288   base_startup_failed_3:
289         LIBCFS_FREE (pt->upt_fd2idx, sizeof(int) * UPT_START_SIZ);
290   base_startup_failed_2:
291         LIBCFS_FREE (pt->upt_idx2conn, sizeof(usock_conn_t *) * UPT_START_SIZ);
292   base_startup_failed_1:
293         LIBCFS_FREE (pt->upt_pollfd, sizeof(struct pollfd) * UPT_START_SIZ);
294   base_startup_failed_0:
295         LASSERT(rc != 0);
296         usocklnd_release_poll_states(i);
297         LIBCFS_FREE (usock_data.ud_pollthreads,
298                      usock_data.ud_npollthreads *
299                      sizeof(usock_pollthread_t));
300         return rc;
301 }
302
303 void
304 usocklnd_base_shutdown(int n)
305 {
306         int i;
307         
308         usock_data.ud_shutdown = 1;
309         for (i = 0; i < n; i++) {
310                 usock_pollthread_t *pt = &usock_data.ud_pollthreads[i];
311                 usocklnd_wakeup_pollthread(i);
312                 cfs_wait_for_completion(&pt->upt_completion);
313         }
314
315         pthread_rwlock_destroy(&usock_data.ud_peers_lock);
316
317         usocklnd_release_poll_states(usock_data.ud_npollthreads);
318
319         LIBCFS_FREE (usock_data.ud_pollthreads,
320                      usock_data.ud_npollthreads *
321                      sizeof(usock_pollthread_t));
322         
323         usock_data.ud_state = UD_STATE_INIT_NOTHING;
324 }
325
326 __u64
327 usocklnd_new_incarnation()
328 {
329         struct timeval tv;
330         int            rc = gettimeofday(&tv, NULL);
331         LASSERT (rc == 0);
332         return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
333 }
334
335 static int
336 usocklnd_assign_ni_nid(lnet_ni_t *ni)
337 {
338         int   rc;
339         int   up;
340         __u32 ipaddr;
341         
342         /* Find correct IP-address and update ni_nid with it.
343          * Two cases are supported:
344          * 1) no explicit interfaces are defined. NID will be assigned to
345          * first non-lo interface that is up;
346          * 2) exactly one explicit interface is defined. For example,
347          * LNET_NETWORKS='tcp(eth0)' */     
348
349         if (ni->ni_interfaces[0] == NULL) {
350                 char **names;
351                 int    i, n;
352             
353                 n = libcfs_ipif_enumerate(&names);
354                 if (n <= 0) {
355                         CERROR("Can't enumerate interfaces: %d\n", n);
356                         return -1;
357                 }
358
359                 for (i = 0; i < n; i++) {
360    
361                         if (!strcmp(names[i], "lo")) /* skip the loopback IF */
362                                 continue;
363                     
364                         rc = libcfs_ipif_query(names[i], &up, &ipaddr);
365                         if (rc != 0) {
366                                 CWARN("Can't get interface %s info: %d\n",
367                                       names[i], rc);
368                                 continue;
369                         }
370                     
371                         if (!up) {
372                                 CWARN("Ignoring interface %s (down)\n",
373                                       names[i]);
374                             continue;
375                         }
376                         
377                         break;      /* one address is quite enough */
378                 }
379             
380                 libcfs_ipif_free_enumeration(names, n);
381
382                 if (i >= n) {
383                         CERROR("Can't find any usable interfaces\n");
384                         return -1;
385                 }
386
387                 CDEBUG(D_NET, "No explicit interfaces defined. "
388                        "%u.%u.%u.%u used\n", HIPQUAD(ipaddr));
389         } else {
390                 if (ni->ni_interfaces[1] != NULL) {
391                         CERROR("only one explicit interface is allowed\n");
392                         return -1;
393                 }
394
395                 rc = libcfs_ipif_query(ni->ni_interfaces[0], &up, &ipaddr);
396                 if (rc != 0) {
397                         CERROR("Can't get interface %s info: %d\n",
398                                ni->ni_interfaces[0], rc);
399                         return -1;
400                 }
401
402                 if (!up) {
403                         CERROR("Explicit interface defined: %s but is down\n",
404                                ni->ni_interfaces[0]);
405                         return -1;
406                 }
407                 
408                 CDEBUG(D_NET, "Explicit interface defined: %s. "
409                        "%u.%u.%u.%u used\n",
410                        ni->ni_interfaces[0], HIPQUAD(ipaddr));
411                 
412         }
413         
414         ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ipaddr);
415
416         return 0;
417 }
418
419 int
420 usocklnd_startup(lnet_ni_t *ni)
421 {
422         int          rc;
423         usock_net_t *net;
424
425         if (usock_data.ud_state == UD_STATE_INIT_NOTHING) {
426                 rc = usocklnd_base_startup();
427                 if (rc != 0)
428                         return rc;
429         }
430
431         LIBCFS_ALLOC(net, sizeof(*net));
432         if (net == NULL)
433                 goto startup_failed_0;
434
435         memset(net, 0, sizeof(*net));
436         net->un_incarnation = usocklnd_new_incarnation();
437         pthread_mutex_init(&net->un_lock, NULL);
438         pthread_cond_init(&net->un_cond, NULL);
439
440         ni->ni_data = net;
441
442         if (!(the_lnet.ln_pid & LNET_PID_USERFLAG)) {
443                 rc = usocklnd_assign_ni_nid(ni);
444                 if (rc != 0)
445                         goto startup_failed_1;
446         }
447
448         LASSERT (ni->ni_lnd == &the_tcplnd);
449
450         ni->ni_maxtxcredits = usock_tuns.ut_txcredits;
451         ni->ni_peertxcredits = usock_tuns.ut_peertxcredits;
452     
453         usock_data.ud_nets_count++;
454         return 0;
455
456  startup_failed_1:
457         pthread_mutex_destroy(&net->un_lock);
458         pthread_cond_destroy(&net->un_cond);
459         LIBCFS_FREE(net, sizeof(*net));
460  startup_failed_0:
461         if (usock_data.ud_nets_count == 0)
462                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
463
464         return -ENETDOWN;
465 }
466
467 void
468 usocklnd_shutdown(lnet_ni_t *ni)
469 {
470         usock_net_t *net = ni->ni_data;
471
472         net->un_shutdown = 1;
473
474         usocklnd_del_all_peers(ni);        
475
476         /* Wait for all peer state to clean up */
477         pthread_mutex_lock(&net->un_lock);
478         while (net->un_peercount != 0) 
479                 pthread_cond_wait(&net->un_cond, &net->un_lock);
480         pthread_mutex_unlock(&net->un_lock);
481         
482         /* Release usock_net_t structure */
483         pthread_mutex_destroy(&net->un_lock);
484         pthread_cond_destroy(&net->un_cond);
485         LIBCFS_FREE(net, sizeof(*net));
486
487         usock_data.ud_nets_count--;
488         if (usock_data.ud_nets_count == 0)
489                 usocklnd_base_shutdown(usock_data.ud_npollthreads);
490 }
491
492 void
493 usocklnd_del_all_peers(lnet_ni_t *ni)
494 {
495         struct list_head  *ptmp;
496         struct list_head  *pnxt;
497         usock_peer_t      *peer;
498         int                i;
499
500         pthread_rwlock_wrlock(&usock_data.ud_peers_lock);
501
502         for (i = 0; i < UD_PEER_HASH_SIZE; i++) {
503                 list_for_each_safe (ptmp, pnxt, &usock_data.ud_peers[i]) {
504                         peer = list_entry (ptmp, usock_peer_t, up_list);
505                         
506                         if (peer->up_ni != ni)
507                                 continue;
508
509                         usocklnd_del_peer_and_conns(peer);
510                 }
511         }
512
513         pthread_rwlock_unlock(&usock_data.ud_peers_lock);
514         
515         /* wakeup all threads */
516         for (i = 0; i < usock_data.ud_npollthreads; i++)
517                 usocklnd_wakeup_pollthread(i);
518 }
519
520 void
521 usocklnd_del_peer_and_conns(usock_peer_t *peer)
522 {
523         /* peer cannot disappear because it's still in hash list */
524
525         pthread_mutex_lock(&peer->up_lock);
526         /* content of conn[] array cannot change now */
527         usocklnd_del_conns_locked(peer);
528         pthread_mutex_unlock(&peer->up_lock);
529
530         /* peer hash list is still protected by the caller */
531         list_del(&peer->up_list);
532
533         usocklnd_peer_decref(peer); /* peer isn't in hash list anymore */
534 }
535
536 void
537 usocklnd_del_conns_locked(usock_peer_t *peer)
538 {
539         int i;
540         
541         for (i=0; i < N_CONN_TYPES; i++) {
542                 usock_conn_t *conn = peer->up_conns[i];
543                 if (conn != NULL)
544                         usocklnd_conn_kill(conn);                 
545         }       
546 }