Whamcloud - gitweb
land b1_4_bug5025 on b1_4
[fs/lustre-release.git] / lustre / ptlrpc / service.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (C) 2002 Cluster File Systems, Inc.
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  *
21  */
22
23 #define DEBUG_SUBSYSTEM S_RPC
24 #ifndef __KERNEL__
25 #include <liblustre.h>
26 #include <linux/kp30.h>
27 #endif
28 #include <linux/obd_support.h>
29 #include <linux/obd_class.h>
30 #include <linux/lustre_net.h>
31 #include <portals/types.h>
32 #include "ptlrpc_internal.h"
33
34 /* forward ref */
35 static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
36
37 static LIST_HEAD (ptlrpc_all_services);
38 static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
39
40 static void
41 ptlrpc_free_server_req (struct ptlrpc_request *req)
42 {
43         /* The last request to be received into a request buffer uses space
44          * in the request buffer descriptor, otherwise requests are
45          * allocated dynamically in the incoming reply event handler */
46         if (req == &req->rq_rqbd->rqbd_req)
47                 return;
48
49         OBD_FREE(req, sizeof(*req));
50 }
51
52 static char *
53 ptlrpc_alloc_request_buffer (int size)
54 {
55         char *ptr;
56
57         if (size > SVC_BUF_VMALLOC_THRESHOLD)
58                 OBD_VMALLOC(ptr, size);
59         else
60                 OBD_ALLOC(ptr, size);
61
62         return (ptr);
63 }
64
65 static void
66 ptlrpc_free_request_buffer (char *ptr, int size)
67 {
68         if (size > SVC_BUF_VMALLOC_THRESHOLD)
69                 OBD_VFREE(ptr, size);
70         else
71                 OBD_FREE(ptr, size);
72 }
73
74 struct ptlrpc_request_buffer_desc *
75 ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni)
76 {
77         struct ptlrpc_service             *svc = srv_ni->sni_service;
78         unsigned long                      flags;
79         struct ptlrpc_request_buffer_desc *rqbd;
80
81         OBD_ALLOC(rqbd, sizeof (*rqbd));
82         if (rqbd == NULL)
83                 return (NULL);
84
85         rqbd->rqbd_srv_ni = srv_ni;
86         rqbd->rqbd_refcount = 0;
87         rqbd->rqbd_cbid.cbid_fn = request_in_callback;
88         rqbd->rqbd_cbid.cbid_arg = rqbd;
89         rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size);
90
91         if (rqbd->rqbd_buffer == NULL) {
92                 OBD_FREE(rqbd, sizeof (*rqbd));
93                 return (NULL);
94         }
95
96         spin_lock_irqsave (&svc->srv_lock, flags);
97         list_add(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
98         svc->srv_nbufs++;
99         spin_unlock_irqrestore (&svc->srv_lock, flags);
100
101         return (rqbd);
102 }
103
104 void
105 ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd) 
106 {
107         struct ptlrpc_srv_ni  *sni = rqbd->rqbd_srv_ni;
108         struct ptlrpc_service *svc = sni->sni_service;
109         unsigned long          flags;
110         
111         LASSERT (rqbd->rqbd_refcount == 0);
112
113         spin_lock_irqsave(&svc->srv_lock, flags);
114         list_del(&rqbd->rqbd_list);
115         svc->srv_nbufs--;
116         spin_unlock_irqrestore(&svc->srv_lock, flags);
117
118         ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size);
119         OBD_FREE (rqbd, sizeof (*rqbd));
120 }
121
122 int
123 ptlrpc_grow_req_bufs(struct ptlrpc_srv_ni *srv_ni)
124 {
125         struct ptlrpc_service             *svc = srv_ni->sni_service;
126         struct ptlrpc_request_buffer_desc *rqbd;
127         int                                i;
128
129         CDEBUG(D_RPCTRACE, "%s: allocate %d new %d-byte reqbufs (%d/%d left)\n",
130                svc->srv_name, svc->srv_nbuf_per_group, svc->srv_buf_size,
131                srv_ni->sni_nrqbd_receiving, svc->srv_nbufs);
132         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
133                 rqbd = ptlrpc_alloc_rqbd(srv_ni);
134
135                 if (rqbd == NULL) {
136                         CERROR ("%s/%s: Can't allocate request buffer\n",
137                                 svc->srv_name, srv_ni->sni_ni->pni_name);
138                         return (-ENOMEM);
139                 }
140
141                 if (ptlrpc_server_post_idle_rqbds(svc) < 0)
142                         return (-EAGAIN);
143         }
144
145         return (0);
146 }
147
148 void
149 ptlrpc_save_lock (struct ptlrpc_request *req, 
150                   struct lustre_handle *lock, int mode)
151 {
152         struct ptlrpc_reply_state *rs = req->rq_reply_state;
153         int                        idx;
154
155         LASSERT (rs != NULL);
156         LASSERT (rs->rs_nlocks < RS_MAX_LOCKS);
157
158         idx = rs->rs_nlocks++;
159         rs->rs_locks[idx] = *lock;
160         rs->rs_modes[idx] = mode;
161         rs->rs_difficult = 1;
162 }
163
164 void
165 ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
166 {
167         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
168
169 #ifdef CONFIG_SMP
170         LASSERT (spin_is_locked (&svc->srv_lock));
171 #endif
172         LASSERT (rs->rs_difficult);
173         rs->rs_scheduled_ever = 1;              /* flag any notification attempt */
174
175         if (rs->rs_scheduled)                   /* being set up or already notified */
176                 return;
177
178         rs->rs_scheduled = 1;
179         list_del (&rs->rs_list);
180         list_add (&rs->rs_list, &svc->srv_reply_queue);
181         wake_up (&svc->srv_waitq);
182 }
183
184 void 
185 ptlrpc_commit_replies (struct obd_device *obd)
186 {
187         struct list_head   *tmp;
188         struct list_head   *nxt;
189         unsigned long       flags;
190         
191         /* Find any replies that have been committed and get their service
192          * to attend to complete them. */
193
194         /* CAVEAT EMPTOR: spinlock ordering!!! */
195         spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
196
197         list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
198                 struct ptlrpc_reply_state *rs =
199                         list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list);
200
201                 LASSERT (rs->rs_difficult);
202
203                 if (rs->rs_transno <= obd->obd_last_committed) {
204                         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
205
206                         spin_lock (&svc->srv_lock);
207                         list_del_init (&rs->rs_obd_list);
208                         ptlrpc_schedule_difficult_reply (rs);
209                         spin_unlock (&svc->srv_lock);
210                 }
211         }
212         
213         spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
214 }
215
216 static long
217 timeval_sub(struct timeval *large, struct timeval *small)
218 {
219         return (large->tv_sec - small->tv_sec) * 1000000 +
220                 (large->tv_usec - small->tv_usec);
221 }
222
223 static int
224 ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
225 {
226         struct ptlrpc_srv_ni              *srv_ni;
227         struct ptlrpc_request_buffer_desc *rqbd;
228         unsigned long                      flags;
229         int                                rc;
230         int                                posted = 0;
231
232         for (;;) {
233                 spin_lock_irqsave(&svc->srv_lock, flags);
234
235                 if (list_empty (&svc->srv_idle_rqbds)) {
236                         spin_unlock_irqrestore(&svc->srv_lock, flags);
237                         return (posted);
238                 }
239
240                 rqbd = list_entry(svc->srv_idle_rqbds.next,
241                                   struct ptlrpc_request_buffer_desc,
242                                   rqbd_list);
243                 list_del (&rqbd->rqbd_list);
244
245                 /* assume we will post successfully */
246                 srv_ni = rqbd->rqbd_srv_ni;
247                 srv_ni->sni_nrqbd_receiving++;
248                 list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
249
250                 spin_unlock_irqrestore(&svc->srv_lock, flags);
251                 
252                 rc = ptlrpc_register_rqbd(rqbd);
253                 if (rc != 0)
254                         break;
255                 
256                 posted = 1;
257         }
258
259         spin_lock_irqsave(&svc->srv_lock, flags);
260
261         srv_ni->sni_nrqbd_receiving--;
262         list_del(&rqbd->rqbd_list);
263         list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
264
265         if (srv_ni->sni_nrqbd_receiving == 0) {
266                 /* This service is off-air on this interface because all
267                  * its request buffers are busy.  Portals will have started
268                  * dropping incoming requests until more buffers get
269                  * posted */
270                 CERROR("All %s %s request buffers busy\n",
271                        svc->srv_name, srv_ni->sni_ni->pni_name);
272         }
273
274         spin_unlock_irqrestore (&svc->srv_lock, flags);
275
276         return (-1);
277 }
278
279 struct ptlrpc_service *
280 ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
281                 int req_portal, int rep_portal, int watchdog_timeout,
282                 svc_handler_t handler, char *name,
283                 struct proc_dir_entry *proc_entry)
284 {
285         int                                i;
286         int                                rc;
287         int                                ssize;
288         struct ptlrpc_service             *service;
289         struct ptlrpc_srv_ni              *srv_ni;
290         ENTRY;
291
292         LASSERT (ptlrpc_ninterfaces > 0);
293         LASSERT (nbufs > 0);
294         LASSERT (bufsize >= max_req_size);
295         
296         ssize = offsetof (struct ptlrpc_service,
297                           srv_interfaces[ptlrpc_ninterfaces]);
298         OBD_ALLOC(service, ssize);
299         if (service == NULL)
300                 RETURN(NULL);
301
302         service->srv_name = name;
303         spin_lock_init(&service->srv_lock);
304         INIT_LIST_HEAD(&service->srv_threads);
305         init_waitqueue_head(&service->srv_waitq);
306
307         service->srv_nbuf_per_group = nbufs;
308         service->srv_max_req_size = max_req_size;
309         service->srv_buf_size = bufsize;
310         service->srv_rep_portal = rep_portal;
311         service->srv_req_portal = req_portal;
312         service->srv_watchdog_timeout = watchdog_timeout;
313         service->srv_handler = handler;
314
315         INIT_LIST_HEAD(&service->srv_request_queue);
316         INIT_LIST_HEAD(&service->srv_idle_rqbds);
317         INIT_LIST_HEAD(&service->srv_reply_queue);
318
319         /* First initialise enough for early teardown */
320         for (i = 0; i < ptlrpc_ninterfaces; i++) {
321                 srv_ni = &service->srv_interfaces[i];
322
323                 srv_ni->sni_service = service;
324                 srv_ni->sni_ni = &ptlrpc_interfaces[i];
325                 INIT_LIST_HEAD(&srv_ni->sni_active_rqbds);
326                 INIT_LIST_HEAD(&srv_ni->sni_active_replies);
327         }
328
329         spin_lock (&ptlrpc_all_services_lock);
330         list_add (&service->srv_list, &ptlrpc_all_services);
331         spin_unlock (&ptlrpc_all_services_lock);
332         
333         /* Now allocate the request buffers, assuming all interfaces require
334          * the same number. */
335         for (i = 0; i < ptlrpc_ninterfaces; i++) {
336                 srv_ni = &service->srv_interfaces[i];
337                 CDEBUG (D_NET, "%s: initialising interface %s\n", name,
338                         srv_ni->sni_ni->pni_name);
339
340                 rc = ptlrpc_grow_req_bufs(srv_ni);
341                 /* We shouldn't be under memory pressure at startup, so
342                  * fail if we can't post all our buffers at this time. */
343                 if (rc != 0)
344                         GOTO(failed, NULL);
345         }
346
347         if (proc_entry != NULL)
348                 ptlrpc_lprocfs_register_service(proc_entry, service);
349
350         CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n",
351                service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal);
352
353         RETURN(service);
354 failed:
355         ptlrpc_unregister_service(service);
356         return NULL;
357 }
358
359 static void
360 ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *req)
361 {
362         unsigned long  flags;
363         int            refcount;
364         
365         spin_lock_irqsave(&svc->srv_lock, flags);
366         svc->srv_n_active_reqs--;
367         refcount = --(req->rq_rqbd->rqbd_refcount);
368         if (refcount == 0) {
369                 /* request buffer is now idle */
370                 list_del(&req->rq_rqbd->rqbd_list);
371                 list_add_tail(&req->rq_rqbd->rqbd_list,
372                               &svc->srv_idle_rqbds);
373         }
374         spin_unlock_irqrestore(&svc->srv_lock, flags);
375
376         ptlrpc_free_server_req(req);
377 }
378
379 static int
380 ptlrpc_server_handle_request (struct ptlrpc_service *svc)
381 {
382         struct ptlrpc_request *request;
383         unsigned long          flags;
384         struct timeval         work_start;
385         struct timeval         work_end;
386         long                   timediff;
387         int                    rc;
388         ENTRY;
389
390         spin_lock_irqsave (&svc->srv_lock, flags);
391         if (list_empty (&svc->srv_request_queue) ||
392             (svc->srv_n_difficult_replies != 0 &&
393              svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) {
394                 /* If all the other threads are handling requests, I must
395                  * remain free to handle any 'difficult' reply that might
396                  * block them */
397                 spin_unlock_irqrestore (&svc->srv_lock, flags);
398                 RETURN(0);
399         }
400
401         request = list_entry (svc->srv_request_queue.next,
402                               struct ptlrpc_request, rq_list);
403         list_del_init (&request->rq_list);
404         svc->srv_n_queued_reqs--;
405         svc->srv_n_active_reqs++;
406
407         spin_unlock_irqrestore (&svc->srv_lock, flags);
408
409         do_gettimeofday(&work_start);
410         timediff = timeval_sub(&work_start, &request->rq_arrival_time);
411         if (svc->srv_stats != NULL) {
412                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
413                                     timediff);
414                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
415                                     svc->srv_n_queued_reqs);
416                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
417                                     svc->srv_n_active_reqs);
418         }
419
420 #if SWAB_PARANOIA
421         /* Clear request swab mask; this is a new request */
422         request->rq_req_swab_mask = 0;
423 #endif
424         rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
425         if (rc != 0) {
426                 CERROR ("error unpacking request: ptl %d from %s"
427                         " xid "LPU64"\n", svc->srv_req_portal,
428                         request->rq_peerstr, request->rq_xid);
429                 goto out;
430         }
431
432         rc = -EINVAL;
433         if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
434                 CERROR("wrong packet type received (type=%u) from %s\n",
435                        request->rq_reqmsg->type, request->rq_peerstr);
436                 goto out;
437         }
438
439         CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
440
441         /* Discard requests queued for longer than my timeout.  If the
442          * client's timeout is similar to mine, she'll be timing out this
443          * REQ anyway (bug 1502) */
444         if (timediff / 1000000 > (long)obd_timeout) {
445                 CERROR("Dropping timed-out opc %d request from %s"
446                        ": %ld seconds old\n", request->rq_reqmsg->opc,
447                        request->rq_peerstr,
448                        timediff / 1000000);
449                 goto out;
450         }
451
452         request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
453
454         if (request->rq_export) {
455                 if (request->rq_reqmsg->conn_cnt <
456                     request->rq_export->exp_conn_cnt) {
457                         DEBUG_REQ(D_ERROR, request,
458                                   "DROPPING req from old connection %d < %d",
459                                   request->rq_reqmsg->conn_cnt,
460                                   request->rq_export->exp_conn_cnt);
461                         goto put_conn;
462                 }
463
464                 request->rq_export->exp_last_request_time = CURRENT_SECONDS;
465         }
466
467         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
468                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
469                (request->rq_export ?
470                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
471                (request->rq_export ?
472                 atomic_read(&request->rq_export->exp_refcount) : -99),
473                request->rq_reqmsg->status, request->rq_xid,
474                request->rq_peer.peer_ni->pni_name,
475                request->rq_peerstr,
476                request->rq_reqmsg->opc);
477
478         rc = svc->srv_handler(request);
479
480         CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
481                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
482                (request->rq_export ?
483                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
484                (request->rq_export ?
485                 atomic_read(&request->rq_export->exp_refcount) : -99),
486                request->rq_reqmsg->status, request->rq_xid,
487                request->rq_peer.peer_ni->pni_name,
488                request->rq_peerstr,
489                request->rq_reqmsg->opc);
490
491 put_conn:
492         if (request->rq_export != NULL)
493                 class_export_put(request->rq_export);
494
495  out:
496         do_gettimeofday(&work_end);
497
498         timediff = timeval_sub(&work_end, &work_start);
499
500         CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
501                "request "LPU64" opc %u from %s processed in %ldus "
502                "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
503                request->rq_peerstr,
504                timediff, timeval_sub(&work_end, &request->rq_arrival_time));
505
506         if (svc->srv_stats != NULL) {
507                 int opc = opcode_offset(request->rq_reqmsg->opc);
508                 if (opc > 0) {
509                         LASSERT(opc < LUSTRE_MAX_OPCODES);
510                         lprocfs_counter_add(svc->srv_stats,
511                                             opc + PTLRPC_LAST_CNTR,
512                                             timediff);
513                 }
514         }
515
516         ptlrpc_server_free_request(svc, request);
517         
518         RETURN(1);
519 }
520
521 static int
522 ptlrpc_server_handle_reply (struct ptlrpc_service *svc) 
523 {
524         struct ptlrpc_reply_state *rs;
525         unsigned long              flags;
526         struct obd_export         *exp;
527         struct obd_device         *obd;
528         int                        nlocks;
529         int                        been_handled;
530         char                       str[PTL_NALFMT_SIZE];
531         ENTRY;
532
533         spin_lock_irqsave (&svc->srv_lock, flags);
534         if (list_empty (&svc->srv_reply_queue)) {
535                 spin_unlock_irqrestore (&svc->srv_lock, flags);
536                 RETURN(0);
537         }
538         
539         rs = list_entry (svc->srv_reply_queue.next,
540                          struct ptlrpc_reply_state, rs_list);
541
542         exp = rs->rs_export;
543         obd = exp->exp_obd;
544
545         LASSERT (rs->rs_difficult);
546         LASSERT (rs->rs_scheduled);
547
548         list_del_init (&rs->rs_list);
549
550         /* Disengage from notifiers carefully (lock ordering!) */
551         spin_unlock(&svc->srv_lock);
552
553         spin_lock (&obd->obd_uncommitted_replies_lock);
554         /* Noop if removed already */
555         list_del_init (&rs->rs_obd_list);
556         spin_unlock (&obd->obd_uncommitted_replies_lock);
557
558         spin_lock (&exp->exp_lock);
559         /* Noop if removed already */
560         list_del_init (&rs->rs_exp_list);
561         spin_unlock (&exp->exp_lock);
562
563         spin_lock(&svc->srv_lock);
564
565         been_handled = rs->rs_handled;
566         rs->rs_handled = 1;
567         
568         nlocks = rs->rs_nlocks;                 /* atomic "steal", but */
569         rs->rs_nlocks = 0;                      /* locks still on rs_locks! */
570
571         if (nlocks == 0 && !been_handled) {
572                 /* If we see this, we should already have seen the warning
573                  * in mds_steal_ack_locks()  */
574                 CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
575                       " o%d NID %s\n",
576                       rs, 
577                       rs->rs_xid, rs->rs_transno,
578                       rs->rs_msg.opc, 
579                       ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
580         }
581
582         if ((!been_handled && rs->rs_on_net) || 
583             nlocks > 0) {
584                 spin_unlock_irqrestore(&svc->srv_lock, flags);
585                 
586                 if (!been_handled && rs->rs_on_net) {
587                         PtlMDUnlink(rs->rs_md_h);
588                         /* Ignore return code; we're racing with
589                          * completion... */
590                 }
591
592                 while (nlocks-- > 0)
593                         ldlm_lock_decref(&rs->rs_locks[nlocks], 
594                                          rs->rs_modes[nlocks]);
595
596                 spin_lock_irqsave(&svc->srv_lock, flags);
597         }
598
599         rs->rs_scheduled = 0;
600
601         if (!rs->rs_on_net) {
602                 /* Off the net */
603                 svc->srv_n_difficult_replies--;
604                 spin_unlock_irqrestore(&svc->srv_lock, flags);
605                 
606                 class_export_put (exp);
607                 rs->rs_export = NULL;
608                 lustre_free_reply_state (rs);
609                 atomic_dec (&svc->srv_outstanding_replies);
610                 RETURN(1);
611         }
612         
613         /* still on the net; callback will schedule */
614         spin_unlock_irqrestore (&svc->srv_lock, flags);
615         RETURN(1);
616 }
617
618 #ifndef __KERNEL__
619 /* FIXME make use of timeout later */
620 int
621 liblustre_check_services (void *arg) 
622 {
623         int  did_something = 0;
624         int  rc;
625         struct list_head *tmp, *nxt;
626         ENTRY;
627         
628         /* I'm relying on being single threaded, not to have to lock
629          * ptlrpc_all_services etc */
630         list_for_each_safe (tmp, nxt, &ptlrpc_all_services) {
631                 struct ptlrpc_service *svc =
632                         list_entry (tmp, struct ptlrpc_service, srv_list);
633                 
634                 if (svc->srv_nthreads != 0)     /* I've recursed */
635                         continue;
636
637                 /* service threads can block for bulk, so this limits us
638                  * (arbitrarily) to recursing 1 stack frame per service.
639                  * Note that the problem with recursion is that we have to
640                  * unwind completely before our caller can resume. */
641                 
642                 svc->srv_nthreads++;
643                 
644                 do {
645                         rc = ptlrpc_server_handle_reply(svc);
646                         rc |= ptlrpc_server_handle_request(svc);
647                         rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
648                         did_something |= rc;
649                 } while (rc);
650                 
651                 svc->srv_nthreads--;
652         }
653
654         RETURN(did_something);
655 }
656
657 #else /* __KERNEL__ */
658
659 /* Don't use daemonize, it removes fs struct from new thread (bug 418) */
660 void ptlrpc_daemonize(void)
661 {
662         exit_mm(current);
663         lustre_daemonize_helper();
664         exit_files(current);
665         reparent_to_init();
666 }
667
668 static void
669 ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
670 {
671         struct ptlrpc_srv_ni  *sni;
672         int                    i;
673         int                    avail = 0;
674         int                    low_water = svc->srv_nbuf_per_group/2;
675
676         for (i = 0; i < ptlrpc_ninterfaces; i++) {
677                 sni = &svc->srv_interfaces[i];
678
679                 avail += sni->sni_nrqbd_receiving;
680                 /* NB I'm not locking; just looking. */
681                 if (sni->sni_nrqbd_receiving <= low_water)
682                         ptlrpc_grow_req_bufs(sni);
683         }
684
685         lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
686 }
687
688 static int
689 ptlrpc_retry_rqbds(void *arg)
690 {
691         struct ptlrpc_service *svc = (struct ptlrpc_service *)arg;
692         
693         svc->srv_rqbd_timeout = 0;
694         return (-ETIMEDOUT);
695 }
696
697 static int ptlrpc_main(void *arg)
698 {
699         struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
700         struct ptlrpc_service  *svc = data->svc;
701         struct ptlrpc_thread   *thread = data->thread;
702         struct lc_watchdog     *watchdog;
703         unsigned long           flags;
704 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
705         struct group_info *ginfo = NULL;
706 #endif
707         ENTRY;
708
709         lock_kernel();
710         ptlrpc_daemonize();
711
712         SIGNAL_MASK_LOCK(current, flags);
713         sigfillset(&current->blocked);
714         RECALC_SIGPENDING;
715         SIGNAL_MASK_UNLOCK(current, flags);
716
717         LASSERTF(strlen(data->name) < sizeof(current->comm),
718                  "name %d > len %d\n",
719                  (int)strlen(data->name), (int)sizeof(current->comm));
720         THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name);
721         unlock_kernel();
722
723 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
724         ginfo = groups_alloc(0);
725         if (!ginfo) {
726                 thread->t_flags = SVC_RUNNING;
727                 wake_up(&thread->t_ctl_waitq);
728                 return (-ENOMEM);
729         }
730         set_current_groups(ginfo);
731         put_group_info(ginfo);
732 #endif
733
734         /* Record that the thread is running */
735         thread->t_flags = SVC_RUNNING;
736         wake_up(&thread->t_ctl_waitq);
737
738         watchdog = lc_watchdog_add(svc->srv_watchdog_timeout,
739                                    LC_WATCHDOG_DEFAULT_CB, NULL);
740
741         spin_lock_irqsave(&svc->srv_lock, flags);
742         svc->srv_nthreads++;
743         spin_unlock_irqrestore(&svc->srv_lock, flags);
744
745         /* XXX maintain a list of all managed devices: insert here */
746
747         while ((thread->t_flags & SVC_STOPPING) == 0 ||
748                svc->srv_n_difficult_replies != 0) {
749                 /* Don't exit while there are replies to be handled */
750                 struct l_wait_info lwi = LWI_TIMEOUT(svc->srv_rqbd_timeout,
751                                                      ptlrpc_retry_rqbds, svc);
752
753                 lc_watchdog_disable(watchdog);
754
755                 l_wait_event_exclusive (svc->srv_waitq,
756                               ((thread->t_flags & SVC_STOPPING) != 0 &&
757                                svc->srv_n_difficult_replies == 0) ||
758                               (!list_empty(&svc->srv_idle_rqbds) &&
759                                svc->srv_rqbd_timeout == 0) ||
760                               !list_empty (&svc->srv_reply_queue) ||
761                               (!list_empty (&svc->srv_request_queue) &&
762                                (svc->srv_n_difficult_replies == 0 ||
763                                 svc->srv_n_active_reqs <
764                                 (svc->srv_nthreads - 1))),
765                               &lwi);
766
767                 lc_watchdog_touch(watchdog);
768
769                 ptlrpc_check_rqbd_pools(svc);
770
771                 if (!list_empty (&svc->srv_reply_queue))
772                         ptlrpc_server_handle_reply (svc);
773
774                 /* only handle requests if there are no difficult replies
775                  * outstanding, or I'm not the last thread handling
776                  * requests */
777                 if (!list_empty (&svc->srv_request_queue) &&
778                     (svc->srv_n_difficult_replies == 0 ||
779                      svc->srv_n_active_reqs < (svc->srv_nthreads - 1)))
780                         ptlrpc_server_handle_request (svc);
781
782                 if (!list_empty(&svc->srv_idle_rqbds) &&
783                     ptlrpc_server_post_idle_rqbds(svc) < 0) {
784                         /* I just failed to repost request buffers.  Wait
785                          * for a timeout (unless something else happens)
786                          * before I try again */
787                         svc->srv_rqbd_timeout = HZ/10;
788                 }
789         }
790
791         spin_lock_irqsave(&svc->srv_lock, flags);
792
793         svc->srv_nthreads--;                    /* must know immediately */
794         thread->t_flags = SVC_STOPPED;
795         wake_up(&thread->t_ctl_waitq);
796
797         spin_unlock_irqrestore(&svc->srv_lock, flags);
798
799         lc_watchdog_delete(watchdog);
800
801         CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
802         return 0;
803 }
804
805 static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
806                                struct ptlrpc_thread *thread)
807 {
808         struct l_wait_info lwi = { 0 };
809         unsigned long      flags;
810
811         spin_lock_irqsave(&svc->srv_lock, flags);
812         thread->t_flags = SVC_STOPPING;
813         spin_unlock_irqrestore(&svc->srv_lock, flags);
814
815         wake_up_all(&svc->srv_waitq);
816         l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED),
817                      &lwi);
818
819         spin_lock_irqsave(&svc->srv_lock, flags);
820         list_del(&thread->t_link);
821         spin_unlock_irqrestore(&svc->srv_lock, flags);
822
823         OBD_FREE(thread, sizeof(*thread));
824 }
825
826 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
827 {
828         unsigned long flags;
829         struct ptlrpc_thread *thread;
830
831         spin_lock_irqsave(&svc->srv_lock, flags);
832         while (!list_empty(&svc->srv_threads)) {
833                 thread = list_entry(svc->srv_threads.next,
834                                     struct ptlrpc_thread, t_link);
835
836                 spin_unlock_irqrestore(&svc->srv_lock, flags);
837                 ptlrpc_stop_thread(svc, thread);
838                 spin_lock_irqsave(&svc->srv_lock, flags);
839         }
840
841         spin_unlock_irqrestore(&svc->srv_lock, flags);
842 }
843
844 /* @base_name should be 12 characters or less - 3 will be added on */
845 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
846                            int num_threads, char *base_name)
847 {
848         int i, rc = 0;
849         ENTRY;
850
851         for (i = 0; i < num_threads; i++) {
852                 char name[32];
853                 sprintf(name, "%s_%02d", base_name, i);
854                 rc = ptlrpc_start_thread(dev, svc, name);
855                 if (rc) {
856                         CERROR("cannot start %s thread #%d: rc %d\n", base_name,
857                                i, rc);
858                         ptlrpc_stop_all_threads(svc);
859                 }
860         }
861         RETURN(rc);
862 }
863
864 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
865                         char *name)
866 {
867         struct l_wait_info lwi = { 0 };
868         struct ptlrpc_svc_data d;
869         struct ptlrpc_thread *thread;
870         unsigned long flags;
871         int rc;
872         ENTRY;
873
874         OBD_ALLOC(thread, sizeof(*thread));
875         if (thread == NULL)
876                 RETURN(-ENOMEM);
877         init_waitqueue_head(&thread->t_ctl_waitq);
878         
879         d.dev = dev;
880         d.svc = svc;
881         d.name = name;
882         d.thread = thread;
883
884         spin_lock_irqsave(&svc->srv_lock, flags);
885         list_add(&thread->t_link, &svc->srv_threads);
886         spin_unlock_irqrestore(&svc->srv_lock, flags);
887
888         /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
889          * just drop the VM and FILES in ptlrpc_daemonize() right away.
890          */
891         rc = kernel_thread(ptlrpc_main, &d, CLONE_VM | CLONE_FILES);
892         if (rc < 0) {
893                 CERROR("cannot start thread: %d\n", rc);
894                 OBD_FREE(thread, sizeof(*thread));
895                 RETURN(rc);
896         }
897         l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi);
898
899         RETURN(0);
900 }
901 #endif
902
903 int ptlrpc_unregister_service(struct ptlrpc_service *service)
904 {
905         int                   i;
906         int                   rc;
907         unsigned long         flags;
908         struct ptlrpc_srv_ni *srv_ni;
909         struct l_wait_info    lwi;
910         struct list_head     *tmp;
911
912         LASSERT(list_empty(&service->srv_threads));
913
914         spin_lock (&ptlrpc_all_services_lock);
915         list_del_init (&service->srv_list);
916         spin_unlock (&ptlrpc_all_services_lock);
917
918         ptlrpc_lprocfs_unregister_service(service);
919
920         for (i = 0; i < ptlrpc_ninterfaces; i++) {
921                 srv_ni = &service->srv_interfaces[i];
922                 CDEBUG(D_NET, "%s: tearing down interface %s\n",
923                        service->srv_name, srv_ni->sni_ni->pni_name);
924
925                 /* Unlink all the request buffers.  This forces a 'final'
926                  * event with its 'unlink' flag set for each posted rqbd */
927                 list_for_each(tmp, &srv_ni->sni_active_rqbds) {
928                         struct ptlrpc_request_buffer_desc *rqbd =
929                                 list_entry(tmp, struct ptlrpc_request_buffer_desc, 
930                                            rqbd_list);
931
932                         rc = PtlMDUnlink(rqbd->rqbd_md_h);
933                         LASSERT (rc == PTL_OK || rc == PTL_MD_INVALID);
934                 }
935
936                 /* Wait for the network to release any buffers it's
937                  * currently filling */
938                 for (;;) {
939                         spin_lock_irqsave(&service->srv_lock, flags);
940                         rc = srv_ni->sni_nrqbd_receiving;
941                         spin_unlock_irqrestore(&service->srv_lock, flags);
942
943                         if (rc == 0)
944                                 break;
945                         
946                         /* Network access will complete in finite time but
947                          * the HUGE timeout lets us CWARN for visibility of
948                          * sluggish NALs */
949                         lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
950                         rc = l_wait_event(service->srv_waitq,
951                                           srv_ni->sni_nrqbd_receiving == 0,
952                                           &lwi);
953                         if (rc == -ETIMEDOUT)
954                                 CWARN("Waiting for request buffers on "
955                                       "service %s on interface %s ",
956                                       service->srv_name, srv_ni->sni_ni->pni_name);
957                 }
958
959                 /* schedule all outstanding replies to terminate them */
960                 spin_lock_irqsave(&service->srv_lock, flags);
961                 while (!list_empty(&srv_ni->sni_active_replies)) {
962                         struct ptlrpc_reply_state *rs =
963                                 list_entry(srv_ni->sni_active_replies.next,
964                                            struct ptlrpc_reply_state,
965                                            rs_list);
966                         ptlrpc_schedule_difficult_reply(rs);
967                 }
968                 spin_unlock_irqrestore(&service->srv_lock, flags);
969         }
970
971         /* purge the request queue.  NB No new replies (rqbds all unlinked)
972          * and no service threads, so I'm the only thread noodling the
973          * request queue now */
974         while (!list_empty(&service->srv_request_queue)) {
975                 struct ptlrpc_request *req =
976                         list_entry(service->srv_request_queue.next,
977                                    struct ptlrpc_request,
978                                    rq_list);
979                 
980                 list_del(&req->rq_list);
981                 service->srv_n_queued_reqs--;
982                 service->srv_n_active_reqs++;
983
984                 ptlrpc_server_free_request(service, req);
985         }
986         LASSERT(service->srv_n_queued_reqs == 0);
987         LASSERT(service->srv_n_active_reqs == 0);
988
989         for (i = 0; i < ptlrpc_ninterfaces; i++) {
990                 srv_ni = &service->srv_interfaces[i];
991                 LASSERT(list_empty(&srv_ni->sni_active_rqbds));
992         }
993
994         /* Now free all the request buffers since nothing references them
995          * any more... */
996         while (!list_empty(&service->srv_idle_rqbds)) {
997                 struct ptlrpc_request_buffer_desc *rqbd =
998                         list_entry(service->srv_idle_rqbds.next,
999                                    struct ptlrpc_request_buffer_desc, 
1000                                    rqbd_list);
1001
1002                 ptlrpc_free_rqbd(rqbd);
1003         }
1004
1005         /* wait for all outstanding replies to complete (they were
1006          * scheduled having been flagged to abort above) */
1007         while (atomic_read(&service->srv_outstanding_replies) != 0) {
1008                 struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL);
1009
1010                 rc = l_wait_event(service->srv_waitq,
1011                                   !list_empty(&service->srv_reply_queue), &lwi);
1012                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
1013
1014                 if (rc == 0) {
1015                         ptlrpc_server_handle_reply(service);
1016                         continue;
1017                 }
1018                 CWARN("Unexpectedly long timeout %p\n", service);
1019         }
1020
1021         OBD_FREE(service,
1022                  offsetof(struct ptlrpc_service,
1023                           srv_interfaces[ptlrpc_ninterfaces]));
1024         return 0;
1025 }