lustre/ptlrpc/service.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (C) 2002 Cluster File Systems, Inc.
   5  *
   6  *   This file is part of Lustre, http://www.lustre.org.
   7  *
   8  *   Lustre is free software; you can redistribute it and/or
   9  *   modify it under the terms of version 2 of the GNU General Public
  10  *   License as published by the Free Software Foundation.
  11  *
  12  *   Lustre is distributed in the hope that it will be useful,
  13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *   GNU General Public License for more details.
  16  *
  17  *   You should have received a copy of the GNU General Public License
  18  *   along with Lustre; if not, write to the Free Software
  19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  *
  21  */
  22
  23 #define DEBUG_SUBSYSTEM S_RPC
  24 #ifndef __KERNEL__
  25 #include <liblustre.h>
  26 #include <linux/kp30.h>
  27 #endif
  28 #include <linux/obd_support.h>
  29 #include <linux/obd_class.h>
  30 #include <linux/lustre_net.h>
  31 #include <portals/types.h>
  32 #include "ptlrpc_internal.h"
  33
  34 /* forward ref */
  35 static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
  36
  37 static LIST_HEAD (ptlrpc_all_services);
  38 static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
  39
  40 static void
  41 ptlrpc_free_server_req (struct ptlrpc_request *req)
  42 {
  43         /* The last request to be received into a request buffer uses space
  44          * in the request buffer descriptor, otherwise requests are
  45          * allocated dynamically in the incoming reply event handler */
  46         if (req == &req->rq_rqbd->rqbd_req)
  47                 return;
  48
  49         OBD_FREE(req, sizeof(*req));
  50 }
  51
  52 static char *
  53 ptlrpc_alloc_request_buffer (int size)
  54 {
  55         char *ptr;
  56
  57         if (size > SVC_BUF_VMALLOC_THRESHOLD)
  58                 OBD_VMALLOC(ptr, size);
  59         else
  60                 OBD_ALLOC(ptr, size);
  61
  62         return (ptr);
  63 }
  64
  65 static void
  66 ptlrpc_free_request_buffer (char *ptr, int size)
  67 {
  68         if (size > SVC_BUF_VMALLOC_THRESHOLD)
  69                 OBD_VFREE(ptr, size);
  70         else
  71                 OBD_FREE(ptr, size);
  72 }
  73
  74 struct ptlrpc_request_buffer_desc *
  75 ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni)
  76 {
  77         struct ptlrpc_service             *svc = srv_ni->sni_service;
  78         unsigned long                      flags;
  79         struct ptlrpc_request_buffer_desc *rqbd;
  80
  81         OBD_ALLOC(rqbd, sizeof (*rqbd));
  82         if (rqbd == NULL)
  83                 return (NULL);
  84
  85         rqbd->rqbd_srv_ni = srv_ni;
  86         rqbd->rqbd_refcount = 0;
  87         rqbd->rqbd_cbid.cbid_fn = request_in_callback;
  88         rqbd->rqbd_cbid.cbid_arg = rqbd;
  89         rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size);
  90
  91         if (rqbd->rqbd_buffer == NULL) {
  92                 OBD_FREE(rqbd, sizeof (*rqbd));
  93                 return (NULL);
  94         }
  95
  96         spin_lock_irqsave (&svc->srv_lock, flags);
  97         list_add(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
  98         svc->srv_nbufs++;
  99         spin_unlock_irqrestore (&svc->srv_lock, flags);
 100
 101         return (rqbd);
 102 }
 103
 104 void
 105 ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
 106 {
 107         struct ptlrpc_srv_ni  *sni = rqbd->rqbd_srv_ni;
 108         struct ptlrpc_service *svc = sni->sni_service;
 109         unsigned long          flags;
 110
 111         LASSERT (rqbd->rqbd_refcount == 0);
 112
 113         spin_lock_irqsave(&svc->srv_lock, flags);
 114         list_del(&rqbd->rqbd_list);
 115         svc->srv_nbufs--;
 116         spin_unlock_irqrestore(&svc->srv_lock, flags);
 117
 118         ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size);
 119         OBD_FREE (rqbd, sizeof (*rqbd));
 120 }
 121
 122 int
 123 ptlrpc_grow_req_bufs(struct ptlrpc_srv_ni *srv_ni)
 124 {
 125         struct ptlrpc_service             *svc = srv_ni->sni_service;
 126         struct ptlrpc_request_buffer_desc *rqbd;
 127         int                                i;
 128
 129         CDEBUG(D_RPCTRACE, "%s: allocate %d new %d-byte reqbufs (%d/%d left)\n",
 130                svc->srv_name, svc->srv_nbuf_per_group, svc->srv_buf_size,
 131                srv_ni->sni_nrqbd_receiving, svc->srv_nbufs);
 132         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
 133                 rqbd = ptlrpc_alloc_rqbd(srv_ni);
 134
 135                 if (rqbd == NULL) {
 136                         CERROR ("%s/%s: Can't allocate request buffer\n",
 137                                 svc->srv_name, srv_ni->sni_ni->pni_name);
 138                         return (-ENOMEM);
 139                 }
 140
 141                 if (ptlrpc_server_post_idle_rqbds(svc) < 0)
 142                         return (-EAGAIN);
 143         }
 144
 145         return (0);
 146 }
 147
 148 void
 149 ptlrpc_save_lock (struct ptlrpc_request *req,
 150                   struct lustre_handle *lock, int mode)
 151 {
 152         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 153         int                        idx;
 154
 155         LASSERT (rs != NULL);
 156         LASSERT (rs->rs_nlocks < RS_MAX_LOCKS);
 157
 158         idx = rs->rs_nlocks++;
 159         rs->rs_locks[idx] = *lock;
 160         rs->rs_modes[idx] = mode;
 161         rs->rs_difficult = 1;
 162 }
 163
 164 void
 165 ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
 166 {
 167         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
 168
 169 #ifdef CONFIG_SMP
 170         LASSERT (spin_is_locked (&svc->srv_lock));
 171 #endif
 172         LASSERT (rs->rs_difficult);
 173         rs->rs_scheduled_ever = 1;              /* flag any notification attempt */
 174
 175         if (rs->rs_scheduled)                   /* being set up or already notified */
 176                 return;
 177
 178         rs->rs_scheduled = 1;
 179         list_del (&rs->rs_list);
 180         list_add (&rs->rs_list, &svc->srv_reply_queue);
 181         wake_up (&svc->srv_waitq);
 182 }
 183
 184 void
 185 ptlrpc_commit_replies (struct obd_device *obd)
 186 {
 187         struct list_head   *tmp;
 188         struct list_head   *nxt;
 189         unsigned long       flags;
 190
 191         /* Find any replies that have been committed and get their service
 192          * to attend to complete them. */
 193
 194         /* CAVEAT EMPTOR: spinlock ordering!!! */
 195         spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
 196
 197         list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
 198                 struct ptlrpc_reply_state *rs =
 199                         list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list);
 200
 201                 LASSERT (rs->rs_difficult);
 202
 203                 if (rs->rs_transno <= obd->obd_last_committed) {
 204                         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
 205
 206                         spin_lock (&svc->srv_lock);
 207                         list_del_init (&rs->rs_obd_list);
 208                         ptlrpc_schedule_difficult_reply (rs);
 209                         spin_unlock (&svc->srv_lock);
 210                 }
 211         }
 212
 213         spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
 214 }
 215
 216 static long
 217 timeval_sub(struct timeval *large, struct timeval *small)
 218 {
 219         return (large->tv_sec - small->tv_sec) * 1000000 +
 220                 (large->tv_usec - small->tv_usec);
 221 }
 222
 223 static int
 224 ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
 225 {
 226         struct ptlrpc_srv_ni              *srv_ni;
 227         struct ptlrpc_request_buffer_desc *rqbd;
 228         unsigned long                      flags;
 229         int                                rc;
 230         int                                posted = 0;
 231
 232         for (;;) {
 233                 spin_lock_irqsave(&svc->srv_lock, flags);
 234
 235                 if (list_empty (&svc->srv_idle_rqbds)) {
 236                         spin_unlock_irqrestore(&svc->srv_lock, flags);
 237                         return (posted);
 238                 }
 239
 240                 rqbd = list_entry(svc->srv_idle_rqbds.next,
 241                                   struct ptlrpc_request_buffer_desc,
 242                                   rqbd_list);
 243                 list_del (&rqbd->rqbd_list);
 244
 245                 /* assume we will post successfully */
 246                 srv_ni = rqbd->rqbd_srv_ni;
 247                 srv_ni->sni_nrqbd_receiving++;
 248                 list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
 249
 250                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 251
 252                 rc = ptlrpc_register_rqbd(rqbd);
 253                 if (rc != 0)
 254                         break;
 255
 256                 posted = 1;
 257         }
 258
 259         spin_lock_irqsave(&svc->srv_lock, flags);
 260
 261         srv_ni->sni_nrqbd_receiving--;
 262         list_del(&rqbd->rqbd_list);
 263         list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
 264
 265         if (srv_ni->sni_nrqbd_receiving == 0) {
 266                 /* This service is off-air on this interface because all
 267                  * its request buffers are busy.  Portals will have started
 268                  * dropping incoming requests until more buffers get
 269                  * posted */
 270                 CERROR("All %s %s request buffers busy\n",
 271                        svc->srv_name, srv_ni->sni_ni->pni_name);
 272         }
 273
 274         spin_unlock_irqrestore (&svc->srv_lock, flags);
 275
 276         return (-1);
 277 }
 278
 279 struct ptlrpc_service *
 280 ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
 281                 int req_portal, int rep_portal, int watchdog_timeout,
 282                 svc_handler_t handler, char *name,
 283                 struct proc_dir_entry *proc_entry)
 284 {
 285         int                                i;
 286         int                                rc;
 287         int                                ssize;
 288         struct ptlrpc_service             *service;
 289         struct ptlrpc_srv_ni              *srv_ni;
 290         ENTRY;
 291
 292         LASSERT (ptlrpc_ninterfaces > 0);
 293         LASSERT (nbufs > 0);
 294         LASSERT (bufsize >= max_req_size);
 295
 296         ssize = offsetof (struct ptlrpc_service,
 297                           srv_interfaces[ptlrpc_ninterfaces]);
 298         OBD_ALLOC(service, ssize);
 299         if (service == NULL)
 300                 RETURN(NULL);
 301
 302         service->srv_name = name;
 303         spin_lock_init(&service->srv_lock);
 304         INIT_LIST_HEAD(&service->srv_threads);
 305         init_waitqueue_head(&service->srv_waitq);
 306
 307         service->srv_nbuf_per_group = nbufs;
 308         service->srv_max_req_size = max_req_size;
 309         service->srv_buf_size = bufsize;
 310         service->srv_rep_portal = rep_portal;
 311         service->srv_req_portal = req_portal;
 312         service->srv_watchdog_timeout = watchdog_timeout;
 313         service->srv_handler = handler;
 314
 315         INIT_LIST_HEAD(&service->srv_request_queue);
 316         INIT_LIST_HEAD(&service->srv_idle_rqbds);
 317         INIT_LIST_HEAD(&service->srv_reply_queue);
 318
 319         /* First initialise enough for early teardown */
 320         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 321                 srv_ni = &service->srv_interfaces[i];
 322
 323                 srv_ni->sni_service = service;
 324                 srv_ni->sni_ni = &ptlrpc_interfaces[i];
 325                 INIT_LIST_HEAD(&srv_ni->sni_active_rqbds);
 326                 INIT_LIST_HEAD(&srv_ni->sni_active_replies);
 327         }
 328
 329         spin_lock (&ptlrpc_all_services_lock);
 330         list_add (&service->srv_list, &ptlrpc_all_services);
 331         spin_unlock (&ptlrpc_all_services_lock);
 332
 333         /* Now allocate the request buffers, assuming all interfaces require
 334          * the same number. */
 335         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 336                 srv_ni = &service->srv_interfaces[i];
 337                 CDEBUG (D_NET, "%s: initialising interface %s\n", name,
 338                         srv_ni->sni_ni->pni_name);
 339
 340                 rc = ptlrpc_grow_req_bufs(srv_ni);
 341                 /* We shouldn't be under memory pressure at startup, so
 342                  * fail if we can't post all our buffers at this time. */
 343                 if (rc != 0)
 344                         GOTO(failed, NULL);
 345         }
 346
 347         if (proc_entry != NULL)
 348                 ptlrpc_lprocfs_register_service(proc_entry, service);
 349
 350         CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n",
 351                service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal);
 352
 353         RETURN(service);
 354 failed:
 355         ptlrpc_unregister_service(service);
 356         return NULL;
 357 }
 358
 359 static void
 360 ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *req)
 361 {
 362         unsigned long  flags;
 363         int            refcount;
 364
 365         spin_lock_irqsave(&svc->srv_lock, flags);
 366         svc->srv_n_active_reqs--;
 367         refcount = --(req->rq_rqbd->rqbd_refcount);
 368         if (refcount == 0) {
 369                 /* request buffer is now idle */
 370                 list_del(&req->rq_rqbd->rqbd_list);
 371                 list_add_tail(&req->rq_rqbd->rqbd_list,
 372                               &svc->srv_idle_rqbds);
 373         }
 374         spin_unlock_irqrestore(&svc->srv_lock, flags);
 375
 376         ptlrpc_free_server_req(req);
 377 }
 378
 379 static int
 380 ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 381 {
 382         struct ptlrpc_request *request;
 383         unsigned long          flags;
 384         struct timeval         work_start;
 385         struct timeval         work_end;
 386         long                   timediff;
 387         int                    rc;
 388         ENTRY;
 389
 390         spin_lock_irqsave (&svc->srv_lock, flags);
 391         if (list_empty (&svc->srv_request_queue) ||
 392             (svc->srv_n_difficult_replies != 0 &&
 393              svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) {
 394                 /* If all the other threads are handling requests, I must
 395                  * remain free to handle any 'difficult' reply that might
 396                  * block them */
 397                 spin_unlock_irqrestore (&svc->srv_lock, flags);
 398                 RETURN(0);
 399         }
 400
 401         request = list_entry (svc->srv_request_queue.next,
 402                               struct ptlrpc_request, rq_list);
 403         list_del_init (&request->rq_list);
 404         svc->srv_n_queued_reqs--;
 405         svc->srv_n_active_reqs++;
 406
 407         spin_unlock_irqrestore (&svc->srv_lock, flags);
 408
 409         do_gettimeofday(&work_start);
 410         timediff = timeval_sub(&work_start, &request->rq_arrival_time);
 411         if (svc->srv_stats != NULL) {
 412                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
 413                                     timediff);
 414                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
 415                                     svc->srv_n_queued_reqs);
 416                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
 417                                     svc->srv_n_active_reqs);
 418         }
 419
 420 #if SWAB_PARANOIA
 421         /* Clear request swab mask; this is a new request */
 422         request->rq_req_swab_mask = 0;
 423 #endif
 424         rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
 425         if (rc != 0) {
 426                 CERROR ("error unpacking request: ptl %d from %s"
 427                         " xid "LPU64"\n", svc->srv_req_portal,
 428                         request->rq_peerstr, request->rq_xid);
 429                 goto out;
 430         }
 431
 432         rc = -EINVAL;
 433         if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
 434                 CERROR("wrong packet type received (type=%u) from %s\n",
 435                        request->rq_reqmsg->type, request->rq_peerstr);
 436                 goto out;
 437         }
 438
 439         CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
 440
 441         /* Discard requests queued for longer than my timeout.  If the
 442          * client's timeout is similar to mine, she'll be timing out this
 443          * REQ anyway (bug 1502) */
 444         if (timediff / 1000000 > (long)obd_timeout) {
 445                 CERROR("Dropping timed-out opc %d request from %s"
 446                        ": %ld seconds old\n", request->rq_reqmsg->opc,
 447                        request->rq_peerstr,
 448                        timediff / 1000000);
 449                 goto out;
 450         }
 451
 452         request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
 453
 454         if (request->rq_export) {
 455                 if (request->rq_reqmsg->conn_cnt <
 456                     request->rq_export->exp_conn_cnt) {
 457                         DEBUG_REQ(D_ERROR, request,
 458                                   "DROPPING req from old connection %d < %d",
 459                                   request->rq_reqmsg->conn_cnt,
 460                                   request->rq_export->exp_conn_cnt);
 461                         goto put_conn;
 462                 }
 463
 464                 request->rq_export->exp_last_request_time = CURRENT_SECONDS;
 465         }
 466
 467         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
 468                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
 469                (request->rq_export ?
 470                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
 471                (request->rq_export ?
 472                 atomic_read(&request->rq_export->exp_refcount) : -99),
 473                request->rq_reqmsg->status, request->rq_xid,
 474                request->rq_peer.peer_ni->pni_name,
 475                request->rq_peerstr,
 476                request->rq_reqmsg->opc);
 477
 478         rc = svc->srv_handler(request);
 479
 480         CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
 481                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
 482                (request->rq_export ?
 483                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
 484                (request->rq_export ?
 485                 atomic_read(&request->rq_export->exp_refcount) : -99),
 486                request->rq_reqmsg->status, request->rq_xid,
 487                request->rq_peer.peer_ni->pni_name,
 488                request->rq_peerstr,
 489                request->rq_reqmsg->opc);
 490
 491 put_conn:
 492         if (request->rq_export != NULL)
 493                 class_export_put(request->rq_export);
 494
 495  out:
 496         do_gettimeofday(&work_end);
 497
 498         timediff = timeval_sub(&work_end, &work_start);
 499
 500         CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
 501                "request "LPU64" opc %u from %s processed in %ldus "
 502                "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
 503                request->rq_peerstr,
 504                timediff, timeval_sub(&work_end, &request->rq_arrival_time));
 505
 506         if (svc->srv_stats != NULL) {
 507                 int opc = opcode_offset(request->rq_reqmsg->opc);
 508                 if (opc > 0) {
 509                         LASSERT(opc < LUSTRE_MAX_OPCODES);
 510                         lprocfs_counter_add(svc->srv_stats,
 511                                             opc + PTLRPC_LAST_CNTR,
 512                                             timediff);
 513                 }
 514         }
 515
 516         ptlrpc_server_free_request(svc, request);
 517
 518         RETURN(1);
 519 }
 520
 521 static int
 522 ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
 523 {
 524         struct ptlrpc_reply_state *rs;
 525         unsigned long              flags;
 526         struct obd_export         *exp;
 527         struct obd_device         *obd;
 528         int                        nlocks;
 529         int                        been_handled;
 530         char                       str[PTL_NALFMT_SIZE];
 531         ENTRY;
 532
 533         spin_lock_irqsave (&svc->srv_lock, flags);
 534         if (list_empty (&svc->srv_reply_queue)) {
 535                 spin_unlock_irqrestore (&svc->srv_lock, flags);
 536                 RETURN(0);
 537         }
 538
 539         rs = list_entry (svc->srv_reply_queue.next,
 540                          struct ptlrpc_reply_state, rs_list);
 541
 542         exp = rs->rs_export;
 543         obd = exp->exp_obd;
 544
 545         LASSERT (rs->rs_difficult);
 546         LASSERT (rs->rs_scheduled);
 547
 548         list_del_init (&rs->rs_list);
 549
 550         /* Disengage from notifiers carefully (lock ordering!) */
 551         spin_unlock(&svc->srv_lock);
 552
 553         spin_lock (&obd->obd_uncommitted_replies_lock);
 554         /* Noop if removed already */
 555         list_del_init (&rs->rs_obd_list);
 556         spin_unlock (&obd->obd_uncommitted_replies_lock);
 557
 558         spin_lock (&exp->exp_lock);
 559         /* Noop if removed already */
 560         list_del_init (&rs->rs_exp_list);
 561         spin_unlock (&exp->exp_lock);
 562
 563         spin_lock(&svc->srv_lock);
 564
 565         been_handled = rs->rs_handled;
 566         rs->rs_handled = 1;
 567
 568         nlocks = rs->rs_nlocks;                 /* atomic "steal", but */
 569         rs->rs_nlocks = 0;                      /* locks still on rs_locks! */
 570
 571         if (nlocks == 0 && !been_handled) {
 572                 /* If we see this, we should already have seen the warning
 573                  * in mds_steal_ack_locks()  */
 574                 CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
 575                       " o%d NID %s\n",
 576                       rs,
 577                       rs->rs_xid, rs->rs_transno,
 578                       rs->rs_msg.opc,
 579                       ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
 580         }
 581
 582         if ((!been_handled && rs->rs_on_net) ||
 583             nlocks > 0) {
 584                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 585
 586                 if (!been_handled && rs->rs_on_net) {
 587                         PtlMDUnlink(rs->rs_md_h);
 588                         /* Ignore return code; we're racing with
 589                          * completion... */
 590                 }
 591
 592                 while (nlocks-- > 0)
 593                         ldlm_lock_decref(&rs->rs_locks[nlocks],
 594                                          rs->rs_modes[nlocks]);
 595
 596                 spin_lock_irqsave(&svc->srv_lock, flags);
 597         }
 598
 599         rs->rs_scheduled = 0;
 600
 601         if (!rs->rs_on_net) {
 602                 /* Off the net */
 603                 svc->srv_n_difficult_replies--;
 604                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 605
 606                 class_export_put (exp);
 607                 rs->rs_export = NULL;
 608                 lustre_free_reply_state (rs);
 609                 atomic_dec (&svc->srv_outstanding_replies);
 610                 RETURN(1);
 611         }
 612
 613         /* still on the net; callback will schedule */
 614         spin_unlock_irqrestore (&svc->srv_lock, flags);
 615         RETURN(1);
 616 }
 617
 618 #ifndef __KERNEL__
 619 /* FIXME make use of timeout later */
 620 int
 621 liblustre_check_services (void *arg)
 622 {
 623         int  did_something = 0;
 624         int  rc;
 625         struct list_head *tmp, *nxt;
 626         ENTRY;
 627
 628         /* I'm relying on being single threaded, not to have to lock
 629          * ptlrpc_all_services etc */
 630         list_for_each_safe (tmp, nxt, &ptlrpc_all_services) {
 631                 struct ptlrpc_service *svc =
 632                         list_entry (tmp, struct ptlrpc_service, srv_list);
 633
 634                 if (svc->srv_nthreads != 0)     /* I've recursed */
 635                         continue;
 636
 637                 /* service threads can block for bulk, so this limits us
 638                  * (arbitrarily) to recursing 1 stack frame per service.
 639                  * Note that the problem with recursion is that we have to
 640                  * unwind completely before our caller can resume. */
 641
 642                 svc->srv_nthreads++;
 643
 644                 do {
 645                         rc = ptlrpc_server_handle_reply(svc);
 646                         rc |= ptlrpc_server_handle_request(svc);
 647                         rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
 648                         did_something |= rc;
 649                 } while (rc);
 650
 651                 svc->srv_nthreads--;
 652         }
 653
 654         RETURN(did_something);
 655 }
 656
 657 #else /* __KERNEL__ */
 658
 659 /* Don't use daemonize, it removes fs struct from new thread (bug 418) */
 660 void ptlrpc_daemonize(void)
 661 {
 662         exit_mm(current);
 663         lustre_daemonize_helper();
 664         exit_files(current);
 665         reparent_to_init();
 666 }
 667
 668 static void
 669 ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
 670 {
 671         struct ptlrpc_srv_ni  *sni;
 672         int                    i;
 673         int                    avail = 0;
 674         int                    low_water = svc->srv_nbuf_per_group/2;
 675
 676         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 677                 sni = &svc->srv_interfaces[i];
 678
 679                 avail += sni->sni_nrqbd_receiving;
 680                 /* NB I'm not locking; just looking. */
 681                 if (sni->sni_nrqbd_receiving <= low_water)
 682                         ptlrpc_grow_req_bufs(sni);
 683         }
 684
 685         lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
 686 }
 687
 688 static int
 689 ptlrpc_retry_rqbds(void *arg)
 690 {
 691         struct ptlrpc_service *svc = (struct ptlrpc_service *)arg;
 692
 693         svc->srv_rqbd_timeout = 0;
 694         return (-ETIMEDOUT);
 695 }
 696
 697 static int ptlrpc_main(void *arg)
 698 {
 699         struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
 700         struct ptlrpc_service  *svc = data->svc;
 701         struct ptlrpc_thread   *thread = data->thread;
 702         struct lc_watchdog     *watchdog;
 703         unsigned long           flags;
 704 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
 705         struct group_info *ginfo = NULL;
 706 #endif
 707         ENTRY;
 708
 709         lock_kernel();
 710         ptlrpc_daemonize();
 711
 712         SIGNAL_MASK_LOCK(current, flags);
 713         sigfillset(&current->blocked);
 714         RECALC_SIGPENDING;
 715         SIGNAL_MASK_UNLOCK(current, flags);
 716
 717         LASSERTF(strlen(data->name) < sizeof(current->comm),
 718                  "name %d > len %d\n",
 719                  (int)strlen(data->name), (int)sizeof(current->comm));
 720         THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name);
 721         unlock_kernel();
 722
 723 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
 724         ginfo = groups_alloc(0);
 725         if (!ginfo) {
 726                 thread->t_flags = SVC_RUNNING;
 727                 wake_up(&thread->t_ctl_waitq);
 728                 return (-ENOMEM);
 729         }
 730         set_current_groups(ginfo);
 731         put_group_info(ginfo);
 732 #endif
 733
 734         /* Record that the thread is running */
 735         thread->t_flags = SVC_RUNNING;
 736         wake_up(&thread->t_ctl_waitq);
 737
 738         watchdog = lc_watchdog_add(svc->srv_watchdog_timeout,
 739                                    LC_WATCHDOG_DEFAULT_CB, NULL);
 740
 741         spin_lock_irqsave(&svc->srv_lock, flags);
 742         svc->srv_nthreads++;
 743         spin_unlock_irqrestore(&svc->srv_lock, flags);
 744
 745         /* XXX maintain a list of all managed devices: insert here */
 746
 747         while ((thread->t_flags & SVC_STOPPING) == 0 ||
 748                svc->srv_n_difficult_replies != 0) {
 749                 /* Don't exit while there are replies to be handled */
 750                 struct l_wait_info lwi = LWI_TIMEOUT(svc->srv_rqbd_timeout,
 751                                                      ptlrpc_retry_rqbds, svc);
 752
 753                 lc_watchdog_disable(watchdog);
 754
 755                 l_wait_event_exclusive (svc->srv_waitq,
 756                               ((thread->t_flags & SVC_STOPPING) != 0 &&
 757                                svc->srv_n_difficult_replies == 0) ||
 758                               (!list_empty(&svc->srv_idle_rqbds) &&
 759                                svc->srv_rqbd_timeout == 0) ||
 760                               !list_empty (&svc->srv_reply_queue) ||
 761                               (!list_empty (&svc->srv_request_queue) &&
 762                                (svc->srv_n_difficult_replies == 0 ||
 763                                 svc->srv_n_active_reqs <
 764                                 (svc->srv_nthreads - 1))),
 765                               &lwi);
 766
 767                 lc_watchdog_touch(watchdog);
 768
 769                 ptlrpc_check_rqbd_pools(svc);
 770
 771                 if (!list_empty (&svc->srv_reply_queue))
 772                         ptlrpc_server_handle_reply (svc);
 773
 774                 /* only handle requests if there are no difficult replies
 775                  * outstanding, or I'm not the last thread handling
 776                  * requests */
 777                 if (!list_empty (&svc->srv_request_queue) &&
 778                     (svc->srv_n_difficult_replies == 0 ||
 779                      svc->srv_n_active_reqs < (svc->srv_nthreads - 1)))
 780                         ptlrpc_server_handle_request (svc);
 781
 782                 if (!list_empty(&svc->srv_idle_rqbds) &&
 783                     ptlrpc_server_post_idle_rqbds(svc) < 0) {
 784                         /* I just failed to repost request buffers.  Wait
 785                          * for a timeout (unless something else happens)
 786                          * before I try again */
 787                         svc->srv_rqbd_timeout = HZ/10;
 788                 }
 789         }
 790
 791         spin_lock_irqsave(&svc->srv_lock, flags);
 792
 793         svc->srv_nthreads--;                    /* must know immediately */
 794         thread->t_flags = SVC_STOPPED;
 795         wake_up(&thread->t_ctl_waitq);
 796
 797         spin_unlock_irqrestore(&svc->srv_lock, flags);
 798
 799         lc_watchdog_delete(watchdog);
 800
 801         CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
 802         return 0;
 803 }
 804
 805 static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
 806                                struct ptlrpc_thread *thread)
 807 {
 808         struct l_wait_info lwi = { 0 };
 809         unsigned long      flags;
 810
 811         spin_lock_irqsave(&svc->srv_lock, flags);
 812         thread->t_flags = SVC_STOPPING;
 813         spin_unlock_irqrestore(&svc->srv_lock, flags);
 814
 815         wake_up_all(&svc->srv_waitq);
 816         l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED),
 817                      &lwi);
 818
 819         spin_lock_irqsave(&svc->srv_lock, flags);
 820         list_del(&thread->t_link);
 821         spin_unlock_irqrestore(&svc->srv_lock, flags);
 822
 823         OBD_FREE(thread, sizeof(*thread));
 824 }
 825
 826 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
 827 {
 828         unsigned long flags;
 829         struct ptlrpc_thread *thread;
 830
 831         spin_lock_irqsave(&svc->srv_lock, flags);
 832         while (!list_empty(&svc->srv_threads)) {
 833                 thread = list_entry(svc->srv_threads.next,
 834                                     struct ptlrpc_thread, t_link);
 835
 836                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 837                 ptlrpc_stop_thread(svc, thread);
 838                 spin_lock_irqsave(&svc->srv_lock, flags);
 839         }
 840
 841         spin_unlock_irqrestore(&svc->srv_lock, flags);
 842 }
 843
 844 /* @base_name should be 12 characters or less - 3 will be added on */
 845 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
 846                            int num_threads, char *base_name)
 847 {
 848         int i, rc = 0;
 849         ENTRY;
 850
 851         for (i = 0; i < num_threads; i++) {
 852                 char name[32];
 853                 sprintf(name, "%s_%02d", base_name, i);
 854                 rc = ptlrpc_start_thread(dev, svc, name);
 855                 if (rc) {
 856                         CERROR("cannot start %s thread #%d: rc %d\n", base_name,
 857                                i, rc);
 858                         ptlrpc_stop_all_threads(svc);
 859                 }
 860         }
 861         RETURN(rc);
 862 }
 863
 864 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
 865                         char *name)
 866 {
 867         struct l_wait_info lwi = { 0 };
 868         struct ptlrpc_svc_data d;
 869         struct ptlrpc_thread *thread;
 870         unsigned long flags;
 871         int rc;
 872         ENTRY;
 873
 874         OBD_ALLOC(thread, sizeof(*thread));
 875         if (thread == NULL)
 876                 RETURN(-ENOMEM);
 877         init_waitqueue_head(&thread->t_ctl_waitq);
 878
 879         d.dev = dev;
 880         d.svc = svc;
 881         d.name = name;
 882         d.thread = thread;
 883
 884         spin_lock_irqsave(&svc->srv_lock, flags);
 885         list_add(&thread->t_link, &svc->srv_threads);
 886         spin_unlock_irqrestore(&svc->srv_lock, flags);
 887
 888         /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
 889          * just drop the VM and FILES in ptlrpc_daemonize() right away.
 890          */
 891         rc = kernel_thread(ptlrpc_main, &d, CLONE_VM | CLONE_FILES);
 892         if (rc < 0) {
 893                 CERROR("cannot start thread: %d\n", rc);
 894                 OBD_FREE(thread, sizeof(*thread));
 895                 RETURN(rc);
 896         }
 897         l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi);
 898
 899         RETURN(0);
 900 }
 901 #endif
 902
 903 int ptlrpc_unregister_service(struct ptlrpc_service *service)
 904 {
 905         int                   i;
 906         int                   rc;
 907         unsigned long         flags;
 908         struct ptlrpc_srv_ni *srv_ni;
 909         struct l_wait_info    lwi;
 910         struct list_head     *tmp;
 911
 912         LASSERT(list_empty(&service->srv_threads));
 913
 914         spin_lock (&ptlrpc_all_services_lock);
 915         list_del_init (&service->srv_list);
 916         spin_unlock (&ptlrpc_all_services_lock);
 917
 918         ptlrpc_lprocfs_unregister_service(service);
 919
 920         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 921                 srv_ni = &service->srv_interfaces[i];
 922                 CDEBUG(D_NET, "%s: tearing down interface %s\n",
 923                        service->srv_name, srv_ni->sni_ni->pni_name);
 924
 925                 /* Unlink all the request buffers.  This forces a 'final'
 926                  * event with its 'unlink' flag set for each posted rqbd */
 927                 list_for_each(tmp, &srv_ni->sni_active_rqbds) {
 928                         struct ptlrpc_request_buffer_desc *rqbd =
 929                                 list_entry(tmp, struct ptlrpc_request_buffer_desc,
 930                                            rqbd_list);
 931
 932                         rc = PtlMDUnlink(rqbd->rqbd_md_h);
 933                         LASSERT (rc == PTL_OK || rc == PTL_MD_INVALID);
 934                 }
 935
 936                 /* Wait for the network to release any buffers it's
 937                  * currently filling */
 938                 for (;;) {
 939                         spin_lock_irqsave(&service->srv_lock, flags);
 940                         rc = srv_ni->sni_nrqbd_receiving;
 941                         spin_unlock_irqrestore(&service->srv_lock, flags);
 942
 943                         if (rc == 0)
 944                                 break;
 945
 946                         /* Network access will complete in finite time but
 947                          * the HUGE timeout lets us CWARN for visibility of
 948                          * sluggish NALs */
 949                         lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
 950                         rc = l_wait_event(service->srv_waitq,
 951                                           srv_ni->sni_nrqbd_receiving == 0,
 952                                           &lwi);
 953                         if (rc == -ETIMEDOUT)
 954                                 CWARN("Waiting for request buffers on "
 955                                       "service %s on interface %s ",
 956                                       service->srv_name, srv_ni->sni_ni->pni_name);
 957                 }
 958
 959                 /* schedule all outstanding replies to terminate them */
 960                 spin_lock_irqsave(&service->srv_lock, flags);
 961                 while (!list_empty(&srv_ni->sni_active_replies)) {
 962                         struct ptlrpc_reply_state *rs =
 963                                 list_entry(srv_ni->sni_active_replies.next,
 964                                            struct ptlrpc_reply_state,
 965                                            rs_list);
 966                         ptlrpc_schedule_difficult_reply(rs);
 967                 }
 968                 spin_unlock_irqrestore(&service->srv_lock, flags);
 969         }
 970
 971         /* purge the request queue.  NB No new replies (rqbds all unlinked)
 972          * and no service threads, so I'm the only thread noodling the
 973          * request queue now */
 974         while (!list_empty(&service->srv_request_queue)) {
 975                 struct ptlrpc_request *req =
 976                         list_entry(service->srv_request_queue.next,
 977                                    struct ptlrpc_request,
 978                                    rq_list);
 979
 980                 list_del(&req->rq_list);
 981                 service->srv_n_queued_reqs--;
 982                 service->srv_n_active_reqs++;
 983
 984                 ptlrpc_server_free_request(service, req);
 985         }
 986         LASSERT(service->srv_n_queued_reqs == 0);
 987         LASSERT(service->srv_n_active_reqs == 0);
 988
 989         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 990                 srv_ni = &service->srv_interfaces[i];
 991                 LASSERT(list_empty(&srv_ni->sni_active_rqbds));
 992         }
 993
 994         /* Now free all the request buffers since nothing references them
 995          * any more... */
 996         while (!list_empty(&service->srv_idle_rqbds)) {
 997                 struct ptlrpc_request_buffer_desc *rqbd =
 998                         list_entry(service->srv_idle_rqbds.next,
 999                                    struct ptlrpc_request_buffer_desc,
1000                                    rqbd_list);
1001
1002                 ptlrpc_free_rqbd(rqbd);
1003         }
1004
1005         /* wait for all outstanding replies to complete (they were
1006          * scheduled having been flagged to abort above) */
1007         while (atomic_read(&service->srv_outstanding_replies) != 0) {
1008                 struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL);
1009
1010                 rc = l_wait_event(service->srv_waitq,
1011                                   !list_empty(&service->srv_reply_queue), &lwi);
1012                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
1013
1014                 if (rc == 0) {
1015                         ptlrpc_server_handle_reply(service);
1016                         continue;
1017                 }
1018                 CWARN("Unexpectedly long timeout %p\n", service);
1019         }
1020
1021         OBD_FREE(service,
1022                  offsetof(struct ptlrpc_service,
1023                           srv_interfaces[ptlrpc_ninterfaces]));
1024         return 0;
1025 }