lustre/ptlrpc/service.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (C) 2002 Cluster File Systems, Inc.
   5  *
   6  *   This file is part of Lustre, http://www.lustre.org.
   7  *
   8  *   Lustre is free software; you can redistribute it and/or
   9  *   modify it under the terms of version 2 of the GNU General Public
  10  *   License as published by the Free Software Foundation.
  11  *
  12  *   Lustre is distributed in the hope that it will be useful,
  13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *   GNU General Public License for more details.
  16  *
  17  *   You should have received a copy of the GNU General Public License
  18  *   along with Lustre; if not, write to the Free Software
  19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  *
  21  */
  22
  23 #define DEBUG_SUBSYSTEM S_RPC
  24 #ifndef __KERNEL__
  25 #include <liblustre.h>
  26 #include <linux/kp30.h>
  27 #endif
  28 #include <linux/obd_support.h>
  29 #include <linux/obd_class.h>
  30 #include <linux/lustre_net.h>
  31 #include <linux/lustre_log.h>
  32 #include <portals/types.h>
  33 #include "ptlrpc_internal.h"
  34
  35 /* forward ref */
  36 static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
  37
  38
  39 static LIST_HEAD (ptlrpc_all_services);
  40 static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
  41
  42 static void
  43 ptlrpc_free_server_req (struct ptlrpc_request *req)
  44 {
  45         /* The last request to be received into a request buffer uses space
  46          * in the request buffer descriptor, otherwise requests are
  47          * allocated dynamically in the incoming reply event handler */
  48         if (req == &req->rq_rqbd->rqbd_req)
  49                 return;
  50
  51         OBD_FREE(req, sizeof(*req));
  52 }
  53
  54 static char *
  55 ptlrpc_alloc_request_buffer (int size)
  56 {
  57         char *ptr;
  58
  59         if (size > SVC_BUF_VMALLOC_THRESHOLD)
  60                 OBD_VMALLOC(ptr, size);
  61         else
  62                 OBD_ALLOC(ptr, size);
  63
  64         return (ptr);
  65 }
  66
  67 static void
  68 ptlrpc_free_request_buffer (char *ptr, int size)
  69 {
  70         if (size > SVC_BUF_VMALLOC_THRESHOLD)
  71                 OBD_VFREE(ptr, size);
  72         else
  73                 OBD_FREE(ptr, size);
  74 }
  75
  76 struct ptlrpc_request_buffer_desc *
  77 ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni)
  78 {
  79         struct ptlrpc_service             *svc = srv_ni->sni_service;
  80         unsigned long                      flags;
  81         struct ptlrpc_request_buffer_desc *rqbd;
  82
  83         OBD_ALLOC(rqbd, sizeof (*rqbd));
  84         if (rqbd == NULL)
  85                 return (NULL);
  86
  87         rqbd->rqbd_srv_ni = srv_ni;
  88         rqbd->rqbd_refcount = 0;
  89         rqbd->rqbd_cbid.cbid_fn = request_in_callback;
  90         rqbd->rqbd_cbid.cbid_arg = rqbd;
  91         rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size);
  92
  93         if (rqbd->rqbd_buffer == NULL) {
  94                 OBD_FREE(rqbd, sizeof (*rqbd));
  95                 return (NULL);
  96         }
  97
  98         spin_lock_irqsave (&svc->srv_lock, flags);
  99         list_add(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
 100         svc->srv_nbufs++;
 101         spin_unlock_irqrestore (&svc->srv_lock, flags);
 102
 103         return (rqbd);
 104 }
 105
 106 void
 107 ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 108 {
 109         struct ptlrpc_srv_ni  *sni = rqbd->rqbd_srv_ni;
 110         struct ptlrpc_service *svc = sni->sni_service;
 111         unsigned long          flags;
 112
 113         LASSERT (rqbd->rqbd_refcount == 0);
 114
 115         spin_lock_irqsave(&svc->srv_lock, flags);
 116         list_del(&rqbd->rqbd_list);
 117         svc->srv_nbufs--;
 118         spin_unlock_irqrestore(&svc->srv_lock, flags);
 119
 120         ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size);
 121         OBD_FREE (rqbd, sizeof (*rqbd));
 122 }
 123
 124 int
 125 ptlrpc_grow_req_bufs(struct ptlrpc_srv_ni *srv_ni)
 126 {
 127         struct ptlrpc_service             *svc = srv_ni->sni_service;
 128         struct ptlrpc_request_buffer_desc *rqbd;
 129         int                                i;
 130
 131         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
 132                 rqbd = ptlrpc_alloc_rqbd(srv_ni);
 133
 134                 if (rqbd == NULL) {
 135                         CERROR ("%s/%s: Can't allocate request buffer\n",
 136                                 svc->srv_name, srv_ni->sni_ni->pni_name);
 137                         return (-ENOMEM);
 138                 }
 139
 140                 if (ptlrpc_server_post_idle_rqbds(svc) < 0)
 141                         return (-EAGAIN);
 142         }
 143
 144         return (0);
 145 }
 146
 147 void
 148 ptlrpc_save_llog_lock(struct ptlrpc_request *req, struct llog_create_locks *lcl)
 149 {
 150         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 151         LASSERT (rs != NULL);
 152         LASSERT (rs->rs_llog_locks == NULL);
 153
 154         rs->rs_llog_locks = lcl;
 155 }
 156
 157 void
 158 ptlrpc_require_repack(struct ptlrpc_request *req)
 159 {
 160         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 161         LASSERT (rs != NULL);
 162         rs->rs_difficult = 1;
 163 }
 164
 165 void
 166 ptlrpc_save_lock (struct ptlrpc_request *req,
 167                   struct lustre_handle *lock, int mode)
 168 {
 169         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 170         int                        idx;
 171
 172         if (!lock->cookie)
 173                 return;
 174
 175         LASSERT (rs != NULL);
 176         LASSERT (rs->rs_nlocks < RS_MAX_LOCKS);
 177
 178         idx = rs->rs_nlocks++;
 179         rs->rs_locks[idx] = *lock;
 180         rs->rs_modes[idx] = mode;
 181         rs->rs_difficult = 1;
 182 }
 183
 184 void
 185 ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
 186 {
 187         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
 188
 189 #ifdef CONFIG_SMP
 190         LASSERT (spin_is_locked (&svc->srv_lock));
 191 #endif
 192         LASSERT (rs->rs_difficult);
 193         rs->rs_scheduled_ever = 1;              /* flag any notification attempt */
 194
 195         if (rs->rs_scheduled)                   /* being set up or already notified */
 196                 return;
 197
 198         rs->rs_scheduled = 1;
 199         list_del (&rs->rs_list);
 200         list_add (&rs->rs_list, &svc->srv_reply_queue);
 201         wake_up (&svc->srv_waitq);
 202 }
 203
 204 void
 205 ptlrpc_commit_replies (struct obd_device *obd)
 206 {
 207         struct list_head   *tmp;
 208         struct list_head   *nxt;
 209         unsigned long       flags;
 210
 211         /* Find any replies that have been committed and get their service
 212          * to attend to complete them. */
 213
 214         /* CAVEAT EMPTOR: spinlock ordering!!! */
 215         spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
 216
 217         list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
 218                 struct ptlrpc_reply_state *rs =
 219                         list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list);
 220                 struct llog_create_locks *lcl = rs->rs_llog_locks;
 221
 222                 rs->rs_llog_locks = NULL;
 223                 LASSERT (rs->rs_difficult);
 224
 225                 if (rs->rs_transno <= obd->obd_last_committed) {
 226                         struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
 227
 228                         spin_lock (&svc->srv_lock);
 229                         list_del_init (&rs->rs_obd_list);
 230                         ptlrpc_schedule_difficult_reply (rs);
 231                         spin_unlock (&svc->srv_lock);
 232
 233                         if (lcl != NULL)
 234                                 llog_create_lock_free(lcl);
 235                 }
 236         }
 237
 238         spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
 239 }
 240
 241 static long
 242 timeval_sub(struct timeval *large, struct timeval *small)
 243 {
 244         return (large->tv_sec - small->tv_sec) * 1000000 +
 245                 (large->tv_usec - small->tv_usec);
 246 }
 247
 248 static int
 249 ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
 250 {
 251         struct ptlrpc_srv_ni              *srv_ni;
 252         struct ptlrpc_request_buffer_desc *rqbd;
 253         unsigned long                      flags;
 254         int                                rc;
 255         int                                posted = 0;
 256
 257         for (;;) {
 258                 spin_lock_irqsave(&svc->srv_lock, flags);
 259                 if (list_empty (&svc->srv_idle_rqbds)) {
 260                         spin_unlock_irqrestore(&svc->srv_lock, flags);
 261                         return (posted);
 262                 }
 263                 rqbd = list_entry(svc->srv_idle_rqbds.next,
 264                                   struct ptlrpc_request_buffer_desc,
 265                                   rqbd_list);
 266                 list_del (&rqbd->rqbd_list);
 267
 268                 /* assume we will post successfully */
 269                 srv_ni = rqbd->rqbd_srv_ni;
 270                 srv_ni->sni_nrqbd_receiving++;
 271                 list_add (&rqbd->rqbd_list, &srv_ni->sni_active_rqbds);
 272                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 273
 274                 rc = ptlrpc_register_rqbd(rqbd);
 275                 if (rc != 0)
 276                         break;
 277
 278                 posted = 1;
 279         }
 280
 281
 282         spin_lock_irqsave(&svc->srv_lock, flags);
 283
 284         srv_ni->sni_nrqbd_receiving--;
 285         list_del(&rqbd->rqbd_list);
 286         list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
 287
 288         if (srv_ni->sni_nrqbd_receiving == 0) {
 289                 /* This service is off-air on this interface because all
 290                  * its request buffers are busy.  Portals will have started
 291                  * dropping incoming requests until more buffers get
 292                  * posted */
 293                 CERROR("All %s %s request buffers busy\n",
 294                        svc->srv_name, srv_ni->sni_ni->pni_name);
 295         }
 296
 297         spin_unlock_irqrestore (&svc->srv_lock, flags);
 298
 299         return (-1);
 300 }
 301
 302 struct ptlrpc_service *
 303 ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
 304                 int req_portal, int rep_portal,
 305                 svc_handler_t handler, char *name,
 306                 struct proc_dir_entry *proc_entry)
 307 {
 308         int                                i;
 309         int                                rc;
 310         int                                ssize;
 311         struct ptlrpc_service             *service;
 312         struct ptlrpc_srv_ni              *srv_ni;
 313         ENTRY;
 314
 315         LASSERT (ptlrpc_ninterfaces > 0);
 316         LASSERT (nbufs > 0);
 317         LASSERT (bufsize >= max_req_size);
 318
 319         ssize = offsetof (struct ptlrpc_service,
 320                           srv_interfaces[ptlrpc_ninterfaces]);
 321         OBD_ALLOC(service, ssize);
 322         if (service == NULL)
 323                 RETURN(NULL);
 324
 325         service->srv_name = name;
 326         spin_lock_init(&service->srv_lock);
 327         INIT_LIST_HEAD(&service->srv_threads);
 328         init_waitqueue_head(&service->srv_waitq);
 329
 330         service->srv_nbuf_per_group = nbufs;
 331         service->srv_max_req_size = max_req_size;
 332         service->srv_buf_size = bufsize;
 333         service->srv_rep_portal = rep_portal;
 334         service->srv_req_portal = req_portal;
 335         service->srv_handler = handler;
 336
 337         INIT_LIST_HEAD(&service->srv_request_queue);
 338         INIT_LIST_HEAD(&service->srv_idle_rqbds);
 339         INIT_LIST_HEAD(&service->srv_reply_queue);
 340
 341         /* First initialise enough for early teardown */
 342         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 343                 srv_ni = &service->srv_interfaces[i];
 344
 345                 srv_ni->sni_service = service;
 346                 srv_ni->sni_ni = &ptlrpc_interfaces[i];
 347                 INIT_LIST_HEAD(&srv_ni->sni_active_rqbds);
 348                 INIT_LIST_HEAD(&srv_ni->sni_active_replies);
 349         }
 350
 351         spin_lock (&ptlrpc_all_services_lock);
 352         list_add (&service->srv_list, &ptlrpc_all_services);
 353         spin_unlock (&ptlrpc_all_services_lock);
 354
 355         /* Now allocate the request buffers, assuming all interfaces require
 356          * the same number. */
 357         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 358                 srv_ni = &service->srv_interfaces[i];
 359                 CDEBUG (D_NET, "%s: initialising interface %s\n", name,
 360                         srv_ni->sni_ni->pni_name);
 361
 362                 rc = ptlrpc_grow_req_bufs(srv_ni);
 363                 /* We shouldn't be under memory pressure at startup, so
 364                  * fail if we can't post all our buffers at this time. */
 365                 if (rc != 0)
 366                         GOTO(failed, NULL);
 367         }
 368
 369         if (proc_entry != NULL)
 370                 ptlrpc_lprocfs_register_service(proc_entry, service);
 371
 372         CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n",
 373                service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal);
 374
 375         RETURN(service);
 376 failed:
 377         ptlrpc_unregister_service(service);
 378         return NULL;
 379 }
 380
 381 static void
 382 ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *req)
 383 {
 384         unsigned long  flags;
 385         int            refcount;
 386
 387         spin_lock_irqsave(&svc->srv_lock, flags);
 388         svc->srv_n_active_reqs--;
 389         refcount = --(req->rq_rqbd->rqbd_refcount);
 390         if (refcount == 0) {
 391                 /* request buffer is now idle */
 392                 list_del(&req->rq_rqbd->rqbd_list);
 393                 list_add_tail(&req->rq_rqbd->rqbd_list,
 394                               &svc->srv_idle_rqbds);
 395         }
 396         spin_unlock_irqrestore(&svc->srv_lock, flags);
 397
 398         ptlrpc_free_server_req(req);
 399 }
 400
 401 static char str[PTL_NALFMT_SIZE];
 402 static int
 403 ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 404 {
 405         struct obd_export     *export = NULL;
 406         struct ptlrpc_request *request;
 407         unsigned long          flags;
 408         struct timeval         work_start;
 409         struct timeval         work_end;
 410         long                   timediff;
 411         int                    rc;
 412         ENTRY;
 413
 414         spin_lock_irqsave (&svc->srv_lock, flags);
 415         if (list_empty (&svc->srv_request_queue) ||
 416             (svc->srv_n_difficult_replies != 0 &&
 417              svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) {
 418                 /* If all the other threads are handling requests, I must
 419                  * remain free to handle any 'difficult' reply that might
 420                  * block them */
 421                 spin_unlock_irqrestore (&svc->srv_lock, flags);
 422                 RETURN(0);
 423         }
 424
 425         request = list_entry (svc->srv_request_queue.next,
 426                               struct ptlrpc_request, rq_list);
 427         list_del_init (&request->rq_list);
 428         svc->srv_n_queued_reqs--;
 429         svc->srv_n_active_reqs++;
 430
 431         spin_unlock_irqrestore (&svc->srv_lock, flags);
 432
 433         do_gettimeofday(&work_start);
 434         timediff = timeval_sub(&work_start, &request->rq_arrival_time);
 435         if (svc->srv_stats != NULL) {
 436                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
 437                                     timediff);
 438                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
 439                                     svc->srv_n_queued_reqs);
 440                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
 441                                     svc->srv_n_active_reqs);
 442         }
 443
 444 #if SWAB_PARANOIA
 445         /* Clear request swab mask; this is a new request */
 446         request->rq_req_swab_mask = 0;
 447 #endif
 448         rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
 449         if (rc != 0) {
 450                 CERROR ("error unpacking request: ptl %d from %s"
 451                         " xid "LPU64"\n", svc->srv_req_portal,
 452                         ptlrpc_peernid2str(&request->rq_peer, str),
 453                        request->rq_xid);
 454                 goto out;
 455         }
 456
 457         rc = -EINVAL;
 458         if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
 459                 CERROR("wrong packet type received (type=%u) from %s\n",
 460                        request->rq_reqmsg->type,
 461                        ptlrpc_peernid2str(&request->rq_peer, str));
 462                 goto out;
 463         }
 464
 465         CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
 466
 467         /* Discard requests queued for longer than my timeout.  If the
 468          * client's timeout is similar to mine, she'll be timing out this
 469          * REQ anyway (bug 1502) */
 470         if (timediff / 1000000 > (long)obd_timeout) {
 471                 CERROR("Dropping timed-out opc %d request from %s"
 472                        ": %ld seconds old\n", request->rq_reqmsg->opc,
 473                        ptlrpc_peernid2str(&request->rq_peer, str),
 474                        timediff / 1000000);
 475                 goto out;
 476         }
 477
 478         request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
 479
 480         if (request->rq_export) {
 481                 if (request->rq_reqmsg->conn_cnt <
 482                     request->rq_export->exp_conn_cnt) {
 483                         DEBUG_REQ(D_ERROR, request,
 484                                   "DROPPING req from old connection %d < %d",
 485                                   request->rq_reqmsg->conn_cnt,
 486                                   request->rq_export->exp_conn_cnt);
 487                         goto put_conn;
 488                 }
 489
 490                 export = class_export_rpc_get(request->rq_export);
 491                 request->rq_export->exp_last_request_time =
 492                         LTIME_S(CURRENT_TIME);
 493         }
 494
 495         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
 496                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
 497                (request->rq_export ?
 498                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
 499                (request->rq_export ?
 500                 atomic_read(&request->rq_export->exp_refcount) : -99),
 501                request->rq_reqmsg->status, request->rq_xid,
 502                request->rq_peer.peer_ni->pni_name,
 503                ptlrpc_peernid2str(&request->rq_peer, str),
 504                request->rq_reqmsg->opc);
 505         request->rq_svc = svc;
 506         rc = svc->srv_handler(request);
 507         request->rq_svc = NULL;
 508         CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
 509                "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm,
 510                (request->rq_export ?
 511                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
 512                (request->rq_export ?
 513                 atomic_read(&request->rq_export->exp_refcount) : -99),
 514                request->rq_reqmsg->status, request->rq_xid,
 515                request->rq_peer.peer_ni->pni_name,
 516                ptlrpc_peernid2str(&request->rq_peer, str),
 517                request->rq_reqmsg->opc);
 518
 519         if (export != NULL)
 520                 class_export_rpc_put(export);
 521
 522 put_conn:
 523         if (request->rq_export != NULL)
 524                 class_export_put(request->rq_export);
 525
 526  out:
 527         do_gettimeofday(&work_end);
 528
 529         timediff = timeval_sub(&work_end, &work_start);
 530
 531         CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
 532                "request "LPU64" opc %u from NID %s processed in %ldus "
 533                "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
 534                ptlrpc_peernid2str(&request->rq_peer, str),
 535                timediff, timeval_sub(&work_end, &request->rq_arrival_time));
 536
 537         if (svc->srv_stats != NULL) {
 538                 int opc = opcode_offset(request->rq_reqmsg->opc);
 539                 if (opc > 0) {
 540                         LASSERT(opc < LUSTRE_MAX_OPCODES);
 541                         lprocfs_counter_add(svc->srv_stats,
 542                                             opc + PTLRPC_LAST_CNTR,
 543                                             timediff);
 544                 }
 545         }
 546
 547         ptlrpc_server_free_request(svc, request);
 548
 549         RETURN(1);
 550 }
 551
 552 static int
 553 ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
 554 {
 555         struct ptlrpc_reply_state *rs;
 556         unsigned long              flags;
 557         struct obd_export         *exp;
 558         struct obd_device         *obd;
 559         struct llog_create_locks  *lcl;
 560         int                        nlocks;
 561         int                        been_handled;
 562         ENTRY;
 563
 564         spin_lock_irqsave (&svc->srv_lock, flags);
 565         if (list_empty (&svc->srv_reply_queue)) {
 566                 spin_unlock_irqrestore (&svc->srv_lock, flags);
 567                 RETURN(0);
 568         }
 569
 570         rs = list_entry (svc->srv_reply_queue.next,
 571                          struct ptlrpc_reply_state, rs_list);
 572
 573         exp = rs->rs_export;
 574         obd = exp->exp_obd;
 575
 576         LASSERT (rs->rs_difficult);
 577         LASSERT (rs->rs_scheduled);
 578
 579         list_del_init (&rs->rs_list);
 580
 581         /* Disengage from notifiers carefully (lock ordering!) */
 582         spin_unlock(&svc->srv_lock);
 583
 584         spin_lock (&obd->obd_uncommitted_replies_lock);
 585         /* Noop if removed already */
 586         list_del_init (&rs->rs_obd_list);
 587         spin_unlock (&obd->obd_uncommitted_replies_lock);
 588
 589         spin_lock (&exp->exp_lock);
 590         /* Noop if removed already */
 591         list_del_init (&rs->rs_exp_list);
 592         spin_unlock (&exp->exp_lock);
 593
 594         spin_lock(&svc->srv_lock);
 595
 596         been_handled = rs->rs_handled;
 597         rs->rs_handled = 1;
 598
 599         nlocks = rs->rs_nlocks;                 /* atomic "steal", but */
 600         rs->rs_nlocks = 0;                      /* locks still on rs_locks! */
 601
 602         lcl = rs->rs_llog_locks;
 603         rs->rs_llog_locks = NULL;
 604
 605         if (nlocks == 0 && !been_handled) {
 606                 /* If we see this, we should already have seen the warning
 607                  * in mds_steal_ack_locks()  */
 608 #if 0
 609                 char str[PTL_NALFMT_SIZE];
 610                 /* CMD may ask to save request with no DLM locks -bzzz */
 611                 CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
 612                       " o%d NID %s\n",
 613                       rs,
 614                       rs->rs_xid, rs->rs_transno,
 615                       rs->rs_msg.opc,
 616                       ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
 617 #endif
 618         }
 619
 620         if ((!been_handled && rs->rs_on_net) ||
 621             nlocks > 0 || lcl != NULL) {
 622                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 623
 624                 if (!been_handled && rs->rs_on_net) {
 625                         PtlMDUnlink(rs->rs_md_h);
 626                         /* Ignore return code; we're racing with
 627                          * completion... */
 628                 }
 629
 630                 while (nlocks-- > 0)
 631                         ldlm_lock_decref(&rs->rs_locks[nlocks],
 632                                          rs->rs_modes[nlocks]);
 633
 634                 if (lcl != NULL)
 635                         llog_create_lock_free(lcl);
 636
 637                 spin_lock_irqsave(&svc->srv_lock, flags);
 638         }
 639
 640         rs->rs_scheduled = 0;
 641
 642         if (!rs->rs_on_net) {
 643                 /* Off the net */
 644                 svc->srv_n_difficult_replies--;
 645                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 646
 647                 class_export_put (exp);
 648                 rs->rs_export = NULL;
 649                 lustre_free_reply_state (rs);
 650                 atomic_dec (&svc->srv_outstanding_replies);
 651                 RETURN(1);
 652         }
 653
 654         /* still on the net; callback will schedule */
 655         spin_unlock_irqrestore (&svc->srv_lock, flags);
 656         RETURN(1);
 657 }
 658
 659 #ifndef __KERNEL__
 660 /* FIXME make use of timeout later */
 661 int
 662 liblustre_check_services (void *arg)
 663 {
 664         int  did_something = 0;
 665         int  rc;
 666         struct list_head *tmp, *nxt;
 667         ENTRY;
 668
 669         /* I'm relying on being single threaded, not to have to lock
 670          * ptlrpc_all_services etc */
 671         list_for_each_safe (tmp, nxt, &ptlrpc_all_services) {
 672                 struct ptlrpc_service *svc =
 673                         list_entry (tmp, struct ptlrpc_service, srv_list);
 674
 675                 if (svc->srv_nthreads != 0)     /* I've recursed */
 676                         continue;
 677
 678                 /* service threads can block for bulk, so this limits us
 679                  * (arbitrarily) to recursing 1 stack frame per service.
 680                  * Note that the problem with recursion is that we have to
 681                  * unwind completely before our caller can resume. */
 682
 683                 svc->srv_nthreads++;
 684
 685                 do {
 686                         rc = ptlrpc_server_handle_reply(svc);
 687                         rc |= ptlrpc_server_handle_request(svc);
 688                         rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
 689                         did_something |= rc;
 690                 } while (rc);
 691
 692                 svc->srv_nthreads--;
 693         }
 694
 695         RETURN(did_something);
 696 }
 697
 698 #else /* __KERNEL__ */
 699
 700 /* Don't use daemonize, it removes fs struct from new thread (bug 418) */
 701 void ptlrpc_daemonize(void)
 702 {
 703         exit_mm(current);
 704         lustre_daemonize_helper();
 705         exit_files(current);
 706         reparent_to_init();
 707 }
 708
 709 static void
 710 ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc)
 711 {
 712         struct ptlrpc_srv_ni  *sni;
 713         int                    i;
 714         int                    avail = 0;
 715         int                    low_water = svc->srv_nbuf_per_group/2;
 716
 717         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 718                 sni = &svc->srv_interfaces[i];
 719
 720                 avail += sni->sni_nrqbd_receiving;
 721                 /* NB I'm not locking; just looking. */
 722                 if (sni->sni_nrqbd_receiving <= low_water)
 723                         ptlrpc_grow_req_bufs(sni);
 724         }
 725
 726         lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
 727 }
 728
 729 static int
 730 ptlrpc_retry_rqbds(void *arg)
 731 {
 732         struct ptlrpc_service *svc = (struct ptlrpc_service *)arg;
 733
 734         svc->srv_rqbd_timeout = 0;
 735         return (-ETIMEDOUT);
 736 }
 737
 738 static int ptlrpc_main(void *arg)
 739 {
 740         struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
 741         struct ptlrpc_service  *svc = data->svc;
 742         struct ptlrpc_thread   *thread = data->thread;
 743         unsigned long           flags;
 744         ENTRY;
 745
 746         lock_kernel();
 747         ptlrpc_daemonize();
 748
 749         SIGNAL_MASK_LOCK(current, flags);
 750         sigfillset(&current->blocked);
 751         RECALC_SIGPENDING;
 752         SIGNAL_MASK_UNLOCK(current, flags);
 753
 754         LASSERTF(strlen(data->name) < sizeof(current->comm),
 755                  "name %d > len %d\n",
 756                  (int)strlen(data->name), (int)sizeof(current->comm));
 757         THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name);
 758
 759         unlock_kernel();
 760
 761         /* Record that the thread is running */
 762         thread->t_flags = SVC_RUNNING;
 763         wake_up(&thread->t_ctl_waitq);
 764
 765         spin_lock_irqsave(&svc->srv_lock, flags);
 766         svc->srv_nthreads++;
 767         spin_unlock_irqrestore(&svc->srv_lock, flags);
 768
 769         /* XXX maintain a list of all managed devices: insert here */
 770
 771         while ((thread->t_flags & SVC_STOPPING) == 0 ||
 772                svc->srv_n_difficult_replies != 0) {
 773                 /* Don't exit while there are replies to be handled */
 774                 struct l_wait_info lwi = LWI_TIMEOUT(svc->srv_rqbd_timeout,
 775                                                      ptlrpc_retry_rqbds, svc);
 776
 777                 l_wait_event_exclusive (svc->srv_waitq,
 778                               ((thread->t_flags & SVC_STOPPING) != 0 &&
 779                                svc->srv_n_difficult_replies == 0) ||
 780                               (!list_empty(&svc->srv_idle_rqbds) &&
 781                                svc->srv_rqbd_timeout == 0) ||
 782                               !list_empty (&svc->srv_reply_queue) ||
 783                               (!list_empty (&svc->srv_request_queue) &&
 784                                (svc->srv_n_difficult_replies == 0 ||
 785                                 svc->srv_n_active_reqs <
 786                                 (svc->srv_nthreads - 1))),
 787                               &lwi);
 788
 789                 ptlrpc_check_rqbd_pools(svc);
 790
 791                 if (!list_empty (&svc->srv_reply_queue))
 792                         ptlrpc_server_handle_reply (svc);
 793
 794                 /* only handle requests if there are no difficult replies
 795                  * outstanding, or I'm not the last thread handling
 796                  * requests */
 797                 if (!list_empty (&svc->srv_request_queue) &&
 798                     (svc->srv_n_difficult_replies == 0 ||
 799                      svc->srv_n_active_reqs < (svc->srv_nthreads - 1)))
 800                         ptlrpc_server_handle_request (svc);
 801
 802                 if (!list_empty(&svc->srv_idle_rqbds) &&
 803                     ptlrpc_server_post_idle_rqbds(svc) < 0) {
 804                         /* I just failed to repost request buffers.  Wait
 805                          * for a timeout (unless something else happens)
 806                          * before I try again */
 807                         svc->srv_rqbd_timeout = HZ/10;
 808                 }
 809         }
 810
 811         spin_lock_irqsave(&svc->srv_lock, flags);
 812
 813         svc->srv_nthreads--;                    /* must know immediately */
 814         thread->t_flags = SVC_STOPPED;
 815         wake_up(&thread->t_ctl_waitq);
 816
 817         spin_unlock_irqrestore(&svc->srv_lock, flags);
 818
 819         CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
 820         return 0;
 821 }
 822
 823 static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
 824                                struct ptlrpc_thread *thread)
 825 {
 826         struct l_wait_info lwi = { 0 };
 827         unsigned long      flags;
 828
 829         spin_lock_irqsave(&svc->srv_lock, flags);
 830         thread->t_flags = SVC_STOPPING;
 831         spin_unlock_irqrestore(&svc->srv_lock, flags);
 832
 833         wake_up_all(&svc->srv_waitq);
 834         l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED),
 835                      &lwi);
 836
 837         spin_lock_irqsave(&svc->srv_lock, flags);
 838         list_del(&thread->t_link);
 839         spin_unlock_irqrestore(&svc->srv_lock, flags);
 840
 841         OBD_FREE(thread, sizeof(*thread));
 842 }
 843
 844 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
 845 {
 846         unsigned long flags;
 847         struct ptlrpc_thread *thread;
 848
 849         spin_lock_irqsave(&svc->srv_lock, flags);
 850         while (!list_empty(&svc->srv_threads)) {
 851                 thread = list_entry(svc->srv_threads.next,
 852                                     struct ptlrpc_thread, t_link);
 853
 854                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 855                 ptlrpc_stop_thread(svc, thread);
 856                 spin_lock_irqsave(&svc->srv_lock, flags);
 857         }
 858
 859         spin_unlock_irqrestore(&svc->srv_lock, flags);
 860 }
 861
 862 /* @base_name should be 12 characters or less - 3 will be added on */
 863 int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
 864                            int num_threads, char *base_name)
 865 {
 866         int i, rc = 0;
 867         ENTRY;
 868
 869         for (i = 0; i < num_threads; i++) {
 870                 char name[32];
 871                 sprintf(name, "%s_%02d", base_name, i);
 872                 rc = ptlrpc_start_thread(dev, svc, name);
 873                 if (rc) {
 874                         CERROR("cannot start %s thread #%d: rc %d\n", base_name,
 875                                i, rc);
 876                         ptlrpc_stop_all_threads(svc);
 877                 }
 878         }
 879         RETURN(rc);
 880 }
 881
 882 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
 883                         char *name)
 884 {
 885         struct l_wait_info lwi = { 0 };
 886         struct ptlrpc_svc_data d;
 887         struct ptlrpc_thread *thread;
 888         unsigned long flags;
 889         int rc;
 890         ENTRY;
 891
 892         OBD_ALLOC(thread, sizeof(*thread));
 893         if (thread == NULL)
 894                 RETURN(-ENOMEM);
 895         init_waitqueue_head(&thread->t_ctl_waitq);
 896
 897         d.dev = dev;
 898         d.svc = svc;
 899         d.name = name;
 900         d.thread = thread;
 901
 902         spin_lock_irqsave(&svc->srv_lock, flags);
 903         list_add(&thread->t_link, &svc->srv_threads);
 904         spin_unlock_irqrestore(&svc->srv_lock, flags);
 905
 906         /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
 907          * just drop the VM and FILES in ptlrpc_daemonize() right away.
 908          */
 909         rc = kernel_thread(ptlrpc_main, &d, CLONE_VM | CLONE_FILES);
 910         if (rc < 0) {
 911                 CERROR("cannot start thread: %d\n", rc);
 912                 OBD_FREE(thread, sizeof(*thread));
 913                 RETURN(rc);
 914         }
 915         l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi);
 916
 917         RETURN(0);
 918 }
 919 #endif
 920
 921 int ptlrpc_unregister_service(struct ptlrpc_service *service)
 922 {
 923         int                   i;
 924         int                   rc;
 925         unsigned long         flags;
 926         struct ptlrpc_srv_ni *srv_ni;
 927         struct l_wait_info    lwi;
 928         struct list_head     *tmp;
 929
 930         LASSERT(list_empty(&service->srv_threads));
 931
 932         spin_lock (&ptlrpc_all_services_lock);
 933         list_del_init (&service->srv_list);
 934         spin_unlock (&ptlrpc_all_services_lock);
 935
 936         ptlrpc_lprocfs_unregister_service(service);
 937
 938         for (i = 0; i < ptlrpc_ninterfaces; i++) {
 939                 srv_ni = &service->srv_interfaces[i];
 940                 CDEBUG(D_NET, "%s: tearing down interface %s\n",
 941                        service->srv_name, srv_ni->sni_ni->pni_name);
 942
 943                 /* Unlink all the request buffers.  This forces a 'final'
 944                  * event with its 'unlink' flag set for each posted rqbd */
 945                 list_for_each(tmp, &srv_ni->sni_active_rqbds) {
 946                         struct ptlrpc_request_buffer_desc *rqbd =
 947                                 list_entry(tmp, struct ptlrpc_request_buffer_desc,
 948                                            rqbd_list);
 949
 950                         rc = PtlMDUnlink(rqbd->rqbd_md_h);
 951                         LASSERT (rc == PTL_OK || rc == PTL_MD_INVALID);
 952                 }
 953
 954                 /* Wait for the network to release any buffers it's
 955                  * currently filling */
 956                 for (;;) {
 957                         spin_lock_irqsave(&service->srv_lock, flags);
 958                         rc = srv_ni->sni_nrqbd_receiving;
 959                         spin_unlock_irqrestore(&service->srv_lock, flags);
 960
 961                         if (rc == 0)
 962                                 break;
 963
 964                         /* Network access will complete in finite time but
 965                          * the HUGE timeout lets us CWARN for visibility of
 966                          * sluggish NALs */
 967                         lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
 968                         rc = l_wait_event(service->srv_waitq,
 969                                           srv_ni->sni_nrqbd_receiving == 0,
 970                                           &lwi);
 971                         if (rc == -ETIMEDOUT)
 972                                 CWARN("Waiting for request buffers on "
 973                                       "service %s on interface %s ",
 974                                       service->srv_name, srv_ni->sni_ni->pni_name);
 975                 }
 976
 977                 /* schedule all outstanding replies to terminate them */
 978                 spin_lock_irqsave(&service->srv_lock, flags);
 979                 while (!list_empty(&srv_ni->sni_active_replies)) {
 980                         struct ptlrpc_reply_state *rs =
 981                                 list_entry(srv_ni->sni_active_replies.next,
 982                                            struct ptlrpc_reply_state,
 983                                            rs_list);
 984                         ptlrpc_schedule_difficult_reply(rs);
 985                 }
 986                 spin_unlock_irqrestore(&service->srv_lock, flags);
 987         }
 988
 989         /* purge the request queue.  NB No new replies (rqbds all unlinked)
 990          * and no service threads, so I'm the only thread noodling the
 991          * request queue now */
 992         while (!list_empty(&service->srv_request_queue)) {
 993                 struct ptlrpc_request *req =
 994                         list_entry(service->srv_request_queue.next,
 995                                    struct ptlrpc_request,
 996                                    rq_list);
 997
 998                 list_del(&req->rq_list);
 999                 service->srv_n_queued_reqs--;
1000                 service->srv_n_active_reqs++;
1001
1002                 ptlrpc_server_free_request(service, req);
1003         }
1004         LASSERT(service->srv_n_queued_reqs == 0);
1005         LASSERT(service->srv_n_active_reqs == 0);
1006
1007         for (i = 0; i < ptlrpc_ninterfaces; i++) {
1008                 srv_ni = &service->srv_interfaces[i];
1009                 LASSERT(list_empty(&srv_ni->sni_active_rqbds));
1010         }
1011
1012         /* Now free all the request buffers since nothing references them
1013          * any more... */
1014         while (!list_empty(&service->srv_idle_rqbds)) {
1015                 struct ptlrpc_request_buffer_desc *rqbd =
1016                         list_entry(service->srv_idle_rqbds.next,
1017                                    struct ptlrpc_request_buffer_desc,
1018                                    rqbd_list);
1019
1020                 ptlrpc_free_rqbd(rqbd);
1021         }
1022
1023         /* wait for all outstanding replies to complete (they were
1024          * scheduled having been flagged to abort above) */
1025         while (atomic_read(&service->srv_outstanding_replies) != 0) {
1026                 struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL);
1027
1028                 rc = l_wait_event(service->srv_waitq,
1029                                   !list_empty(&service->srv_reply_queue), &lwi);
1030                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
1031
1032                 if (rc == 0) {
1033                         ptlrpc_server_handle_reply(service);
1034                         continue;
1035                 }
1036                 CWARN("Unexpectedly long timeout %p\n", service);
1037         }
1038
1039         OBD_FREE(service,
1040                  offsetof(struct ptlrpc_service,
1041                           srv_interfaces[ptlrpc_ninterfaces]));
1042         return 0;
1043 }