lustre/ptlrpc/service.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2010, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  */
  31
  32 #define DEBUG_SUBSYSTEM S_RPC
  33
  34 #include <linux/fs_struct.h>
  35 #include <linux/kthread.h>
  36 #include <linux/ratelimit.h>
  37
  38 #include <obd_support.h>
  39 #include <obd_class.h>
  40 #include <lustre_net.h>
  41 #include <lu_object.h>
  42 #include <uapi/linux/lnet/lnet-types.h>
  43 #include "ptlrpc_internal.h"
  44 #include <linux/delay.h>
  45
  46 /* The following are visible and mutable through /sys/module/ptlrpc */
  47 int test_req_buffer_pressure = 0;
  48 module_param(test_req_buffer_pressure, int, 0444);
  49 MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools");
  50 module_param(at_min, int, 0644);
  51 MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)");
  52 module_param(at_max, int, 0644);
  53 MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
  54 module_param(at_history, int, 0644);
  55 MODULE_PARM_DESC(at_history,
  56                  "Adaptive timeouts remember the slowest event that took place within this period (sec)");
  57 module_param(at_early_margin, int, 0644);
  58 MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
  59 module_param(at_extra, int, 0644);
  60 MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply");
  61
  62 /* forward ref */
  63 static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
  64 static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
  65 static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
  66 static int ptlrpc_start_threads(struct ptlrpc_service *svc);
  67 static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
  68
  69 /** Holds a list of all PTLRPC services */
  70 LIST_HEAD(ptlrpc_all_services);
  71 /** Used to protect the \e ptlrpc_all_services list */
  72 struct mutex ptlrpc_all_services_mutex;
  73
  74 static struct ptlrpc_request_buffer_desc *
  75 ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
  76 {
  77         struct ptlrpc_service             *svc = svcpt->scp_service;
  78         struct ptlrpc_request_buffer_desc *rqbd;
  79
  80         OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
  81         if (rqbd == NULL)
  82                 return NULL;
  83
  84         rqbd->rqbd_svcpt = svcpt;
  85         rqbd->rqbd_refcount = 0;
  86         rqbd->rqbd_cbid.cbid_fn = request_in_callback;
  87         rqbd->rqbd_cbid.cbid_arg = rqbd;
  88         INIT_LIST_HEAD(&rqbd->rqbd_reqs);
  89         OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
  90                             svcpt->scp_cpt, svc->srv_buf_size);
  91         if (rqbd->rqbd_buffer == NULL) {
  92                 OBD_FREE_PTR(rqbd);
  93                 return NULL;
  94         }
  95
  96         spin_lock(&svcpt->scp_lock);
  97         list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
  98         svcpt->scp_nrqbds_total++;
  99         spin_unlock(&svcpt->scp_lock);
 100
 101         return rqbd;
 102 }
 103
 104 static void ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 105 {
 106         struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
 107
 108         LASSERT(rqbd->rqbd_refcount == 0);
 109         LASSERT(list_empty(&rqbd->rqbd_reqs));
 110
 111         OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
 112         OBD_FREE_PTR(rqbd);
 113 }
 114
 115 static int ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
 116 {
 117         struct ptlrpc_service *svc = svcpt->scp_service;
 118         struct ptlrpc_request_buffer_desc *rqbd;
 119         int rc = 0;
 120         int i;
 121
 122         if (svcpt->scp_rqbd_allocating)
 123                 goto try_post;
 124
 125         spin_lock(&svcpt->scp_lock);
 126         /* check again with lock */
 127         if (svcpt->scp_rqbd_allocating) {
 128                 /* NB: we might allow more than one thread in the future */
 129                 LASSERT(svcpt->scp_rqbd_allocating == 1);
 130                 spin_unlock(&svcpt->scp_lock);
 131                 goto try_post;
 132         }
 133
 134         svcpt->scp_rqbd_allocating++;
 135         spin_unlock(&svcpt->scp_lock);
 136
 137
 138         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
 139                 /*
 140                  * NB: another thread might have recycled enough rqbds, we
 141                  * need to make sure it wouldn't over-allocate, see LU-1212.
 142                  */
 143                 if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group ||
 144                     (svc->srv_nrqbds_max != 0 &&
 145                      svcpt->scp_nrqbds_total > svc->srv_nrqbds_max))
 146                         break;
 147
 148                 rqbd = ptlrpc_alloc_rqbd(svcpt);
 149
 150                 if (rqbd == NULL) {
 151                         CERROR("%s: Can't allocate request buffer\n",
 152                                svc->srv_name);
 153                         rc = -ENOMEM;
 154                         break;
 155                 }
 156         }
 157
 158         spin_lock(&svcpt->scp_lock);
 159
 160         LASSERT(svcpt->scp_rqbd_allocating == 1);
 161         svcpt->scp_rqbd_allocating--;
 162
 163         spin_unlock(&svcpt->scp_lock);
 164
 165         CDEBUG(D_RPCTRACE,
 166                "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
 167                svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
 168                svcpt->scp_nrqbds_total, rc);
 169
 170  try_post:
 171         if (post && rc == 0)
 172                 rc = ptlrpc_server_post_idle_rqbds(svcpt);
 173
 174         return rc;
 175 }
 176
 177 /**
 178  * Part of Rep-Ack logic.
 179  * Puts a lock and its mode into reply state assotiated to request reply.
 180  */
 181 void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock,
 182                       bool no_ack)
 183 {
 184         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 185         int idx;
 186
 187         LASSERT(rs != NULL);
 188         CDEBUG(D_RPCTRACE, "nlocks %d\n", rs->rs_nlocks);
 189         LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
 190
 191         idx = rs->rs_nlocks++;
 192         rs->rs_locks[idx] = *lock;
 193         rs->rs_difficult = 1;
 194         rs->rs_no_ack = no_ack;
 195 }
 196 EXPORT_SYMBOL(ptlrpc_save_lock);
 197
 198
 199 struct ptlrpc_hr_partition;
 200
 201 struct ptlrpc_hr_thread {
 202         int                             hrt_id;         /* thread ID */
 203         spinlock_t                      hrt_lock;
 204         wait_queue_head_t               hrt_waitq;
 205         struct list_head                hrt_queue;
 206         struct ptlrpc_hr_partition      *hrt_partition;
 207 };
 208
 209 struct ptlrpc_hr_partition {
 210         /* # of started threads */
 211         atomic_t                        hrp_nstarted;
 212         /* # of stopped threads */
 213         atomic_t                        hrp_nstopped;
 214         /* cpu partition id */
 215         int                             hrp_cpt;
 216         /* round-robin rotor for choosing thread */
 217         int                             hrp_rotor;
 218         /* total number of threads on this partition */
 219         int                             hrp_nthrs;
 220         /* threads table */
 221         struct ptlrpc_hr_thread         *hrp_thrs;
 222 };
 223
 224 #define HRT_RUNNING 0
 225 #define HRT_STOPPING 1
 226
 227 struct ptlrpc_hr_service {
 228         /* CPU partition table, it's just cfs_cpt_tab for now */
 229         struct cfs_cpt_table            *hr_cpt_table;
 230         /** controller sleep waitq */
 231         wait_queue_head_t               hr_waitq;
 232         unsigned int                    hr_stopping;
 233         /** roundrobin rotor for non-affinity service */
 234         unsigned int                    hr_rotor;
 235         /* partition data */
 236         struct ptlrpc_hr_partition      **hr_partitions;
 237 };
 238
 239 struct rs_batch {
 240         struct list_head                        rsb_replies;
 241         unsigned int                    rsb_n_replies;
 242         struct ptlrpc_service_part      *rsb_svcpt;
 243 };
 244
 245 /** reply handling service. */
 246 static struct ptlrpc_hr_service         ptlrpc_hr;
 247
 248 /**
 249  * maximum mumber of replies scheduled in one batch
 250  */
 251 #define MAX_SCHEDULED 256
 252
 253 /**
 254  * Initialize a reply batch.
 255  *
 256  * \param b batch
 257  */
 258 static void rs_batch_init(struct rs_batch *b)
 259 {
 260         memset(b, 0, sizeof(*b));
 261         INIT_LIST_HEAD(&b->rsb_replies);
 262 }
 263
 264 /**
 265  * Choose an hr thread to dispatch requests to.
 266  */
 267 static
 268 struct ptlrpc_hr_thread *ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
 269 {
 270         struct ptlrpc_hr_partition      *hrp;
 271         unsigned int                    rotor;
 272
 273         if (svcpt->scp_cpt >= 0 &&
 274             svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
 275                 /* directly match partition */
 276                 hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
 277
 278         } else {
 279                 rotor = ptlrpc_hr.hr_rotor++;
 280                 rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
 281
 282                 hrp = ptlrpc_hr.hr_partitions[rotor];
 283         }
 284
 285         rotor = hrp->hrp_rotor++;
 286         return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
 287 }
 288
 289 /**
 290  * Dispatch all replies accumulated in the batch to one from
 291  * dedicated reply handling threads.
 292  *
 293  * \param b batch
 294  */
 295 static void rs_batch_dispatch(struct rs_batch *b)
 296 {
 297         if (b->rsb_n_replies != 0) {
 298                 struct ptlrpc_hr_thread *hrt;
 299
 300                 hrt = ptlrpc_hr_select(b->rsb_svcpt);
 301
 302                 spin_lock(&hrt->hrt_lock);
 303                 list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
 304                 spin_unlock(&hrt->hrt_lock);
 305
 306                 wake_up(&hrt->hrt_waitq);
 307                 b->rsb_n_replies = 0;
 308         }
 309 }
 310
 311 /**
 312  * Add a reply to a batch.
 313  * Add one reply object to a batch, schedule batched replies if overload.
 314  *
 315  * \param b batch
 316  * \param rs reply
 317  */
 318 static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
 319 {
 320         struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
 321
 322         if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
 323                 if (b->rsb_svcpt != NULL) {
 324                         rs_batch_dispatch(b);
 325                         spin_unlock(&b->rsb_svcpt->scp_rep_lock);
 326                 }
 327                 spin_lock(&svcpt->scp_rep_lock);
 328                 b->rsb_svcpt = svcpt;
 329         }
 330         spin_lock(&rs->rs_lock);
 331         rs->rs_scheduled_ever = 1;
 332         if (rs->rs_scheduled == 0) {
 333                 list_move(&rs->rs_list, &b->rsb_replies);
 334                 rs->rs_scheduled = 1;
 335                 b->rsb_n_replies++;
 336         }
 337         rs->rs_committed = 1;
 338         spin_unlock(&rs->rs_lock);
 339 }
 340
 341 /**
 342  * Reply batch finalization.
 343  * Dispatch remaining replies from the batch
 344  * and release remaining spinlock.
 345  *
 346  * \param b batch
 347  */
 348 static void rs_batch_fini(struct rs_batch *b)
 349 {
 350         if (b->rsb_svcpt != NULL) {
 351                 rs_batch_dispatch(b);
 352                 spin_unlock(&b->rsb_svcpt->scp_rep_lock);
 353         }
 354 }
 355
 356 #define DECLARE_RS_BATCH(b)     struct rs_batch b
 357
 358
 359 /**
 360  * Put reply state into a queue for processing because we received
 361  * ACK from the client
 362  */
 363 void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
 364 {
 365         struct ptlrpc_hr_thread *hrt;
 366
 367         ENTRY;
 368
 369         LASSERT(list_empty(&rs->rs_list));
 370
 371         hrt = ptlrpc_hr_select(rs->rs_svcpt);
 372
 373         spin_lock(&hrt->hrt_lock);
 374         list_add_tail(&rs->rs_list, &hrt->hrt_queue);
 375         spin_unlock(&hrt->hrt_lock);
 376
 377         wake_up(&hrt->hrt_waitq);
 378         EXIT;
 379 }
 380
 381 void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
 382 {
 383         ENTRY;
 384
 385         assert_spin_locked(&rs->rs_svcpt->scp_rep_lock);
 386         assert_spin_locked(&rs->rs_lock);
 387         LASSERT(rs->rs_difficult);
 388         rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
 389
 390         if (rs->rs_scheduled) {     /* being set up or already notified */
 391                 EXIT;
 392                 return;
 393         }
 394
 395         rs->rs_scheduled = 1;
 396         list_del_init(&rs->rs_list);
 397         ptlrpc_dispatch_difficult_reply(rs);
 398         EXIT;
 399 }
 400 EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
 401
 402 void ptlrpc_commit_replies(struct obd_export *exp)
 403 {
 404         struct ptlrpc_reply_state *rs, *nxt;
 405         DECLARE_RS_BATCH(batch);
 406
 407         ENTRY;
 408
 409         rs_batch_init(&batch);
 410         /*
 411          * Find any replies that have been committed and get their service
 412          * to attend to complete them.
 413          */
 414
 415         /* CAVEAT EMPTOR: spinlock ordering!!! */
 416         spin_lock(&exp->exp_uncommitted_replies_lock);
 417         list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
 418                                  rs_obd_list) {
 419                 LASSERT(rs->rs_difficult);
 420                 /* VBR: per-export last_committed */
 421                 LASSERT(rs->rs_export);
 422                 if (rs->rs_transno <= exp->exp_last_committed) {
 423                         list_del_init(&rs->rs_obd_list);
 424                         rs_batch_add(&batch, rs);
 425                 }
 426         }
 427         spin_unlock(&exp->exp_uncommitted_replies_lock);
 428         rs_batch_fini(&batch);
 429         EXIT;
 430 }
 431
 432 static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
 433 {
 434         struct ptlrpc_request_buffer_desc *rqbd;
 435         int rc;
 436         int posted = 0;
 437
 438         for (;;) {
 439                 spin_lock(&svcpt->scp_lock);
 440
 441                 if (list_empty(&svcpt->scp_rqbd_idle)) {
 442                         spin_unlock(&svcpt->scp_lock);
 443                         return posted;
 444                 }
 445
 446                 rqbd = list_first_entry(&svcpt->scp_rqbd_idle,
 447                                         struct ptlrpc_request_buffer_desc,
 448                                         rqbd_list);
 449
 450                 /* assume we will post successfully */
 451                 svcpt->scp_nrqbds_posted++;
 452                 list_move(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
 453
 454                 spin_unlock(&svcpt->scp_lock);
 455
 456                 rc = ptlrpc_register_rqbd(rqbd);
 457                 if (rc != 0)
 458                         break;
 459
 460                 posted = 1;
 461         }
 462
 463         spin_lock(&svcpt->scp_lock);
 464
 465         svcpt->scp_nrqbds_posted--;
 466         list_move_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
 467
 468         /*
 469          * Don't complain if no request buffers are posted right now; LNET
 470          * won't drop requests because we set the portal lazy!
 471          */
 472
 473         spin_unlock(&svcpt->scp_lock);
 474
 475         return -1;
 476 }
 477
 478 static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
 479 {
 480         struct ptlrpc_service_part *svcpt;
 481
 482         svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
 483
 484         svcpt->scp_at_check = 1;
 485         svcpt->scp_at_checktime = ktime_get();
 486         wake_up(&svcpt->scp_waitq);
 487 }
 488
 489 static void ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
 490                                          struct ptlrpc_service_conf *conf)
 491 {
 492         struct ptlrpc_service_thr_conf *tc = &conf->psc_thr;
 493         unsigned int init;
 494         unsigned int total;
 495         unsigned int nthrs;
 496         int weight;
 497
 498         /*
 499          * Common code for estimating & validating threads number.
 500          * CPT affinity service could have percpt thread-pool instead
 501          * of a global thread-pool, which means user might not always
 502          * get the threads number they give it in conf::tc_nthrs_user
 503          * even they did set. It's because we need to validate threads
 504          * number for each CPT to guarantee each pool will have enough
 505          * threads to keep the service healthy.
 506          */
 507         init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
 508         init = max_t(int, init, tc->tc_nthrs_init);
 509
 510         /*
 511          * NB: please see comments in lustre_lnet.h for definition
 512          * details of these members
 513          */
 514         LASSERT(tc->tc_nthrs_max != 0);
 515
 516         if (tc->tc_nthrs_user != 0) {
 517                 /*
 518                  * In case there is a reason to test a service with many
 519                  * threads, we give a less strict check here, it can
 520                  * be up to 8 * nthrs_max
 521                  */
 522                 total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
 523                 nthrs = total / svc->srv_ncpts;
 524                 init  = max(init, nthrs);
 525                 goto out;
 526         }
 527
 528         total = tc->tc_nthrs_max;
 529         if (tc->tc_nthrs_base == 0) {
 530                 /*
 531                  * don't care about base threads number per partition,
 532                  * this is most for non-affinity service
 533                  */
 534                 nthrs = total / svc->srv_ncpts;
 535                 goto out;
 536         }
 537
 538         nthrs = tc->tc_nthrs_base;
 539         if (svc->srv_ncpts == 1) {
 540                 int     i;
 541
 542                 /*
 543                  * NB: Increase the base number if it's single partition
 544                  * and total number of cores/HTs is larger or equal to 4.
 545                  * result will always < 2 * nthrs_base
 546                  */
 547                 weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
 548                 for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
 549                             (tc->tc_nthrs_base >> i) != 0; i++)
 550                         nthrs += tc->tc_nthrs_base >> i;
 551         }
 552
 553         if (tc->tc_thr_factor != 0) {
 554                 int       factor = tc->tc_thr_factor;
 555                 const int fade = 4;
 556
 557                 /*
 558                  * User wants to increase number of threads with for
 559                  * each CPU core/HT, most likely the factor is larger than
 560                  * one thread/core because service threads are supposed to
 561                  * be blocked by lock or wait for IO.
 562                  */
 563                 /*
 564                  * Amdahl's law says that adding processors wouldn't give
 565                  * a linear increasing of parallelism, so it's nonsense to
 566                  * have too many threads no matter how many cores/HTs
 567                  * there are.
 568                  */
 569                 preempt_disable();
 570                 if (cpumask_weight
 571                     (topology_sibling_cpumask(smp_processor_id())) > 1) {
 572                         /* weight is # of HTs */
 573                         /* depress thread factor for hyper-thread */
 574                         factor = factor - (factor >> 1) + (factor >> 3);
 575                 }
 576                 preempt_enable();
 577
 578                 weight = cfs_cpt_weight(svc->srv_cptable, 0);
 579
 580                 for (; factor > 0 && weight > 0; factor--, weight -= fade)
 581                         nthrs += min(weight, fade) * factor;
 582         }
 583
 584         if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
 585                 nthrs = max(tc->tc_nthrs_base,
 586                             tc->tc_nthrs_max / svc->srv_ncpts);
 587         }
 588  out:
 589         nthrs = max(nthrs, tc->tc_nthrs_init);
 590         svc->srv_nthrs_cpt_limit = nthrs;
 591         svc->srv_nthrs_cpt_init = init;
 592
 593         if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
 594                 CDEBUG(D_OTHER,
 595                        "%s: This service may have more threads (%d) than the given soft limit (%d)\n",
 596                        svc->srv_name, nthrs * svc->srv_ncpts,
 597                        tc->tc_nthrs_max);
 598         }
 599 }
 600
 601 /**
 602  * Initialize percpt data for a service
 603  */
 604 static int ptlrpc_service_part_init(struct ptlrpc_service *svc,
 605                                     struct ptlrpc_service_part *svcpt, int cpt)
 606 {
 607         struct ptlrpc_at_array *array;
 608         int size;
 609         int index;
 610         int rc;
 611
 612         svcpt->scp_cpt = cpt;
 613         INIT_LIST_HEAD(&svcpt->scp_threads);
 614
 615         /* rqbd and incoming request queue */
 616         spin_lock_init(&svcpt->scp_lock);
 617         mutex_init(&svcpt->scp_mutex);
 618         INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
 619         INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
 620         INIT_LIST_HEAD(&svcpt->scp_req_incoming);
 621         init_waitqueue_head(&svcpt->scp_waitq);
 622         /* history request & rqbd list */
 623         INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
 624         INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
 625
 626         /* acitve requests and hp requests */
 627         spin_lock_init(&svcpt->scp_req_lock);
 628
 629         /* reply states */
 630         spin_lock_init(&svcpt->scp_rep_lock);
 631         INIT_LIST_HEAD(&svcpt->scp_rep_active);
 632         INIT_LIST_HEAD(&svcpt->scp_rep_idle);
 633         init_waitqueue_head(&svcpt->scp_rep_waitq);
 634         atomic_set(&svcpt->scp_nreps_difficult, 0);
 635
 636         /* adaptive timeout */
 637         spin_lock_init(&svcpt->scp_at_lock);
 638         array = &svcpt->scp_at_array;
 639
 640         size = at_est2timeout(obd_get_at_max(NULL));
 641         array->paa_size     = size;
 642         array->paa_count    = 0;
 643         array->paa_deadline = -1;
 644
 645         /* allocate memory for scp_at_array (ptlrpc_at_array) */
 646         OBD_CPT_ALLOC(array->paa_reqs_array,
 647                       svc->srv_cptable, cpt, sizeof(struct list_head) * size);
 648         if (array->paa_reqs_array == NULL)
 649                 return -ENOMEM;
 650
 651         for (index = 0; index < size; index++)
 652                 INIT_LIST_HEAD(&array->paa_reqs_array[index]);
 653
 654         OBD_CPT_ALLOC(array->paa_reqs_count,
 655                       svc->srv_cptable, cpt, sizeof(__u32) * size);
 656         if (array->paa_reqs_count == NULL)
 657                 goto failed;
 658
 659         cfs_timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer,
 660                         (unsigned long)svcpt, 0);
 661
 662         /*
 663          * At SOW, service time should be quick; 10s seems generous. If client
 664          * timeout is less than this, we'll be sending an early reply.
 665          */
 666         at_init(&svcpt->scp_at_estimate, 10, 0);
 667
 668         /* assign this before call ptlrpc_grow_req_bufs */
 669         svcpt->scp_service = svc;
 670         /* Now allocate the request buffers, but don't post them now */
 671         rc = ptlrpc_grow_req_bufs(svcpt, 0);
 672         /*
 673          * We shouldn't be under memory pressure at startup, so
 674          * fail if we can't allocate all our buffers at this time.
 675          */
 676         if (rc != 0)
 677                 goto failed;
 678
 679         return 0;
 680
 681  failed:
 682         if (array->paa_reqs_count != NULL) {
 683                 OBD_FREE_PTR_ARRAY(array->paa_reqs_count, size);
 684                 array->paa_reqs_count = NULL;
 685         }
 686
 687         if (array->paa_reqs_array != NULL) {
 688                 OBD_FREE_PTR_ARRAY(array->paa_reqs_array, array->paa_size);
 689                 array->paa_reqs_array = NULL;
 690         }
 691
 692         return -ENOMEM;
 693 }
 694
 695 /**
 696  * Initialize service on a given portal.
 697  * This includes starting serving threads , allocating and posting rqbds and
 698  * so on.
 699  */
 700 struct ptlrpc_service *ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 701                                                struct kset *parent,
 702                                                struct dentry *debugfs_entry)
 703 {
 704         struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt;
 705         struct ptlrpc_service *service;
 706         struct ptlrpc_service_part *svcpt;
 707         struct cfs_cpt_table *cptable;
 708         __u32 *cpts = NULL;
 709         int ncpts;
 710         int cpt;
 711         int rc;
 712         int i;
 713
 714         ENTRY;
 715
 716         LASSERT(conf->psc_buf.bc_nbufs > 0);
 717         LASSERT(conf->psc_buf.bc_buf_size >=
 718                 conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
 719         LASSERT(conf->psc_thr.tc_ctx_tags != 0);
 720
 721         cptable = cconf->cc_cptable;
 722         if (cptable == NULL)
 723                 cptable = cfs_cpt_tab;
 724
 725         if (conf->psc_thr.tc_cpu_bind > 1) {
 726                 CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n",
 727                        conf->psc_name, conf->psc_thr.tc_cpu_bind);
 728                 RETURN(ERR_PTR(-EINVAL));
 729         }
 730
 731         if (!cconf->cc_affinity) {
 732                 ncpts = 1;
 733         } else {
 734                 ncpts = cfs_cpt_number(cptable);
 735                 if (cconf->cc_pattern != NULL) {
 736                         struct cfs_expr_list    *el;
 737
 738                         rc = cfs_expr_list_parse(cconf->cc_pattern,
 739                                                  strlen(cconf->cc_pattern),
 740                                                  0, ncpts - 1, &el);
 741                         if (rc != 0) {
 742                                 CERROR("%s: invalid CPT pattern string: %s\n",
 743                                        conf->psc_name, cconf->cc_pattern);
 744                                 RETURN(ERR_PTR(-EINVAL));
 745                         }
 746
 747                         rc = cfs_expr_list_values(el, ncpts, &cpts);
 748                         cfs_expr_list_free(el);
 749                         if (rc <= 0) {
 750                                 CERROR("%s: failed to parse CPT array %s: %d\n",
 751                                        conf->psc_name, cconf->cc_pattern, rc);
 752                                 if (cpts != NULL)
 753                                         OBD_FREE_PTR_ARRAY(cpts, ncpts);
 754                                 RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
 755                         }
 756                         ncpts = rc;
 757                 }
 758         }
 759
 760         OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
 761         if (service == NULL) {
 762                 if (cpts != NULL)
 763                         OBD_FREE_PTR_ARRAY(cpts, ncpts);
 764                 RETURN(ERR_PTR(-ENOMEM));
 765         }
 766
 767         service->srv_cptable            = cptable;
 768         service->srv_cpts               = cpts;
 769         service->srv_ncpts              = ncpts;
 770         service->srv_cpt_bind           = conf->psc_thr.tc_cpu_bind;
 771
 772         service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
 773         while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
 774                 service->srv_cpt_bits++;
 775
 776         /* public members */
 777         spin_lock_init(&service->srv_lock);
 778         service->srv_name               = conf->psc_name;
 779         service->srv_watchdog_factor    = conf->psc_watchdog_factor;
 780         INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
 781
 782         /* buffer configuration */
 783         service->srv_nbuf_per_group     = test_req_buffer_pressure ?
 784                                           1 : conf->psc_buf.bc_nbufs;
 785         /* do not limit max number of rqbds by default */
 786         service->srv_nrqbds_max         = 0;
 787
 788         service->srv_max_req_size       = conf->psc_buf.bc_req_max_size +
 789                                           SPTLRPC_MAX_PAYLOAD;
 790         service->srv_buf_size           = conf->psc_buf.bc_buf_size;
 791         service->srv_rep_portal         = conf->psc_buf.bc_rep_portal;
 792         service->srv_req_portal         = conf->psc_buf.bc_req_portal;
 793
 794         /* With slab/alloc_pages buffer size will be rounded up to 2^n */
 795         if (service->srv_buf_size & (service->srv_buf_size - 1)) {
 796                 int round = size_roundup_power2(service->srv_buf_size);
 797
 798                 service->srv_buf_size = round;
 799         }
 800
 801         /* Increase max reply size to next power of two */
 802         service->srv_max_reply_size = 1;
 803         while (service->srv_max_reply_size <
 804                conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
 805                 service->srv_max_reply_size <<= 1;
 806
 807         service->srv_thread_name        = conf->psc_thr.tc_thr_name;
 808         service->srv_ctx_tags           = conf->psc_thr.tc_ctx_tags;
 809         service->srv_hpreq_ratio        = PTLRPC_SVC_HP_RATIO;
 810         service->srv_ops                = conf->psc_ops;
 811
 812         for (i = 0; i < ncpts; i++) {
 813                 if (!cconf->cc_affinity)
 814                         cpt = CFS_CPT_ANY;
 815                 else
 816                         cpt = cpts != NULL ? cpts[i] : i;
 817
 818                 OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
 819                 if (svcpt == NULL)
 820                         GOTO(failed, rc = -ENOMEM);
 821
 822                 service->srv_parts[i] = svcpt;
 823                 rc = ptlrpc_service_part_init(service, svcpt, cpt);
 824                 if (rc != 0)
 825                         GOTO(failed, rc);
 826         }
 827
 828         ptlrpc_server_nthreads_check(service, conf);
 829
 830         rc = LNetSetLazyPortal(service->srv_req_portal);
 831         LASSERT(rc == 0);
 832
 833         mutex_lock(&ptlrpc_all_services_mutex);
 834         list_add(&service->srv_list, &ptlrpc_all_services);
 835         mutex_unlock(&ptlrpc_all_services_mutex);
 836
 837         if (parent) {
 838                 rc = ptlrpc_sysfs_register_service(parent, service);
 839                 if (rc)
 840                         GOTO(failed, rc);
 841         }
 842
 843         if (debugfs_entry != NULL)
 844                 ptlrpc_ldebugfs_register_service(debugfs_entry, service);
 845
 846         rc = ptlrpc_service_nrs_setup(service);
 847         if (rc != 0)
 848                 GOTO(failed, rc);
 849
 850         CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
 851                service->srv_name, service->srv_req_portal);
 852
 853         rc = ptlrpc_start_threads(service);
 854         if (rc != 0) {
 855                 CERROR("Failed to start threads for service %s: %d\n",
 856                        service->srv_name, rc);
 857                 GOTO(failed, rc);
 858         }
 859
 860         RETURN(service);
 861 failed:
 862         ptlrpc_unregister_service(service);
 863         RETURN(ERR_PTR(rc));
 864 }
 865 EXPORT_SYMBOL(ptlrpc_register_service);
 866
 867 /**
 868  * to actually free the request, must be called without holding svc_lock.
 869  * note it's caller's responsibility to unlink req->rq_list.
 870  */
 871 static void ptlrpc_server_free_request(struct ptlrpc_request *req)
 872 {
 873         LASSERT(atomic_read(&req->rq_refcount) == 0);
 874         LASSERT(list_empty(&req->rq_timed_list));
 875
 876         /*
 877          * DEBUG_REQ() assumes the reply state of a request with a valid
 878          * ref will not be destroyed until that reference is dropped.
 879          */
 880         ptlrpc_req_drop_rs(req);
 881
 882         sptlrpc_svc_ctx_decref(req);
 883
 884         if (req != &req->rq_rqbd->rqbd_req) {
 885                 /*
 886                  * NB request buffers use an embedded
 887                  * req if the incoming req unlinked the
 888                  * MD; this isn't one of them!
 889                  */
 890                 ptlrpc_request_cache_free(req);
 891         }
 892 }
 893
 894 /**
 895  * drop a reference count of the request. if it reaches 0, we either
 896  * put it into history list, or free it immediately.
 897  */
 898 void ptlrpc_server_drop_request(struct ptlrpc_request *req)
 899 {
 900         struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
 901         struct ptlrpc_service_part        *svcpt = rqbd->rqbd_svcpt;
 902         struct ptlrpc_service             *svc = svcpt->scp_service;
 903         int                                refcount;
 904
 905         if (!atomic_dec_and_test(&req->rq_refcount))
 906                 return;
 907
 908         if (req->rq_session.lc_state == LCS_ENTERED) {
 909                 lu_context_exit(&req->rq_session);
 910                 lu_context_fini(&req->rq_session);
 911         }
 912
 913         if (req->rq_at_linked) {
 914                 spin_lock(&svcpt->scp_at_lock);
 915                 /*
 916                  * recheck with lock, in case it's unlinked by
 917                  * ptlrpc_at_check_timed()
 918                  */
 919                 if (likely(req->rq_at_linked))
 920                         ptlrpc_at_remove_timed(req);
 921                 spin_unlock(&svcpt->scp_at_lock);
 922         }
 923
 924         LASSERT(list_empty(&req->rq_timed_list));
 925
 926         /* finalize request */
 927         if (req->rq_export) {
 928                 class_export_put(req->rq_export);
 929                 req->rq_export = NULL;
 930         }
 931
 932         spin_lock(&svcpt->scp_lock);
 933
 934         list_add(&req->rq_list, &rqbd->rqbd_reqs);
 935
 936         refcount = --(rqbd->rqbd_refcount);
 937         if (refcount == 0) {
 938                 /* request buffer is now idle: add to history */
 939                 list_move_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
 940                 svcpt->scp_hist_nrqbds++;
 941
 942                 /*
 943                  * cull some history?
 944                  * I expect only about 1 or 2 rqbds need to be recycled here
 945                  */
 946                 while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
 947                         rqbd = list_first_entry(&svcpt->scp_hist_rqbds,
 948                                                 struct ptlrpc_request_buffer_desc,
 949                                                 rqbd_list);
 950
 951                         list_del(&rqbd->rqbd_list);
 952                         svcpt->scp_hist_nrqbds--;
 953
 954                         /*
 955                          * remove rqbd's reqs from svc's req history while
 956                          * I've got the service lock
 957                          */
 958                         list_for_each_entry(req, &rqbd->rqbd_reqs, rq_list) {
 959                                 /* Track the highest culled req seq */
 960                                 if (req->rq_history_seq >
 961                                     svcpt->scp_hist_seq_culled) {
 962                                         svcpt->scp_hist_seq_culled =
 963                                                 req->rq_history_seq;
 964                                 }
 965                                 list_del(&req->rq_history_list);
 966                         }
 967
 968                         spin_unlock(&svcpt->scp_lock);
 969
 970                         while ((req = list_first_entry_or_null(
 971                                         &rqbd->rqbd_reqs,
 972                                         struct ptlrpc_request, rq_list))) {
 973                                 list_del(&req->rq_list);
 974                                 ptlrpc_server_free_request(req);
 975                         }
 976
 977                         spin_lock(&svcpt->scp_lock);
 978                         /*
 979                          * now all reqs including the embedded req has been
 980                          * disposed, schedule request buffer for re-use
 981                          * or free it to drain some in excess.
 982                          */
 983                         LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
 984                         if (svcpt->scp_nrqbds_posted >=
 985                             svc->srv_nbuf_per_group ||
 986                             (svc->srv_nrqbds_max != 0 &&
 987                              svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) ||
 988                             test_req_buffer_pressure) {
 989                                 /* like in ptlrpc_free_rqbd() */
 990                                 svcpt->scp_nrqbds_total--;
 991                                 OBD_FREE_LARGE(rqbd->rqbd_buffer,
 992                                                svc->srv_buf_size);
 993                                 OBD_FREE_PTR(rqbd);
 994                         } else {
 995                                 list_add_tail(&rqbd->rqbd_list,
 996                                               &svcpt->scp_rqbd_idle);
 997                         }
 998                 }
 999
1000                 spin_unlock(&svcpt->scp_lock);
1001         } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
1002                 /* If we are low on memory, we are not interested in history */
1003                 list_del(&req->rq_list);
1004                 list_del_init(&req->rq_history_list);
1005
1006                 /* Track the highest culled req seq */
1007                 if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
1008                         svcpt->scp_hist_seq_culled = req->rq_history_seq;
1009
1010                 spin_unlock(&svcpt->scp_lock);
1011
1012                 ptlrpc_server_free_request(req);
1013         } else {
1014                 spin_unlock(&svcpt->scp_lock);
1015         }
1016 }
1017
1018 static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req,
1019                                        struct obd_export *export, bool hp)
1020 {
1021         __u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
1022
1023         if (hp)
1024                 list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
1025         else
1026                 list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
1027         if (tag && export->exp_used_slots)
1028                 set_bit(tag - 1, export->exp_used_slots);
1029 }
1030
1031 static void ptlrpc_del_exp_list(struct ptlrpc_request *req)
1032 {
1033         __u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
1034
1035         spin_lock(&req->rq_export->exp_rpc_lock);
1036         list_del_init(&req->rq_exp_list);
1037         if (tag && !req->rq_obsolete && req->rq_export->exp_used_slots)
1038                 clear_bit(tag - 1, req->rq_export->exp_used_slots);
1039         spin_unlock(&req->rq_export->exp_rpc_lock);
1040 }
1041
1042 /** Change request export and move hp request from old export to new */
1043 void ptlrpc_request_change_export(struct ptlrpc_request *req,
1044                                   struct obd_export *export)
1045 {
1046         if (req->rq_export != NULL) {
1047                 LASSERT(!list_empty(&req->rq_exp_list));
1048                 /* remove rq_exp_list from last export */
1049                 ptlrpc_del_exp_list(req);
1050                 /* export has one reference already, so it's safe to
1051                  * add req to export queue here and get another
1052                  * reference for request later
1053                  */
1054                 spin_lock(&export->exp_rpc_lock);
1055                 ptlrpc_add_exp_list_nolock(req, export, req->rq_ops != NULL);
1056                 spin_unlock(&export->exp_rpc_lock);
1057
1058                 class_export_rpc_dec(req->rq_export);
1059                 class_export_put(req->rq_export);
1060         }
1061
1062         /* request takes one export refcount */
1063         req->rq_export = class_export_get(export);
1064         class_export_rpc_inc(export);
1065 }
1066
1067 /**
1068  * to finish a request: stop sending more early replies, and release
1069  * the request.
1070  */
1071 static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
1072                                          struct ptlrpc_request *req)
1073 {
1074         ptlrpc_server_hpreq_fini(req);
1075
1076         ptlrpc_server_drop_request(req);
1077 }
1078
1079 /**
1080  * to finish an active request: stop sending more early replies, and release
1081  * the request. should be called after we finished handling the request.
1082  */
1083 static void ptlrpc_server_finish_active_request(
1084                                         struct ptlrpc_service_part *svcpt,
1085                                         struct ptlrpc_request *req)
1086 {
1087         spin_lock(&svcpt->scp_req_lock);
1088         ptlrpc_nrs_req_stop_nolock(req);
1089         svcpt->scp_nreqs_active--;
1090         if (req->rq_hp)
1091                 svcpt->scp_nhreqs_active--;
1092         spin_unlock(&svcpt->scp_req_lock);
1093
1094         ptlrpc_nrs_req_finalize(req);
1095
1096         if (req->rq_export != NULL)
1097                 class_export_rpc_dec(req->rq_export);
1098
1099         ptlrpc_server_finish_request(svcpt, req);
1100 }
1101
1102 /**
1103  * This function makes sure dead exports are evicted in a timely manner.
1104  * This function is only called when some export receives a message (i.e.,
1105  * the network is up.)
1106  */
1107 void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
1108 {
1109         struct obd_export *oldest_exp, *newest_exp;
1110         time64_t oldest_time, current_time;
1111         bool    evict = false;
1112         ENTRY;
1113
1114         LASSERT(exp);
1115
1116         /*
1117          * Compensate for slow machines, etc, by faking our request time
1118          * into the future.  Although this can break the strict time-ordering
1119          * of the list, we can be really lazy here - we don't have to evict
1120          * at the exact right moment.  Eventually, all silent exports
1121          * will make it to the top of the list.
1122          */
1123
1124         /* Do not pay attention on 1sec or smaller renewals. */
1125         current_time = ktime_get_real_seconds();
1126         /* 1 seconds */
1127         if (exp->exp_last_request_time + 1 >= current_time + extra_delay)
1128                 RETURN_EXIT;
1129
1130         exp->exp_last_request_time = current_time + extra_delay;
1131
1132         /*
1133          * exports may get disconnected from the chain even though the
1134          * export has references, so we must keep the spin lock while
1135          * manipulating the lists
1136          */
1137         spin_lock(&exp->exp_obd->obd_dev_lock);
1138
1139         if (list_empty(&exp->exp_obd_chain_timed)) {
1140                 /* this one is not timed */
1141                 spin_unlock(&exp->exp_obd->obd_dev_lock);
1142                 RETURN_EXIT;
1143         }
1144
1145         newest_exp = list_last_entry(&exp->exp_obd->obd_exports_timed,
1146                                      struct obd_export, exp_obd_chain_timed);
1147
1148         list_move_tail(&exp->exp_obd_chain_timed,
1149                        &exp->exp_obd->obd_exports_timed);
1150
1151         if (exp->exp_obd->obd_recovering) {
1152                 /* be nice to everyone during recovery */
1153                 spin_unlock(&exp->exp_obd->obd_dev_lock);
1154                 RETURN_EXIT;
1155         }
1156
1157         oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
1158                                 struct obd_export, exp_obd_chain_timed);
1159
1160         oldest_time = oldest_exp->exp_last_request_time;
1161
1162         /* Check if the oldest entry is expired. */
1163         if (exp->exp_obd->obd_eviction_timer == 0 &&
1164             current_time > oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
1165
1166                 if (current_time < newest_exp->exp_last_request_time +
1167                              PING_EVICT_TIMEOUT / 2) {
1168                         /* If import is active - evict stale clients */
1169                         evict = true;
1170                 } else {
1171                         /*
1172                          * We need a second timer, in case the net was down and
1173                          * it just came back. Since the pinger may skip every
1174                          * other PING_INTERVAL (see note in ptlrpc_pinger_main),
1175                          * we better wait for 3.
1176                          */
1177                         exp->exp_obd->obd_eviction_timer =
1178                                 ktime_get_real_seconds() + 3 * PING_INTERVAL;
1179                         CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
1180                                exp->exp_obd->obd_name,
1181                                obd_export_nid2str(oldest_exp), oldest_time);
1182
1183                 }
1184         }
1185
1186         spin_unlock(&exp->exp_obd->obd_dev_lock);
1187
1188         if (evict) {
1189                 /* Evict stale clients */
1190                 ping_evictor_wake(exp);
1191         } else {
1192                 if (ktime_get_real_seconds() >
1193                     (exp->exp_obd->obd_eviction_timer + extra_delay)) {
1194                         /*
1195                          * The evictor won't evict anyone who we've heard from
1196                          * recently, so we don't have to check before we start
1197                          * it.
1198                          */
1199                         if (!ping_evictor_wake(exp))
1200                                 exp->exp_obd->obd_eviction_timer = 0;
1201                 }
1202         }
1203
1204         EXIT;
1205 }
1206
1207 /**
1208  * Sanity check request \a req.
1209  * Return 0 if all is ok, error code otherwise.
1210  */
1211 static int ptlrpc_check_req(struct ptlrpc_request *req)
1212 {
1213         struct obd_device *obd = req->rq_export->exp_obd;
1214         int rc = 0;
1215
1216         if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
1217                      req->rq_export->exp_conn_cnt)) {
1218                 DEBUG_REQ(D_RPCTRACE, req,
1219                           "DROPPING req from old connection %d < %d",
1220                           lustre_msg_get_conn_cnt(req->rq_reqmsg),
1221                           req->rq_export->exp_conn_cnt);
1222                 return -EEXIST;
1223         }
1224         if (unlikely(obd == NULL || obd->obd_fail)) {
1225                 /*
1226                  * Failing over, don't handle any more reqs,
1227                  * send error response instead.
1228                  */
1229                 CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
1230                         req, (obd != NULL) ? obd->obd_name : "unknown");
1231                 rc = -ENODEV;
1232         } else if (lustre_msg_get_flags(req->rq_reqmsg) &
1233                    (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
1234                    !obd->obd_recovering) {
1235                 DEBUG_REQ(D_ERROR, req,
1236                           "Invalid replay without recovery");
1237                 class_fail_export(req->rq_export);
1238                 rc = -ENODEV;
1239         } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
1240                    !obd->obd_recovering) {
1241                 DEBUG_REQ(D_ERROR, req,
1242                           "Invalid req with transno %llu without recovery",
1243                           lustre_msg_get_transno(req->rq_reqmsg));
1244                 class_fail_export(req->rq_export);
1245                 rc = -ENODEV;
1246         }
1247
1248         if (unlikely(rc < 0)) {
1249                 req->rq_status = rc;
1250                 ptlrpc_error(req);
1251         }
1252         return rc;
1253 }
1254
1255 static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
1256 {
1257         struct ptlrpc_at_array *array = &svcpt->scp_at_array;
1258         time64_t next;
1259
1260         if (array->paa_count == 0) {
1261                 timer_delete(&svcpt->scp_at_timer);
1262                 return;
1263         }
1264
1265         /* Set timer for closest deadline */
1266         next = array->paa_deadline - ktime_get_real_seconds() -
1267                at_early_margin;
1268         if (next <= 0) {
1269                 ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
1270         } else {
1271                 mod_timer(&svcpt->scp_at_timer,
1272                           jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC));
1273                 CDEBUG(D_INFO, "armed %s at %+llds\n",
1274                        svcpt->scp_service->srv_name, next);
1275         }
1276 }
1277
1278 /* Add rpc to early reply check list */
1279 static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
1280 {
1281         struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
1282         struct ptlrpc_at_array *array = &svcpt->scp_at_array;
1283         struct ptlrpc_request *rq = NULL;
1284         __u32 index;
1285         struct obd_device *obd = NULL;
1286
1287         if (req->rq_export)
1288                 obd = req->rq_export->exp_obd;
1289
1290         if (obd_at_off(obd))
1291                 return(0);
1292
1293         if (req->rq_no_reply)
1294                 return 0;
1295
1296         if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
1297                 return(-ENOSYS);
1298
1299         spin_lock(&svcpt->scp_at_lock);
1300         LASSERT(list_empty(&req->rq_timed_list));
1301
1302         div_u64_rem(req->rq_deadline, array->paa_size, &index);
1303         if (array->paa_reqs_count[index] > 0) {
1304                 /*
1305                  * latest rpcs will have the latest deadlines in the list,
1306                  * so search backward.
1307                  */
1308                 list_for_each_entry_reverse(rq, &array->paa_reqs_array[index],
1309                                             rq_timed_list) {
1310                         if (req->rq_deadline >= rq->rq_deadline) {
1311                                 list_add(&req->rq_timed_list,
1312                                          &rq->rq_timed_list);
1313                                 break;
1314                         }
1315                 }
1316         }
1317
1318         /* Add the request at the head of the list */
1319         if (list_empty(&req->rq_timed_list))
1320                 list_add(&req->rq_timed_list, &array->paa_reqs_array[index]);
1321
1322         spin_lock(&req->rq_lock);
1323         req->rq_at_linked = 1;
1324         spin_unlock(&req->rq_lock);
1325         req->rq_at_index = index;
1326         array->paa_reqs_count[index]++;
1327         array->paa_count++;
1328         if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
1329                 array->paa_deadline = req->rq_deadline;
1330                 ptlrpc_at_set_timer(svcpt);
1331         }
1332         spin_unlock(&svcpt->scp_at_lock);
1333
1334         return 0;
1335 }
1336
1337 static void ptlrpc_at_remove_timed(struct ptlrpc_request *req)
1338 {
1339         struct ptlrpc_at_array *array;
1340
1341         array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
1342
1343         /* NB: must call with hold svcpt::scp_at_lock */
1344         LASSERT(!list_empty(&req->rq_timed_list));
1345         list_del_init(&req->rq_timed_list);
1346
1347         spin_lock(&req->rq_lock);
1348         req->rq_at_linked = 0;
1349         spin_unlock(&req->rq_lock);
1350
1351         array->paa_reqs_count[req->rq_at_index]--;
1352         array->paa_count--;
1353 }
1354
1355 /*
1356  * Attempt to extend the request deadline by sending an early reply to the
1357  * client.
1358  */
1359 static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
1360 {
1361         struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
1362         struct ptlrpc_request *reqcopy;
1363         struct lustre_msg *reqmsg;
1364         timeout_t olddl = req->rq_deadline - ktime_get_real_seconds();
1365         time64_t newdl;
1366         int rc;
1367         struct obd_device *obd = NULL;
1368
1369         ENTRY;
1370
1371         if (req->rq_export)
1372                 obd = req->rq_export->exp_obd;
1373
1374         if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT) ||
1375             CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND)) {
1376                 /* don't send early reply */
1377                 RETURN(1);
1378         }
1379
1380         /*
1381          * deadline is when the client expects us to reply, margin is the
1382          * difference between clients' and servers' expectations
1383          */
1384         DEBUG_REQ(D_ADAPTTO, req,
1385                   "%ssending early reply (deadline %+ds, margin %+ds) for %d+%d",
1386                   obd_at_off(obd) ? "AT off - not " : "",
1387                   olddl, olddl - obd_at_get(obd, &svcpt->scp_at_estimate),
1388                   obd_at_get(obd, &svcpt->scp_at_estimate), at_extra);
1389
1390         if (obd_at_off(obd))
1391                 RETURN(0);
1392
1393         if (olddl < 0) {
1394                 /* below message is checked in replay-ost-single.sh test_9 */
1395                 DEBUG_REQ(D_WARNING, req,
1396                           "Already past deadline (%+ds), not sending early reply. Consider increasing at_early_margin (%d)?",
1397                           olddl, at_early_margin);
1398
1399                 /* Return an error so we're not re-added to the timed list. */
1400                 RETURN(-ETIMEDOUT);
1401         }
1402
1403         if ((lustre_msghdr_get_flags(req->rq_reqmsg) &
1404              MSGHDR_AT_SUPPORT) == 0) {
1405                 DEBUG_REQ(D_INFO, req,
1406                           "Wanted to ask client for more time, but no AT support");
1407                 RETURN(-ENOSYS);
1408         }
1409
1410         if (req->rq_export &&
1411             lustre_msg_get_flags(req->rq_reqmsg) &
1412             (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
1413                 struct obd_device *obd_exp = req->rq_export->exp_obd;
1414
1415                 /*
1416                  * During recovery, we don't want to send too many early
1417                  * replies, but on the other hand we want to make sure the
1418                  * client has enough time to resend if the rpc is lost. So
1419                  * during the recovery period send at least 4 early replies,
1420                  * spacing them every at_extra if we can. at_estimate should
1421                  * always equal this fixed value during recovery.
1422                  */
1423
1424                 /*
1425                  * Don't account request processing time into AT history
1426                  * during recovery, it is not service time we need but
1427                  * includes also waiting time for recovering clients
1428                  */
1429                 newdl = min_t(time64_t, at_extra,
1430                               obd_exp->obd_recovery_timeout / 4) +
1431                         ktime_get_real_seconds();
1432         } else {
1433                 /*
1434                  * We want to extend the request deadline by at_extra seconds,
1435                  * so we set our service estimate to reflect how much time has
1436                  * passed since this request arrived plus an additional
1437                  * at_extra seconds. The client will calculate the new deadline
1438                  * based on this service estimate (plus some additional time to
1439                  * account for network latency). See ptlrpc_at_recv_early_reply
1440                  */
1441                 obd_at_measure(obd, &svcpt->scp_at_estimate, at_extra +
1442                             ktime_get_real_seconds() -
1443                             req->rq_arrival_time.tv_sec);
1444                 newdl = req->rq_arrival_time.tv_sec +
1445                         obd_at_get(obd, &svcpt->scp_at_estimate);
1446         }
1447
1448         /*
1449          * Check to see if we've actually increased the deadline -
1450          * we may be past adaptive_max
1451          */
1452         if (req->rq_deadline >= newdl) {
1453                 DEBUG_REQ(D_WARNING, req,
1454                           "Could not add any time (%d/%lld), not sending early reply",
1455                           olddl, newdl - ktime_get_real_seconds());
1456                 RETURN(-ETIMEDOUT);
1457         }
1458
1459         reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
1460         if (reqcopy == NULL)
1461                 RETURN(-ENOMEM);
1462         OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
1463         if (!reqmsg)
1464                 GOTO(out_free, rc = -ENOMEM);
1465
1466         *reqcopy = *req;
1467         spin_lock_init(&reqcopy->rq_early_free_lock);
1468         reqcopy->rq_reply_state = NULL;
1469         reqcopy->rq_rep_swab_mask = 0;
1470         reqcopy->rq_pack_bulk = 0;
1471         reqcopy->rq_pack_udesc = 0;
1472         reqcopy->rq_packed_final = 0;
1473         sptlrpc_svc_ctx_addref(reqcopy);
1474         /* We only need the reqmsg for the magic */
1475         reqcopy->rq_reqmsg = reqmsg;
1476         memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
1477
1478         /*
1479          * tgt_brw_read() and tgt_brw_write() may have decided not to reply.
1480          * Without this check, we would fail the rq_no_reply assertion in
1481          * ptlrpc_send_reply().
1482          */
1483         if (reqcopy->rq_no_reply)
1484                 GOTO(out, rc = -ETIMEDOUT);
1485
1486         LASSERT(atomic_read(&req->rq_refcount));
1487         /* if it is last refcount then early reply isn't needed */
1488         if (atomic_read(&req->rq_refcount) == 1) {
1489                 DEBUG_REQ(D_ADAPTTO, reqcopy,
1490                           "Normal reply already sent, abort early reply");
1491                 GOTO(out, rc = -EINVAL);
1492         }
1493
1494         /* Connection ref */
1495         reqcopy->rq_export = class_conn2export(
1496                         lustre_msg_get_handle(reqcopy->rq_reqmsg));
1497         if (reqcopy->rq_export == NULL)
1498                 GOTO(out, rc = -ENODEV);
1499
1500         /* RPC ref */
1501         class_export_rpc_inc(reqcopy->rq_export);
1502         if (reqcopy->rq_export->exp_obd &&
1503             reqcopy->rq_export->exp_obd->obd_fail)
1504                 GOTO(out_put, rc = -ENODEV);
1505
1506         rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
1507         if (rc)
1508                 GOTO(out_put, rc);
1509
1510         rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
1511
1512         if (!rc) {
1513                 /* Adjust our own deadline to what we told the client */
1514                 req->rq_deadline = newdl;
1515                 req->rq_early_count++; /* number sent, server side */
1516         } else {
1517                 DEBUG_REQ(D_ERROR, req, "Early reply send failed: rc = %d", rc);
1518         }
1519
1520         /*
1521          * Free the (early) reply state from lustre_pack_reply.
1522          * (ptlrpc_send_reply takes it's own rs ref, so this is safe here)
1523          */
1524         ptlrpc_req_drop_rs(reqcopy);
1525
1526 out_put:
1527         class_export_rpc_dec(reqcopy->rq_export);
1528         class_export_put(reqcopy->rq_export);
1529 out:
1530         sptlrpc_svc_ctx_decref(reqcopy);
1531         OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
1532 out_free:
1533         ptlrpc_request_cache_free(reqcopy);
1534         RETURN(rc);
1535 }
1536
1537 /*
1538  * Send early replies to everybody expiring within at_early_margin
1539  * asking for at_extra time
1540  */
1541 static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
1542 {
1543         struct ptlrpc_at_array *array = &svcpt->scp_at_array;
1544         struct ptlrpc_request *rq, *n;
1545         LIST_HEAD(work_list);
1546         __u32 index, count;
1547         time64_t deadline;
1548         time64_t now = ktime_get_real_seconds();
1549         s64 delay_ms;
1550         int first, counter = 0;
1551
1552         ENTRY;
1553         spin_lock(&svcpt->scp_at_lock);
1554         if (svcpt->scp_at_check == 0) {
1555                 spin_unlock(&svcpt->scp_at_lock);
1556                 RETURN(0);
1557         }
1558         delay_ms = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime);
1559         svcpt->scp_at_check = 0;
1560
1561         if (array->paa_count == 0) {
1562                 spin_unlock(&svcpt->scp_at_lock);
1563                 RETURN(0);
1564         }
1565
1566         /* The timer went off, but maybe the nearest rpc already completed. */
1567         first = array->paa_deadline - now;
1568         if (first > at_early_margin) {
1569                 /* We've still got plenty of time.  Reset the timer. */
1570                 ptlrpc_at_set_timer(svcpt);
1571                 spin_unlock(&svcpt->scp_at_lock);
1572                 RETURN(0);
1573         }
1574
1575         /*
1576          * We're close to a timeout, and we don't know how much longer the
1577          * server will take. Send early replies to everyone expiring soon.
1578          */
1579         deadline = -1;
1580         div_u64_rem(array->paa_deadline, array->paa_size, &index);
1581         count = array->paa_count;
1582         while (count > 0) {
1583                 count -= array->paa_reqs_count[index];
1584                 list_for_each_entry_safe(rq, n,
1585                                          &array->paa_reqs_array[index],
1586                                          rq_timed_list) {
1587                         if (rq->rq_deadline > now + at_early_margin) {
1588                                 /* update the earliest deadline */
1589                                 if (deadline == -1 ||
1590                                     rq->rq_deadline < deadline)
1591                                         deadline = rq->rq_deadline;
1592                                 break;
1593                         }
1594
1595                         /**
1596                          * ptlrpc_server_drop_request() may drop
1597                          * refcount to 0 already. Let's check this and
1598                          * don't add entry to work_list
1599                          */
1600                         if (likely(atomic_inc_not_zero(&rq->rq_refcount))) {
1601                                 ptlrpc_at_remove_timed(rq);
1602                                 list_add(&rq->rq_timed_list, &work_list);
1603                         } else {
1604                                 ptlrpc_at_remove_timed(rq);
1605                         }
1606
1607                         counter++;
1608                 }
1609
1610                 if (++index >= array->paa_size)
1611                         index = 0;
1612         }
1613         array->paa_deadline = deadline;
1614         /* we have a new earliest deadline, restart the timer */
1615         ptlrpc_at_set_timer(svcpt);
1616
1617         spin_unlock(&svcpt->scp_at_lock);
1618
1619         CDEBUG(D_ADAPTTO,
1620                "timeout in %+ds, asking for %d secs on %d early replies\n",
1621                first, at_extra, counter);
1622         if (first < 0) {
1623                 /*
1624                  * We're already past request deadlines before we even get a
1625                  * chance to send early replies
1626                  */
1627                 timeout_t atg = obd_at_get((struct obd_device *)NULL,
1628                                            &svcpt->scp_at_estimate);
1629                 LCONSOLE_WARN("'%s' is processing requests too slowly, client may timeout. Late by %ds, missed %d early replies (reqs waiting=%d active=%d, at_estimate=%d, delay=%lldms)\n",
1630                               svcpt->scp_service->srv_name, -first, counter,
1631                               svcpt->scp_nreqs_incoming,
1632                               svcpt->scp_nreqs_active,
1633                               atg,
1634                               delay_ms);
1635         }
1636
1637         /*
1638          * we took additional refcount so entries can't be deleted from list, no
1639          * locking is needed
1640          */
1641         while ((rq = list_first_entry_or_null(&work_list,
1642                                               struct ptlrpc_request,
1643                                               rq_timed_list)) != NULL) {
1644                 list_del_init(&rq->rq_timed_list);
1645
1646                 if (ptlrpc_at_send_early_reply(rq) == 0)
1647                         ptlrpc_at_add_timed(rq);
1648
1649                 ptlrpc_server_drop_request(rq);
1650         }
1651
1652         RETURN(1); /* return "did_something" for liblustre */
1653 }
1654
1655 /*
1656  * Check if we are already handling earlier incarnation of this request.
1657  * Called under &req->rq_export->exp_rpc_lock locked
1658  */
1659 static struct ptlrpc_request*
1660 ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
1661 {
1662         struct ptlrpc_request *tmp = NULL;
1663
1664         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
1665                 return NULL;
1666
1667         /*
1668          * This list should not be longer than max_requests in
1669          * flights on the client, so it is not all that long.
1670          * Also we only hit this codepath in case of a resent
1671          * request which makes it even more rarely hit
1672          */
1673         list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs,
1674                                 rq_exp_list) {
1675                 /* Found duplicate one */
1676                 if (tmp->rq_xid == req->rq_xid)
1677                         goto found;
1678         }
1679         list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs,
1680                                 rq_exp_list) {
1681                 /* Found duplicate one */
1682                 if (tmp->rq_xid == req->rq_xid)
1683                         goto found;
1684         }
1685         return NULL;
1686
1687 found:
1688         DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
1689         DEBUG_REQ(D_HA, tmp, "Request being processed");
1690         return tmp;
1691 }
1692
1693 #ifdef HAVE_SERVER_SUPPORT
1694 static void ptlrpc_server_mark_obsolete(struct ptlrpc_request *req)
1695 {
1696         spin_lock(&req->rq_lock);
1697         req->rq_obsolete = 1;
1698         spin_unlock(&req->rq_lock);
1699 }
1700
1701 static void
1702 ptlrpc_server_mark_in_progress_obsolete(struct ptlrpc_request *req)
1703 {
1704         struct ptlrpc_request   *tmp = NULL;
1705         __u16                   tag;
1706
1707         if (!tgt_is_increasing_xid_client(req->rq_export) ||
1708             req->rq_export->exp_used_slots == NULL)
1709                 return;
1710
1711         tag = lustre_msg_get_tag(req->rq_reqmsg);
1712         if (tag == 0)
1713                 return;
1714
1715         if (!test_bit(tag - 1, req->rq_export->exp_used_slots))
1716                 return;
1717
1718         /* This list should not be longer than max_requests in
1719          * flights on the client, so it is not all that long.
1720          * Also we only hit this codepath in case of a resent
1721          * request which makes it even more rarely hit */
1722         list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs, rq_exp_list) {
1723                 if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
1724                     req->rq_xid > tmp->rq_xid)
1725                         ptlrpc_server_mark_obsolete(tmp);
1726
1727         }
1728         list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs, rq_exp_list) {
1729                 if (tag == lustre_msg_get_tag(tmp->rq_reqmsg) &&
1730                     req->rq_xid > tmp->rq_xid)
1731                         ptlrpc_server_mark_obsolete(tmp);
1732         }
1733 }
1734 #endif
1735
1736 /**
1737  * Check if a request should be assigned with a high priority.
1738  *
1739  * \retval      < 0: error occurred
1740  *                0: normal RPC request
1741  *               +1: high priority request
1742  */
1743 static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
1744                                     struct ptlrpc_request *req)
1745 {
1746         int rc = 0;
1747
1748         ENTRY;
1749         if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) {
1750                 rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
1751                 if (rc < 0)
1752                         RETURN(rc);
1753
1754                 LASSERT(rc == 0);
1755         }
1756
1757         if (req->rq_export != NULL && req->rq_ops != NULL) {
1758                 /*
1759                  * Perform request specific check. We should do this
1760                  * check before the request is added into exp_hp_rpcs
1761                  * list otherwise it may hit swab race at LU-1044.
1762                  */
1763                 if (req->rq_ops->hpreq_check != NULL) {
1764                         rc = req->rq_ops->hpreq_check(req);
1765                         if (rc == -ESTALE) {
1766                                 req->rq_status = rc;
1767                                 ptlrpc_error(req);
1768                         }
1769                         /*
1770                          * can only return error,
1771                          * 0 for normal request,
1772                          * or 1 for high priority request
1773                          */
1774                         LASSERT(rc <= 1);
1775                 }
1776         }
1777
1778         RETURN(rc);
1779 }
1780
1781 /** Remove the request from the export list. */
1782 static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
1783 {
1784         ENTRY;
1785         if (req->rq_export) {
1786                 /*
1787                  * refresh lock timeout again so that client has more
1788                  * room to send lock cancel RPC.
1789                  */
1790                 if (req->rq_ops && req->rq_ops->hpreq_fini)
1791                         req->rq_ops->hpreq_fini(req);
1792
1793                 ptlrpc_del_exp_list(req);
1794         }
1795         EXIT;
1796 }
1797
1798 static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
1799 {
1800         return 1;
1801 }
1802
1803 static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
1804         .hpreq_check       = ptlrpc_hpreq_check,
1805 };
1806
1807 /* Hi-Priority RPC check by RPC operation code. */
1808 int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
1809 {
1810         int opc = lustre_msg_get_opc(req->rq_reqmsg);
1811
1812         /*
1813          * Check for export to let only reconnects for not yet evicted
1814          * export to become a HP rpc.
1815          */
1816         if ((req->rq_export != NULL) &&
1817             (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
1818                 req->rq_ops = &ptlrpc_hpreq_common;
1819
1820         return 0;
1821 }
1822 EXPORT_SYMBOL(ptlrpc_hpreq_handler);
1823
1824 static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
1825                                      struct ptlrpc_request *req)
1826 {
1827         int rc;
1828         bool hp;
1829         struct ptlrpc_request *orig;
1830
1831         ENTRY;
1832
1833         rc = ptlrpc_server_hpreq_init(svcpt, req);
1834         if (rc < 0)
1835                 RETURN(rc);
1836
1837         hp = rc > 0;
1838         ptlrpc_nrs_req_initialize(svcpt, req, hp);
1839
1840         while (req->rq_export != NULL) {
1841                 struct obd_export *exp = req->rq_export;
1842
1843                 /*
1844                  * do search for duplicated xid and the adding to the list
1845                  * atomically
1846                  */
1847                 spin_lock_bh(&exp->exp_rpc_lock);
1848 #ifdef HAVE_SERVER_SUPPORT
1849                 ptlrpc_server_mark_in_progress_obsolete(req);
1850 #endif
1851                 orig = ptlrpc_server_check_resend_in_progress(req);
1852                 if (orig && CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
1853                         spin_unlock_bh(&exp->exp_rpc_lock);
1854
1855                         CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
1856                         msleep(4 * MSEC_PER_SEC);
1857                         continue;
1858                 }
1859
1860                 if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
1861                         bool linked;
1862
1863                         spin_unlock_bh(&exp->exp_rpc_lock);
1864
1865                         /*
1866                          * When the client resend request and the server has
1867                          * the previous copy of it, we need to update deadlines,
1868                          * to be sure that the client and the server have equal
1869                          *  request deadlines.
1870                          */
1871
1872                         spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
1873                         linked = orig->rq_at_linked;
1874                         if (likely(linked))
1875                                 ptlrpc_at_remove_timed(orig);
1876                         spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
1877                         orig->rq_deadline = req->rq_deadline;
1878                         orig->rq_rep_mbits = req->rq_rep_mbits;
1879                         if (likely(linked))
1880                                 ptlrpc_at_add_timed(orig);
1881                         ptlrpc_server_drop_request(orig);
1882                         ptlrpc_nrs_req_finalize(req);
1883
1884                         /* don't mark slot unused for resend in progress */
1885                         spin_lock(&req->rq_lock);
1886                         req->rq_obsolete = 1;
1887                         spin_unlock(&req->rq_lock);
1888
1889                         RETURN(-EBUSY);
1890                 }
1891
1892                 ptlrpc_add_exp_list_nolock(req, exp, hp || req->rq_ops != NULL);
1893
1894                 spin_unlock_bh(&exp->exp_rpc_lock);
1895                 break;
1896         }
1897
1898         /*
1899          * the current thread is not the processing thread for this request
1900          * since that, but request is in exp_hp_list and can be find there.
1901          * Remove all relations between request and old thread.
1902          */
1903         req->rq_svc_thread->t_env->le_ses = NULL;
1904         req->rq_svc_thread = NULL;
1905         req->rq_session.lc_thread = NULL;
1906
1907         ptlrpc_nrs_req_add(svcpt, req, hp);
1908
1909         RETURN(0);
1910 }
1911
1912 /**
1913  * Allow to handle high priority request
1914  * User can call it w/o any lock but need to hold
1915  * ptlrpc_service_part::scp_req_lock to get reliable result
1916  */
1917 static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
1918                                      bool force)
1919 {
1920         int running = svcpt->scp_nthrs_running;
1921
1922         if (!nrs_svcpt_has_hp(svcpt))
1923                 return false;
1924
1925         if (force)
1926                 return true;
1927
1928         if (ptlrpc_nrs_req_throttling_nolock(svcpt, true))
1929                 return false;
1930
1931         if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
1932                      CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
1933                 /* leave just 1 thread for normal RPCs */
1934                 running = PTLRPC_NTHRS_INIT;
1935                 if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
1936                         running += 1;
1937         }
1938
1939         if (svcpt->scp_nreqs_active >= running - 1)
1940                 return false;
1941
1942         if (svcpt->scp_nhreqs_active == 0)
1943                 return true;
1944
1945         return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
1946                svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
1947 }
1948
1949 static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
1950                                        bool force)
1951 {
1952         return ptlrpc_server_allow_high(svcpt, force) &&
1953                ptlrpc_nrs_req_pending_nolock(svcpt, true);
1954 }
1955
1956 /**
1957  * Only allow normal priority requests on a service that has a high-priority
1958  * queue if forced (i.e. cleanup), if there are other high priority requests
1959  * already being processed (i.e. those threads can service more high-priority
1960  * requests), or if there are enough idle threads that a later thread can do
1961  * a high priority request.
1962  * User can call it w/o any lock but need to hold
1963  * ptlrpc_service_part::scp_req_lock to get reliable result
1964  */
1965 static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
1966                                        bool force)
1967 {
1968         int running = svcpt->scp_nthrs_running;
1969
1970         if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
1971                      CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
1972                 /* leave just 1 thread for normal RPCs */
1973                 running = PTLRPC_NTHRS_INIT;
1974                 if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
1975                         running += 1;
1976         }
1977
1978         if (force)
1979                 return true;
1980
1981         if (ptlrpc_nrs_req_throttling_nolock(svcpt, false))
1982                 return false;
1983
1984         if (svcpt->scp_nreqs_active < running - 2)
1985                 return true;
1986
1987         if (svcpt->scp_nreqs_active >= running - 1)
1988                 return false;
1989
1990         return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
1991 }
1992
1993 static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
1994                                          bool force)
1995 {
1996         return ptlrpc_server_allow_normal(svcpt, force) &&
1997                ptlrpc_nrs_req_pending_nolock(svcpt, false);
1998 }
1999
2000 /**
2001  * Returns true if there are requests available in incoming
2002  * request queue for processing and it is allowed to fetch them.
2003  * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
2004  * to get reliable result
2005  * \see ptlrpc_server_allow_normal
2006  * \see ptlrpc_server_allow high
2007  */
2008 static inline
2009 bool ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt,
2010                                    bool force)
2011 {
2012         return ptlrpc_server_high_pending(svcpt, force) ||
2013                ptlrpc_server_normal_pending(svcpt, force);
2014 }
2015
2016 /**
2017  * Fetch a request for processing from queue of unprocessed requests.
2018  * Favors high-priority requests.
2019  * Returns a pointer to fetched request.
2020  */
2021 static struct ptlrpc_request *
2022 ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
2023 {
2024         struct ptlrpc_request *req = NULL;
2025
2026         ENTRY;
2027
2028         spin_lock(&svcpt->scp_req_lock);
2029
2030         if (ptlrpc_server_high_pending(svcpt, force)) {
2031                 req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
2032                 if (req != NULL) {
2033                         svcpt->scp_hreq_count++;
2034                         goto got_request;
2035                 }
2036         }
2037
2038         if (ptlrpc_server_normal_pending(svcpt, force)) {
2039                 req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
2040                 if (req != NULL) {
2041                         svcpt->scp_hreq_count = 0;
2042                         goto got_request;
2043                 }
2044         }
2045
2046         spin_unlock(&svcpt->scp_req_lock);
2047         RETURN(NULL);
2048
2049 got_request:
2050         svcpt->scp_nreqs_active++;
2051         if (req->rq_hp)
2052                 svcpt->scp_nhreqs_active++;
2053
2054         spin_unlock(&svcpt->scp_req_lock);
2055
2056         if (likely(req->rq_export))
2057                 class_export_rpc_inc(req->rq_export);
2058
2059         RETURN(req);
2060 }
2061
2062 /**
2063  * Handle freshly incoming reqs, add to timed early reply list,
2064  * pass on to regular request queue.
2065  * All incoming requests pass through here before getting into
2066  * ptlrpc_server_handle_req later on.
2067  */
2068 static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
2069                                        struct ptlrpc_thread *thread)
2070 {
2071         struct ptlrpc_service *svc = svcpt->scp_service;
2072         struct ptlrpc_request *req;
2073         __u32 deadline;
2074         __u32 opc;
2075         int rc;
2076
2077         ENTRY;
2078
2079         spin_lock(&svcpt->scp_lock);
2080         if (list_empty(&svcpt->scp_req_incoming)) {
2081                 spin_unlock(&svcpt->scp_lock);
2082                 RETURN(0);
2083         }
2084
2085         req = list_first_entry(&svcpt->scp_req_incoming,
2086                                struct ptlrpc_request, rq_list);
2087         list_del_init(&req->rq_list);
2088         svcpt->scp_nreqs_incoming--;
2089         /*
2090          * Consider this still a "queued" request as far as stats are
2091          * concerned
2092          */
2093         spin_unlock(&svcpt->scp_lock);
2094
2095         /* go through security check/transform */
2096         CDEBUG(D_RPCTRACE, "unwrap req x%llu\n", req->rq_xid);
2097         rc = sptlrpc_svc_unwrap_request(req);
2098         switch (rc) {
2099         case SECSVC_OK:
2100                 break;
2101         case SECSVC_COMPLETE:
2102                 target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
2103                 goto err_req;
2104         case SECSVC_DROP:
2105                 goto err_req;
2106         default:
2107                 LBUG();
2108         }
2109
2110         /*
2111          * for null-flavored rpc, msg has been unpacked by sptlrpc, although
2112          * redo it wouldn't be harmful.
2113          */
2114         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
2115                 rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
2116                 if (rc != 0) {
2117                         CERROR("error unpacking request: ptl %d from %s x%llu\n",
2118                                svc->srv_req_portal, libcfs_idstr(&req->rq_peer),
2119                                req->rq_xid);
2120                         goto err_req;
2121                 }
2122         }
2123
2124         rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
2125         if (rc) {
2126                 CERROR("error unpacking ptlrpc body: ptl %d from %s x %llu\n",
2127                        svc->srv_req_portal, libcfs_idstr(&req->rq_peer),
2128                        req->rq_xid);
2129                 goto err_req;
2130         }
2131
2132         opc = lustre_msg_get_opc(req->rq_reqmsg);
2133         if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
2134             opc == cfs_fail_val) {
2135                 CERROR("drop incoming rpc opc %u, x%llu\n",
2136                        cfs_fail_val, req->rq_xid);
2137                 goto err_req;
2138         }
2139
2140         rc = -EINVAL;
2141         if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
2142                 CERROR("wrong packet type received (type=%u) from %s\n",
2143                        lustre_msg_get_type(req->rq_reqmsg),
2144                        libcfs_idstr(&req->rq_peer));
2145                 goto err_req;
2146         }
2147
2148         switch (opc) {
2149         case MDS_WRITEPAGE:
2150         case OST_WRITE:
2151         case OUT_UPDATE:
2152                 req->rq_bulk_write = 1;
2153                 break;
2154         case MDS_READPAGE:
2155         case OST_READ:
2156         case MGS_CONFIG_READ:
2157                 req->rq_bulk_read = 1;
2158                 break;
2159         }
2160
2161         CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid);
2162
2163         req->rq_export = class_conn2export(
2164                 lustre_msg_get_handle(req->rq_reqmsg));
2165         if (req->rq_export) {
2166                 rc = ptlrpc_check_req(req);
2167                 if (rc == 0) {
2168                         rc = sptlrpc_target_export_check(req->rq_export, req);
2169                         if (rc)
2170                                 DEBUG_REQ(D_ERROR, req,
2171                                           "DROPPING req with illegal security flavor");
2172                 }
2173
2174                 if (rc)
2175                         goto err_req;
2176                 ptlrpc_update_export_timer(req->rq_export, 0);
2177         }
2178
2179         /* req_in handling should/must be fast */
2180         if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5)
2181                 DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds",
2182                           ktime_get_real_seconds() -
2183                           req->rq_arrival_time.tv_sec);
2184
2185         /* Set rpc server deadline and add it to the timed list */
2186         deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
2187                     MSGHDR_AT_SUPPORT) ?
2188                     /* The max time the client expects us to take */
2189                     lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
2190
2191         req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
2192         if (unlikely(deadline == 0)) {
2193                 DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
2194                 goto err_req;
2195         }
2196
2197         /* Skip early reply */
2198         if (CFS_FAIL_PRECHECK(OBD_FAIL_MDS_RESEND))
2199                 req->rq_deadline += obd_timeout;
2200
2201         req->rq_svc_thread = thread;
2202         if (thread != NULL) {
2203                 /*
2204                  * initialize request session, it is needed for request
2205                  * processing by target
2206                  */
2207                 rc = lu_context_init(&req->rq_session, LCT_SERVER_SESSION |
2208                                                        LCT_NOREF);
2209                 if (rc) {
2210                         CERROR("%s: failure to initialize session: rc = %d\n",
2211                                thread->t_name, rc);
2212                         goto err_req;
2213                 }
2214                 req->rq_session.lc_thread = thread;
2215                 lu_context_enter(&req->rq_session);
2216                 thread->t_env->le_ses = &req->rq_session;
2217         }
2218
2219
2220         if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_ENQ_RESEND) &&
2221                      (opc == LDLM_ENQUEUE) &&
2222                      (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
2223                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_ENQ_RESEND, 6);
2224
2225         ptlrpc_at_add_timed(req);
2226
2227         if (opc != OST_CONNECT && opc != MDS_CONNECT &&
2228             opc != MGS_CONNECT && req->rq_export != NULL) {
2229                 if (exp_connect_flags2(req->rq_export) & OBD_CONNECT2_REP_MBITS)
2230                         req->rq_rep_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
2231         }
2232
2233         /* Move it over to the request processing queue */
2234         rc = ptlrpc_server_request_add(svcpt, req);
2235         if (rc)
2236                 GOTO(err_req, rc);
2237
2238         wake_up(&svcpt->scp_waitq);
2239         RETURN(1);
2240
2241 err_req:
2242         CDEBUG(D_RPCTRACE, "finish req x%llu\n", req->rq_xid);
2243         ptlrpc_server_finish_request(svcpt, req);
2244
2245         RETURN(1);
2246 }
2247
2248 /**
2249  * Main incoming request handling logic.
2250  * Calls handler function from service to do actual processing.
2251  */
2252 static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
2253                                         struct ptlrpc_thread *thread)
2254 {
2255         struct ptlrpc_service *svc = svcpt->scp_service;
2256         struct ptlrpc_request *request;
2257         ktime_t work_start;
2258         ktime_t work_end;
2259         ktime_t arrived;
2260         s64 timediff_usecs;
2261         s64 arrived_usecs;
2262         int fail_opc = 0;
2263         struct obd_device *obd = NULL;
2264
2265         ENTRY;
2266
2267         request = ptlrpc_server_request_get(svcpt, false);
2268         if (request == NULL)
2269                 RETURN(0);
2270
2271         if (request->rq_export)
2272                 obd = request->rq_export->exp_obd;
2273
2274         if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
2275                 fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
2276         else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
2277                 fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
2278
2279         if (unlikely(fail_opc)) {
2280                 if (request->rq_export && request->rq_ops)
2281                         CFS_FAIL_TIMEOUT(fail_opc, 4);
2282         }
2283
2284         ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
2285
2286         if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
2287                 libcfs_debug_dumplog();
2288
2289         work_start = ktime_get_real();
2290         arrived = timespec64_to_ktime(request->rq_arrival_time);
2291         timediff_usecs = ktime_us_delta(work_start, arrived);
2292         if (likely(svc->srv_stats != NULL)) {
2293                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
2294                                     timediff_usecs);
2295                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
2296                                     svcpt->scp_nreqs_incoming);
2297                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
2298                                     svcpt->scp_nreqs_active);
2299                 lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
2300                                     obd_at_get(obd, &svcpt->scp_at_estimate));
2301         }
2302
2303         if (likely(request->rq_export)) {
2304                 if (unlikely(ptlrpc_check_req(request)))
2305                         goto put_conn;
2306                 ptlrpc_update_export_timer(request->rq_export,
2307                                            div_u64(timediff_usecs,
2308                                                    USEC_PER_SEC / 2));
2309         }
2310
2311         /*
2312          * Discard requests queued for longer than the deadline.
2313          * The deadline is increased if we send an early reply.
2314          */
2315         if (ktime_get_real_seconds() > request->rq_deadline) {
2316                 DEBUG_REQ(D_ERROR, request,
2317                           "Dropping timed-out request from %s: deadline %lld/%llds ago",
2318                           libcfs_idstr(&request->rq_peer),
2319                           request->rq_deadline -
2320                           request->rq_arrival_time.tv_sec,
2321                           ktime_get_real_seconds() - request->rq_deadline);
2322                 goto put_conn;
2323         }
2324
2325         CDEBUG(D_RPCTRACE,
2326                "Handling RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s\n",
2327                request, current->comm,
2328                (request->rq_export ?
2329                 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
2330                (request->rq_export ?
2331                 refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
2332                lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
2333                libcfs_idstr(&request->rq_peer),
2334                lustre_msg_get_opc(request->rq_reqmsg),
2335                lustre_msg_get_jobid(request->rq_reqmsg) ?: "");
2336
2337         if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
2338                 CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
2339
2340         CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
2341
2342         /* re-assign request and sesson thread to the current one */
2343         request->rq_svc_thread = thread;
2344         if (thread != NULL) {
2345                 LASSERT(request->rq_session.lc_thread == NULL);
2346                 request->rq_session.lc_thread = thread;
2347                 thread->t_env->le_ses = &request->rq_session;
2348         }
2349         svc->srv_ops.so_req_handler(request);
2350
2351         ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
2352
2353 put_conn:
2354         if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
2355                 DEBUG_REQ(D_WARNING, request,
2356                           "Request took longer than estimated (%lld/%llds); client may timeout",
2357                           request->rq_deadline -
2358                           request->rq_arrival_time.tv_sec,
2359                           ktime_get_real_seconds() - request->rq_deadline);
2360         }
2361
2362         work_end = ktime_get_real();
2363         timediff_usecs = ktime_us_delta(work_end, work_start);
2364         arrived_usecs = ktime_us_delta(work_end, arrived);
2365         CDEBUG(D_RPCTRACE,
2366                "Handled RPC req@%p pname:cluuid+ref:pid:xid:nid:opc:job %s:%s+%d:%d:x%llu:%s:%d:%s Request processed in %lldus (%lldus total) trans %llu rc %d/%d\n",
2367                request, current->comm,
2368                (request->rq_export ?
2369                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
2370                (request->rq_export ?
2371                 refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
2372                lustre_msg_get_status(request->rq_reqmsg),
2373                request->rq_xid,
2374                libcfs_idstr(&request->rq_peer),
2375                lustre_msg_get_opc(request->rq_reqmsg),
2376                lustre_msg_get_jobid(request->rq_reqmsg) ?: "",
2377                timediff_usecs,
2378                arrived_usecs,
2379                (request->rq_repmsg ?
2380                lustre_msg_get_transno(request->rq_repmsg) :
2381                request->rq_transno),
2382                request->rq_status,
2383                (request->rq_repmsg ?
2384                lustre_msg_get_status(request->rq_repmsg) : -999));
2385         if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
2386                 __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
2387                 int opc = opcode_offset(op);
2388
2389                 if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
2390                         LASSERT(opc < LUSTRE_MAX_OPCODES);
2391                         lprocfs_counter_add(svc->srv_stats,
2392                                             opc + EXTRA_MAX_OPCODES,
2393                                             timediff_usecs);
2394                 }
2395         }
2396         if (unlikely(request->rq_early_count)) {
2397                 DEBUG_REQ(D_ADAPTTO, request,
2398                           "sent %d early replies before finishing in %llds",
2399                           request->rq_early_count,
2400                           div_u64(arrived_usecs, USEC_PER_SEC));
2401         }
2402
2403         ptlrpc_server_finish_active_request(svcpt, request);
2404
2405         RETURN(1);
2406 }
2407
2408 /**
2409  * An internal function to process a single reply state object.
2410  */
2411 static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
2412 {
2413         struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
2414         struct ptlrpc_service *svc = svcpt->scp_service;
2415         struct obd_export *exp;
2416         int nlocks;
2417         int been_handled;
2418
2419         ENTRY;
2420
2421         exp = rs->rs_export;
2422
2423         LASSERT(rs->rs_difficult);
2424         LASSERT(rs->rs_scheduled);
2425         LASSERT(list_empty(&rs->rs_list));
2426
2427         /*
2428          * The disk commit callback holds exp_uncommitted_replies_lock while it
2429          * iterates over newly committed replies, removing them from
2430          * exp_uncommitted_replies.  It then drops this lock and schedules the
2431          * replies it found for handling here.
2432          *
2433          * We can avoid contention for exp_uncommitted_replies_lock between the
2434          * HRT threads and further commit callbacks by checking rs_committed
2435          * which is set in the commit callback while it holds both
2436          * rs_lock and exp_uncommitted_reples.
2437          *
2438          * If we see rs_committed clear, the commit callback _may_ not have
2439          * handled this reply yet and we race with it to grab
2440          * exp_uncommitted_replies_lock before removing the reply from
2441          * exp_uncommitted_replies.  Note that if we lose the race and the
2442          * reply has already been removed, list_del_init() is a noop.
2443          *
2444          * If we see rs_committed set, we know the commit callback is handling,
2445          * or has handled this reply since store reordering might allow us to
2446          * see rs_committed set out of sequence.  But since this is done
2447          * holding rs_lock, we can be sure it has all completed once we hold
2448          * rs_lock, which we do right next.
2449          */
2450         if (!rs->rs_committed) {
2451                 spin_lock(&exp->exp_uncommitted_replies_lock);
2452                 list_del_init(&rs->rs_obd_list);
2453                 spin_unlock(&exp->exp_uncommitted_replies_lock);
2454         }
2455
2456         spin_lock(&exp->exp_lock);
2457         /* Noop if removed already */
2458         list_del_init(&rs->rs_exp_list);
2459         spin_unlock(&exp->exp_lock);
2460
2461         spin_lock(&rs->rs_lock);
2462
2463         been_handled = rs->rs_handled;
2464         rs->rs_handled = 1;
2465
2466         nlocks = rs->rs_nlocks; /* atomic "steal", but */
2467         rs->rs_nlocks = 0; /* locks still on rs_locks! */
2468
2469         if (nlocks == 0 && !been_handled) {
2470                 /*
2471                  * If we see this, we should already have seen the warning
2472                  * in mds_steal_ack_locks()
2473                  */
2474                 CDEBUG(D_HA,
2475                        "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n",
2476                        rs, rs->rs_xid, rs->rs_transno, rs->rs_opc,
2477                        libcfs_nidstr(&exp->exp_connection->c_peer.nid));
2478         }
2479
2480         if ((rs->rs_sent && !rs->rs_unlinked) || nlocks > 0) {
2481                 spin_unlock(&rs->rs_lock);
2482
2483                 /* We can unlink if the LNET_EVENT_SEND has occurred.
2484                  * If rs_unlinked is set then MD is already unlinked and no
2485                  * need to do so here.
2486                  */
2487                 if ((rs->rs_sent && !rs->rs_unlinked)) {
2488                         LNetMDUnlink(rs->rs_md_h);
2489                         /* Ignore return code; we're racing with completion */
2490                 }
2491
2492                 while (nlocks-- > 0)
2493                         ldlm_lock_decref(&rs->rs_locks[nlocks], LCK_TXN);
2494
2495                 spin_lock(&rs->rs_lock);
2496         }
2497
2498         rs->rs_scheduled = 0;
2499
2500         if (rs->rs_unlinked) {
2501                 /* Off the net */
2502                 spin_unlock(&rs->rs_lock);
2503
2504                 class_export_put(exp);
2505                 rs->rs_export = NULL;
2506                 ptlrpc_rs_decref(rs);
2507                 if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
2508                     svc->srv_is_stopping)
2509                         wake_up_all(&svcpt->scp_waitq);
2510                 RETURN(1);
2511         }
2512
2513         /* still on the net; callback will schedule */
2514         spin_unlock(&rs->rs_lock);
2515         RETURN(1);
2516 }
2517
2518
2519 static void ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
2520 {
2521         int avail = svcpt->scp_nrqbds_posted;
2522         int low_water = test_req_buffer_pressure ? 0 :
2523                         svcpt->scp_service->srv_nbuf_per_group / 2;
2524
2525         /* NB I'm not locking; just looking. */
2526
2527         /*
2528          * CAVEAT EMPTOR: We might be allocating buffers here because we've
2529          * allowed the request history to grow out of control.  We could put a
2530          * sanity check on that here and cull some history if we need the
2531          * space.
2532          */
2533
2534         if (avail <= low_water)
2535                 ptlrpc_grow_req_bufs(svcpt, 1);
2536
2537         if (svcpt->scp_service->srv_stats) {
2538                 lprocfs_counter_add(svcpt->scp_service->srv_stats,
2539                                     PTLRPC_REQBUF_AVAIL_CNTR, avail);
2540         }
2541 }
2542
2543 static inline int ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
2544 {
2545         return svcpt->scp_nreqs_active <
2546                svcpt->scp_nthrs_running - 1 -
2547                (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
2548 }
2549
2550 /**
2551  * allowed to create more threads
2552  * user can call it w/o any lock but need to hold
2553  * ptlrpc_service_part::scp_lock to get reliable result
2554  */
2555 static inline int ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
2556 {
2557         return svcpt->scp_nthrs_running +
2558                svcpt->scp_nthrs_starting <
2559                svcpt->scp_service->srv_nthrs_cpt_limit;
2560 }
2561
2562 /**
2563  * too many requests and allowed to create more threads
2564  */
2565 static inline int ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
2566 {
2567         return !ptlrpc_threads_enough(svcpt) &&
2568                 ptlrpc_threads_increasable(svcpt);
2569 }
2570
2571 static inline int ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
2572 {
2573         return thread_is_stopping(thread) ||
2574                thread->t_svcpt->scp_service->srv_is_stopping;
2575 }
2576
2577 /* stop the highest numbered thread if there are too many threads running */
2578 static inline bool ptlrpc_thread_should_stop(struct ptlrpc_thread *thread)
2579 {
2580         struct ptlrpc_service_part *svcpt = thread->t_svcpt;
2581
2582         return thread->t_id >= svcpt->scp_service->srv_nthrs_cpt_limit &&
2583                 thread->t_id == svcpt->scp_thr_nextid - 1;
2584 }
2585
2586 static void ptlrpc_stop_thread(struct ptlrpc_thread *thread)
2587 {
2588         CDEBUG(D_INFO, "Stopping thread %s #%u\n",
2589                thread->t_svcpt->scp_service->srv_thread_name, thread->t_id);
2590         thread_add_flags(thread, SVC_STOPPING);
2591 }
2592
2593 static inline void ptlrpc_thread_stop(struct ptlrpc_thread *thread)
2594 {
2595         struct ptlrpc_service_part *svcpt = thread->t_svcpt;
2596
2597         spin_lock(&svcpt->scp_lock);
2598         if (ptlrpc_thread_should_stop(thread)) {
2599                 ptlrpc_stop_thread(thread);
2600                 svcpt->scp_thr_nextid--;
2601         }
2602         spin_unlock(&svcpt->scp_lock);
2603 }
2604
2605 static inline int ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
2606 {
2607         return !list_empty(&svcpt->scp_rqbd_idle) &&
2608                svcpt->scp_rqbd_timeout == 0;
2609 }
2610
2611 static inline int
2612 ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
2613 {
2614         return svcpt->scp_at_check;
2615 }
2616
2617 /*
2618  * If a thread runs too long or spends to much time on a single request,
2619  * we want to know about it, so we set up a delayed work item as a watchdog.
2620  * If it fires, we display a stack trace of the delayed thread,
2621  * providing we aren't rate-limited
2622  *
2623  * Watchdog stack traces are limited to 3 per 'libcfs_watchdog_ratelimit'
2624  * seconds
2625  */
2626 static struct ratelimit_state watchdog_limit;
2627
2628 static void ptlrpc_watchdog_fire(struct work_struct *w)
2629 {
2630         struct ptlrpc_thread *thread = container_of(w, struct ptlrpc_thread,
2631                                                     t_watchdog.work);
2632         u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched);
2633         u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC);
2634
2635         /* ___ratelimit() returns true if the action is NOT ratelimited */
2636         if (__ratelimit(&watchdog_limit)) {
2637                 /* below message is checked in sanity-quota.sh test_6,18 */
2638                 LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
2639                               thread->t_task->comm, thread->t_task->pid,
2640                               ms_lapse, ms_frac);
2641
2642                 libcfs_debug_dumpstack(thread->t_task);
2643         } else {
2644                 /* below message is checked in sanity-quota.sh test_6,18 */
2645                 LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. Watchdog stack traces are limited to 3 per %u seconds, skipping this one.\n",
2646                               thread->t_task->comm, thread->t_task->pid,
2647                               ms_lapse, ms_frac, libcfs_watchdog_ratelimit);
2648         }
2649 }
2650
2651 void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout)
2652 {
2653         INIT_DELAYED_WORK(work, ptlrpc_watchdog_fire);
2654         schedule_delayed_work(work, cfs_time_seconds(timeout));
2655 }
2656
2657 void ptlrpc_watchdog_disable(struct delayed_work *work)
2658 {
2659         cancel_delayed_work_sync(work);
2660 }
2661
2662 void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout)
2663 {
2664         struct ptlrpc_thread *thread = container_of(&work->work,
2665                                                     struct ptlrpc_thread,
2666                                                     t_watchdog.work);
2667         thread->t_touched = ktime_get();
2668         mod_delayed_work(system_wq, work, cfs_time_seconds(timeout));
2669 }
2670
2671 /**
2672  * requests wait on preprocessing
2673  * user can call it w/o any lock but need to hold
2674  * ptlrpc_service_part::scp_lock to get reliable result
2675  */
2676 static inline int
2677 ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
2678 {
2679         return !list_empty(&svcpt->scp_req_incoming);
2680 }
2681
2682 static __attribute__((__noinline__)) int
2683 ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
2684                   struct ptlrpc_thread *thread)
2685 {
2686         ptlrpc_watchdog_disable(&thread->t_watchdog);
2687
2688         cond_resched();
2689
2690         if (svcpt->scp_rqbd_timeout == 0)
2691                 /* Don't exit while there are replies to be handled */
2692                 wait_event_idle_exclusive_lifo(
2693                         svcpt->scp_waitq,
2694                         ptlrpc_thread_stopping(thread) ||
2695                         ptlrpc_server_request_incoming(svcpt) ||
2696                         ptlrpc_server_request_pending(svcpt, false) ||
2697                         ptlrpc_rqbd_pending(svcpt) ||
2698                         ptlrpc_at_check(svcpt));
2699         else if (wait_event_idle_exclusive_lifo_timeout(
2700                          svcpt->scp_waitq,
2701                          ptlrpc_thread_stopping(thread) ||
2702                          ptlrpc_server_request_incoming(svcpt) ||
2703                          ptlrpc_server_request_pending(svcpt, false) ||
2704                          ptlrpc_rqbd_pending(svcpt) ||
2705                          ptlrpc_at_check(svcpt),
2706                          svcpt->scp_rqbd_timeout) == 0)
2707                 svcpt->scp_rqbd_timeout = 0;
2708
2709         if (ptlrpc_thread_stopping(thread))
2710                 return -EINTR;
2711
2712         ptlrpc_watchdog_touch(&thread->t_watchdog,
2713                               ptlrpc_server_get_timeout(svcpt));
2714         return 0;
2715 }
2716
2717 #ifdef HAVE_SERVER_SUPPORT
2718 # ifdef HAVE_FLUSH_DELAYED_FPUT
2719 #  define cfs_flush_delayed_fput() flush_delayed_fput()
2720 # else
2721 void (*cfs_flush_delayed_fput)(void);
2722 # endif /* HAVE_FLUSH_DELAYED_FPUT */
2723 #else /* !HAVE_SERVER_SUPPORT */
2724 #define cfs_flush_delayed_fput() do {} while (0)
2725 #endif /* HAVE_SERVER_SUPPORT */
2726
2727 /**
2728  * Main thread body for service threads.
2729  * Waits in a loop waiting for new requests to process to appear.
2730  * Every time an incoming requests is added to its queue, a waitq
2731  * is woken up and one of the threads will handle it.
2732  */
2733 static int ptlrpc_main(void *arg)
2734 {
2735         struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
2736         struct ptlrpc_service_part *svcpt = thread->t_svcpt;
2737         struct ptlrpc_service *svc = svcpt->scp_service;
2738         struct ptlrpc_reply_state *rs;
2739         struct group_info *ginfo = NULL;
2740         struct lu_env *env;
2741         int counter = 0, rc = 0;
2742
2743         ENTRY;
2744         unshare_fs_struct();
2745
2746         thread->t_task = current;
2747         thread->t_pid = current->pid;
2748
2749         if (svc->srv_cpt_bind) {
2750                 rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
2751                 if (rc != 0) {
2752                         CWARN("%s: failed to bind %s on CPT %d\n",
2753                               svc->srv_name, thread->t_name, svcpt->scp_cpt);
2754                 }
2755         }
2756
2757         ginfo = groups_alloc(0);
2758         if (!ginfo)
2759                 GOTO(out, rc = -ENOMEM);
2760
2761         set_current_groups(ginfo);
2762         put_group_info(ginfo);
2763
2764         if (svc->srv_ops.so_thr_init != NULL) {
2765                 rc = svc->srv_ops.so_thr_init(thread);
2766                 if (rc)
2767                         GOTO(out, rc);
2768         }
2769
2770         OBD_ALLOC_PTR(env);
2771         if (env == NULL)
2772                 GOTO(out_srv_fini, rc = -ENOMEM);
2773         rc = lu_env_add(env);
2774         if (rc)
2775                 GOTO(out_env, rc);
2776
2777         rc = lu_context_init(&env->le_ctx,
2778                              svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
2779         if (rc)
2780                 GOTO(out_env_remove, rc);
2781
2782         thread->t_env = env;
2783         env->le_ctx.lc_thread = thread;
2784         env->le_ctx.lc_cookie = 0x6;
2785
2786         while (!list_empty(&svcpt->scp_rqbd_idle)) {
2787                 rc = ptlrpc_server_post_idle_rqbds(svcpt);
2788                 if (rc >= 0)
2789                         continue;
2790
2791                 CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
2792                         svc->srv_name, svcpt->scp_cpt, rc);
2793                 GOTO(out_ctx_fini, rc);
2794         }
2795
2796         /* Alloc reply state structure for this one */
2797         OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
2798         if (!rs)
2799                 GOTO(out_ctx_fini, rc = -ENOMEM);
2800
2801         spin_lock(&svcpt->scp_lock);
2802
2803         LASSERT(thread_is_starting(thread));
2804         thread_clear_flags(thread, SVC_STARTING);
2805
2806         LASSERT(svcpt->scp_nthrs_starting == 1);
2807         svcpt->scp_nthrs_starting--;
2808
2809         /*
2810          * SVC_STOPPING may already be set here if someone else is trying
2811          * to stop the service while this new thread has been dynamically
2812          * forked. We still set SVC_RUNNING to let our creator know that
2813          * we are now running, however we will exit as soon as possible
2814          */
2815         thread_add_flags(thread, SVC_RUNNING);
2816         svcpt->scp_nthrs_running++;
2817         spin_unlock(&svcpt->scp_lock);
2818
2819         /* wake up our creator in case he's still waiting. */
2820         wake_up(&thread->t_ctl_waitq);
2821
2822         thread->t_touched = ktime_get();
2823         ptlrpc_watchdog_init(&thread->t_watchdog,
2824                          ptlrpc_server_get_timeout(svcpt));
2825
2826         spin_lock(&svcpt->scp_rep_lock);
2827         list_add(&rs->rs_list, &svcpt->scp_rep_idle);
2828         wake_up(&svcpt->scp_rep_waitq);
2829         spin_unlock(&svcpt->scp_rep_lock);
2830
2831         CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
2832                svcpt->scp_nthrs_running);
2833
2834 #ifdef HAVE_SERVER_SUPPORT
2835 #ifndef HAVE_FLUSH_DELAYED_FPUT
2836         if (unlikely(cfs_flush_delayed_fput == NULL))
2837                 cfs_flush_delayed_fput =
2838                         cfs_kallsyms_lookup_name("flush_delayed_fput");
2839 #endif
2840 #endif
2841         /* XXX maintain a list of all managed devices: insert here */
2842         while (!ptlrpc_thread_stopping(thread)) {
2843                 bool idle = true;
2844
2845                 if (ptlrpc_wait_event(svcpt, thread))
2846                         break;
2847
2848                 ptlrpc_check_rqbd_pool(svcpt);
2849
2850                 if (ptlrpc_threads_need_create(svcpt)) {
2851                         /* Ignore return code - we tried... */
2852                         ptlrpc_start_thread(svcpt, 0);
2853                         idle = false;
2854                 }
2855
2856                 /* reset le_ses to initial state */
2857                 env->le_ses = NULL;
2858                 /* Refill the context before execution to make sure
2859                  * all thread keys are allocated */
2860                 lu_env_refill(env);
2861                 /* Process all incoming reqs before handling any */
2862                 if (ptlrpc_server_request_incoming(svcpt)) {
2863                         lu_context_enter(&env->le_ctx);
2864                         ptlrpc_server_handle_req_in(svcpt, thread);
2865                         lu_context_exit(&env->le_ctx);
2866
2867                         /* but limit ourselves in case of flood */
2868                         if (counter++ < 100)
2869                                 continue;
2870                         counter = 0;
2871                         idle = false;
2872                 }
2873
2874                 if (ptlrpc_at_check(svcpt))
2875                         ptlrpc_at_check_timed(svcpt);
2876
2877                 if (ptlrpc_server_request_pending(svcpt, false)) {
2878                         lu_context_enter(&env->le_ctx);
2879                         ptlrpc_server_handle_request(svcpt, thread);
2880                         lu_context_exit(&env->le_ctx);
2881                         idle = false;
2882                 }
2883
2884                 if (ptlrpc_rqbd_pending(svcpt) &&
2885                     ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
2886                         /*
2887                          * I just failed to repost request buffers.
2888                          * Wait for a timeout (unless something else
2889                          * happens) before I try again
2890                          */
2891                         svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
2892                         CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
2893                                svcpt->scp_nrqbds_posted);
2894                         idle = false;
2895                 }
2896
2897                 /* If nothing to do, flush old alloc_file_pseudo() descriptors.
2898                  * This has internal atomicity so it is OK to call often.
2899                  * We could also do other idle tasks at this time.
2900                  */
2901                 if (idle)
2902                         cfs_flush_delayed_fput();
2903
2904                 /*
2905                  * If the number of threads has been tuned downward and this
2906                  * thread should be stopped, then stop in reverse order so the
2907                  * the threads always have contiguous thread index values.
2908                  */
2909                 if (unlikely(ptlrpc_thread_should_stop(thread)))
2910                         ptlrpc_thread_stop(thread);
2911         }
2912
2913         ptlrpc_watchdog_disable(&thread->t_watchdog);
2914
2915 out_ctx_fini:
2916         lu_context_fini(&env->le_ctx);
2917 out_env_remove:
2918         lu_env_remove(env);
2919 out_env:
2920         OBD_FREE_PTR(env);
2921 out_srv_fini:
2922         /* deconstruct service thread state created by ptlrpc_start_thread() */
2923         if (svc->srv_ops.so_thr_done != NULL)
2924                 svc->srv_ops.so_thr_done(thread);
2925 out:
2926         CDEBUG(D_RPCTRACE, "%s: service thread [%p:%u] %d exiting: rc = %d\n",
2927                thread->t_name, thread, thread->t_pid, thread->t_id, rc);
2928         spin_lock(&svcpt->scp_lock);
2929         if (thread_test_and_clear_flags(thread, SVC_STARTING))
2930                 svcpt->scp_nthrs_starting--;
2931
2932         if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
2933                 /* must know immediately */
2934                 svcpt->scp_nthrs_running--;
2935         }
2936
2937         thread->t_id = rc;
2938         thread_add_flags(thread, SVC_STOPPED);
2939
2940         wake_up(&thread->t_ctl_waitq);
2941         spin_unlock(&svcpt->scp_lock);
2942
2943         return rc;
2944 }
2945
2946 static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
2947                           struct list_head *replies)
2948 {
2949         int result;
2950
2951         spin_lock(&hrt->hrt_lock);
2952
2953         list_splice_init(&hrt->hrt_queue, replies);
2954         result = ptlrpc_hr.hr_stopping || !list_empty(replies);
2955
2956         spin_unlock(&hrt->hrt_lock);
2957         return result;
2958 }
2959
2960 /**
2961  * Main body of "handle reply" function.
2962  * It processes acked reply states
2963  */
2964 static int ptlrpc_hr_main(void *arg)
2965 {
2966         struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg;
2967         struct ptlrpc_hr_partition *hrp = hrt->hrt_partition;
2968         LIST_HEAD(replies);
2969         struct lu_env *env;
2970         int rc;
2971
2972         unshare_fs_struct();
2973         OBD_ALLOC_PTR(env);
2974         if (env == NULL)
2975                 RETURN(-ENOMEM);
2976
2977         rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
2978         if (rc != 0) {
2979                 char threadname[20];
2980
2981                 snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
2982                          hrp->hrp_cpt, hrt->hrt_id);
2983                 CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
2984                       threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
2985         }
2986
2987         rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD |
2988                              LCT_REMEMBER | LCT_NOREF);
2989         if (rc)
2990                 GOTO(out_env, rc);
2991
2992         rc = lu_env_add(env);
2993         if (rc)
2994                 GOTO(out_ctx_fini, rc);
2995
2996         atomic_inc(&hrp->hrp_nstarted);
2997         wake_up(&ptlrpc_hr.hr_waitq);
2998
2999         while (!ptlrpc_hr.hr_stopping) {
3000                 wait_event_idle(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
3001
3002                 while (!list_empty(&replies)) {
3003                         struct ptlrpc_reply_state *rs;
3004
3005                         rs = list_entry(replies.prev,
3006                                         struct ptlrpc_reply_state,
3007                                         rs_list);
3008                         list_del_init(&rs->rs_list);
3009                         /* refill keys if needed */
3010                         lu_env_refill(env);
3011                         lu_context_enter(&env->le_ctx);
3012                         ptlrpc_handle_rs(rs);
3013                         lu_context_exit(&env->le_ctx);
3014                 }
3015         }
3016
3017         atomic_inc(&hrp->hrp_nstopped);
3018         wake_up(&ptlrpc_hr.hr_waitq);
3019
3020         lu_env_remove(env);
3021 out_ctx_fini:
3022         lu_context_fini(&env->le_ctx);
3023 out_env:
3024         OBD_FREE_PTR(env);
3025         return 0;
3026 }
3027
3028 static void ptlrpc_stop_hr_threads(void)
3029 {
3030         struct ptlrpc_hr_partition *hrp;
3031         int i;
3032         int j;
3033
3034         ptlrpc_hr.hr_stopping = 1;
3035
3036         cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
3037                 if (hrp->hrp_thrs == NULL)
3038                         continue; /* uninitialized */
3039                 for (j = 0; j < hrp->hrp_nthrs; j++)
3040                         wake_up(&hrp->hrp_thrs[j].hrt_waitq);
3041         }
3042
3043         cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
3044                 if (hrp->hrp_thrs == NULL)
3045                         continue; /* uninitialized */
3046                 wait_event(ptlrpc_hr.hr_waitq,
3047                                atomic_read(&hrp->hrp_nstopped) ==
3048                                atomic_read(&hrp->hrp_nstarted));
3049         }
3050 }
3051
3052 static int ptlrpc_start_hr_threads(void)
3053 {
3054         struct ptlrpc_hr_partition *hrp;
3055         int i;
3056         int j;
3057
3058         ENTRY;
3059
3060         cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
3061                 int     rc = 0;
3062
3063                 for (j = 0; j < hrp->hrp_nthrs; j++) {
3064                         struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
3065                         struct task_struct *task;
3066
3067                         task = kthread_run(ptlrpc_hr_main,
3068                                            &hrp->hrp_thrs[j],
3069                                            "ptlrpc_hr%02d_%03d",
3070                                            hrp->hrp_cpt,
3071                                            hrt->hrt_id);
3072                         if (IS_ERR(task)) {
3073                                 rc = PTR_ERR(task);
3074                                 break;
3075                         }
3076                 }
3077
3078                 wait_event(ptlrpc_hr.hr_waitq,
3079                            atomic_read(&hrp->hrp_nstarted) == j);
3080
3081                 if (rc < 0) {
3082                         CERROR("cannot start reply handler thread %d:%d: rc = %d\n",
3083                                i, j, rc);
3084                         ptlrpc_stop_hr_threads();
3085                         RETURN(rc);
3086                 }
3087         }
3088
3089         RETURN(0);
3090 }
3091
3092 static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
3093 {
3094         struct ptlrpc_thread *thread;
3095         LIST_HEAD(zombie);
3096
3097         ENTRY;
3098
3099         CDEBUG(D_INFO, "Stopping threads for service %s\n",
3100                svcpt->scp_service->srv_name);
3101
3102         spin_lock(&svcpt->scp_lock);
3103         /* let the thread know that we would like it to stop asap */
3104         list_for_each_entry(thread, &svcpt->scp_threads, t_link)
3105                 ptlrpc_stop_thread(thread);
3106
3107         wake_up_all(&svcpt->scp_waitq);
3108
3109         while ((thread = list_first_entry_or_null(&svcpt->scp_threads,
3110                                                   struct ptlrpc_thread,
3111                                                   t_link)) != NULL) {
3112                 if (thread_is_stopped(thread)) {
3113                         list_move(&thread->t_link, &zombie);
3114                         continue;
3115                 }
3116                 spin_unlock(&svcpt->scp_lock);
3117
3118                 CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
3119                        svcpt->scp_service->srv_thread_name, thread->t_id);
3120                 wait_event_idle(thread->t_ctl_waitq,
3121                                 thread_is_stopped(thread));
3122
3123                 spin_lock(&svcpt->scp_lock);
3124         }
3125
3126         spin_unlock(&svcpt->scp_lock);
3127
3128         while ((thread = list_first_entry_or_null(&zombie,
3129                                                   struct ptlrpc_thread,
3130                                                   t_link)) != NULL) {
3131                 list_del(&thread->t_link);
3132                 OBD_FREE_PTR(thread);
3133         }
3134         EXIT;
3135 }
3136
3137 /**
3138  * Stops all threads of a particular service \a svc
3139  */
3140 static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
3141 {
3142         struct ptlrpc_service_part *svcpt;
3143         int i;
3144
3145         ENTRY;
3146
3147         ptlrpc_service_for_each_part(svcpt, i, svc) {
3148                 if (svcpt->scp_service != NULL)
3149                         ptlrpc_svcpt_stop_threads(svcpt);
3150         }
3151
3152         EXIT;
3153 }
3154
3155 static int ptlrpc_start_threads(struct ptlrpc_service *svc)
3156 {
3157         int rc = 0;
3158         int i;
3159         int j;
3160
3161         ENTRY;
3162
3163         /* We require 2 threads min, see note in ptlrpc_server_handle_request */
3164         LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
3165
3166         for (i = 0; i < svc->srv_ncpts; i++) {
3167                 for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
3168                         rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
3169                         if (rc == 0)
3170                                 continue;
3171
3172                         if (rc != -EMFILE)
3173                                 goto failed;
3174                         /* We have enough threads, don't start more. b=15759 */
3175                         break;
3176                 }
3177         }
3178
3179         RETURN(0);
3180  failed:
3181         CERROR("cannot start %s thread #%d_%d: rc %d\n",
3182                svc->srv_thread_name, i, j, rc);
3183         ptlrpc_stop_all_threads(svc);
3184         RETURN(rc);
3185 }
3186
3187 static int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
3188 {
3189         struct ptlrpc_thread *thread;
3190         struct ptlrpc_service *svc;
3191         struct task_struct *task;
3192         int rc;
3193
3194         ENTRY;
3195
3196         LASSERT(svcpt != NULL);
3197
3198         svc = svcpt->scp_service;
3199
3200         CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
3201                svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
3202                svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
3203
3204  again:
3205         if (unlikely(svc->srv_is_stopping))
3206                 RETURN(-ESRCH);
3207
3208         if (!ptlrpc_threads_increasable(svcpt) ||
3209             (CFS_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
3210              svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
3211                 RETURN(-EMFILE);
3212
3213         OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
3214         if (thread == NULL)
3215                 RETURN(-ENOMEM);
3216         init_waitqueue_head(&thread->t_ctl_waitq);
3217
3218         spin_lock(&svcpt->scp_lock);
3219         if (!ptlrpc_threads_increasable(svcpt)) {
3220                 spin_unlock(&svcpt->scp_lock);
3221                 OBD_FREE_PTR(thread);
3222                 RETURN(-EMFILE);
3223         }
3224
3225         if (svcpt->scp_nthrs_starting != 0) {
3226                 /*
3227                  * serialize starting because some modules (obdfilter)
3228                  * might require unique and contiguous t_id
3229                  */
3230                 LASSERT(svcpt->scp_nthrs_starting == 1);
3231                 spin_unlock(&svcpt->scp_lock);
3232                 OBD_FREE_PTR(thread);
3233                 if (wait) {
3234                         CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
3235                                svc->srv_thread_name, svcpt->scp_thr_nextid);
3236                         schedule();
3237                         goto again;
3238                 }
3239
3240                 CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
3241                        svc->srv_thread_name, svcpt->scp_thr_nextid);
3242                 RETURN(-EAGAIN);
3243         }
3244
3245         svcpt->scp_nthrs_starting++;
3246         thread->t_id = svcpt->scp_thr_nextid++;
3247         thread_add_flags(thread, SVC_STARTING);
3248         thread->t_svcpt = svcpt;
3249
3250         list_add(&thread->t_link, &svcpt->scp_threads);
3251         spin_unlock(&svcpt->scp_lock);
3252
3253         if (svcpt->scp_cpt >= 0) {
3254                 snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
3255                          svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
3256         } else {
3257                 snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
3258                          svc->srv_thread_name, thread->t_id);
3259         }
3260
3261         CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
3262         task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name);
3263         if (IS_ERR(task)) {
3264                 rc = PTR_ERR(task);
3265                 CERROR("cannot start thread '%s': rc = %d\n",
3266                        thread->t_name, rc);
3267                 spin_lock(&svcpt->scp_lock);
3268                 --svcpt->scp_nthrs_starting;
3269                 if (thread_is_stopping(thread)) {
3270                         /*
3271                          * this ptlrpc_thread is being hanled
3272                          * by ptlrpc_svcpt_stop_threads now
3273                          */
3274                         thread_add_flags(thread, SVC_STOPPED);
3275                         wake_up(&thread->t_ctl_waitq);
3276                         spin_unlock(&svcpt->scp_lock);
3277                 } else {
3278                         list_del(&thread->t_link);
3279                         spin_unlock(&svcpt->scp_lock);
3280                         OBD_FREE_PTR(thread);
3281                 }
3282                 RETURN(rc);
3283         }
3284
3285         if (!wait)
3286                 RETURN(0);
3287
3288         wait_event_idle(thread->t_ctl_waitq,
3289                         thread_is_running(thread) || thread_is_stopped(thread));
3290
3291         rc = thread_is_stopped(thread) ? thread->t_id : 0;
3292         RETURN(rc);
3293 }
3294
3295 int ptlrpc_hr_init(void)
3296 {
3297         struct ptlrpc_hr_partition *hrp;
3298         struct ptlrpc_hr_thread *hrt;
3299         int rc;
3300         int cpt;
3301         int i;
3302         int weight;
3303
3304         ENTRY;
3305
3306         memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
3307         ptlrpc_hr.hr_cpt_table = cfs_cpt_tab;
3308
3309         ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
3310                                                    sizeof(*hrp));
3311         if (ptlrpc_hr.hr_partitions == NULL)
3312                 RETURN(-ENOMEM);
3313
3314         ratelimit_state_init(&watchdog_limit,
3315                              cfs_time_seconds(libcfs_watchdog_ratelimit), 3);
3316
3317         init_waitqueue_head(&ptlrpc_hr.hr_waitq);
3318
3319         preempt_disable();
3320         weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
3321         preempt_enable();
3322
3323         cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
3324                 hrp->hrp_cpt = cpt;
3325
3326                 atomic_set(&hrp->hrp_nstarted, 0);
3327                 atomic_set(&hrp->hrp_nstopped, 0);
3328
3329                 hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt);
3330                 hrp->hrp_nthrs /= weight;
3331                 if (hrp->hrp_nthrs == 0)
3332                         hrp->hrp_nthrs = 1;
3333
3334                 OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt,
3335                               hrp->hrp_nthrs * sizeof(*hrt));
3336                 if (hrp->hrp_thrs == NULL)
3337                         GOTO(out, rc = -ENOMEM);
3338
3339                 for (i = 0; i < hrp->hrp_nthrs; i++) {
3340                         hrt = &hrp->hrp_thrs[i];
3341
3342                         hrt->hrt_id = i;
3343                         hrt->hrt_partition = hrp;
3344                         init_waitqueue_head(&hrt->hrt_waitq);
3345                         spin_lock_init(&hrt->hrt_lock);
3346                         INIT_LIST_HEAD(&hrt->hrt_queue);
3347                 }
3348         }
3349
3350         rc = ptlrpc_start_hr_threads();
3351 out:
3352         if (rc != 0)
3353                 ptlrpc_hr_fini();
3354         RETURN(rc);
3355 }
3356
3357 void ptlrpc_hr_fini(void)
3358 {
3359         struct ptlrpc_hr_partition *hrp;
3360         int cpt;
3361
3362         if (ptlrpc_hr.hr_partitions == NULL)
3363                 return;
3364
3365         ptlrpc_stop_hr_threads();
3366
3367         cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
3368                 if (hrp->hrp_thrs)
3369                         OBD_FREE_PTR_ARRAY(hrp->hrp_thrs, hrp->hrp_nthrs);
3370         }
3371
3372         cfs_percpt_free(ptlrpc_hr.hr_partitions);
3373         ptlrpc_hr.hr_partitions = NULL;
3374 }
3375
3376
3377 /**
3378  * Wait until all already scheduled replies are processed.
3379  */
3380 static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
3381 {
3382         while (1) {
3383                 if (wait_event_idle_timeout(
3384                         svcpt->scp_waitq,
3385                         atomic_read(&svcpt->scp_nreps_difficult) == 0,
3386                         cfs_time_seconds(10)) > 0)
3387                         break;
3388                 CWARN("Unexpectedly long timeout %s %p\n",
3389                       svcpt->scp_service->srv_name, svcpt->scp_service);
3390         }
3391 }
3392
3393 static void
3394 ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
3395 {
3396         struct ptlrpc_service_part *svcpt;
3397         int i;
3398
3399         /* early disarm AT timer... */
3400         ptlrpc_service_for_each_part(svcpt, i, svc) {
3401                 if (svcpt->scp_service != NULL)
3402                         timer_delete(&svcpt->scp_at_timer);
3403         }
3404 }
3405
3406 static void
3407 ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
3408 {
3409         struct ptlrpc_service_part *svcpt;
3410         struct ptlrpc_request_buffer_desc *rqbd;
3411         int rc;
3412         int i;
3413
3414         /*
3415          * All history will be culled when the next request buffer is
3416          * freed in ptlrpc_service_purge_all()
3417          */
3418         svc->srv_hist_nrqbds_cpt_max = 0;
3419
3420         rc = LNetClearLazyPortal(svc->srv_req_portal);
3421         LASSERT(rc == 0);
3422
3423         ptlrpc_service_for_each_part(svcpt, i, svc) {
3424                 if (svcpt->scp_service == NULL)
3425                         break;
3426
3427                 /*
3428                  * Unlink all the request buffers.  This forces a 'final'
3429                  * event with its 'unlink' flag set for each posted rqbd
3430                  */
3431                 list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
3432                                         rqbd_list) {
3433                         rc = LNetMDUnlink(rqbd->rqbd_md_h);
3434                         LASSERT(rc == 0 || rc == -ENOENT);
3435                 }
3436         }
3437
3438         ptlrpc_service_for_each_part(svcpt, i, svc) {
3439                 if (svcpt->scp_service == NULL)
3440                         break;
3441
3442                 /*
3443                  * Wait for the network to release any buffers
3444                  * it's currently filling
3445                  */
3446                 spin_lock(&svcpt->scp_lock);
3447                 while (svcpt->scp_nrqbds_posted != 0) {
3448                         int seconds = PTLRPC_REQ_LONG_UNLINK;
3449
3450                         spin_unlock(&svcpt->scp_lock);
3451                         /*
3452                          * Network access will complete in finite time but
3453                          * the HUGE timeout lets us CWARN for visibility
3454                          * of sluggish NALs
3455                          */
3456                         while (seconds > 0 &&
3457                                wait_event_idle_timeout(
3458                                        svcpt->scp_waitq,
3459                                        svcpt->scp_nrqbds_posted == 0,
3460                                        cfs_time_seconds(1)) == 0)
3461                                 seconds -= 1;
3462                         if (seconds == 0) {
3463                                 CWARN("Service %s waiting for request buffers\n",
3464                                       svcpt->scp_service->srv_name);
3465                         }
3466                         spin_lock(&svcpt->scp_lock);
3467                 }
3468                 spin_unlock(&svcpt->scp_lock);
3469         }
3470 }
3471
3472 static void
3473 ptlrpc_service_purge_all(struct ptlrpc_service *svc)
3474 {
3475         struct ptlrpc_service_part *svcpt;
3476         struct ptlrpc_request_buffer_desc *rqbd;
3477         struct ptlrpc_request *req;
3478         struct ptlrpc_reply_state *rs;
3479         int i;
3480
3481         ptlrpc_service_for_each_part(svcpt, i, svc) {
3482                 if (svcpt->scp_service == NULL)
3483                         break;
3484
3485                 spin_lock(&svcpt->scp_rep_lock);
3486                 while ((rs = list_first_entry_or_null(&svcpt->scp_rep_active,
3487                                                       struct ptlrpc_reply_state,
3488                                                       rs_list)) != NULL) {
3489                         spin_lock(&rs->rs_lock);
3490                         ptlrpc_schedule_difficult_reply(rs);
3491                         spin_unlock(&rs->rs_lock);
3492                 }
3493                 spin_unlock(&svcpt->scp_rep_lock);
3494
3495                 /*
3496                  * purge the request queue.  NB No new replies (rqbds
3497                  * all unlinked) and no service threads, so I'm the only
3498                  * thread noodling the request queue now
3499                  */
3500                 while ((req = list_first_entry_or_null(&svcpt->scp_req_incoming,
3501                                                        struct ptlrpc_request,
3502                                                        rq_list)) != NULL) {
3503                         list_del(&req->rq_list);
3504                         svcpt->scp_nreqs_incoming--;
3505                         ptlrpc_server_finish_request(svcpt, req);
3506                 }
3507
3508                 while (ptlrpc_server_request_pending(svcpt, true)) {
3509                         req = ptlrpc_server_request_get(svcpt, true);
3510                         LASSERT(req);
3511                         ptlrpc_server_finish_active_request(svcpt, req);
3512                 }
3513
3514                 /*
3515                  * The portal may be shared by several services (eg:OUT_PORTAL).
3516                  * So the request could be referenced by other target. So we
3517                  * have to wait the ptlrpc_server_drop_request invoked.
3518                  *
3519                  * TODO: move the req_buffer as global rather than per service.
3520                  */
3521                 spin_lock(&svcpt->scp_lock);
3522                 while (!list_empty(&svcpt->scp_rqbd_posted)) {
3523                         spin_unlock(&svcpt->scp_lock);
3524                         wait_event_idle_timeout(svcpt->scp_waitq,
3525                                 list_empty(&svcpt->scp_rqbd_posted),
3526                                 cfs_time_seconds(1));
3527                         spin_lock(&svcpt->scp_lock);
3528                 }
3529                 spin_unlock(&svcpt->scp_lock);
3530
3531                 LASSERT(svcpt->scp_nreqs_incoming == 0);
3532                 LASSERT(svcpt->scp_nreqs_active == 0);
3533                 /*
3534                  * history should have been culled by
3535                  * ptlrpc_server_finish_request
3536                  */
3537                 LASSERT(svcpt->scp_hist_nrqbds == 0);
3538
3539                 /*
3540                  * Now free all the request buffers since nothing
3541                  * references them any more...
3542                  */
3543                 spin_lock(&svcpt->scp_lock);
3544                 while ((rqbd = list_first_entry_or_null(&svcpt->scp_rqbd_idle,
3545                                                         struct ptlrpc_request_buffer_desc,
3546                                                         rqbd_list)) != NULL) {
3547                         list_del(&rqbd->rqbd_list);
3548                         svcpt->scp_nrqbds_total--;
3549                         spin_unlock(&svcpt->scp_lock);
3550
3551                         ptlrpc_free_rqbd(rqbd);
3552                         spin_lock(&svcpt->scp_lock);
3553                 }
3554                 spin_unlock(&svcpt->scp_lock);
3555
3556                 ptlrpc_wait_replies(svcpt);
3557
3558                 while ((rs = list_first_entry_or_null(&svcpt->scp_rep_idle,
3559                                                       struct ptlrpc_reply_state,
3560                                                       rs_list)) != NULL) {
3561                         list_del(&rs->rs_list);
3562                         OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
3563                 }
3564         }
3565 }
3566
3567 static void
3568 ptlrpc_service_free(struct ptlrpc_service *svc)
3569 {
3570         struct ptlrpc_service_part      *svcpt;
3571         struct ptlrpc_at_array          *array;
3572         int                             i;
3573
3574         ptlrpc_service_for_each_part(svcpt, i, svc) {
3575                 if (svcpt->scp_service == NULL)
3576                         break;
3577
3578                 /* In case somebody rearmed this in the meantime */
3579                 timer_delete(&svcpt->scp_at_timer);
3580                 array = &svcpt->scp_at_array;
3581
3582                 if (array->paa_reqs_array != NULL) {
3583                         OBD_FREE_PTR_ARRAY(array->paa_reqs_array,
3584                                            array->paa_size);
3585                         array->paa_reqs_array = NULL;
3586                 }
3587
3588                 if (array->paa_reqs_count != NULL) {
3589                         OBD_FREE_PTR_ARRAY(array->paa_reqs_count,
3590                                            array->paa_size);
3591                         array->paa_reqs_count = NULL;
3592                 }
3593         }
3594
3595         ptlrpc_service_for_each_part(svcpt, i, svc)
3596                 OBD_FREE_PTR(svcpt);
3597
3598         if (svc->srv_cpts != NULL)
3599                 cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
3600
3601         OBD_FREE(svc, offsetof(struct ptlrpc_service,
3602                                srv_parts[svc->srv_ncpts]));
3603 }
3604
3605 int ptlrpc_unregister_service(struct ptlrpc_service *service)
3606 {
3607         ENTRY;
3608
3609         CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
3610
3611         service->srv_is_stopping = 1;
3612
3613         mutex_lock(&ptlrpc_all_services_mutex);
3614         list_del_init(&service->srv_list);
3615         mutex_unlock(&ptlrpc_all_services_mutex);
3616
3617         ptlrpc_service_del_atimer(service);
3618         ptlrpc_stop_all_threads(service);
3619
3620         ptlrpc_service_unlink_rqbd(service);
3621         ptlrpc_service_purge_all(service);
3622         ptlrpc_service_nrs_cleanup(service);
3623
3624         ptlrpc_lprocfs_unregister_service(service);
3625         ptlrpc_sysfs_unregister_service(service);
3626
3627         ptlrpc_service_free(service);
3628
3629         RETURN(0);
3630 }
3631 EXPORT_SYMBOL(ptlrpc_unregister_service);
3632
3633 /**
3634  * Returns 0 if the service is healthy.
3635  *
3636  * Right now, it just checks to make sure that requests aren't languishing
3637  * in the queue.  We'll use this health check to govern whether a node needs
3638  * to be shot, so it's intentionally non-aggressive.
3639  */
3640 static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
3641 {
3642         struct ptlrpc_request *request = NULL;
3643         struct timespec64 right_now;
3644         struct timespec64 timediff;
3645         struct obd_device *obd = NULL;
3646
3647         ktime_get_real_ts64(&right_now);
3648
3649         spin_lock(&svcpt->scp_req_lock);
3650         /* How long has the next entry been waiting? */
3651         if (ptlrpc_server_high_pending(svcpt, true))
3652                 request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
3653         else if (ptlrpc_server_normal_pending(svcpt, true))
3654                 request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
3655
3656         if (request == NULL) {
3657                 spin_unlock(&svcpt->scp_req_lock);
3658                 return 0;
3659         }
3660
3661         timediff = timespec64_sub(right_now, request->rq_arrival_time);
3662         spin_unlock(&svcpt->scp_req_lock);
3663
3664         if (request->rq_export)
3665                 obd = request->rq_export->exp_obd;
3666
3667         if ((timediff.tv_sec) >
3668             (obd_at_off(obd) ? obd_timeout * 3 / 2 : obd_get_at_max(obd))) {
3669                 CERROR("%s: unhealthy - request has been waiting %llds\n",
3670                        svcpt->scp_service->srv_name, (s64)timediff.tv_sec);
3671                 return -1;
3672         }
3673
3674         return 0;
3675 }
3676
3677 int
3678 ptlrpc_service_health_check(struct ptlrpc_service *svc)
3679 {
3680         struct ptlrpc_service_part      *svcpt;
3681         int                             i;
3682
3683         if (svc == NULL)
3684                 return 0;
3685
3686         ptlrpc_service_for_each_part(svcpt, i, svc) {
3687                 int rc = ptlrpc_svcpt_health_check(svcpt);
3688
3689                 if (rc != 0)
3690                         return rc;
3691         }
3692         return 0;
3693 }
3694 EXPORT_SYMBOL(ptlrpc_service_health_check);
3695
3696 int
3697 ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
3698 {
3699         int at = 0;
3700
3701         if (!obd_at_off(NULL))
3702                 at = obd_at_get(NULL, &svcpt->scp_at_estimate);
3703
3704         return svcpt->scp_service->srv_watchdog_factor *
3705                max_t(int, at, obd_timeout);
3706 }