lustre/ptlrpc/client.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  */
  32
  33 /** Implementation of client-side PortalRPC interfaces */
  34
  35 #define DEBUG_SUBSYSTEM S_RPC
  36
  37 #include <linux/delay.h>
  38 #include <linux/random.h>
  39
  40 #include <obd_support.h>
  41 #include <obd_class.h>
  42 #include <lustre_lib.h>
  43 #include <lustre_ha.h>
  44 #include <lustre_import.h>
  45 #include <lustre_req_layout.h>
  46
  47 #include "ptlrpc_internal.h"
  48
  49 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
  50         .add_kiov_frag  = ptlrpc_prep_bulk_page_pin,
  51         .release_frags  = ptlrpc_release_bulk_page_pin,
  52 };
  53 EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
  54
  55 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
  56         .add_kiov_frag  = ptlrpc_prep_bulk_page_nopin,
  57         .release_frags  = ptlrpc_release_bulk_noop,
  58 };
  59 EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
  60
  61 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops = {
  62         .add_iov_frag = ptlrpc_prep_bulk_frag,
  63 };
  64 EXPORT_SYMBOL(ptlrpc_bulk_kvec_ops);
  65
  66 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
  67 static int ptlrpcd_check_work(struct ptlrpc_request *req);
  68 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
  69
  70 /**
  71  * Initialize passed in client structure \a cl.
  72  */
  73 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
  74                         struct ptlrpc_client *cl)
  75 {
  76         cl->cli_request_portal = req_portal;
  77         cl->cli_reply_portal   = rep_portal;
  78         cl->cli_name           = name;
  79 }
  80 EXPORT_SYMBOL(ptlrpc_init_client);
  81
  82 /**
  83  * Return PortalRPC connection for remore uud \a uuid
  84  */
  85 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
  86                                                     lnet_nid_t nid4refnet)
  87 {
  88         struct ptlrpc_connection *c;
  89         lnet_nid_t self;
  90         struct lnet_process_id peer;
  91         int err;
  92
  93         /*
  94          * ptlrpc_uuid_to_peer() initializes its 2nd parameter
  95          * before accessing its values.
  96          */
  97         /* coverity[uninit_use_in_call] */
  98         peer.nid = nid4refnet;
  99         err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
 100         if (err != 0) {
 101                 CNETERR("cannot find peer %s!\n", uuid->uuid);
 102                 return NULL;
 103         }
 104
 105         c = ptlrpc_connection_get(peer, self, uuid);
 106         if (c) {
 107                 memcpy(c->c_remote_uuid.uuid,
 108                        uuid->uuid, sizeof(c->c_remote_uuid.uuid));
 109         }
 110
 111         CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
 112
 113         return c;
 114 }
 115
 116 /**
 117  * Allocate and initialize new bulk descriptor on the sender.
 118  * Returns pointer to the descriptor or NULL on error.
 119  */
 120 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags,
 121                                          unsigned int max_brw,
 122                                          enum ptlrpc_bulk_op_type type,
 123                                          unsigned int portal,
 124                                          const struct ptlrpc_bulk_frag_ops *ops)
 125 {
 126         struct ptlrpc_bulk_desc *desc;
 127         int i;
 128
 129         /* ensure that only one of KIOV or IOVEC is set but not both */
 130         LASSERT((ptlrpc_is_bulk_desc_kiov(type) &&
 131                  ops->add_kiov_frag != NULL) ||
 132                 (ptlrpc_is_bulk_desc_kvec(type) &&
 133                  ops->add_iov_frag != NULL));
 134
 135         OBD_ALLOC_PTR(desc);
 136         if (!desc)
 137                 return NULL;
 138         if (type & PTLRPC_BULK_BUF_KIOV) {
 139                 OBD_ALLOC_LARGE(GET_KIOV(desc),
 140                                 nfrags * sizeof(*GET_KIOV(desc)));
 141                 if (!GET_KIOV(desc))
 142                         goto out;
 143         } else {
 144                 OBD_ALLOC_LARGE(GET_KVEC(desc),
 145                                 nfrags * sizeof(*GET_KVEC(desc)));
 146                 if (!GET_KVEC(desc))
 147                         goto out;
 148         }
 149
 150         spin_lock_init(&desc->bd_lock);
 151         init_waitqueue_head(&desc->bd_waitq);
 152         desc->bd_max_iov = nfrags;
 153         desc->bd_iov_count = 0;
 154         desc->bd_portal = portal;
 155         desc->bd_type = type;
 156         desc->bd_md_count = 0;
 157         desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *)ops;
 158         LASSERT(max_brw > 0);
 159         desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
 160         /*
 161          * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
 162          * node. Negotiated ocd_brw_size will always be <= this number.
 163          */
 164         for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
 165                 LNetInvalidateMDHandle(&desc->bd_mds[i]);
 166
 167         return desc;
 168 out:
 169         OBD_FREE_PTR(desc);
 170         return NULL;
 171 }
 172
 173 /**
 174  * Prepare bulk descriptor for specified outgoing request \a req that
 175  * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
 176  * the bulk to be sent. Used on client-side.
 177  * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
 178  * error.
 179  */
 180 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
 181                                               unsigned int nfrags,
 182                                               unsigned int max_brw,
 183                                               unsigned int type,
 184                                               unsigned int portal,
 185                                               const struct ptlrpc_bulk_frag_ops
 186                                                 *ops)
 187 {
 188         struct obd_import *imp = req->rq_import;
 189         struct ptlrpc_bulk_desc *desc;
 190
 191         ENTRY;
 192         LASSERT(ptlrpc_is_bulk_op_passive(type));
 193
 194         desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
 195         if (!desc)
 196                 RETURN(NULL);
 197
 198         desc->bd_import_generation = req->rq_import_generation;
 199         desc->bd_import = class_import_get(imp);
 200         desc->bd_req = req;
 201
 202         desc->bd_cbid.cbid_fn  = client_bulk_callback;
 203         desc->bd_cbid.cbid_arg = desc;
 204
 205         /* This makes req own desc, and free it when she frees herself */
 206         req->rq_bulk = desc;
 207
 208         return desc;
 209 }
 210 EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
 211
 212 void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 213                              struct page *page, int pageoffset, int len,
 214                              int pin)
 215 {
 216         lnet_kiov_t *kiov;
 217
 218         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
 219         LASSERT(page != NULL);
 220         LASSERT(pageoffset >= 0);
 221         LASSERT(len > 0);
 222         LASSERT(pageoffset + len <= PAGE_SIZE);
 223         LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 224
 225         kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
 226
 227         desc->bd_nob += len;
 228
 229         if (pin)
 230                 get_page(page);
 231
 232         kiov->kiov_page = page;
 233         kiov->kiov_offset = pageoffset;
 234         kiov->kiov_len = len;
 235
 236         desc->bd_iov_count++;
 237 }
 238 EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
 239
 240 int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
 241                           void *frag, int len)
 242 {
 243         struct kvec *iovec;
 244
 245         ENTRY;
 246
 247         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
 248         LASSERT(frag != NULL);
 249         LASSERT(len > 0);
 250         LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
 251
 252         iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
 253
 254         desc->bd_nob += len;
 255
 256         iovec->iov_base = frag;
 257         iovec->iov_len = len;
 258
 259         desc->bd_iov_count++;
 260
 261         RETURN(desc->bd_nob);
 262 }
 263 EXPORT_SYMBOL(ptlrpc_prep_bulk_frag);
 264
 265 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 266 {
 267         ENTRY;
 268
 269         LASSERT(desc != NULL);
 270         LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
 271         LASSERT(desc->bd_md_count == 0);         /* network hands off */
 272         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 273         LASSERT(desc->bd_frag_ops != NULL);
 274
 275         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
 276                 sptlrpc_enc_pool_put_pages(desc);
 277
 278         if (desc->bd_export)
 279                 class_export_put(desc->bd_export);
 280         else
 281                 class_import_put(desc->bd_import);
 282
 283         if (desc->bd_frag_ops->release_frags != NULL)
 284                 desc->bd_frag_ops->release_frags(desc);
 285
 286         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
 287                 OBD_FREE_LARGE(GET_KIOV(desc),
 288                                desc->bd_max_iov * sizeof(*GET_KIOV(desc)));
 289         else
 290                 OBD_FREE_LARGE(GET_KVEC(desc),
 291                                desc->bd_max_iov * sizeof(*GET_KVEC(desc)));
 292         OBD_FREE_PTR(desc);
 293         EXIT;
 294 }
 295 EXPORT_SYMBOL(ptlrpc_free_bulk);
 296
 297 /**
 298  * Set server timelimit for this req, i.e. how long are we willing to wait
 299  * for reply before timing out this request.
 300  */
 301 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
 302 {
 303         __u32 serv_est;
 304         int idx;
 305         struct imp_at *at;
 306
 307         LASSERT(req->rq_import);
 308
 309         if (AT_OFF) {
 310                 /* non-AT settings */
 311                 /**
 312                  * \a imp_server_timeout means this is reverse import and
 313                  * we send (currently only) ASTs to the client and cannot afford
 314                  * to wait too long for the reply, otherwise the other client
 315                  * (because of which we are sending this request) would
 316                  * timeout waiting for us
 317                  */
 318                 req->rq_timeout = req->rq_import->imp_server_timeout ?
 319                                   obd_timeout / 2 : obd_timeout;
 320         } else {
 321                 at = &req->rq_import->imp_at;
 322                 idx = import_at_get_index(req->rq_import,
 323                                           req->rq_request_portal);
 324                 serv_est = at_get(&at->iat_service_estimate[idx]);
 325                 req->rq_timeout = at_est2timeout(serv_est);
 326         }
 327         /*
 328          * We could get even fancier here, using history to predict increased
 329          * loading...
 330          */
 331
 332         /*
 333          * Let the server know what this RPC timeout is by putting it in the
 334          * reqmsg
 335          */
 336         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 337 }
 338 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
 339
 340 /* Adjust max service estimate based on server value */
 341 static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
 342                                   unsigned int serv_est)
 343 {
 344         int idx;
 345         unsigned int oldse;
 346         struct imp_at *at;
 347
 348         LASSERT(req->rq_import);
 349         at = &req->rq_import->imp_at;
 350
 351         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
 352         /*
 353          * max service estimates are tracked on the server side,
 354          * so just keep minimal history here
 355          */
 356         oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
 357         if (oldse != 0)
 358                 CDEBUG(D_ADAPTTO,
 359                        "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
 360                        req->rq_import->imp_obd->obd_name,
 361                        req->rq_request_portal,
 362                        oldse, at_get(&at->iat_service_estimate[idx]));
 363 }
 364
 365 /* Expected network latency per remote node (secs) */
 366 int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 367 {
 368         return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
 369 }
 370
 371 /* Adjust expected network latency */
 372 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 373                                unsigned int service_time)
 374 {
 375         unsigned int nl, oldnl;
 376         struct imp_at *at;
 377         time64_t now = ktime_get_real_seconds();
 378
 379         LASSERT(req->rq_import);
 380
 381         if (service_time > now - req->rq_sent + 3) {
 382                 /*
 383                  * b=16408, however, this can also happen if early reply
 384                  * is lost and client RPC is expired and resent, early reply
 385                  * or reply of original RPC can still be fit in reply buffer
 386                  * of resent RPC, now client is measuring time from the
 387                  * resent time, but server sent back service time of original
 388                  * RPC.
 389                  */
 390                 CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 391                        D_ADAPTTO : D_WARNING,
 392                        "Reported service time %u > total measured time %lld\n",
 393                        service_time, now - req->rq_sent);
 394                 return;
 395         }
 396
 397         /* Network latency is total time less server processing time */
 398         nl = max_t(int, now - req->rq_sent -
 399                         service_time, 0) + 1; /* st rounding */
 400         at = &req->rq_import->imp_at;
 401
 402         oldnl = at_measured(&at->iat_net_latency, nl);
 403         if (oldnl != 0)
 404                 CDEBUG(D_ADAPTTO,
 405                        "The network latency for %s (nid %s) has changed from %d to %d\n",
 406                        req->rq_import->imp_obd->obd_name,
 407                        obd_uuid2str(&req->rq_import->imp_connection->c_remote_uuid),
 408                        oldnl, at_get(&at->iat_net_latency));
 409 }
 410
 411 static int unpack_reply(struct ptlrpc_request *req)
 412 {
 413         int rc;
 414
 415         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
 416                 rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
 417                 if (rc) {
 418                         DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
 419                         return -EPROTO;
 420                 }
 421         }
 422
 423         rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
 424         if (rc) {
 425                 DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
 426                 return -EPROTO;
 427         }
 428         return 0;
 429 }
 430
 431 /**
 432  * Handle an early reply message, called with the rq_lock held.
 433  * If anything goes wrong just ignore it - same as if it never happened
 434  */
 435 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 436 __must_hold(&req->rq_lock)
 437 {
 438         struct ptlrpc_request *early_req;
 439         time64_t olddl;
 440         int rc;
 441
 442         ENTRY;
 443         req->rq_early = 0;
 444         spin_unlock(&req->rq_lock);
 445
 446         rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
 447         if (rc) {
 448                 spin_lock(&req->rq_lock);
 449                 RETURN(rc);
 450         }
 451
 452         rc = unpack_reply(early_req);
 453         if (rc != 0) {
 454                 sptlrpc_cli_finish_early_reply(early_req);
 455                 spin_lock(&req->rq_lock);
 456                 RETURN(rc);
 457         }
 458
 459         /*
 460          * Use new timeout value just to adjust the local value for this
 461          * request, don't include it into at_history. It is unclear yet why
 462          * service time increased and should it be counted or skipped, e.g.
 463          * that can be recovery case or some error or server, the real reply
 464          * will add all new data if it is worth to add.
 465          */
 466         req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
 467         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 468
 469         /* Network latency can be adjusted, it is pure network delays */
 470         ptlrpc_at_adj_net_latency(req,
 471                                   lustre_msg_get_service_time(early_req->rq_repmsg));
 472
 473         sptlrpc_cli_finish_early_reply(early_req);
 474
 475         spin_lock(&req->rq_lock);
 476         olddl = req->rq_deadline;
 477         /*
 478          * server assumes it now has rq_timeout from when the request
 479          * arrived, so the client should give it at least that long.
 480          * since we don't know the arrival time we'll use the original
 481          * sent time
 482          */
 483         req->rq_deadline = req->rq_sent + req->rq_timeout +
 484                            ptlrpc_at_get_net_latency(req);
 485
 486         DEBUG_REQ(D_ADAPTTO, req,
 487                   "Early reply #%d, new deadline in %llds (%llds)",
 488                   req->rq_early_count,
 489                   req->rq_deadline - ktime_get_real_seconds(),
 490                   req->rq_deadline - olddl);
 491
 492         RETURN(rc);
 493 }
 494
 495 static struct kmem_cache *request_cache;
 496
 497 int ptlrpc_request_cache_init(void)
 498 {
 499         request_cache = kmem_cache_create("ptlrpc_cache",
 500                                           sizeof(struct ptlrpc_request),
 501                                           0, SLAB_HWCACHE_ALIGN, NULL);
 502         return request_cache ? 0 : -ENOMEM;
 503 }
 504
 505 void ptlrpc_request_cache_fini(void)
 506 {
 507         kmem_cache_destroy(request_cache);
 508 }
 509
 510 struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
 511 {
 512         struct ptlrpc_request *req;
 513
 514         OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
 515         return req;
 516 }
 517
 518 void ptlrpc_request_cache_free(struct ptlrpc_request *req)
 519 {
 520         OBD_SLAB_FREE_PTR(req, request_cache);
 521 }
 522
 523 /**
 524  * Wind down request pool \a pool.
 525  * Frees all requests from the pool too
 526  */
 527 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
 528 {
 529         struct list_head *l, *tmp;
 530         struct ptlrpc_request *req;
 531
 532         LASSERT(pool != NULL);
 533
 534         spin_lock(&pool->prp_lock);
 535         list_for_each_safe(l, tmp, &pool->prp_req_list) {
 536                 req = list_entry(l, struct ptlrpc_request, rq_list);
 537                 list_del(&req->rq_list);
 538                 LASSERT(req->rq_reqbuf);
 539                 LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
 540                 OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
 541                 ptlrpc_request_cache_free(req);
 542         }
 543         spin_unlock(&pool->prp_lock);
 544         OBD_FREE(pool, sizeof(*pool));
 545 }
 546 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 547
 548 /**
 549  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
 550  */
 551 int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 552 {
 553         int i;
 554         int size = 1;
 555
 556         while (size < pool->prp_rq_size)
 557                 size <<= 1;
 558
 559         LASSERTF(list_empty(&pool->prp_req_list) ||
 560                  size == pool->prp_rq_size,
 561                  "Trying to change pool size with nonempty pool from %d to %d bytes\n",
 562                  pool->prp_rq_size, size);
 563
 564         spin_lock(&pool->prp_lock);
 565         pool->prp_rq_size = size;
 566         for (i = 0; i < num_rq; i++) {
 567                 struct ptlrpc_request *req;
 568                 struct lustre_msg *msg;
 569
 570                 spin_unlock(&pool->prp_lock);
 571                 req = ptlrpc_request_cache_alloc(GFP_NOFS);
 572                 if (!req)
 573                         return i;
 574                 OBD_ALLOC_LARGE(msg, size);
 575                 if (!msg) {
 576                         ptlrpc_request_cache_free(req);
 577                         return i;
 578                 }
 579                 req->rq_reqbuf = msg;
 580                 req->rq_reqbuf_len = size;
 581                 req->rq_pool = pool;
 582                 spin_lock(&pool->prp_lock);
 583                 list_add_tail(&req->rq_list, &pool->prp_req_list);
 584         }
 585         spin_unlock(&pool->prp_lock);
 586         return num_rq;
 587 }
 588 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 589
 590 /**
 591  * Create and initialize new request pool with given attributes:
 592  * \a num_rq - initial number of requests to create for the pool
 593  * \a msgsize - maximum message size possible for requests in thid pool
 594  * \a populate_pool - function to be called when more requests need to be added
 595  *                    to the pool
 596  * Returns pointer to newly created pool or NULL on error.
 597  */
 598 struct ptlrpc_request_pool *
 599 ptlrpc_init_rq_pool(int num_rq, int msgsize,
 600                     int (*populate_pool)(struct ptlrpc_request_pool *, int))
 601 {
 602         struct ptlrpc_request_pool *pool;
 603
 604         OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
 605         if (!pool)
 606                 return NULL;
 607
 608         /*
 609          * Request next power of two for the allocation, because internally
 610          * kernel would do exactly this
 611          */
 612         spin_lock_init(&pool->prp_lock);
 613         INIT_LIST_HEAD(&pool->prp_req_list);
 614         pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
 615         pool->prp_populate = populate_pool;
 616
 617         populate_pool(pool, num_rq);
 618
 619         return pool;
 620 }
 621 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
 622
 623 /**
 624  * Fetches one request from pool \a pool
 625  */
 626 static struct ptlrpc_request *
 627 ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
 628 {
 629         struct ptlrpc_request *request;
 630         struct lustre_msg *reqbuf;
 631
 632         if (!pool)
 633                 return NULL;
 634
 635         spin_lock(&pool->prp_lock);
 636
 637         /*
 638          * See if we have anything in a pool, and bail out if nothing,
 639          * in writeout path, where this matters, this is safe to do, because
 640          * nothing is lost in this case, and when some in-flight requests
 641          * complete, this code will be called again.
 642          */
 643         if (unlikely(list_empty(&pool->prp_req_list))) {
 644                 spin_unlock(&pool->prp_lock);
 645                 return NULL;
 646         }
 647
 648         request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
 649                              rq_list);
 650         list_del_init(&request->rq_list);
 651         spin_unlock(&pool->prp_lock);
 652
 653         LASSERT(request->rq_reqbuf);
 654         LASSERT(request->rq_pool);
 655
 656         reqbuf = request->rq_reqbuf;
 657         memset(request, 0, sizeof(*request));
 658         request->rq_reqbuf = reqbuf;
 659         request->rq_reqbuf_len = pool->prp_rq_size;
 660         request->rq_pool = pool;
 661
 662         return request;
 663 }
 664
 665 /**
 666  * Returns freed \a request to pool.
 667  */
 668 static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
 669 {
 670         struct ptlrpc_request_pool *pool = request->rq_pool;
 671
 672         spin_lock(&pool->prp_lock);
 673         LASSERT(list_empty(&request->rq_list));
 674         LASSERT(!request->rq_receiving_reply);
 675         list_add_tail(&request->rq_list, &pool->prp_req_list);
 676         spin_unlock(&pool->prp_lock);
 677 }
 678
 679 void ptlrpc_add_unreplied(struct ptlrpc_request *req)
 680 {
 681         struct obd_import *imp = req->rq_import;
 682         struct list_head *tmp;
 683         struct ptlrpc_request *iter;
 684
 685         assert_spin_locked(&imp->imp_lock);
 686         LASSERT(list_empty(&req->rq_unreplied_list));
 687
 688         /* unreplied list is sorted by xid in ascending order */
 689         list_for_each_prev(tmp, &imp->imp_unreplied_list) {
 690                 iter = list_entry(tmp, struct ptlrpc_request,
 691                                   rq_unreplied_list);
 692
 693                 LASSERT(req->rq_xid != iter->rq_xid);
 694                 if (req->rq_xid < iter->rq_xid)
 695                         continue;
 696                 list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
 697                 return;
 698         }
 699         list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
 700 }
 701
 702 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
 703 {
 704         req->rq_xid = ptlrpc_next_xid();
 705         ptlrpc_add_unreplied(req);
 706 }
 707
 708 static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
 709 {
 710         spin_lock(&req->rq_import->imp_lock);
 711         ptlrpc_assign_next_xid_nolock(req);
 712         spin_unlock(&req->rq_import->imp_lock);
 713 }
 714
 715 static __u64 ptlrpc_last_xid;
 716 static spinlock_t ptlrpc_last_xid_lock;
 717
 718 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 719                              __u32 version, int opcode, char **bufs,
 720                              struct ptlrpc_cli_ctx *ctx)
 721 {
 722         int count;
 723         struct obd_import *imp;
 724         __u32 *lengths;
 725         int rc;
 726
 727         ENTRY;
 728
 729         count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
 730         imp = request->rq_import;
 731         lengths = request->rq_pill.rc_area[RCL_CLIENT];
 732
 733         if (ctx) {
 734                 request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
 735         } else {
 736                 rc = sptlrpc_req_get_ctx(request);
 737                 if (rc)
 738                         GOTO(out_free, rc);
 739         }
 740         sptlrpc_req_set_flavor(request, opcode);
 741
 742         rc = lustre_pack_request(request, imp->imp_msg_magic, count,
 743                                  lengths, bufs);
 744         if (rc)
 745                 GOTO(out_ctx, rc);
 746
 747         lustre_msg_add_version(request->rq_reqmsg, version);
 748         request->rq_send_state = LUSTRE_IMP_FULL;
 749         request->rq_type = PTL_RPC_MSG_REQUEST;
 750
 751         request->rq_req_cbid.cbid_fn  = request_out_callback;
 752         request->rq_req_cbid.cbid_arg = request;
 753
 754         request->rq_reply_cbid.cbid_fn  = reply_in_callback;
 755         request->rq_reply_cbid.cbid_arg = request;
 756
 757         request->rq_reply_deadline = 0;
 758         request->rq_bulk_deadline = 0;
 759         request->rq_req_deadline = 0;
 760         request->rq_phase = RQ_PHASE_NEW;
 761         request->rq_next_phase = RQ_PHASE_UNDEFINED;
 762
 763         request->rq_request_portal = imp->imp_client->cli_request_portal;
 764         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
 765
 766         ptlrpc_at_set_req_timeout(request);
 767
 768         lustre_msg_set_opc(request->rq_reqmsg, opcode);
 769
 770         /* Let's setup deadline for req/reply/bulk unlink for opcode. */
 771         if (cfs_fail_val == opcode) {
 772                 time64_t *fail_t = NULL, *fail2_t = NULL;
 773
 774                 if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
 775                         fail_t = &request->rq_bulk_deadline;
 776                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
 777                         fail_t = &request->rq_reply_deadline;
 778                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) {
 779                         fail_t = &request->rq_req_deadline;
 780                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
 781                         fail_t = &request->rq_reply_deadline;
 782                         fail2_t = &request->rq_bulk_deadline;
 783                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_ROUND_XID)) {
 784                         time64_t now = ktime_get_real_seconds();
 785                         spin_lock(&ptlrpc_last_xid_lock);
 786                         ptlrpc_last_xid = ((__u64)now >> 4) << 24;
 787                         spin_unlock(&ptlrpc_last_xid_lock);
 788                 }
 789
 790                 if (fail_t) {
 791                         *fail_t = ktime_get_real_seconds() + LONG_UNLINK;
 792
 793                         if (fail2_t)
 794                                 *fail2_t = ktime_get_real_seconds() +
 795                                            LONG_UNLINK;
 796
 797                         /*
 798                          * The RPC is infected, let the test to change the
 799                          * fail_loc
 800                          */
 801                         msleep(4 * MSEC_PER_SEC);
 802                 }
 803         }
 804         ptlrpc_assign_next_xid(request);
 805
 806         RETURN(0);
 807
 808 out_ctx:
 809         LASSERT(!request->rq_pool);
 810         sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 811 out_free:
 812         class_import_put(imp);
 813
 814         return rc;
 815 }
 816 EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
 817
 818 /**
 819  * Pack request buffers for network transfer, performing necessary encryption
 820  * steps if necessary.
 821  */
 822 int ptlrpc_request_pack(struct ptlrpc_request *request,
 823                         __u32 version, int opcode)
 824 {
 825         int rc;
 826
 827         rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
 828         if (rc)
 829                 return rc;
 830
 831         /*
 832          * For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
 833          * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
 834          * have to send old ptlrpc_body to keep interoprability with these
 835          * clients.
 836          *
 837          * Only three kinds of server->client RPCs so far:
 838          *  - LDLM_BL_CALLBACK
 839          *  - LDLM_CP_CALLBACK
 840          *  - LDLM_GL_CALLBACK
 841          *
 842          * XXX This should be removed whenever we drop the interoprability with
 843          *     the these old clients.
 844          */
 845         if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
 846             opcode == LDLM_GL_CALLBACK)
 847                 req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
 848                                    sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
 849
 850         return rc;
 851 }
 852 EXPORT_SYMBOL(ptlrpc_request_pack);
 853
 854 /**
 855  * Helper function to allocate new request on import \a imp
 856  * and possibly using existing request from pool \a pool if provided.
 857  * Returns allocated request structure with import field filled or
 858  * NULL on error.
 859  */
 860 static inline
 861 struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 862                                               struct ptlrpc_request_pool *pool)
 863 {
 864         struct ptlrpc_request *request = NULL;
 865
 866         request = ptlrpc_request_cache_alloc(GFP_NOFS);
 867
 868         if (!request && pool)
 869                 request = ptlrpc_prep_req_from_pool(pool);
 870
 871         if (request) {
 872                 ptlrpc_cli_req_init(request);
 873
 874                 LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
 875                 LASSERT(imp != LP_POISON);
 876                 LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n",
 877                          imp->imp_client);
 878                 LASSERT(imp->imp_client != LP_POISON);
 879
 880                 request->rq_import = class_import_get(imp);
 881         } else {
 882                 CERROR("request allocation out of memory\n");
 883         }
 884
 885         return request;
 886 }
 887
 888 /**
 889  * Helper function for creating a request.
 890  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
 891  * buffer structures according to capsule template \a format.
 892  * Returns allocated request structure pointer or NULL on error.
 893  */
 894 static struct ptlrpc_request *
 895 ptlrpc_request_alloc_internal(struct obd_import *imp,
 896                               struct ptlrpc_request_pool *pool,
 897                               const struct req_format *format)
 898 {
 899         struct ptlrpc_request *request;
 900         int connect = 0;
 901
 902         request = __ptlrpc_request_alloc(imp, pool);
 903         if (!request)
 904                 return NULL;
 905
 906         /*
 907          * initiate connection if needed when the import has been
 908          * referenced by the new request to avoid races with disconnect
 909          */
 910         if (unlikely(imp->imp_state == LUSTRE_IMP_IDLE)) {
 911                 int rc;
 912
 913                 CDEBUG_LIMIT(imp->imp_idle_debug,
 914                              "%s: reconnect after %llds idle\n",
 915                              imp->imp_obd->obd_name, ktime_get_real_seconds() -
 916                                                      imp->imp_last_reply_time);
 917                 spin_lock(&imp->imp_lock);
 918                 if (imp->imp_state == LUSTRE_IMP_IDLE) {
 919                         imp->imp_generation++;
 920                         imp->imp_initiated_at = imp->imp_generation;
 921                         imp->imp_state =  LUSTRE_IMP_NEW;
 922                         connect = 1;
 923                 }
 924                 spin_unlock(&imp->imp_lock);
 925                 if (connect) {
 926                         rc = ptlrpc_connect_import(imp);
 927                         if (rc < 0) {
 928                                 ptlrpc_request_free(request);
 929                                 return NULL;
 930                         }
 931                         ptlrpc_pinger_add_import(imp);
 932                 }
 933         }
 934
 935         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
 936         req_capsule_set(&request->rq_pill, format);
 937         return request;
 938 }
 939
 940 /**
 941  * Allocate new request structure for import \a imp and initialize its
 942  * buffer structure according to capsule template \a format.
 943  */
 944 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
 945                                             const struct req_format *format)
 946 {
 947         return ptlrpc_request_alloc_internal(imp, NULL, format);
 948 }
 949 EXPORT_SYMBOL(ptlrpc_request_alloc);
 950
 951 /**
 952  * Allocate new request structure for import \a imp from pool \a pool and
 953  * initialize its buffer structure according to capsule template \a format.
 954  */
 955 struct ptlrpc_request *
 956 ptlrpc_request_alloc_pool(struct obd_import *imp,
 957                           struct ptlrpc_request_pool *pool,
 958                           const struct req_format *format)
 959 {
 960         return ptlrpc_request_alloc_internal(imp, pool, format);
 961 }
 962 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
 963
 964 /**
 965  * For requests not from pool, free memory of the request structure.
 966  * For requests obtained from a pool earlier, return request back to pool.
 967  */
 968 void ptlrpc_request_free(struct ptlrpc_request *request)
 969 {
 970         if (request->rq_pool)
 971                 __ptlrpc_free_req_to_pool(request);
 972         else
 973                 ptlrpc_request_cache_free(request);
 974 }
 975 EXPORT_SYMBOL(ptlrpc_request_free);
 976
 977 /**
 978  * Allocate new request for operatione \a opcode and immediatelly pack it for
 979  * network transfer.
 980  * Only used for simple requests like OBD_PING where the only important
 981  * part of the request is operation itself.
 982  * Returns allocated request or NULL on error.
 983  */
 984 struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
 985                                                  const struct req_format *format,
 986                                                  __u32 version, int opcode)
 987 {
 988         struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
 989         int rc;
 990
 991         if (req) {
 992                 rc = ptlrpc_request_pack(req, version, opcode);
 993                 if (rc) {
 994                         ptlrpc_request_free(req);
 995                         req = NULL;
 996                 }
 997         }
 998         return req;
 999 }
1000 EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
1001
1002 /**
1003  * Allocate and initialize new request set structure on the current CPT.
1004  * Returns a pointer to the newly allocated set structure or NULL on error.
1005  */
1006 struct ptlrpc_request_set *ptlrpc_prep_set(void)
1007 {
1008         struct ptlrpc_request_set *set;
1009         int cpt;
1010
1011         ENTRY;
1012         cpt = cfs_cpt_current(cfs_cpt_table, 0);
1013         OBD_CPT_ALLOC(set, cfs_cpt_table, cpt, sizeof(*set));
1014         if (!set)
1015                 RETURN(NULL);
1016         atomic_set(&set->set_refcount, 1);
1017         INIT_LIST_HEAD(&set->set_requests);
1018         init_waitqueue_head(&set->set_waitq);
1019         atomic_set(&set->set_new_count, 0);
1020         atomic_set(&set->set_remaining, 0);
1021         spin_lock_init(&set->set_new_req_lock);
1022         INIT_LIST_HEAD(&set->set_new_requests);
1023         set->set_max_inflight = UINT_MAX;
1024         set->set_producer     = NULL;
1025         set->set_producer_arg = NULL;
1026         set->set_rc           = 0;
1027
1028         RETURN(set);
1029 }
1030 EXPORT_SYMBOL(ptlrpc_prep_set);
1031
1032 /**
1033  * Allocate and initialize new request set structure with flow control
1034  * extension. This extension allows to control the number of requests in-flight
1035  * for the whole set. A callback function to generate requests must be provided
1036  * and the request set will keep the number of requests sent over the wire to
1037  * @max_inflight.
1038  * Returns a pointer to the newly allocated set structure or NULL on error.
1039  */
1040 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
1041                                              void *arg)
1042
1043 {
1044         struct ptlrpc_request_set *set;
1045
1046         set = ptlrpc_prep_set();
1047         if (!set)
1048                 RETURN(NULL);
1049
1050         set->set_max_inflight  = max;
1051         set->set_producer      = func;
1052         set->set_producer_arg  = arg;
1053
1054         RETURN(set);
1055 }
1056
1057 /**
1058  * Wind down and free request set structure previously allocated with
1059  * ptlrpc_prep_set.
1060  * Ensures that all requests on the set have completed and removes
1061  * all requests from the request list in a set.
1062  * If any unsent request happen to be on the list, pretends that they got
1063  * an error in flight and calls their completion handler.
1064  */
1065 void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
1066 {
1067         struct list_head *tmp;
1068         struct list_head *next;
1069         int expected_phase;
1070         int n = 0;
1071
1072         ENTRY;
1073
1074         /* Requests on the set should either all be completed, or all be new */
1075         expected_phase = (atomic_read(&set->set_remaining) == 0) ?
1076                          RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
1077         list_for_each(tmp, &set->set_requests) {
1078                 struct ptlrpc_request *req =
1079                         list_entry(tmp, struct ptlrpc_request,
1080                                    rq_set_chain);
1081
1082                 LASSERT(req->rq_phase == expected_phase);
1083                 n++;
1084         }
1085
1086         LASSERTF(atomic_read(&set->set_remaining) == 0 ||
1087                  atomic_read(&set->set_remaining) == n, "%d / %d\n",
1088                  atomic_read(&set->set_remaining), n);
1089
1090         list_for_each_safe(tmp, next, &set->set_requests) {
1091                 struct ptlrpc_request *req =
1092                         list_entry(tmp, struct ptlrpc_request,
1093                                    rq_set_chain);
1094                 list_del_init(&req->rq_set_chain);
1095
1096                 LASSERT(req->rq_phase == expected_phase);
1097
1098                 if (req->rq_phase == RQ_PHASE_NEW) {
1099                         ptlrpc_req_interpret(NULL, req, -EBADR);
1100                         atomic_dec(&set->set_remaining);
1101                 }
1102
1103                 spin_lock(&req->rq_lock);
1104                 req->rq_set = NULL;
1105                 req->rq_invalid_rqset = 0;
1106                 spin_unlock(&req->rq_lock);
1107
1108                 ptlrpc_req_finished(req);
1109         }
1110
1111         LASSERT(atomic_read(&set->set_remaining) == 0);
1112
1113         ptlrpc_reqset_put(set);
1114         EXIT;
1115 }
1116 EXPORT_SYMBOL(ptlrpc_set_destroy);
1117
1118 /**
1119  * Add a new request to the general purpose request set.
1120  * Assumes request reference from the caller.
1121  */
1122 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
1123                         struct ptlrpc_request *req)
1124 {
1125         LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
1126         LASSERT(list_empty(&req->rq_set_chain));
1127
1128         if (req->rq_allow_intr)
1129                 set->set_allow_intr = 1;
1130
1131         /* The set takes over the caller's request reference */
1132         list_add_tail(&req->rq_set_chain, &set->set_requests);
1133         req->rq_set = set;
1134         atomic_inc(&set->set_remaining);
1135         req->rq_queued_time = ktime_get_seconds();
1136
1137         if (req->rq_reqmsg)
1138                 lustre_msg_set_jobid(req->rq_reqmsg, NULL);
1139
1140         if (set->set_producer)
1141                 /*
1142                  * If the request set has a producer callback, the RPC must be
1143                  * sent straight away
1144                  */
1145                 ptlrpc_send_new_req(req);
1146 }
1147 EXPORT_SYMBOL(ptlrpc_set_add_req);
1148
1149 /**
1150  * Add a request to a request with dedicated server thread
1151  * and wake the thread to make any necessary processing.
1152  * Currently only used for ptlrpcd.
1153  */
1154 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
1155                             struct ptlrpc_request *req)
1156 {
1157         struct ptlrpc_request_set *set = pc->pc_set;
1158         int count, i;
1159
1160         LASSERT(req->rq_set == NULL);
1161         LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
1162
1163         spin_lock(&set->set_new_req_lock);
1164         /*
1165          * The set takes over the caller's request reference.
1166          */
1167         req->rq_set = set;
1168         req->rq_queued_time = ktime_get_seconds();
1169         list_add_tail(&req->rq_set_chain, &set->set_new_requests);
1170         count = atomic_inc_return(&set->set_new_count);
1171         spin_unlock(&set->set_new_req_lock);
1172
1173         /* Only need to call wakeup once for the first entry. */
1174         if (count == 1) {
1175                 wake_up(&set->set_waitq);
1176
1177                 /*
1178                  * XXX: It maybe unnecessary to wakeup all the partners. But to
1179                  *      guarantee the async RPC can be processed ASAP, we have
1180                  *      no other better choice. It maybe fixed in future.
1181                  */
1182                 for (i = 0; i < pc->pc_npartners; i++)
1183                         wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
1184         }
1185 }
1186
1187 /**
1188  * Based on the current state of the import, determine if the request
1189  * can be sent, is an error, or should be delayed.
1190  *
1191  * Returns true if this request should be delayed. If false, and
1192  * *status is set, then the request can not be sent and *status is the
1193  * error code.  If false and status is 0, then request can be sent.
1194  *
1195  * The imp->imp_lock must be held.
1196  */
1197 static int ptlrpc_import_delay_req(struct obd_import *imp,
1198                                    struct ptlrpc_request *req, int *status)
1199 {
1200         int delay = 0;
1201
1202         ENTRY;
1203         LASSERT(status);
1204         *status = 0;
1205
1206         if (req->rq_ctx_init || req->rq_ctx_fini) {
1207                 /* always allow ctx init/fini rpc go through */
1208         } else if (imp->imp_state == LUSTRE_IMP_NEW) {
1209                 DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
1210                 *status = -EIO;
1211         } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
1212                 unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
1213
1214                 /*
1215                  * pings or MDS-equivalent STATFS may safely
1216                  * race with umount
1217                  */
1218                 DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
1219                           D_HA : D_ERROR, req, "IMP_CLOSED ");
1220                 *status = -EIO;
1221         } else if (ptlrpc_send_limit_expired(req)) {
1222                 /* probably doesn't need to be a D_ERROR afterinitial testing */
1223                 DEBUG_REQ(D_HA, req, "send limit expired ");
1224                 *status = -ETIMEDOUT;
1225         } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
1226                    imp->imp_state == LUSTRE_IMP_CONNECTING) {
1227                 ;/* allow CONNECT even if import is invalid */
1228                 if (atomic_read(&imp->imp_inval_count) != 0) {
1229                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1230                         *status = -EIO;
1231                 }
1232         } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
1233                 if (!imp->imp_deactive)
1234                         DEBUG_REQ(D_NET, req, "IMP_INVALID");
1235                 *status = -ESHUTDOWN; /* b=12940 */
1236         } else if (req->rq_import_generation != imp->imp_generation) {
1237                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
1238                 *status = -EIO;
1239         } else if (req->rq_send_state != imp->imp_state) {
1240                 /* invalidate in progress - any requests should be drop */
1241                 if (atomic_read(&imp->imp_inval_count) != 0) {
1242                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1243                         *status = -EIO;
1244                 } else if (req->rq_no_delay &&
1245                            imp->imp_generation != imp->imp_initiated_at) {
1246                         /* ignore nodelay for requests initiating connections */
1247                         *status = -EWOULDBLOCK;
1248                 } else if (req->rq_allow_replay &&
1249                            (imp->imp_state == LUSTRE_IMP_REPLAY ||
1250                             imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
1251                             imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
1252                             imp->imp_state == LUSTRE_IMP_RECOVER)) {
1253                         DEBUG_REQ(D_HA, req, "allow during recovery.\n");
1254                 } else {
1255                         delay = 1;
1256                 }
1257         }
1258
1259         RETURN(delay);
1260 }
1261
1262 /**
1263  * Decide if the error message should be printed to the console or not.
1264  * Makes its decision based on request type, status, and failure frequency.
1265  *
1266  * \param[in] req  request that failed and may need a console message
1267  *
1268  * \retval false if no message should be printed
1269  * \retval true  if console message should be printed
1270  */
1271 static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
1272 {
1273         LASSERT(req->rq_reqmsg != NULL);
1274
1275         /* Suppress particular reconnect errors which are to be expected. */
1276         if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
1277                 /* Suppress timed out reconnect requests */
1278                 if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
1279                     req->rq_timedout)
1280                         return false;
1281
1282                 /*
1283                  * Suppress most unavailable/again reconnect requests, but
1284                  * print occasionally so it is clear client is trying to
1285                  * connect to a server where no target is running.
1286                  */
1287                 if ((err == -ENODEV || err == -EAGAIN) &&
1288                     req->rq_import->imp_conn_cnt % 30 != 20)
1289                         return false;
1290         }
1291
1292         if (opc == LDLM_ENQUEUE && err == -EAGAIN)
1293                 /* -EAGAIN is normal when using POSIX flocks */
1294                 return false;
1295
1296         if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
1297             (req->rq_xid & 0xf) != 10)
1298                 /* Suppress most ping requests, they may fail occasionally */
1299                 return false;
1300
1301         return true;
1302 }
1303
1304 /**
1305  * Check request processing status.
1306  * Returns the status.
1307  */
1308 static int ptlrpc_check_status(struct ptlrpc_request *req)
1309 {
1310         int err;
1311
1312         ENTRY;
1313         err = lustre_msg_get_status(req->rq_repmsg);
1314         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
1315                 struct obd_import *imp = req->rq_import;
1316                 lnet_nid_t nid = imp->imp_connection->c_peer.nid;
1317                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
1318
1319                 if (ptlrpc_console_allow(req, opc, err))
1320                         LCONSOLE_ERROR_MSG(0x11,
1321                                            "%s: operation %s to node %s failed: rc = %d\n",
1322                                            imp->imp_obd->obd_name,
1323                                            ll_opcode2str(opc),
1324                                            libcfs_nid2str(nid), err);
1325                 RETURN(err < 0 ? err : -EINVAL);
1326         }
1327
1328         if (err < 0) {
1329                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1330         } else if (err > 0) {
1331                 /* XXX: translate this error from net to host */
1332                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1333         }
1334
1335         RETURN(err);
1336 }
1337
1338 /**
1339  * save pre-versions of objects into request for replay.
1340  * Versions are obtained from server reply.
1341  * used for VBR.
1342  */
1343 static void ptlrpc_save_versions(struct ptlrpc_request *req)
1344 {
1345         struct lustre_msg *repmsg = req->rq_repmsg;
1346         struct lustre_msg *reqmsg = req->rq_reqmsg;
1347         __u64 *versions = lustre_msg_get_versions(repmsg);
1348
1349         ENTRY;
1350         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
1351                 return;
1352
1353         LASSERT(versions);
1354         lustre_msg_set_versions(reqmsg, versions);
1355         CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
1356                versions[0], versions[1]);
1357
1358         EXIT;
1359 }
1360
1361 __u64 ptlrpc_known_replied_xid(struct obd_import *imp)
1362 {
1363         struct ptlrpc_request *req;
1364
1365         assert_spin_locked(&imp->imp_lock);
1366         if (list_empty(&imp->imp_unreplied_list))
1367                 return 0;
1368
1369         req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
1370                          rq_unreplied_list);
1371         LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
1372
1373         if (imp->imp_known_replied_xid < req->rq_xid - 1)
1374                 imp->imp_known_replied_xid = req->rq_xid - 1;
1375
1376         return req->rq_xid - 1;
1377 }
1378
1379 /**
1380  * Callback function called when client receives RPC reply for \a req.
1381  * Returns 0 on success or error code.
1382  * The return alue would be assigned to req->rq_status by the caller
1383  * as request processing status.
1384  * This function also decides if the request needs to be saved for later replay.
1385  */
1386 static int after_reply(struct ptlrpc_request *req)
1387 {
1388         struct obd_import *imp = req->rq_import;
1389         struct obd_device *obd = req->rq_import->imp_obd;
1390         ktime_t work_start;
1391         u64 committed;
1392         s64 timediff;
1393         int rc;
1394
1395         ENTRY;
1396         LASSERT(obd != NULL);
1397         /* repbuf must be unlinked */
1398         LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked);
1399
1400         if (req->rq_reply_truncated) {
1401                 if (ptlrpc_no_resend(req)) {
1402                         DEBUG_REQ(D_ERROR, req,
1403                                   "reply buffer overflow, expected: %d, actual size: %d",
1404                                   req->rq_nob_received, req->rq_repbuf_len);
1405                         RETURN(-EOVERFLOW);
1406                 }
1407
1408                 sptlrpc_cli_free_repbuf(req);
1409                 /*
1410                  * Pass the required reply buffer size (include
1411                  * space for early reply).
1412                  * NB: no need to roundup because alloc_repbuf
1413                  * will roundup it
1414                  */
1415                 req->rq_replen = req->rq_nob_received;
1416                 req->rq_nob_received = 0;
1417                 spin_lock(&req->rq_lock);
1418                 req->rq_resend       = 1;
1419                 spin_unlock(&req->rq_lock);
1420                 RETURN(0);
1421         }
1422
1423         work_start = ktime_get_real();
1424         timediff = ktime_us_delta(work_start, req->rq_sent_ns);
1425
1426         /*
1427          * NB Until this point, the whole of the incoming message,
1428          * including buflens, status etc is in the sender's byte order.
1429          */
1430         rc = sptlrpc_cli_unwrap_reply(req);
1431         if (rc) {
1432                 DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
1433                 RETURN(rc);
1434         }
1435
1436         /*
1437          * Security layer unwrap might ask resend this request.
1438          */
1439         if (req->rq_resend)
1440                 RETURN(0);
1441
1442         rc = unpack_reply(req);
1443         if (rc)
1444                 RETURN(rc);
1445
1446         /* retry indefinitely on EINPROGRESS */
1447         if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
1448             ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
1449                 time64_t now = ktime_get_real_seconds();
1450
1451                 DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
1452                 spin_lock(&req->rq_lock);
1453                 req->rq_resend = 1;
1454                 spin_unlock(&req->rq_lock);
1455                 req->rq_nr_resend++;
1456
1457                 /* Readjust the timeout for current conditions */
1458                 ptlrpc_at_set_req_timeout(req);
1459                 /*
1460                  * delay resend to give a chance to the server to get ready.
1461                  * The delay is increased by 1s on every resend and is capped to
1462                  * the current request timeout (i.e. obd_timeout if AT is off,
1463                  * or AT service time x 125% + 5s, see at_est2timeout)
1464                  */
1465                 if (req->rq_nr_resend > req->rq_timeout)
1466                         req->rq_sent = now + req->rq_timeout;
1467                 else
1468                         req->rq_sent = now + req->rq_nr_resend;
1469
1470                 /* Resend for EINPROGRESS will use a new XID */
1471                 spin_lock(&imp->imp_lock);
1472                 list_del_init(&req->rq_unreplied_list);
1473                 spin_unlock(&imp->imp_lock);
1474
1475                 RETURN(0);
1476         }
1477
1478         if (obd->obd_svc_stats) {
1479                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
1480                                     timediff);
1481                 ptlrpc_lprocfs_rpc_sent(req, timediff);
1482         }
1483
1484         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
1485             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
1486                 DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
1487                           lustre_msg_get_type(req->rq_repmsg));
1488                 RETURN(-EPROTO);
1489         }
1490
1491         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
1492                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
1493         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
1494         ptlrpc_at_adj_net_latency(req,
1495                                   lustre_msg_get_service_time(req->rq_repmsg));
1496
1497         rc = ptlrpc_check_status(req);
1498
1499         if (rc) {
1500                 /*
1501                  * Either we've been evicted, or the server has failed for
1502                  * some reason. Try to reconnect, and if that fails, punt to
1503                  * the upcall.
1504                  */
1505                 if (ptlrpc_recoverable_error(rc)) {
1506                         if (req->rq_send_state != LUSTRE_IMP_FULL ||
1507                             imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
1508                                 RETURN(rc);
1509                         }
1510                         ptlrpc_request_handle_notconn(req);
1511                         RETURN(rc);
1512                 }
1513         } else {
1514                 /*
1515                  * Let's look if server sent slv. Do it only for RPC with
1516                  * rc == 0.
1517                  */
1518                 ldlm_cli_update_pool(req);
1519         }
1520
1521         /*
1522          * Store transno in reqmsg for replay.
1523          */
1524         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
1525                 req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
1526                 lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
1527         }
1528
1529         if (imp->imp_replayable) {
1530                 spin_lock(&imp->imp_lock);
1531                 /*
1532                  * No point in adding already-committed requests to the replay
1533                  * list, we will just remove them immediately. b=9829
1534                  */
1535                 if (req->rq_transno != 0 &&
1536                     (req->rq_transno >
1537                      lustre_msg_get_last_committed(req->rq_repmsg) ||
1538                      req->rq_replay)) {
1539                         /** version recovery */
1540                         ptlrpc_save_versions(req);
1541                         ptlrpc_retain_replayable_request(req, imp);
1542                 } else if (req->rq_commit_cb &&
1543                            list_empty(&req->rq_replay_list)) {
1544                         /*
1545                          * NB: don't call rq_commit_cb if it's already on
1546                          * rq_replay_list, ptlrpc_free_committed() will call
1547                          * it later, see LU-3618 for details
1548                          */
1549                         spin_unlock(&imp->imp_lock);
1550                         req->rq_commit_cb(req);
1551                         spin_lock(&imp->imp_lock);
1552                 }
1553
1554                 /*
1555                  * Replay-enabled imports return commit-status information.
1556                  */
1557                 committed = lustre_msg_get_last_committed(req->rq_repmsg);
1558                 if (likely(committed > imp->imp_peer_committed_transno))
1559                         imp->imp_peer_committed_transno = committed;
1560
1561                 ptlrpc_free_committed(imp);
1562
1563                 if (!list_empty(&imp->imp_replay_list)) {
1564                         struct ptlrpc_request *last;
1565
1566                         last = list_entry(imp->imp_replay_list.prev,
1567                                           struct ptlrpc_request,
1568                                           rq_replay_list);
1569                         /*
1570                          * Requests with rq_replay stay on the list even if no
1571                          * commit is expected.
1572                          */
1573                         if (last->rq_transno > imp->imp_peer_committed_transno)
1574                                 ptlrpc_pinger_commit_expected(imp);
1575                 }
1576
1577                 spin_unlock(&imp->imp_lock);
1578         }
1579
1580         RETURN(rc);
1581 }
1582
1583 /**
1584  * Helper function to send request \a req over the network for the first time
1585  * Also adjusts request phase.
1586  * Returns 0 on success or error code.
1587  */
1588 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
1589 {
1590         struct obd_import *imp = req->rq_import;
1591         __u64 min_xid = 0;
1592         int rc;
1593
1594         ENTRY;
1595         LASSERT(req->rq_phase == RQ_PHASE_NEW);
1596
1597         /* do not try to go further if there is not enough memory in enc_pool */
1598         if (req->rq_sent && req->rq_bulk)
1599                 if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
1600                     pool_is_at_full_capacity())
1601                         RETURN(-ENOMEM);
1602
1603         if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
1604             (!req->rq_generation_set ||
1605              req->rq_import_generation == imp->imp_generation))
1606                 RETURN(0);
1607
1608         ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
1609
1610         spin_lock(&imp->imp_lock);
1611
1612         LASSERT(req->rq_xid != 0);
1613         LASSERT(!list_empty(&req->rq_unreplied_list));
1614
1615         if (!req->rq_generation_set)
1616                 req->rq_import_generation = imp->imp_generation;
1617
1618         if (ptlrpc_import_delay_req(imp, req, &rc)) {
1619                 spin_lock(&req->rq_lock);
1620                 req->rq_waiting = 1;
1621                 spin_unlock(&req->rq_lock);
1622
1623                 DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
1624                           ptlrpc_import_state_name(req->rq_send_state),
1625                           ptlrpc_import_state_name(imp->imp_state));
1626                 LASSERT(list_empty(&req->rq_list));
1627                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
1628                 atomic_inc(&req->rq_import->imp_inflight);
1629                 spin_unlock(&imp->imp_lock);
1630                 RETURN(0);
1631         }
1632
1633         if (rc != 0) {
1634                 spin_unlock(&imp->imp_lock);
1635                 req->rq_status = rc;
1636                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1637                 RETURN(rc);
1638         }
1639
1640         LASSERT(list_empty(&req->rq_list));
1641         list_add_tail(&req->rq_list, &imp->imp_sending_list);
1642         atomic_inc(&req->rq_import->imp_inflight);
1643
1644         /*
1645          * find the known replied XID from the unreplied list, CONNECT
1646          * and DISCONNECT requests are skipped to make the sanity check
1647          * on server side happy. see process_req_last_xid().
1648          *
1649          * For CONNECT: Because replay requests have lower XID, it'll
1650          * break the sanity check if CONNECT bump the exp_last_xid on
1651          * server.
1652          *
1653          * For DISCONNECT: Since client will abort inflight RPC before
1654          * sending DISCONNECT, DISCONNECT may carry an XID which higher
1655          * than the inflight RPC.
1656          */
1657         if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
1658                 min_xid = ptlrpc_known_replied_xid(imp);
1659         spin_unlock(&imp->imp_lock);
1660
1661         lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
1662
1663         lustre_msg_set_status(req->rq_reqmsg, current_pid());
1664
1665         rc = sptlrpc_req_refresh_ctx(req, -1);
1666         if (rc) {
1667                 if (req->rq_err) {
1668                         req->rq_status = rc;
1669                         RETURN(1);
1670                 } else {
1671                         spin_lock(&req->rq_lock);
1672                         req->rq_wait_ctx = 1;
1673                         spin_unlock(&req->rq_lock);
1674                         RETURN(0);
1675                 }
1676         }
1677
1678         CDEBUG(D_RPCTRACE,
1679                "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1680                current_comm(),
1681                imp->imp_obd->obd_uuid.uuid,
1682                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1683                obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
1684
1685         rc = ptl_send_rpc(req, 0);
1686         if (rc == -ENOMEM) {
1687                 spin_lock(&imp->imp_lock);
1688                 if (!list_empty(&req->rq_list)) {
1689                         list_del_init(&req->rq_list);
1690                         atomic_dec(&req->rq_import->imp_inflight);
1691                 }
1692                 spin_unlock(&imp->imp_lock);
1693                 ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
1694                 RETURN(rc);
1695         }
1696         if (rc) {
1697                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
1698                 spin_lock(&req->rq_lock);
1699                 req->rq_net_err = 1;
1700                 spin_unlock(&req->rq_lock);
1701                 RETURN(rc);
1702         }
1703         RETURN(0);
1704 }
1705
1706 static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
1707 {
1708         int remaining, rc;
1709
1710         ENTRY;
1711         LASSERT(set->set_producer != NULL);
1712
1713         remaining = atomic_read(&set->set_remaining);
1714
1715         /*
1716          * populate the ->set_requests list with requests until we
1717          * reach the maximum number of RPCs in flight for this set
1718          */
1719         while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
1720                 rc = set->set_producer(set, set->set_producer_arg);
1721                 if (rc == -ENOENT) {
1722                         /* no more RPC to produce */
1723                         set->set_producer     = NULL;
1724                         set->set_producer_arg = NULL;
1725                         RETURN(0);
1726                 }
1727         }
1728
1729         RETURN((atomic_read(&set->set_remaining) - remaining));
1730 }
1731
1732 /**
1733  * this sends any unsent RPCs in \a set and returns 1 if all are sent
1734  * and no more replies are expected.
1735  * (it is possible to get less replies than requests sent e.g. due to timed out
1736  * requests or requests that we had trouble to send out)
1737  *
1738  * NOTE: This function contains a potential schedule point (cond_resched()).
1739  */
1740 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
1741 {
1742         struct list_head *tmp, *next;
1743         struct list_head  comp_reqs;
1744         int force_timer_recalc = 0;
1745
1746         ENTRY;
1747         if (atomic_read(&set->set_remaining) == 0)
1748                 RETURN(1);
1749
1750         INIT_LIST_HEAD(&comp_reqs);
1751         list_for_each_safe(tmp, next, &set->set_requests) {
1752                 struct ptlrpc_request *req =
1753                         list_entry(tmp, struct ptlrpc_request,
1754                                    rq_set_chain);
1755                 struct obd_import *imp = req->rq_import;
1756                 int unregistered = 0;
1757                 int async = 1;
1758                 int rc = 0;
1759
1760                 if (req->rq_phase == RQ_PHASE_COMPLETE) {
1761                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1762                         continue;
1763                 }
1764
1765                 /*
1766                  * This schedule point is mainly for the ptlrpcd caller of this
1767                  * function.  Most ptlrpc sets are not long-lived and unbounded
1768                  * in length, but at the least the set used by the ptlrpcd is.
1769                  * Since the processing time is unbounded, we need to insert an
1770                  * explicit schedule point to make the thread well-behaved.
1771                  */
1772                 cond_resched();
1773
1774                 /*
1775                  * If the caller requires to allow to be interpreted by force
1776                  * and it has really been interpreted, then move the request
1777                  * to RQ_PHASE_INTERPRET phase in spite of what the current
1778                  * phase is.
1779                  */
1780                 if (unlikely(req->rq_allow_intr && req->rq_intr)) {
1781                         req->rq_status = -EINTR;
1782                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1783
1784                         /*
1785                          * Since it is interpreted and we have to wait for
1786                          * the reply to be unlinked, then use sync mode.
1787                          */
1788                         async = 0;
1789
1790                         GOTO(interpret, req->rq_status);
1791                 }
1792
1793                 if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
1794                         force_timer_recalc = 1;
1795
1796                 /* delayed send - skip */
1797                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
1798                         continue;
1799
1800                 /* delayed resend - skip */
1801                 if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
1802                     req->rq_sent > ktime_get_real_seconds())
1803                         continue;
1804
1805                 if (!(req->rq_phase == RQ_PHASE_RPC ||
1806                       req->rq_phase == RQ_PHASE_BULK ||
1807                       req->rq_phase == RQ_PHASE_INTERPRET ||
1808                       req->rq_phase == RQ_PHASE_UNREG_RPC ||
1809                       req->rq_phase == RQ_PHASE_UNREG_BULK)) {
1810                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
1811                         LBUG();
1812                 }
1813
1814                 if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
1815                     req->rq_phase == RQ_PHASE_UNREG_BULK) {
1816                         LASSERT(req->rq_next_phase != req->rq_phase);
1817                         LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
1818
1819                         if (req->rq_req_deadline &&
1820                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
1821                                 req->rq_req_deadline = 0;
1822                         if (req->rq_reply_deadline &&
1823                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
1824                                 req->rq_reply_deadline = 0;
1825                         if (req->rq_bulk_deadline &&
1826                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
1827                                 req->rq_bulk_deadline = 0;
1828
1829                         /*
1830                          * Skip processing until reply is unlinked. We
1831                          * can't return to pool before that and we can't
1832                          * call interpret before that. We need to make
1833                          * sure that all rdma transfers finished and will
1834                          * not corrupt any data.
1835                          */
1836                         if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
1837                             ptlrpc_client_recv_or_unlink(req))
1838                                 continue;
1839                         if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
1840                             ptlrpc_client_bulk_active(req))
1841                                 continue;
1842
1843                         /*
1844                          * Turn fail_loc off to prevent it from looping
1845                          * forever.
1846                          */
1847                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
1848                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
1849                                                      OBD_FAIL_ONCE);
1850                         }
1851                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
1852                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
1853                                                      OBD_FAIL_ONCE);
1854                         }
1855
1856                         /*
1857                          * Move to next phase if reply was successfully
1858                          * unlinked.
1859                          */
1860                         ptlrpc_rqphase_move(req, req->rq_next_phase);
1861                 }
1862
1863                 if (req->rq_phase == RQ_PHASE_INTERPRET)
1864                         GOTO(interpret, req->rq_status);
1865
1866                 /*
1867                  * Note that this also will start async reply unlink.
1868                  */
1869                 if (req->rq_net_err && !req->rq_timedout) {
1870                         ptlrpc_expire_one_request(req, 1);
1871
1872                         /*
1873                          * Check if we still need to wait for unlink.
1874                          */
1875                         if (ptlrpc_client_recv_or_unlink(req) ||
1876                             ptlrpc_client_bulk_active(req))
1877                                 continue;
1878                         /* If there is no need to resend, fail it now. */
1879                         if (req->rq_no_resend) {
1880                                 if (req->rq_status == 0)
1881                                         req->rq_status = -EIO;
1882                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1883                                 GOTO(interpret, req->rq_status);
1884                         } else {
1885                                 continue;
1886                         }
1887                 }
1888
1889                 if (req->rq_err) {
1890                         spin_lock(&req->rq_lock);
1891                         req->rq_replied = 0;
1892                         spin_unlock(&req->rq_lock);
1893                         if (req->rq_status == 0)
1894                                 req->rq_status = -EIO;
1895                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1896                         GOTO(interpret, req->rq_status);
1897                 }
1898
1899                 /*
1900                  * ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
1901                  * so it sets rq_intr regardless of individual rpc
1902                  * timeouts. The synchronous IO waiting path sets
1903                  * rq_intr irrespective of whether ptlrpcd
1904                  * has seen a timeout.  Our policy is to only interpret
1905                  * interrupted rpcs after they have timed out, so we
1906                  * need to enforce that here.
1907                  */
1908
1909                 if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
1910                                      req->rq_wait_ctx)) {
1911                         req->rq_status = -EINTR;
1912                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1913                         GOTO(interpret, req->rq_status);
1914                 }
1915
1916                 if (req->rq_phase == RQ_PHASE_RPC) {
1917                         if (req->rq_timedout || req->rq_resend ||
1918                             req->rq_waiting || req->rq_wait_ctx) {
1919                                 int status;
1920
1921                                 if (!ptlrpc_unregister_reply(req, 1)) {
1922                                         ptlrpc_unregister_bulk(req, 1);
1923                                         continue;
1924                                 }
1925
1926                                 spin_lock(&imp->imp_lock);
1927                                 if (ptlrpc_import_delay_req(imp, req,
1928                                                             &status)) {
1929                                         /*
1930                                          * put on delay list - only if we wait
1931                                          * recovery finished - before send
1932                                          */
1933                                         list_del_init(&req->rq_list);
1934                                         list_add_tail(&req->rq_list,
1935                                                       &imp->imp_delayed_list);
1936                                         spin_unlock(&imp->imp_lock);
1937                                         continue;
1938                                 }
1939
1940                                 if (status != 0)  {
1941                                         req->rq_status = status;
1942                                         ptlrpc_rqphase_move(req,
1943                                                             RQ_PHASE_INTERPRET);
1944                                         spin_unlock(&imp->imp_lock);
1945                                         GOTO(interpret, req->rq_status);
1946                                 }
1947                                 /* ignore on just initiated connections */
1948                                 if (ptlrpc_no_resend(req) &&
1949                                     !req->rq_wait_ctx &&
1950                                     imp->imp_generation !=
1951                                     imp->imp_initiated_at) {
1952                                         req->rq_status = -ENOTCONN;
1953                                         ptlrpc_rqphase_move(req,
1954                                                             RQ_PHASE_INTERPRET);
1955                                         spin_unlock(&imp->imp_lock);
1956                                         GOTO(interpret, req->rq_status);
1957                                 }
1958
1959                                 list_del_init(&req->rq_list);
1960                                 list_add_tail(&req->rq_list,
1961                                               &imp->imp_sending_list);
1962
1963                                 spin_unlock(&imp->imp_lock);
1964
1965                                 spin_lock(&req->rq_lock);
1966                                 req->rq_waiting = 0;
1967                                 spin_unlock(&req->rq_lock);
1968
1969                                 if (req->rq_timedout || req->rq_resend) {
1970                                         /*
1971                                          * This is re-sending anyways,
1972                                          * let's mark req as resend.
1973                                          */
1974                                         spin_lock(&req->rq_lock);
1975                                         req->rq_resend = 1;
1976                                         spin_unlock(&req->rq_lock);
1977                                 }
1978                                 /*
1979                                  * rq_wait_ctx is only touched by ptlrpcd,
1980                                  * so no lock is needed here.
1981                                  */
1982                                 status = sptlrpc_req_refresh_ctx(req, -1);
1983                                 if (status) {
1984                                         if (req->rq_err) {
1985                                                 req->rq_status = status;
1986                                                 spin_lock(&req->rq_lock);
1987                                                 req->rq_wait_ctx = 0;
1988                                                 spin_unlock(&req->rq_lock);
1989                                                 force_timer_recalc = 1;
1990                                         } else {
1991                                                 spin_lock(&req->rq_lock);
1992                                                 req->rq_wait_ctx = 1;
1993                                                 spin_unlock(&req->rq_lock);
1994                                         }
1995
1996                                         continue;
1997                                 } else {
1998                                         spin_lock(&req->rq_lock);
1999                                         req->rq_wait_ctx = 0;
2000                                         spin_unlock(&req->rq_lock);
2001                                 }
2002
2003                                 /*
2004                                  * In any case, the previous bulk should be
2005                                  * cleaned up to prepare for the new sending
2006                                  */
2007                                 if (req->rq_bulk &&
2008                                     !ptlrpc_unregister_bulk(req, 1))
2009                                         continue;
2010
2011                                 rc = ptl_send_rpc(req, 0);
2012                                 if (rc == -ENOMEM) {
2013                                         spin_lock(&imp->imp_lock);
2014                                         if (!list_empty(&req->rq_list))
2015                                                 list_del_init(&req->rq_list);
2016                                         spin_unlock(&imp->imp_lock);
2017                                         ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
2018                                         continue;
2019                                 }
2020                                 if (rc) {
2021                                         DEBUG_REQ(D_HA, req,
2022                                                   "send failed: rc = %d", rc);
2023                                         force_timer_recalc = 1;
2024                                         spin_lock(&req->rq_lock);
2025                                         req->rq_net_err = 1;
2026                                         spin_unlock(&req->rq_lock);
2027                                         continue;
2028                                 }
2029                                 /* need to reset the timeout */
2030                                 force_timer_recalc = 1;
2031                         }
2032
2033                         spin_lock(&req->rq_lock);
2034
2035                         if (ptlrpc_client_early(req)) {
2036                                 ptlrpc_at_recv_early_reply(req);
2037                                 spin_unlock(&req->rq_lock);
2038                                 continue;
2039                         }
2040
2041                         /* Still waiting for a reply? */
2042                         if (ptlrpc_client_recv(req)) {
2043                                 spin_unlock(&req->rq_lock);
2044                                 continue;
2045                         }
2046
2047                         /* Did we actually receive a reply? */
2048                         if (!ptlrpc_client_replied(req)) {
2049                                 spin_unlock(&req->rq_lock);
2050                                 continue;
2051                         }
2052
2053                         spin_unlock(&req->rq_lock);
2054
2055                         /*
2056                          * unlink from net because we are going to
2057                          * swab in-place of reply buffer
2058                          */
2059                         unregistered = ptlrpc_unregister_reply(req, 1);
2060                         if (!unregistered)
2061                                 continue;
2062
2063                         req->rq_status = after_reply(req);
2064                         if (req->rq_resend)
2065                                 continue;
2066
2067                         /*
2068                          * If there is no bulk associated with this request,
2069                          * then we're done and should let the interpreter
2070                          * process the reply. Similarly if the RPC returned
2071                          * an error, and therefore the bulk will never arrive.
2072                          */
2073                         if (!req->rq_bulk || req->rq_status < 0) {
2074                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2075                                 GOTO(interpret, req->rq_status);
2076                         }
2077
2078                         ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
2079                 }
2080
2081                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
2082                 if (ptlrpc_client_bulk_active(req))
2083                         continue;
2084
2085                 if (req->rq_bulk->bd_failure) {
2086                         /*
2087                          * The RPC reply arrived OK, but the bulk screwed
2088                          * up!  Dead weird since the server told us the RPC
2089                          * was good after getting the REPLY for her GET or
2090                          * the ACK for her PUT.
2091                          */
2092                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
2093                         req->rq_status = -EIO;
2094                 }
2095
2096                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2097
2098 interpret:
2099                 LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
2100
2101                 /*
2102                  * This moves to "unregistering" phase we need to wait for
2103                  * reply unlink.
2104                  */
2105                 if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
2106                         /* start async bulk unlink too */
2107                         ptlrpc_unregister_bulk(req, 1);
2108                         continue;
2109                 }
2110
2111                 if (!ptlrpc_unregister_bulk(req, async))
2112                         continue;
2113
2114                 /*
2115                  * When calling interpret receiving already should be
2116                  * finished.
2117                  */
2118                 LASSERT(!req->rq_receiving_reply);
2119
2120                 ptlrpc_req_interpret(env, req, req->rq_status);
2121
2122                 if (ptlrpcd_check_work(req)) {
2123                         atomic_dec(&set->set_remaining);
2124                         continue;
2125                 }
2126                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
2127
2128                 if (req->rq_reqmsg)
2129                         CDEBUG(D_RPCTRACE,
2130                                "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
2131                                current_comm(),
2132                                imp->imp_obd->obd_uuid.uuid,
2133                                lustre_msg_get_status(req->rq_reqmsg),
2134                                req->rq_xid,
2135                                obd_import_nid2str(imp),
2136                                lustre_msg_get_opc(req->rq_reqmsg));
2137
2138                 spin_lock(&imp->imp_lock);
2139                 /*
2140                  * Request already may be not on sending or delaying list. This
2141                  * may happen in the case of marking it erroneous for the case
2142                  * ptlrpc_import_delay_req(req, status) find it impossible to
2143                  * allow sending this rpc and returns *status != 0.
2144                  */
2145                 if (!list_empty(&req->rq_list)) {
2146                         list_del_init(&req->rq_list);
2147                         atomic_dec(&imp->imp_inflight);
2148                 }
2149                 list_del_init(&req->rq_unreplied_list);
2150                 spin_unlock(&imp->imp_lock);
2151
2152                 atomic_dec(&set->set_remaining);
2153                 wake_up_all(&imp->imp_recovery_waitq);
2154
2155                 if (set->set_producer) {
2156                         /* produce a new request if possible */
2157                         if (ptlrpc_set_producer(set) > 0)
2158                                 force_timer_recalc = 1;
2159
2160                         /*
2161                          * free the request that has just been completed
2162                          * in order not to pollute set->set_requests
2163                          */
2164                         list_del_init(&req->rq_set_chain);
2165                         spin_lock(&req->rq_lock);
2166                         req->rq_set = NULL;
2167                         req->rq_invalid_rqset = 0;
2168                         spin_unlock(&req->rq_lock);
2169
2170                         /* record rq_status to compute the final status later */
2171                         if (req->rq_status != 0)
2172                                 set->set_rc = req->rq_status;
2173                         ptlrpc_req_finished(req);
2174                 } else {
2175                         list_move_tail(&req->rq_set_chain, &comp_reqs);
2176                 }
2177         }
2178
2179         /*
2180          * move completed request at the head of list so it's easier for
2181          * caller to find them
2182          */
2183         list_splice(&comp_reqs, &set->set_requests);
2184
2185         /* If we hit an error, we want to recover promptly. */
2186         RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
2187 }
2188 EXPORT_SYMBOL(ptlrpc_check_set);
2189
2190 /**
2191  * Time out request \a req. is \a async_unlink is set, that means do not wait
2192  * until LNet actually confirms network buffer unlinking.
2193  * Return 1 if we should give up further retrying attempts or 0 otherwise.
2194  */
2195 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
2196 {
2197         struct obd_import *imp = req->rq_import;
2198         unsigned int debug_mask = D_RPCTRACE;
2199         int rc = 0;
2200
2201         ENTRY;
2202         spin_lock(&req->rq_lock);
2203         req->rq_timedout = 1;
2204         spin_unlock(&req->rq_lock);
2205
2206         if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
2207                                  lustre_msg_get_status(req->rq_reqmsg)))
2208                 debug_mask = D_WARNING;
2209         DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
2210                   req->rq_net_err ? "failed due to network error" :
2211                      ((req->rq_real_sent == 0 ||
2212                        req->rq_real_sent < req->rq_sent ||
2213                        req->rq_real_sent >= req->rq_deadline) ?
2214                       "timed out for sent delay" : "timed out for slow reply"),
2215                   (s64)req->rq_sent, (s64)req->rq_real_sent);
2216
2217         if (imp && obd_debug_peer_on_timeout)
2218                 LNetDebugPeer(imp->imp_connection->c_peer);
2219
2220         ptlrpc_unregister_reply(req, async_unlink);
2221         ptlrpc_unregister_bulk(req, async_unlink);
2222
2223         if (obd_dump_on_timeout)
2224                 libcfs_debug_dumplog();
2225
2226         if (!imp) {
2227                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
2228                 RETURN(1);
2229         }
2230
2231         atomic_inc(&imp->imp_timeouts);
2232
2233         /* The DLM server doesn't want recovery run on its imports. */
2234         if (imp->imp_dlm_fake)
2235                 RETURN(1);
2236
2237         /*
2238          * If this request is for recovery or other primordial tasks,
2239          * then error it out here.
2240          */
2241         if (req->rq_ctx_init || req->rq_ctx_fini ||
2242             req->rq_send_state != LUSTRE_IMP_FULL ||
2243             imp->imp_obd->obd_no_recov) {
2244                 DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
2245                           ptlrpc_import_state_name(req->rq_send_state),
2246                           ptlrpc_import_state_name(imp->imp_state));
2247                 spin_lock(&req->rq_lock);
2248                 req->rq_status = -ETIMEDOUT;
2249                 req->rq_err = 1;
2250                 spin_unlock(&req->rq_lock);
2251                 RETURN(1);
2252         }
2253
2254         /*
2255          * if a request can't be resent we can't wait for an answer after
2256          * the timeout
2257          */
2258         if (ptlrpc_no_resend(req)) {
2259                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
2260                 rc = 1;
2261         }
2262
2263         ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
2264
2265         RETURN(rc);
2266 }
2267
2268 /**
2269  * Time out all uncompleted requests in request set pointed by \a data
2270  * Callback used when waiting on sets with l_wait_event.
2271  * Always returns 1.
2272  */
2273 int ptlrpc_expired_set(void *data)
2274 {
2275         struct ptlrpc_request_set *set = data;
2276         struct list_head *tmp;
2277         time64_t now = ktime_get_real_seconds();
2278
2279         ENTRY;
2280         LASSERT(set != NULL);
2281
2282         /*
2283          * A timeout expired. See which reqs it applies to...
2284          */
2285         list_for_each(tmp, &set->set_requests) {
2286                 struct ptlrpc_request *req =
2287                         list_entry(tmp, struct ptlrpc_request,
2288                                    rq_set_chain);
2289
2290                 /* don't expire request waiting for context */
2291                 if (req->rq_wait_ctx)
2292                         continue;
2293
2294                 /* Request in-flight? */
2295                 if (!((req->rq_phase == RQ_PHASE_RPC &&
2296                        !req->rq_waiting && !req->rq_resend) ||
2297                       (req->rq_phase == RQ_PHASE_BULK)))
2298                         continue;
2299
2300                 if (req->rq_timedout ||     /* already dealt with */
2301                     req->rq_deadline > now) /* not expired */
2302                         continue;
2303
2304                 /*
2305                  * Deal with this guy. Do it asynchronously to not block
2306                  * ptlrpcd thread.
2307                  */
2308                 ptlrpc_expire_one_request(req, 1);
2309         }
2310
2311         /*
2312          * When waiting for a whole set, we always break out of the
2313          * sleep so we can recalculate the timeout, or enable interrupts
2314          * if everyone's timed out.
2315          */
2316         RETURN(1);
2317 }
2318
2319 /**
2320  * Sets rq_intr flag in \a req under spinlock.
2321  */
2322 void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
2323 {
2324         spin_lock(&req->rq_lock);
2325         req->rq_intr = 1;
2326         spin_unlock(&req->rq_lock);
2327 }
2328 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
2329
2330 /**
2331  * Interrupts (sets interrupted flag) all uncompleted requests in
2332  * a set \a data. Callback for l_wait_event for interruptible waits.
2333  */
2334 static void ptlrpc_interrupted_set(void *data)
2335 {
2336         struct ptlrpc_request_set *set = data;
2337         struct list_head *tmp;
2338
2339         LASSERT(set != NULL);
2340         CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
2341
2342         list_for_each(tmp, &set->set_requests) {
2343                 struct ptlrpc_request *req =
2344                         list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2345
2346                 if (req->rq_intr)
2347                         continue;
2348
2349                 if (req->rq_phase != RQ_PHASE_RPC &&
2350                     req->rq_phase != RQ_PHASE_UNREG_RPC &&
2351                     !req->rq_allow_intr)
2352                         continue;
2353
2354                 ptlrpc_mark_interrupted(req);
2355         }
2356 }
2357
2358 /**
2359  * Get the smallest timeout in the set; this does NOT set a timeout.
2360  */
2361 time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
2362 {
2363         struct list_head *tmp;
2364         time64_t now = ktime_get_real_seconds();
2365         int timeout = 0;
2366         struct ptlrpc_request *req;
2367         time64_t deadline;
2368
2369         ENTRY;
2370         list_for_each(tmp, &set->set_requests) {
2371                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2372
2373                 /* Request in-flight? */
2374                 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
2375                       (req->rq_phase == RQ_PHASE_BULK) ||
2376                       (req->rq_phase == RQ_PHASE_NEW)))
2377                         continue;
2378
2379                 /* Already timed out. */
2380                 if (req->rq_timedout)
2381                         continue;
2382
2383                 /* Waiting for ctx. */
2384                 if (req->rq_wait_ctx)
2385                         continue;
2386
2387                 if (req->rq_phase == RQ_PHASE_NEW)
2388                         deadline = req->rq_sent;
2389                 else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
2390                         deadline = req->rq_sent;
2391                 else
2392                         deadline = req->rq_sent + req->rq_timeout;
2393
2394                 if (deadline <= now)    /* actually expired already */
2395                         timeout = 1;    /* ASAP */
2396                 else if (timeout == 0 || timeout > deadline - now)
2397                         timeout = deadline - now;
2398         }
2399         RETURN(timeout);
2400 }
2401
2402 /**
2403  * Send all unset request from the set and then wait untill all
2404  * requests in the set complete (either get a reply, timeout, get an
2405  * error or otherwise be interrupted).
2406  * Returns 0 on success or error code otherwise.
2407  */
2408 int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
2409 {
2410         struct list_head *tmp;
2411         struct ptlrpc_request *req;
2412         struct l_wait_info lwi;
2413         time64_t timeout;
2414         int rc;
2415
2416         ENTRY;
2417         if (set->set_producer)
2418                 (void)ptlrpc_set_producer(set);
2419         else
2420                 list_for_each(tmp, &set->set_requests) {
2421                         req = list_entry(tmp, struct ptlrpc_request,
2422                                          rq_set_chain);
2423                         if (req->rq_phase == RQ_PHASE_NEW)
2424                                 (void)ptlrpc_send_new_req(req);
2425                 }
2426
2427         if (list_empty(&set->set_requests))
2428                 RETURN(0);
2429
2430         do {
2431                 timeout = ptlrpc_set_next_timeout(set);
2432
2433                 /*
2434                  * wait until all complete, interrupted, or an in-flight
2435                  * req times out
2436                  */
2437                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
2438                        set, timeout);
2439
2440                 if ((timeout == 0 && !signal_pending(current)) ||
2441                     set->set_allow_intr)
2442                         /*
2443                          * No requests are in-flight (ether timed out
2444                          * or delayed), so we can allow interrupts.
2445                          * We still want to block for a limited time,
2446                          * so we allow interrupts during the timeout.
2447                          */
2448                         lwi = LWI_TIMEOUT_INTR_ALL(
2449                                         cfs_time_seconds(timeout ? timeout : 1),
2450                                         ptlrpc_expired_set,
2451                                         ptlrpc_interrupted_set, set);
2452                 else
2453                         /*
2454                          * At least one request is in flight, so no
2455                          * interrupts are allowed. Wait until all
2456                          * complete, or an in-flight req times out.
2457                          */
2458                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
2459                                           ptlrpc_expired_set, set);
2460
2461                 rc = l_wait_event(set->set_waitq,
2462                                   ptlrpc_check_set(NULL, set), &lwi);
2463
2464                 /*
2465                  * LU-769 - if we ignored the signal because it was already
2466                  * pending when we started, we need to handle it now or we risk
2467                  * it being ignored forever
2468                  */
2469                 if (rc == -ETIMEDOUT &&
2470                     (!lwi.lwi_allow_intr || set->set_allow_intr) &&
2471                     signal_pending(current)) {
2472                         sigset_t blocked_sigs =
2473                                            cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
2474
2475                         /*
2476                          * In fact we only interrupt for the "fatal" signals
2477                          * like SIGINT or SIGKILL. We still ignore less
2478                          * important signals since ptlrpc set is not easily
2479                          * reentrant from userspace again
2480                          */
2481                         if (signal_pending(current))
2482                                 ptlrpc_interrupted_set(set);
2483                         cfs_restore_sigs(blocked_sigs);
2484                 }
2485
2486                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
2487
2488                 /*
2489                  * -EINTR => all requests have been flagged rq_intr so next
2490                  * check completes.
2491                  * -ETIMEDOUT => someone timed out.  When all reqs have
2492                  * timed out, signals are enabled allowing completion with
2493                  * EINTR.
2494                  * I don't really care if we go once more round the loop in
2495                  * the error cases -eeb.
2496                  */
2497                 if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
2498                         list_for_each(tmp, &set->set_requests) {
2499                                 req = list_entry(tmp, struct ptlrpc_request,
2500                                                  rq_set_chain);
2501                                 spin_lock(&req->rq_lock);
2502                                 req->rq_invalid_rqset = 1;
2503                                 spin_unlock(&req->rq_lock);
2504                         }
2505                 }
2506         } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
2507
2508         LASSERT(atomic_read(&set->set_remaining) == 0);
2509
2510         rc = set->set_rc; /* rq_status of already freed requests if any */
2511         list_for_each(tmp, &set->set_requests) {
2512                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2513
2514                 LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
2515                 if (req->rq_status != 0)
2516                         rc = req->rq_status;
2517         }
2518
2519         RETURN(rc);
2520 }
2521 EXPORT_SYMBOL(ptlrpc_set_wait);
2522
2523 /**
2524  * Helper fuction for request freeing.
2525  * Called when request count reached zero and request needs to be freed.
2526  * Removes request from all sorts of sending/replay lists it might be on,
2527  * frees network buffers if any are present.
2528  * If \a locked is set, that means caller is already holding import imp_lock
2529  * and so we no longer need to reobtain it (for certain lists manipulations)
2530  */
2531 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
2532 {
2533         ENTRY;
2534
2535         if (!request)
2536                 RETURN_EXIT;
2537
2538         LASSERT(!request->rq_srv_req);
2539         LASSERT(request->rq_export == NULL);
2540         LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
2541         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
2542         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
2543         LASSERTF(!request->rq_replay, "req %p\n", request);
2544
2545         req_capsule_fini(&request->rq_pill);
2546
2547         /*
2548          * We must take it off the imp_replay_list first.  Otherwise, we'll set
2549          * request->rq_reqmsg to NULL while osc_close is dereferencing it.
2550          */
2551         if (request->rq_import) {
2552                 if (!locked)
2553                         spin_lock(&request->rq_import->imp_lock);
2554                 list_del_init(&request->rq_replay_list);
2555                 list_del_init(&request->rq_unreplied_list);
2556                 if (!locked)
2557                         spin_unlock(&request->rq_import->imp_lock);
2558         }
2559         LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
2560
2561         if (atomic_read(&request->rq_refcount) != 0) {
2562                 DEBUG_REQ(D_ERROR, request,
2563                           "freeing request with nonzero refcount");
2564                 LBUG();
2565         }
2566
2567         if (request->rq_repbuf)
2568                 sptlrpc_cli_free_repbuf(request);
2569
2570         if (request->rq_import) {
2571                 class_import_put(request->rq_import);
2572                 request->rq_import = NULL;
2573         }
2574         if (request->rq_bulk)
2575                 ptlrpc_free_bulk(request->rq_bulk);
2576
2577         if (request->rq_reqbuf || request->rq_clrbuf)
2578                 sptlrpc_cli_free_reqbuf(request);
2579
2580         if (request->rq_cli_ctx)
2581                 sptlrpc_req_put_ctx(request, !locked);
2582
2583         if (request->rq_pool)
2584                 __ptlrpc_free_req_to_pool(request);
2585         else
2586                 ptlrpc_request_cache_free(request);
2587         EXIT;
2588 }
2589
2590 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
2591 /**
2592  * Drop one request reference. Must be called with import imp_lock held.
2593  * When reference count drops to zero, request is freed.
2594  */
2595 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
2596 {
2597         assert_spin_locked(&request->rq_import->imp_lock);
2598         (void)__ptlrpc_req_finished(request, 1);
2599 }
2600
2601 /**
2602  * Helper function
2603  * Drops one reference count for request \a request.
2604  * \a locked set indicates that caller holds import imp_lock.
2605  * Frees the request whe reference count reaches zero.
2606  *
2607  * \retval 1    the request is freed
2608  * \retval 0    some others still hold references on the request
2609  */
2610 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
2611 {
2612         int count;
2613
2614         ENTRY;
2615         if (!request)
2616                 RETURN(1);
2617
2618         LASSERT(request != LP_POISON);
2619         LASSERT(request->rq_reqmsg != LP_POISON);
2620
2621         DEBUG_REQ(D_INFO, request, "refcount now %u",
2622                   atomic_read(&request->rq_refcount) - 1);
2623
2624         spin_lock(&request->rq_lock);
2625         count = atomic_dec_return(&request->rq_refcount);
2626         LASSERTF(count >= 0, "Invalid ref count %d\n", count);
2627
2628         /*
2629          * For open RPC, the client does not know the EA size (LOV, ACL, and
2630          * so on) before replied, then the client has to reserve very large
2631          * reply buffer. Such buffer will not be released until the RPC freed.
2632          * Since The open RPC is replayable, we need to keep it in the replay
2633          * list until close. If there are a lot of files opened concurrently,
2634          * then the client may be OOM.
2635          *
2636          * If fact, it is unnecessary to keep reply buffer for open replay,
2637          * related EAs have already been saved via mdc_save_lovea() before
2638          * coming here. So it is safe to free the reply buffer some earlier
2639          * before releasing the RPC to avoid client OOM. LU-9514
2640          */
2641         if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) {
2642                 spin_lock(&request->rq_early_free_lock);
2643                 sptlrpc_cli_free_repbuf(request);
2644                 request->rq_repbuf = NULL;
2645                 request->rq_repbuf_len = 0;
2646                 request->rq_repdata = NULL;
2647                 request->rq_reqdata_len = 0;
2648                 spin_unlock(&request->rq_early_free_lock);
2649         }
2650         spin_unlock(&request->rq_lock);
2651
2652         if (!count)
2653                 __ptlrpc_free_req(request, locked);
2654
2655         RETURN(!count);
2656 }
2657
2658 /**
2659  * Drops one reference count for a request.
2660  */
2661 void ptlrpc_req_finished(struct ptlrpc_request *request)
2662 {
2663         __ptlrpc_req_finished(request, 0);
2664 }
2665 EXPORT_SYMBOL(ptlrpc_req_finished);
2666
2667 /**
2668  * Returns xid of a \a request
2669  */
2670 __u64 ptlrpc_req_xid(struct ptlrpc_request *request)
2671 {
2672         return request->rq_xid;
2673 }
2674 EXPORT_SYMBOL(ptlrpc_req_xid);
2675
2676 /**
2677  * Disengage the client's reply buffer from the network
2678  * NB does _NOT_ unregister any client-side bulk.
2679  * IDEMPOTENT, but _not_ safe against concurrent callers.
2680  * The request owner (i.e. the thread doing the I/O) must call...
2681  * Returns 0 on success or 1 if unregistering cannot be made.
2682  */
2683 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
2684 {
2685         int rc;
2686         struct l_wait_info lwi;
2687
2688         /*
2689          * Might sleep.
2690          */
2691         LASSERT(!in_interrupt());
2692
2693         /* Let's setup deadline for reply unlink. */
2694         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
2695             async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
2696                 request->rq_reply_deadline = ktime_get_real_seconds() +
2697                                              LONG_UNLINK;
2698
2699         /*
2700          * Nothing left to do.
2701          */
2702         if (!ptlrpc_client_recv_or_unlink(request))
2703                 RETURN(1);
2704
2705         LNetMDUnlink(request->rq_reply_md_h);
2706
2707         /*
2708          * Let's check it once again.
2709          */
2710         if (!ptlrpc_client_recv_or_unlink(request))
2711                 RETURN(1);
2712
2713         /* Move to "Unregistering" phase as reply was not unlinked yet. */
2714         ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
2715
2716         /*
2717          * Do not wait for unlink to finish.
2718          */
2719         if (async)
2720                 RETURN(0);
2721
2722         /*
2723          * We have to l_wait_event() whatever the result, to give liblustre
2724          * a chance to run reply_in_callback(), and to make sure we've
2725          * unlinked before returning a req to the pool.
2726          */
2727         for (;;) {
2728                 /* The wq argument is ignored by user-space wait_event macros */
2729                 wait_queue_head_t *wq = (request->rq_set) ?
2730                                         &request->rq_set->set_waitq :
2731                                         &request->rq_reply_waitq;
2732                 /*
2733                  * Network access will complete in finite time but the HUGE
2734                  * timeout lets us CWARN for visibility of sluggish NALs
2735                  */
2736                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
2737                                            cfs_time_seconds(1), NULL, NULL);
2738                 rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
2739                                   &lwi);
2740                 if (rc == 0) {
2741                         ptlrpc_rqphase_move(request, request->rq_next_phase);
2742                         RETURN(1);
2743                 }
2744
2745                 LASSERT(rc == -ETIMEDOUT);
2746                 DEBUG_REQ(D_WARNING, request,
2747                           "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d",
2748                           request->rq_receiving_reply,
2749                           request->rq_req_unlinked,
2750                           request->rq_reply_unlinked);
2751         }
2752         RETURN(0);
2753 }
2754
2755 static void ptlrpc_free_request(struct ptlrpc_request *req)
2756 {
2757         spin_lock(&req->rq_lock);
2758         req->rq_replay = 0;
2759         spin_unlock(&req->rq_lock);
2760
2761         if (req->rq_commit_cb)
2762                 req->rq_commit_cb(req);
2763         list_del_init(&req->rq_replay_list);
2764
2765         __ptlrpc_req_finished(req, 1);
2766 }
2767
2768 /**
2769  * the request is committed and dropped from the replay list of its import
2770  */
2771 void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
2772 {
2773         struct obd_import *imp = req->rq_import;
2774
2775         spin_lock(&imp->imp_lock);
2776         if (list_empty(&req->rq_replay_list)) {
2777                 spin_unlock(&imp->imp_lock);
2778                 return;
2779         }
2780
2781         if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
2782                 if (imp->imp_replay_cursor == &req->rq_replay_list)
2783                         imp->imp_replay_cursor = req->rq_replay_list.next;
2784                 ptlrpc_free_request(req);
2785         }
2786
2787         spin_unlock(&imp->imp_lock);
2788 }
2789 EXPORT_SYMBOL(ptlrpc_request_committed);
2790
2791 /**
2792  * Iterates through replay_list on import and prunes
2793  * all requests have transno smaller than last_committed for the
2794  * import and don't have rq_replay set.
2795  * Since requests are sorted in transno order, stops when meetign first
2796  * transno bigger than last_committed.
2797  * caller must hold imp->imp_lock
2798  */
2799 void ptlrpc_free_committed(struct obd_import *imp)
2800 {
2801         struct ptlrpc_request *req, *saved;
2802         struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
2803         bool skip_committed_list = true;
2804
2805         ENTRY;
2806         LASSERT(imp != NULL);
2807         assert_spin_locked(&imp->imp_lock);
2808
2809         if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
2810             imp->imp_generation == imp->imp_last_generation_checked) {
2811                 CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
2812                        imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
2813                 RETURN_EXIT;
2814         }
2815         CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
2816                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
2817                imp->imp_generation);
2818
2819         if (imp->imp_generation != imp->imp_last_generation_checked ||
2820             imp->imp_last_transno_checked == 0)
2821                 skip_committed_list = false;
2822
2823         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
2824         imp->imp_last_generation_checked = imp->imp_generation;
2825
2826         list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
2827                                  rq_replay_list) {
2828                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
2829                 LASSERT(req != last_req);
2830                 last_req = req;
2831
2832                 if (req->rq_transno == 0) {
2833                         DEBUG_REQ(D_EMERG, req, "zero transno during replay");
2834                         LBUG();
2835                 }
2836                 if (req->rq_import_generation < imp->imp_generation) {
2837                         DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
2838                         GOTO(free_req, 0);
2839                 }
2840
2841                 /* not yet committed */
2842                 if (req->rq_transno > imp->imp_peer_committed_transno) {
2843                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
2844                         break;
2845                 }
2846
2847                 if (req->rq_replay) {
2848                         DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
2849                         list_move_tail(&req->rq_replay_list,
2850                                        &imp->imp_committed_list);
2851                         continue;
2852                 }
2853
2854                 DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
2855                           imp->imp_peer_committed_transno);
2856 free_req:
2857                 ptlrpc_free_request(req);
2858         }
2859
2860         if (skip_committed_list)
2861                 GOTO(out, 0);
2862
2863         list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
2864                                  rq_replay_list) {
2865                 LASSERT(req->rq_transno != 0);
2866                 if (req->rq_import_generation < imp->imp_generation ||
2867                     !req->rq_replay) {
2868                         DEBUG_REQ(D_RPCTRACE, req, "free %s open request",
2869                                   req->rq_import_generation <
2870                                   imp->imp_generation ? "stale" : "closed");
2871
2872                         if (imp->imp_replay_cursor == &req->rq_replay_list)
2873                                 imp->imp_replay_cursor =
2874                                         req->rq_replay_list.next;
2875
2876                         ptlrpc_free_request(req);
2877                 }
2878         }
2879 out:
2880         EXIT;
2881 }
2882
2883 void ptlrpc_cleanup_client(struct obd_import *imp)
2884 {
2885         ENTRY;
2886         EXIT;
2887 }
2888
2889 /**
2890  * Schedule previously sent request for resend.
2891  * For bulk requests we assign new xid (to avoid problems with
2892  * lost replies and therefore several transfers landing into same buffer
2893  * from different sending attempts).
2894  */
2895 void ptlrpc_resend_req(struct ptlrpc_request *req)
2896 {
2897         DEBUG_REQ(D_HA, req, "going to resend");
2898         spin_lock(&req->rq_lock);
2899
2900         /*
2901          * Request got reply but linked to the import list still.
2902          * Let ptlrpc_check_set() process it.
2903          */
2904         if (ptlrpc_client_replied(req)) {
2905                 spin_unlock(&req->rq_lock);
2906                 DEBUG_REQ(D_HA, req, "it has reply, so skip it");
2907                 return;
2908         }
2909
2910         req->rq_status = -EAGAIN;
2911
2912         req->rq_resend = 1;
2913         req->rq_net_err = 0;
2914         req->rq_timedout = 0;
2915
2916         ptlrpc_client_wake_req(req);
2917         spin_unlock(&req->rq_lock);
2918 }
2919
2920 /* XXX: this function and rq_status are currently unused */
2921 void ptlrpc_restart_req(struct ptlrpc_request *req)
2922 {
2923         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
2924         req->rq_status = -ERESTARTSYS;
2925
2926         spin_lock(&req->rq_lock);
2927         req->rq_restart = 1;
2928         req->rq_timedout = 0;
2929         ptlrpc_client_wake_req(req);
2930         spin_unlock(&req->rq_lock);
2931 }
2932
2933 /**
2934  * Grab additional reference on a request \a req
2935  */
2936 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
2937 {
2938         ENTRY;
2939         atomic_inc(&req->rq_refcount);
2940         RETURN(req);
2941 }
2942 EXPORT_SYMBOL(ptlrpc_request_addref);
2943
2944 /**
2945  * Add a request to import replay_list.
2946  * Must be called under imp_lock
2947  */
2948 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
2949                                       struct obd_import *imp)
2950 {
2951         struct list_head *tmp;
2952
2953         assert_spin_locked(&imp->imp_lock);
2954
2955         if (req->rq_transno == 0) {
2956                 DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
2957                 LBUG();
2958         }
2959
2960         /*
2961          * clear this for new requests that were resent as well
2962          * as resent replayed requests.
2963          */
2964         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
2965
2966         /* don't re-add requests that have been replayed */
2967         if (!list_empty(&req->rq_replay_list))
2968                 return;
2969
2970         lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
2971
2972         spin_lock(&req->rq_lock);
2973         req->rq_resend = 0;
2974         spin_unlock(&req->rq_lock);
2975
2976         LASSERT(imp->imp_replayable);
2977         /* Balanced in ptlrpc_free_committed, usually. */
2978         ptlrpc_request_addref(req);
2979         list_for_each_prev(tmp, &imp->imp_replay_list) {
2980                 struct ptlrpc_request *iter = list_entry(tmp,
2981                                                          struct ptlrpc_request,
2982                                                          rq_replay_list);
2983
2984                 /*
2985                  * We may have duplicate transnos if we create and then
2986                  * open a file, or for closes retained if to match creating
2987                  * opens, so use req->rq_xid as a secondary key.
2988                  * (See bugs 684, 685, and 428.)
2989                  * XXX no longer needed, but all opens need transnos!
2990                  */
2991                 if (iter->rq_transno > req->rq_transno)
2992                         continue;
2993
2994                 if (iter->rq_transno == req->rq_transno) {
2995                         LASSERT(iter->rq_xid != req->rq_xid);
2996                         if (iter->rq_xid > req->rq_xid)
2997                                 continue;
2998                 }
2999
3000                 list_add(&req->rq_replay_list, &iter->rq_replay_list);
3001                 return;
3002         }
3003
3004         list_add(&req->rq_replay_list, &imp->imp_replay_list);
3005 }
3006
3007 /**
3008  * Send request and wait until it completes.
3009  * Returns request processing status.
3010  */
3011 int ptlrpc_queue_wait(struct ptlrpc_request *req)
3012 {
3013         struct ptlrpc_request_set *set;
3014         int rc;
3015
3016         ENTRY;
3017         LASSERT(req->rq_set == NULL);
3018         LASSERT(!req->rq_receiving_reply);
3019
3020         set = ptlrpc_prep_set();
3021         if (!set) {
3022                 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
3023                 RETURN(-ENOMEM);
3024         }
3025
3026         /* for distributed debugging */
3027         lustre_msg_set_status(req->rq_reqmsg, current_pid());
3028
3029         /* add a ref for the set (see comment in ptlrpc_set_add_req) */
3030         ptlrpc_request_addref(req);
3031         ptlrpc_set_add_req(set, req);
3032         rc = ptlrpc_set_wait(NULL, set);
3033         ptlrpc_set_destroy(set);
3034
3035         RETURN(rc);
3036 }
3037 EXPORT_SYMBOL(ptlrpc_queue_wait);
3038
3039 /**
3040  * Callback used for replayed requests reply processing.
3041  * In case of successful reply calls registered request replay callback.
3042  * In case of error restart replay process.
3043  */
3044 static int ptlrpc_replay_interpret(const struct lu_env *env,
3045                                    struct ptlrpc_request *req,
3046                                    void *args, int rc)
3047 {
3048         struct ptlrpc_replay_async_args *aa = args;
3049         struct obd_import *imp = req->rq_import;
3050
3051         ENTRY;
3052         atomic_dec(&imp->imp_replay_inflight);
3053
3054         /*
3055          * Note: if it is bulk replay (MDS-MDS replay), then even if
3056          * server got the request, but bulk transfer timeout, let's
3057          * replay the bulk req again
3058          */
3059         if (!ptlrpc_client_replied(req) ||
3060             (req->rq_bulk &&
3061              lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
3062                 DEBUG_REQ(D_ERROR, req, "request replay timed out.\n");
3063                 GOTO(out, rc = -ETIMEDOUT);
3064         }
3065
3066         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
3067             (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
3068             lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
3069                 GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
3070
3071         /** VBR: check version failure */
3072         if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
3073                 /** replay was failed due to version mismatch */
3074                 DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
3075                 spin_lock(&imp->imp_lock);
3076                 imp->imp_vbr_failed = 1;
3077                 spin_unlock(&imp->imp_lock);
3078                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3079         } else {
3080                 /** The transno had better not change over replay. */
3081                 LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
3082                          lustre_msg_get_transno(req->rq_repmsg) ||
3083                          lustre_msg_get_transno(req->rq_repmsg) == 0,
3084                          "%#llx/%#llx\n",
3085                          lustre_msg_get_transno(req->rq_reqmsg),
3086                          lustre_msg_get_transno(req->rq_repmsg));
3087         }
3088
3089         spin_lock(&imp->imp_lock);
3090         imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
3091         spin_unlock(&imp->imp_lock);
3092         LASSERT(imp->imp_last_replay_transno);
3093
3094         /* transaction number shouldn't be bigger than the latest replayed */
3095         if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
3096                 DEBUG_REQ(D_ERROR, req,
3097                           "Reported transno %llu is bigger than the replayed one: %llu",
3098                           req->rq_transno,
3099                           lustre_msg_get_transno(req->rq_reqmsg));
3100                 GOTO(out, rc = -EINVAL);
3101         }
3102
3103         DEBUG_REQ(D_HA, req, "got rep");
3104
3105         /* let the callback do fixups, possibly including in the request */
3106         if (req->rq_replay_cb)
3107                 req->rq_replay_cb(req);
3108
3109         if (ptlrpc_client_replied(req) &&
3110             lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
3111                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
3112                           lustre_msg_get_status(req->rq_repmsg),
3113                           aa->praa_old_status);
3114
3115                 /*
3116                  * Note: If the replay fails for MDT-MDT recovery, let's
3117                  * abort all of the following requests in the replay
3118                  * and sending list, because MDT-MDT update requests
3119                  * are dependent on each other, see LU-7039
3120                  */
3121                 if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
3122                         struct ptlrpc_request *free_req;
3123                         struct ptlrpc_request *tmp;
3124
3125                         spin_lock(&imp->imp_lock);
3126                         list_for_each_entry_safe(free_req, tmp,
3127                                                  &imp->imp_replay_list,
3128                                                  rq_replay_list) {
3129                                 ptlrpc_free_request(free_req);
3130                         }
3131
3132                         list_for_each_entry_safe(free_req, tmp,
3133                                                  &imp->imp_committed_list,
3134                                                  rq_replay_list) {
3135                                 ptlrpc_free_request(free_req);
3136                         }
3137
3138                         list_for_each_entry_safe(free_req, tmp,
3139                                                  &imp->imp_delayed_list,
3140                                                  rq_list) {
3141                                 spin_lock(&free_req->rq_lock);
3142                                 free_req->rq_err = 1;
3143                                 free_req->rq_status = -EIO;
3144                                 ptlrpc_client_wake_req(free_req);
3145                                 spin_unlock(&free_req->rq_lock);
3146                         }
3147
3148                         list_for_each_entry_safe(free_req, tmp,
3149                                                  &imp->imp_sending_list,
3150                                                  rq_list) {
3151                                 spin_lock(&free_req->rq_lock);
3152                                 free_req->rq_err = 1;
3153                                 free_req->rq_status = -EIO;
3154                                 ptlrpc_client_wake_req(free_req);
3155                                 spin_unlock(&free_req->rq_lock);
3156                         }
3157                         spin_unlock(&imp->imp_lock);
3158                 }
3159         } else {
3160                 /* Put it back for re-replay. */
3161                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3162         }
3163
3164         /*
3165          * Errors while replay can set transno to 0, but
3166          * imp_last_replay_transno shouldn't be set to 0 anyway
3167          */
3168         if (req->rq_transno == 0)
3169                 CERROR("Transno is 0 during replay!\n");
3170
3171         /* continue with recovery */
3172         rc = ptlrpc_import_recovery_state_machine(imp);
3173  out:
3174         req->rq_send_state = aa->praa_old_state;
3175
3176         if (rc != 0)
3177                 /* this replay failed, so restart recovery */
3178                 ptlrpc_connect_import(imp);
3179
3180         RETURN(rc);
3181 }
3182
3183 /**
3184  * Prepares and queues request for replay.
3185  * Adds it to ptlrpcd queue for actual sending.
3186  * Returns 0 on success.
3187  */
3188 int ptlrpc_replay_req(struct ptlrpc_request *req)
3189 {
3190         struct ptlrpc_replay_async_args *aa;
3191
3192         ENTRY;
3193
3194         LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
3195
3196         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
3197         aa = ptlrpc_req_async_args(req);
3198         memset(aa, 0, sizeof(*aa));
3199
3200         /* Prepare request to be resent with ptlrpcd */
3201         aa->praa_old_state = req->rq_send_state;
3202         req->rq_send_state = LUSTRE_IMP_REPLAY;
3203         req->rq_phase = RQ_PHASE_NEW;
3204         req->rq_next_phase = RQ_PHASE_UNDEFINED;
3205         if (req->rq_repmsg)
3206                 aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
3207         req->rq_status = 0;
3208         req->rq_interpret_reply = ptlrpc_replay_interpret;
3209         /* Readjust the timeout for current conditions */
3210         ptlrpc_at_set_req_timeout(req);
3211
3212         /* Tell server net_latency to calculate how long to wait for reply. */
3213         lustre_msg_set_service_time(req->rq_reqmsg,
3214                                     ptlrpc_at_get_net_latency(req));
3215         DEBUG_REQ(D_HA, req, "REPLAY");
3216
3217         atomic_inc(&req->rq_import->imp_replay_inflight);
3218         spin_lock(&req->rq_lock);
3219         req->rq_early_free_repbuf = 0;
3220         spin_unlock(&req->rq_lock);
3221         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
3222
3223         ptlrpcd_add_req(req);
3224         RETURN(0);
3225 }
3226
3227 /**
3228  * Aborts all in-flight request on import \a imp sending and delayed lists
3229  */
3230 void ptlrpc_abort_inflight(struct obd_import *imp)
3231 {
3232         struct list_head *tmp, *n;
3233
3234         ENTRY;
3235         /*
3236          * Make sure that no new requests get processed for this import.
3237          * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
3238          * this flag and then putting requests on sending_list or delayed_list.
3239          */
3240         spin_lock(&imp->imp_lock);
3241
3242         /*
3243          * XXX locking?  Maybe we should remove each request with the list
3244          * locked?  Also, how do we know if the requests on the list are
3245          * being freed at this time?
3246          */
3247         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
3248                 struct ptlrpc_request *req = list_entry(tmp,
3249                                                         struct ptlrpc_request,
3250                                                         rq_list);
3251
3252                 DEBUG_REQ(D_RPCTRACE, req, "inflight");
3253
3254                 spin_lock(&req->rq_lock);
3255                 if (req->rq_import_generation < imp->imp_generation) {
3256                         req->rq_err = 1;
3257                         req->rq_status = -EIO;
3258                         ptlrpc_client_wake_req(req);
3259                 }
3260                 spin_unlock(&req->rq_lock);
3261         }
3262
3263         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
3264                 struct ptlrpc_request *req =
3265                         list_entry(tmp, struct ptlrpc_request, rq_list);
3266
3267                 DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
3268
3269                 spin_lock(&req->rq_lock);
3270                 if (req->rq_import_generation < imp->imp_generation) {
3271                         req->rq_err = 1;
3272                         req->rq_status = -EIO;
3273                         ptlrpc_client_wake_req(req);
3274                 }
3275                 spin_unlock(&req->rq_lock);
3276         }
3277
3278         /*
3279          * Last chance to free reqs left on the replay list, but we
3280          * will still leak reqs that haven't committed.
3281          */
3282         if (imp->imp_replayable)
3283                 ptlrpc_free_committed(imp);
3284
3285         spin_unlock(&imp->imp_lock);
3286
3287         EXIT;
3288 }
3289
3290 /**
3291  * Abort all uncompleted requests in request set \a set
3292  */
3293 void ptlrpc_abort_set(struct ptlrpc_request_set *set)
3294 {
3295         struct list_head *tmp, *pos;
3296
3297         LASSERT(set != NULL);
3298
3299         list_for_each_safe(pos, tmp, &set->set_requests) {
3300                 struct ptlrpc_request *req =
3301                         list_entry(pos, struct ptlrpc_request,
3302                                    rq_set_chain);
3303
3304                 spin_lock(&req->rq_lock);
3305                 if (req->rq_phase != RQ_PHASE_RPC) {
3306                         spin_unlock(&req->rq_lock);
3307                         continue;
3308                 }
3309
3310                 req->rq_err = 1;
3311                 req->rq_status = -EINTR;
3312                 ptlrpc_client_wake_req(req);
3313                 spin_unlock(&req->rq_lock);
3314         }
3315 }
3316
3317 /**
3318  * Initialize the XID for the node.  This is common among all requests on
3319  * this node, and only requires the property that it is monotonically
3320  * increasing.  It does not need to be sequential.  Since this is also used
3321  * as the RDMA match bits, it is important that a single client NOT have
3322  * the same match bits for two different in-flight requests, hence we do
3323  * NOT want to have an XID per target or similar.
3324  *
3325  * To avoid an unlikely collision between match bits after a client reboot
3326  * (which would deliver old data into the wrong RDMA buffer) initialize
3327  * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
3328  * If the time is clearly incorrect, we instead use a 62-bit random number.
3329  * In the worst case the random number will overflow 1M RPCs per second in
3330  * 9133 years, or permutations thereof.
3331  */
3332 #define YEAR_2004 (1ULL << 30)
3333 void ptlrpc_init_xid(void)
3334 {
3335         time64_t now = ktime_get_real_seconds();
3336
3337         spin_lock_init(&ptlrpc_last_xid_lock);
3338         if (now < YEAR_2004) {
3339                 get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
3340                 ptlrpc_last_xid >>= 2;
3341                 ptlrpc_last_xid |= (1ULL << 61);
3342         } else {
3343                 ptlrpc_last_xid = (__u64)now << 20;
3344         }
3345
3346         /* Need to always be aligned to a power-of-two for mutli-bulk BRW */
3347         CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
3348         ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
3349 }
3350
3351 /**
3352  * Increase xid and returns resulting new value to the caller.
3353  *
3354  * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
3355  * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
3356  * itself uses the last bulk xid needed, so the server can determine the
3357  * the number of bulk transfers from the RPC XID and a bitmask.  The starting
3358  * xid must align to a power-of-two value.
3359  *
3360  * This is assumed to be true due to the initial ptlrpc_last_xid
3361  * value also being initialized to a power-of-two value. LU-1431
3362  */
3363 __u64 ptlrpc_next_xid(void)
3364 {
3365         __u64 next;
3366
3367         spin_lock(&ptlrpc_last_xid_lock);
3368         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3369         ptlrpc_last_xid = next;
3370         spin_unlock(&ptlrpc_last_xid_lock);
3371
3372         return next;
3373 }
3374
3375 /**
3376  * If request has a new allocated XID (new request or EINPROGRESS resend),
3377  * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
3378  * request to ensure previous bulk fails and avoid problems with lost replies
3379  * and therefore several transfers landing into the same buffer from different
3380  * sending attempts.
3381  */
3382 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
3383 {
3384         struct ptlrpc_bulk_desc *bd = req->rq_bulk;
3385
3386         LASSERT(bd != NULL);
3387
3388         /*
3389          * Generate new matchbits for all resend requests, including
3390          * resend replay.
3391          */
3392         if (req->rq_resend) {
3393                 __u64 old_mbits = req->rq_mbits;
3394
3395                 /*
3396                  * First time resend on -EINPROGRESS will generate new xid,
3397                  * so we can actually use the rq_xid as rq_mbits in such case,
3398                  * however, it's bit hard to distinguish such resend with a
3399                  * 'resend for the -EINPROGRESS resend'. To make it simple,
3400                  * we opt to generate mbits for all resend cases.
3401                  */
3402                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data,
3403                                  BULK_MBITS)) {
3404                         req->rq_mbits = ptlrpc_next_xid();
3405                 } else {
3406                         /*
3407                          * Old version transfers rq_xid to peer as
3408                          * matchbits.
3409                          */
3410                         spin_lock(&req->rq_import->imp_lock);
3411                         list_del_init(&req->rq_unreplied_list);
3412                         ptlrpc_assign_next_xid_nolock(req);
3413                         spin_unlock(&req->rq_import->imp_lock);
3414                         req->rq_mbits = req->rq_xid;
3415                 }
3416                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
3417                        old_mbits, req->rq_mbits);
3418         } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
3419                 /* Request being sent first time, use xid as matchbits. */
3420                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)
3421                     || req->rq_mbits == 0) {
3422                         req->rq_mbits = req->rq_xid;
3423                 } else {
3424                         int total_md = (bd->bd_iov_count + LNET_MAX_IOV - 1) /
3425                                         LNET_MAX_IOV;
3426                         req->rq_mbits -= total_md - 1;
3427                 }
3428         } else {
3429                 /*
3430                  * Replay request, xid and matchbits have already been
3431                  * correctly assigned.
3432                  */
3433                 return;
3434         }
3435
3436         /*
3437          * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
3438          * that server can infer the number of bulks that were prepared,
3439          * see LU-1431
3440          */
3441         req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
3442                           LNET_MAX_IOV) - 1;
3443
3444         /*
3445          * Set rq_xid as rq_mbits to indicate the final bulk for the old
3446          * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
3447          *
3448          * It's ok to directly set the rq_xid here, since this xid bump
3449          * won't affect the request position in unreplied list.
3450          */
3451         if (!OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS))
3452                 req->rq_xid = req->rq_mbits;
3453 }
3454
3455 /**
3456  * Get a glimpse at what next xid value might have been.
3457  * Returns possible next xid.
3458  */
3459 __u64 ptlrpc_sample_next_xid(void)
3460 {
3461 #if BITS_PER_LONG == 32
3462         /* need to avoid possible word tearing on 32-bit systems */
3463         __u64 next;
3464
3465         spin_lock(&ptlrpc_last_xid_lock);
3466         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3467         spin_unlock(&ptlrpc_last_xid_lock);
3468
3469         return next;
3470 #else
3471         /* No need to lock, since returned value is racy anyways */
3472         return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3473 #endif
3474 }
3475 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
3476
3477 /**
3478  * Functions for operating ptlrpc workers.
3479  *
3480  * A ptlrpc work is a function which will be running inside ptlrpc context.
3481  * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
3482  *
3483  * 1. after a work is created, it can be used many times, that is:
3484  *         handler = ptlrpcd_alloc_work();
3485  *         ptlrpcd_queue_work();
3486  *
3487  *    queue it again when necessary:
3488  *         ptlrpcd_queue_work();
3489  *         ptlrpcd_destroy_work();
3490  * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
3491  *    it will only be queued once in any time. Also as its name implies, it may
3492  *    have delay before it really runs by ptlrpcd thread.
3493  */
3494 struct ptlrpc_work_async_args {
3495         int (*cb)(const struct lu_env *, void *);
3496         void *cbdata;
3497 };
3498
3499 static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
3500 {
3501         /* re-initialize the req */
3502         req->rq_timeout         = obd_timeout;
3503         req->rq_sent            = ktime_get_real_seconds();
3504         req->rq_deadline        = req->rq_sent + req->rq_timeout;
3505         req->rq_phase           = RQ_PHASE_INTERPRET;
3506         req->rq_next_phase      = RQ_PHASE_COMPLETE;
3507         req->rq_xid             = ptlrpc_next_xid();
3508         req->rq_import_generation = req->rq_import->imp_generation;
3509
3510         ptlrpcd_add_req(req);
3511 }
3512
3513 static int work_interpreter(const struct lu_env *env,
3514                             struct ptlrpc_request *req, void *args, int rc)
3515 {
3516         struct ptlrpc_work_async_args *arg = args;
3517
3518         LASSERT(ptlrpcd_check_work(req));
3519         LASSERT(arg->cb != NULL);
3520
3521         rc = arg->cb(env, arg->cbdata);
3522
3523         list_del_init(&req->rq_set_chain);
3524         req->rq_set = NULL;
3525
3526         if (atomic_dec_return(&req->rq_refcount) > 1) {
3527                 atomic_set(&req->rq_refcount, 2);
3528                 ptlrpcd_add_work_req(req);
3529         }
3530         return rc;
3531 }
3532
3533 static int worker_format;
3534
3535 static int ptlrpcd_check_work(struct ptlrpc_request *req)
3536 {
3537         return req->rq_pill.rc_fmt == (void *)&worker_format;
3538 }
3539
3540 /**
3541  * Create a work for ptlrpc.
3542  */
3543 void *ptlrpcd_alloc_work(struct obd_import *imp,
3544                          int (*cb)(const struct lu_env *, void *), void *cbdata)
3545 {
3546         struct ptlrpc_request *req = NULL;
3547         struct ptlrpc_work_async_args *args;
3548
3549         ENTRY;
3550         might_sleep();
3551
3552         if (!cb)
3553                 RETURN(ERR_PTR(-EINVAL));
3554
3555         /* copy some code from deprecated fakereq. */
3556         req = ptlrpc_request_cache_alloc(GFP_NOFS);
3557         if (!req) {
3558                 CERROR("ptlrpc: run out of memory!\n");
3559                 RETURN(ERR_PTR(-ENOMEM));
3560         }
3561
3562         ptlrpc_cli_req_init(req);
3563
3564         req->rq_send_state = LUSTRE_IMP_FULL;
3565         req->rq_type = PTL_RPC_MSG_REQUEST;
3566         req->rq_import = class_import_get(imp);
3567         req->rq_interpret_reply = work_interpreter;
3568         /* don't want reply */
3569         req->rq_no_delay = req->rq_no_resend = 1;
3570         req->rq_pill.rc_fmt = (void *)&worker_format;
3571
3572         CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
3573         args = ptlrpc_req_async_args(req);
3574         args->cb     = cb;
3575         args->cbdata = cbdata;
3576
3577         RETURN(req);
3578 }
3579 EXPORT_SYMBOL(ptlrpcd_alloc_work);
3580
3581 void ptlrpcd_destroy_work(void *handler)
3582 {
3583         struct ptlrpc_request *req = handler;
3584
3585         if (req)
3586                 ptlrpc_req_finished(req);
3587 }
3588 EXPORT_SYMBOL(ptlrpcd_destroy_work);
3589
3590 int ptlrpcd_queue_work(void *handler)
3591 {
3592         struct ptlrpc_request *req = handler;
3593
3594         /*
3595          * Check if the req is already being queued.
3596          *
3597          * Here comes a trick: it lacks a way of checking if a req is being
3598          * processed reliably in ptlrpc. Here I have to use refcount of req
3599          * for this purpose. This is okay because the caller should use this
3600          * req as opaque data. - Jinshan
3601          */
3602         LASSERT(atomic_read(&req->rq_refcount) > 0);
3603         if (atomic_inc_return(&req->rq_refcount) == 2)
3604                 ptlrpcd_add_work_req(req);
3605         return 0;
3606 }
3607 EXPORT_SYMBOL(ptlrpcd_queue_work);