lustre/ptlrpc/client.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  */
  32
  33 /** Implementation of client-side PortalRPC interfaces */
  34
  35 #define DEBUG_SUBSYSTEM S_RPC
  36
  37 #include <linux/delay.h>
  38 #include <obd_support.h>
  39 #include <obd_class.h>
  40 #include <lustre_lib.h>
  41 #include <lustre_ha.h>
  42 #include <lustre_import.h>
  43 #include <lustre_req_layout.h>
  44
  45 #include "ptlrpc_internal.h"
  46
  47 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
  48         .add_kiov_frag  = ptlrpc_prep_bulk_page_pin,
  49         .release_frags  = ptlrpc_release_bulk_page_pin,
  50 };
  51 EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
  52
  53 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
  54         .add_kiov_frag  = ptlrpc_prep_bulk_page_nopin,
  55         .release_frags  = ptlrpc_release_bulk_noop,
  56 };
  57 EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
  58
  59 const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops = {
  60         .add_iov_frag = ptlrpc_prep_bulk_frag,
  61 };
  62 EXPORT_SYMBOL(ptlrpc_bulk_kvec_ops);
  63
  64 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
  65 static int ptlrpcd_check_work(struct ptlrpc_request *req);
  66 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
  67
  68 /**
  69  * Initialize passed in client structure \a cl.
  70  */
  71 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
  72                         struct ptlrpc_client *cl)
  73 {
  74         cl->cli_request_portal = req_portal;
  75         cl->cli_reply_portal   = rep_portal;
  76         cl->cli_name           = name;
  77 }
  78 EXPORT_SYMBOL(ptlrpc_init_client);
  79
  80 /**
  81  * Return PortalRPC connection for remore uud \a uuid
  82  */
  83 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
  84                                                     lnet_nid_t nid4refnet)
  85 {
  86         struct ptlrpc_connection *c;
  87         lnet_nid_t self;
  88         struct lnet_process_id peer;
  89         int err;
  90
  91         /*
  92          * ptlrpc_uuid_to_peer() initializes its 2nd parameter
  93          * before accessing its values.
  94          */
  95         /* coverity[uninit_use_in_call] */
  96         peer.nid = nid4refnet;
  97         err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
  98         if (err != 0) {
  99                 CNETERR("cannot find peer %s!\n", uuid->uuid);
 100                 return NULL;
 101         }
 102
 103         c = ptlrpc_connection_get(peer, self, uuid);
 104         if (c) {
 105                 memcpy(c->c_remote_uuid.uuid,
 106                        uuid->uuid, sizeof(c->c_remote_uuid.uuid));
 107         }
 108
 109         CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
 110
 111         return c;
 112 }
 113
 114 /**
 115  * Allocate and initialize new bulk descriptor on the sender.
 116  * Returns pointer to the descriptor or NULL on error.
 117  */
 118 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags,
 119                                          unsigned int max_brw,
 120                                          enum ptlrpc_bulk_op_type type,
 121                                          unsigned int portal,
 122                                          const struct ptlrpc_bulk_frag_ops *ops)
 123 {
 124         struct ptlrpc_bulk_desc *desc;
 125         int i;
 126
 127         /* ensure that only one of KIOV or IOVEC is set but not both */
 128         LASSERT((ptlrpc_is_bulk_desc_kiov(type) &&
 129                  ops->add_kiov_frag != NULL) ||
 130                 (ptlrpc_is_bulk_desc_kvec(type) &&
 131                  ops->add_iov_frag != NULL));
 132
 133         OBD_ALLOC_PTR(desc);
 134         if (!desc)
 135                 return NULL;
 136         if (type & PTLRPC_BULK_BUF_KIOV) {
 137                 OBD_ALLOC_LARGE(GET_KIOV(desc),
 138                                 nfrags * sizeof(*GET_KIOV(desc)));
 139                 if (!GET_KIOV(desc))
 140                         goto out;
 141         } else {
 142                 OBD_ALLOC_LARGE(GET_KVEC(desc),
 143                                 nfrags * sizeof(*GET_KVEC(desc)));
 144                 if (!GET_KVEC(desc))
 145                         goto out;
 146         }
 147
 148         spin_lock_init(&desc->bd_lock);
 149         init_waitqueue_head(&desc->bd_waitq);
 150         desc->bd_max_iov = nfrags;
 151         desc->bd_iov_count = 0;
 152         desc->bd_portal = portal;
 153         desc->bd_type = type;
 154         desc->bd_md_count = 0;
 155         desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *)ops;
 156         LASSERT(max_brw > 0);
 157         desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
 158         /*
 159          * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
 160          * node. Negotiated ocd_brw_size will always be <= this number.
 161          */
 162         for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
 163                 LNetInvalidateMDHandle(&desc->bd_mds[i]);
 164
 165         return desc;
 166 out:
 167         OBD_FREE_PTR(desc);
 168         return NULL;
 169 }
 170
 171 /**
 172  * Prepare bulk descriptor for specified outgoing request \a req that
 173  * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
 174  * the bulk to be sent. Used on client-side.
 175  * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
 176  * error.
 177  */
 178 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
 179                                               unsigned int nfrags,
 180                                               unsigned int max_brw,
 181                                               unsigned int type,
 182                                               unsigned int portal,
 183                                               const struct ptlrpc_bulk_frag_ops
 184                                                 *ops)
 185 {
 186         struct obd_import *imp = req->rq_import;
 187         struct ptlrpc_bulk_desc *desc;
 188
 189         ENTRY;
 190         LASSERT(ptlrpc_is_bulk_op_passive(type));
 191
 192         desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
 193         if (!desc)
 194                 RETURN(NULL);
 195
 196         desc->bd_import_generation = req->rq_import_generation;
 197         desc->bd_import = class_import_get(imp);
 198         desc->bd_req = req;
 199
 200         desc->bd_cbid.cbid_fn  = client_bulk_callback;
 201         desc->bd_cbid.cbid_arg = desc;
 202
 203         /* This makes req own desc, and free it when she frees herself */
 204         req->rq_bulk = desc;
 205
 206         return desc;
 207 }
 208 EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
 209
 210 void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 211                              struct page *page, int pageoffset, int len,
 212                              int pin)
 213 {
 214         lnet_kiov_t *kiov;
 215
 216         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
 217         LASSERT(page != NULL);
 218         LASSERT(pageoffset >= 0);
 219         LASSERT(len > 0);
 220         LASSERT(pageoffset + len <= PAGE_SIZE);
 221         LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 222
 223         kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
 224
 225         desc->bd_nob += len;
 226
 227         if (pin)
 228                 get_page(page);
 229
 230         kiov->kiov_page = page;
 231         kiov->kiov_offset = pageoffset;
 232         kiov->kiov_len = len;
 233
 234         desc->bd_iov_count++;
 235 }
 236 EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
 237
 238 int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
 239                           void *frag, int len)
 240 {
 241         struct kvec *iovec;
 242
 243         ENTRY;
 244
 245         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
 246         LASSERT(frag != NULL);
 247         LASSERT(len > 0);
 248         LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
 249
 250         iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
 251
 252         desc->bd_nob += len;
 253
 254         iovec->iov_base = frag;
 255         iovec->iov_len = len;
 256
 257         desc->bd_iov_count++;
 258
 259         RETURN(desc->bd_nob);
 260 }
 261 EXPORT_SYMBOL(ptlrpc_prep_bulk_frag);
 262
 263 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 264 {
 265         ENTRY;
 266
 267         LASSERT(desc != NULL);
 268         LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
 269         LASSERT(desc->bd_md_count == 0);         /* network hands off */
 270         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 271         LASSERT(desc->bd_frag_ops != NULL);
 272
 273         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
 274                 sptlrpc_enc_pool_put_pages(desc);
 275
 276         if (desc->bd_export)
 277                 class_export_put(desc->bd_export);
 278         else
 279                 class_import_put(desc->bd_import);
 280
 281         if (desc->bd_frag_ops->release_frags != NULL)
 282                 desc->bd_frag_ops->release_frags(desc);
 283
 284         if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
 285                 OBD_FREE_LARGE(GET_KIOV(desc),
 286                                desc->bd_max_iov * sizeof(*GET_KIOV(desc)));
 287         else
 288                 OBD_FREE_LARGE(GET_KVEC(desc),
 289                                desc->bd_max_iov * sizeof(*GET_KVEC(desc)));
 290         OBD_FREE_PTR(desc);
 291         EXIT;
 292 }
 293 EXPORT_SYMBOL(ptlrpc_free_bulk);
 294
 295 /**
 296  * Set server timelimit for this req, i.e. how long are we willing to wait
 297  * for reply before timing out this request.
 298  */
 299 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
 300 {
 301         __u32 serv_est;
 302         int idx;
 303         struct imp_at *at;
 304
 305         LASSERT(req->rq_import);
 306
 307         if (AT_OFF) {
 308                 /* non-AT settings */
 309                 /**
 310                  * \a imp_server_timeout means this is reverse import and
 311                  * we send (currently only) ASTs to the client and cannot afford
 312                  * to wait too long for the reply, otherwise the other client
 313                  * (because of which we are sending this request) would
 314                  * timeout waiting for us
 315                  */
 316                 req->rq_timeout = req->rq_import->imp_server_timeout ?
 317                                   obd_timeout / 2 : obd_timeout;
 318         } else {
 319                 at = &req->rq_import->imp_at;
 320                 idx = import_at_get_index(req->rq_import,
 321                                           req->rq_request_portal);
 322                 serv_est = at_get(&at->iat_service_estimate[idx]);
 323                 req->rq_timeout = at_est2timeout(serv_est);
 324         }
 325         /*
 326          * We could get even fancier here, using history to predict increased
 327          * loading...
 328          */
 329
 330         /*
 331          * Let the server know what this RPC timeout is by putting it in the
 332          * reqmsg
 333          */
 334         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 335 }
 336 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
 337
 338 /* Adjust max service estimate based on server value */
 339 static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
 340                                   unsigned int serv_est)
 341 {
 342         int idx;
 343         unsigned int oldse;
 344         struct imp_at *at;
 345
 346         LASSERT(req->rq_import);
 347         at = &req->rq_import->imp_at;
 348
 349         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
 350         /*
 351          * max service estimates are tracked on the server side,
 352          * so just keep minimal history here
 353          */
 354         oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
 355         if (oldse != 0)
 356                 CDEBUG(D_ADAPTTO,
 357                        "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
 358                        req->rq_import->imp_obd->obd_name,
 359                        req->rq_request_portal,
 360                        oldse, at_get(&at->iat_service_estimate[idx]));
 361 }
 362
 363 /* Expected network latency per remote node (secs) */
 364 int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 365 {
 366         return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
 367 }
 368
 369 /* Adjust expected network latency */
 370 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 371                                unsigned int service_time)
 372 {
 373         unsigned int nl, oldnl;
 374         struct imp_at *at;
 375         time64_t now = ktime_get_real_seconds();
 376
 377         LASSERT(req->rq_import);
 378
 379         if (service_time > now - req->rq_sent + 3) {
 380                 /*
 381                  * b=16408, however, this can also happen if early reply
 382                  * is lost and client RPC is expired and resent, early reply
 383                  * or reply of original RPC can still be fit in reply buffer
 384                  * of resent RPC, now client is measuring time from the
 385                  * resent time, but server sent back service time of original
 386                  * RPC.
 387                  */
 388                 CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 389                        D_ADAPTTO : D_WARNING,
 390                        "Reported service time %u > total measured time %lld\n",
 391                        service_time, now - req->rq_sent);
 392                 return;
 393         }
 394
 395         /* Network latency is total time less server processing time */
 396         nl = max_t(int, now - req->rq_sent -
 397                         service_time, 0) + 1; /* st rounding */
 398         at = &req->rq_import->imp_at;
 399
 400         oldnl = at_measured(&at->iat_net_latency, nl);
 401         if (oldnl != 0)
 402                 CDEBUG(D_ADAPTTO,
 403                        "The network latency for %s (nid %s) has changed from %d to %d\n",
 404                        req->rq_import->imp_obd->obd_name,
 405                        obd_uuid2str(&req->rq_import->imp_connection->c_remote_uuid),
 406                        oldnl, at_get(&at->iat_net_latency));
 407 }
 408
 409 static int unpack_reply(struct ptlrpc_request *req)
 410 {
 411         int rc;
 412
 413         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
 414                 rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
 415                 if (rc) {
 416                         DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
 417                         return -EPROTO;
 418                 }
 419         }
 420
 421         rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
 422         if (rc) {
 423                 DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
 424                 return -EPROTO;
 425         }
 426         return 0;
 427 }
 428
 429 /**
 430  * Handle an early reply message, called with the rq_lock held.
 431  * If anything goes wrong just ignore it - same as if it never happened
 432  */
 433 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 434 __must_hold(&req->rq_lock)
 435 {
 436         struct ptlrpc_request *early_req;
 437         time64_t olddl;
 438         int rc;
 439
 440         ENTRY;
 441         req->rq_early = 0;
 442         spin_unlock(&req->rq_lock);
 443
 444         rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
 445         if (rc) {
 446                 spin_lock(&req->rq_lock);
 447                 RETURN(rc);
 448         }
 449
 450         rc = unpack_reply(early_req);
 451         if (rc != 0) {
 452                 sptlrpc_cli_finish_early_reply(early_req);
 453                 spin_lock(&req->rq_lock);
 454                 RETURN(rc);
 455         }
 456
 457         /*
 458          * Use new timeout value just to adjust the local value for this
 459          * request, don't include it into at_history. It is unclear yet why
 460          * service time increased and should it be counted or skipped, e.g.
 461          * that can be recovery case or some error or server, the real reply
 462          * will add all new data if it is worth to add.
 463          */
 464         req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
 465         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 466
 467         /* Network latency can be adjusted, it is pure network delays */
 468         ptlrpc_at_adj_net_latency(req,
 469                                   lustre_msg_get_service_time(early_req->rq_repmsg));
 470
 471         sptlrpc_cli_finish_early_reply(early_req);
 472
 473         spin_lock(&req->rq_lock);
 474         olddl = req->rq_deadline;
 475         /*
 476          * server assumes it now has rq_timeout from when the request
 477          * arrived, so the client should give it at least that long.
 478          * since we don't know the arrival time we'll use the original
 479          * sent time
 480          */
 481         req->rq_deadline = req->rq_sent + req->rq_timeout +
 482                            ptlrpc_at_get_net_latency(req);
 483
 484         DEBUG_REQ(D_ADAPTTO, req,
 485                   "Early reply #%d, new deadline in %llds (%llds)",
 486                   req->rq_early_count,
 487                   req->rq_deadline - ktime_get_real_seconds(),
 488                   req->rq_deadline - olddl);
 489
 490         RETURN(rc);
 491 }
 492
 493 static struct kmem_cache *request_cache;
 494
 495 int ptlrpc_request_cache_init(void)
 496 {
 497         request_cache = kmem_cache_create("ptlrpc_cache",
 498                                           sizeof(struct ptlrpc_request),
 499                                           0, SLAB_HWCACHE_ALIGN, NULL);
 500         return request_cache ? 0 : -ENOMEM;
 501 }
 502
 503 void ptlrpc_request_cache_fini(void)
 504 {
 505         kmem_cache_destroy(request_cache);
 506 }
 507
 508 struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
 509 {
 510         struct ptlrpc_request *req;
 511
 512         OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
 513         return req;
 514 }
 515
 516 void ptlrpc_request_cache_free(struct ptlrpc_request *req)
 517 {
 518         OBD_SLAB_FREE_PTR(req, request_cache);
 519 }
 520
 521 /**
 522  * Wind down request pool \a pool.
 523  * Frees all requests from the pool too
 524  */
 525 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
 526 {
 527         struct list_head *l, *tmp;
 528         struct ptlrpc_request *req;
 529
 530         LASSERT(pool != NULL);
 531
 532         spin_lock(&pool->prp_lock);
 533         list_for_each_safe(l, tmp, &pool->prp_req_list) {
 534                 req = list_entry(l, struct ptlrpc_request, rq_list);
 535                 list_del(&req->rq_list);
 536                 LASSERT(req->rq_reqbuf);
 537                 LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
 538                 OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
 539                 ptlrpc_request_cache_free(req);
 540         }
 541         spin_unlock(&pool->prp_lock);
 542         OBD_FREE(pool, sizeof(*pool));
 543 }
 544 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 545
 546 /**
 547  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
 548  */
 549 int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 550 {
 551         int i;
 552         int size = 1;
 553
 554         while (size < pool->prp_rq_size)
 555                 size <<= 1;
 556
 557         LASSERTF(list_empty(&pool->prp_req_list) ||
 558                  size == pool->prp_rq_size,
 559                  "Trying to change pool size with nonempty pool from %d to %d bytes\n",
 560                  pool->prp_rq_size, size);
 561
 562         spin_lock(&pool->prp_lock);
 563         pool->prp_rq_size = size;
 564         for (i = 0; i < num_rq; i++) {
 565                 struct ptlrpc_request *req;
 566                 struct lustre_msg *msg;
 567
 568                 spin_unlock(&pool->prp_lock);
 569                 req = ptlrpc_request_cache_alloc(GFP_NOFS);
 570                 if (!req)
 571                         return i;
 572                 OBD_ALLOC_LARGE(msg, size);
 573                 if (!msg) {
 574                         ptlrpc_request_cache_free(req);
 575                         return i;
 576                 }
 577                 req->rq_reqbuf = msg;
 578                 req->rq_reqbuf_len = size;
 579                 req->rq_pool = pool;
 580                 spin_lock(&pool->prp_lock);
 581                 list_add_tail(&req->rq_list, &pool->prp_req_list);
 582         }
 583         spin_unlock(&pool->prp_lock);
 584         return num_rq;
 585 }
 586 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 587
 588 /**
 589  * Create and initialize new request pool with given attributes:
 590  * \a num_rq - initial number of requests to create for the pool
 591  * \a msgsize - maximum message size possible for requests in thid pool
 592  * \a populate_pool - function to be called when more requests need to be added
 593  *                    to the pool
 594  * Returns pointer to newly created pool or NULL on error.
 595  */
 596 struct ptlrpc_request_pool *
 597 ptlrpc_init_rq_pool(int num_rq, int msgsize,
 598                     int (*populate_pool)(struct ptlrpc_request_pool *, int))
 599 {
 600         struct ptlrpc_request_pool *pool;
 601
 602         OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
 603         if (!pool)
 604                 return NULL;
 605
 606         /*
 607          * Request next power of two for the allocation, because internally
 608          * kernel would do exactly this
 609          */
 610         spin_lock_init(&pool->prp_lock);
 611         INIT_LIST_HEAD(&pool->prp_req_list);
 612         pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
 613         pool->prp_populate = populate_pool;
 614
 615         populate_pool(pool, num_rq);
 616
 617         return pool;
 618 }
 619 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
 620
 621 /**
 622  * Fetches one request from pool \a pool
 623  */
 624 static struct ptlrpc_request *
 625 ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
 626 {
 627         struct ptlrpc_request *request;
 628         struct lustre_msg *reqbuf;
 629
 630         if (!pool)
 631                 return NULL;
 632
 633         spin_lock(&pool->prp_lock);
 634
 635         /*
 636          * See if we have anything in a pool, and bail out if nothing,
 637          * in writeout path, where this matters, this is safe to do, because
 638          * nothing is lost in this case, and when some in-flight requests
 639          * complete, this code will be called again.
 640          */
 641         if (unlikely(list_empty(&pool->prp_req_list))) {
 642                 spin_unlock(&pool->prp_lock);
 643                 return NULL;
 644         }
 645
 646         request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
 647                              rq_list);
 648         list_del_init(&request->rq_list);
 649         spin_unlock(&pool->prp_lock);
 650
 651         LASSERT(request->rq_reqbuf);
 652         LASSERT(request->rq_pool);
 653
 654         reqbuf = request->rq_reqbuf;
 655         memset(request, 0, sizeof(*request));
 656         request->rq_reqbuf = reqbuf;
 657         request->rq_reqbuf_len = pool->prp_rq_size;
 658         request->rq_pool = pool;
 659
 660         return request;
 661 }
 662
 663 /**
 664  * Returns freed \a request to pool.
 665  */
 666 static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
 667 {
 668         struct ptlrpc_request_pool *pool = request->rq_pool;
 669
 670         spin_lock(&pool->prp_lock);
 671         LASSERT(list_empty(&request->rq_list));
 672         LASSERT(!request->rq_receiving_reply);
 673         list_add_tail(&request->rq_list, &pool->prp_req_list);
 674         spin_unlock(&pool->prp_lock);
 675 }
 676
 677 void ptlrpc_add_unreplied(struct ptlrpc_request *req)
 678 {
 679         struct obd_import *imp = req->rq_import;
 680         struct list_head *tmp;
 681         struct ptlrpc_request *iter;
 682
 683         assert_spin_locked(&imp->imp_lock);
 684         LASSERT(list_empty(&req->rq_unreplied_list));
 685
 686         /* unreplied list is sorted by xid in ascending order */
 687         list_for_each_prev(tmp, &imp->imp_unreplied_list) {
 688                 iter = list_entry(tmp, struct ptlrpc_request,
 689                                   rq_unreplied_list);
 690
 691                 LASSERT(req->rq_xid != iter->rq_xid);
 692                 if (req->rq_xid < iter->rq_xid)
 693                         continue;
 694                 list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
 695                 return;
 696         }
 697         list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
 698 }
 699
 700 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
 701 {
 702         req->rq_xid = ptlrpc_next_xid();
 703         ptlrpc_add_unreplied(req);
 704 }
 705
 706 static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
 707 {
 708         spin_lock(&req->rq_import->imp_lock);
 709         ptlrpc_assign_next_xid_nolock(req);
 710         spin_unlock(&req->rq_import->imp_lock);
 711 }
 712
 713 static __u64 ptlrpc_last_xid;
 714 static spinlock_t ptlrpc_last_xid_lock;
 715
 716 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 717                              __u32 version, int opcode, char **bufs,
 718                              struct ptlrpc_cli_ctx *ctx)
 719 {
 720         int count;
 721         struct obd_import *imp;
 722         __u32 *lengths;
 723         int rc;
 724
 725         ENTRY;
 726
 727         count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
 728         imp = request->rq_import;
 729         lengths = request->rq_pill.rc_area[RCL_CLIENT];
 730
 731         if (ctx) {
 732                 request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
 733         } else {
 734                 rc = sptlrpc_req_get_ctx(request);
 735                 if (rc)
 736                         GOTO(out_free, rc);
 737         }
 738         sptlrpc_req_set_flavor(request, opcode);
 739
 740         rc = lustre_pack_request(request, imp->imp_msg_magic, count,
 741                                  lengths, bufs);
 742         if (rc)
 743                 GOTO(out_ctx, rc);
 744
 745         lustre_msg_add_version(request->rq_reqmsg, version);
 746         request->rq_send_state = LUSTRE_IMP_FULL;
 747         request->rq_type = PTL_RPC_MSG_REQUEST;
 748
 749         request->rq_req_cbid.cbid_fn  = request_out_callback;
 750         request->rq_req_cbid.cbid_arg = request;
 751
 752         request->rq_reply_cbid.cbid_fn  = reply_in_callback;
 753         request->rq_reply_cbid.cbid_arg = request;
 754
 755         request->rq_reply_deadline = 0;
 756         request->rq_bulk_deadline = 0;
 757         request->rq_req_deadline = 0;
 758         request->rq_phase = RQ_PHASE_NEW;
 759         request->rq_next_phase = RQ_PHASE_UNDEFINED;
 760
 761         request->rq_request_portal = imp->imp_client->cli_request_portal;
 762         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
 763
 764         ptlrpc_at_set_req_timeout(request);
 765
 766         lustre_msg_set_opc(request->rq_reqmsg, opcode);
 767
 768         /* Let's setup deadline for req/reply/bulk unlink for opcode. */
 769         if (cfs_fail_val == opcode) {
 770                 time64_t *fail_t = NULL, *fail2_t = NULL;
 771
 772                 if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
 773                         fail_t = &request->rq_bulk_deadline;
 774                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
 775                         fail_t = &request->rq_reply_deadline;
 776                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) {
 777                         fail_t = &request->rq_req_deadline;
 778                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
 779                         fail_t = &request->rq_reply_deadline;
 780                         fail2_t = &request->rq_bulk_deadline;
 781                 } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_ROUND_XID)) {
 782                         time64_t now = ktime_get_real_seconds();
 783                         spin_lock(&ptlrpc_last_xid_lock);
 784                         ptlrpc_last_xid = ((__u64)now >> 4) << 24;
 785                         spin_unlock(&ptlrpc_last_xid_lock);
 786                 }
 787
 788                 if (fail_t) {
 789                         *fail_t = ktime_get_real_seconds() + LONG_UNLINK;
 790
 791                         if (fail2_t)
 792                                 *fail2_t = ktime_get_real_seconds() +
 793                                            LONG_UNLINK;
 794
 795                         /*
 796                          * The RPC is infected, let the test to change the
 797                          * fail_loc
 798                          */
 799                         msleep(4 * MSEC_PER_SEC);
 800                 }
 801         }
 802         ptlrpc_assign_next_xid(request);
 803
 804         RETURN(0);
 805
 806 out_ctx:
 807         LASSERT(!request->rq_pool);
 808         sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 809 out_free:
 810         class_import_put(imp);
 811
 812         return rc;
 813 }
 814 EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
 815
 816 /**
 817  * Pack request buffers for network transfer, performing necessary encryption
 818  * steps if necessary.
 819  */
 820 int ptlrpc_request_pack(struct ptlrpc_request *request,
 821                         __u32 version, int opcode)
 822 {
 823         int rc;
 824
 825         rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
 826         if (rc)
 827                 return rc;
 828
 829         /*
 830          * For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
 831          * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
 832          * have to send old ptlrpc_body to keep interoprability with these
 833          * clients.
 834          *
 835          * Only three kinds of server->client RPCs so far:
 836          *  - LDLM_BL_CALLBACK
 837          *  - LDLM_CP_CALLBACK
 838          *  - LDLM_GL_CALLBACK
 839          *
 840          * XXX This should be removed whenever we drop the interoprability with
 841          *     the these old clients.
 842          */
 843         if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
 844             opcode == LDLM_GL_CALLBACK)
 845                 req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
 846                                    sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
 847
 848         return rc;
 849 }
 850 EXPORT_SYMBOL(ptlrpc_request_pack);
 851
 852 /**
 853  * Helper function to allocate new request on import \a imp
 854  * and possibly using existing request from pool \a pool if provided.
 855  * Returns allocated request structure with import field filled or
 856  * NULL on error.
 857  */
 858 static inline
 859 struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 860                                               struct ptlrpc_request_pool *pool)
 861 {
 862         struct ptlrpc_request *request = NULL;
 863
 864         request = ptlrpc_request_cache_alloc(GFP_NOFS);
 865
 866         if (!request && pool)
 867                 request = ptlrpc_prep_req_from_pool(pool);
 868
 869         if (request) {
 870                 ptlrpc_cli_req_init(request);
 871
 872                 LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
 873                 LASSERT(imp != LP_POISON);
 874                 LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n",
 875                          imp->imp_client);
 876                 LASSERT(imp->imp_client != LP_POISON);
 877
 878                 request->rq_import = class_import_get(imp);
 879         } else {
 880                 CERROR("request allocation out of memory\n");
 881         }
 882
 883         return request;
 884 }
 885
 886 /**
 887  * Helper function for creating a request.
 888  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
 889  * buffer structures according to capsule template \a format.
 890  * Returns allocated request structure pointer or NULL on error.
 891  */
 892 static struct ptlrpc_request *
 893 ptlrpc_request_alloc_internal(struct obd_import *imp,
 894                               struct ptlrpc_request_pool *pool,
 895                               const struct req_format *format)
 896 {
 897         struct ptlrpc_request *request;
 898         int connect = 0;
 899
 900         request = __ptlrpc_request_alloc(imp, pool);
 901         if (!request)
 902                 return NULL;
 903
 904         /*
 905          * initiate connection if needed when the import has been
 906          * referenced by the new request to avoid races with disconnect
 907          */
 908         if (unlikely(imp->imp_state == LUSTRE_IMP_IDLE)) {
 909                 int rc;
 910
 911                 CDEBUG_LIMIT(imp->imp_idle_debug,
 912                              "%s: reconnect after %llds idle\n",
 913                              imp->imp_obd->obd_name, ktime_get_real_seconds() -
 914                                                      imp->imp_last_reply_time);
 915                 spin_lock(&imp->imp_lock);
 916                 if (imp->imp_state == LUSTRE_IMP_IDLE) {
 917                         imp->imp_generation++;
 918                         imp->imp_initiated_at = imp->imp_generation;
 919                         imp->imp_state =  LUSTRE_IMP_NEW;
 920                         connect = 1;
 921                 }
 922                 spin_unlock(&imp->imp_lock);
 923                 if (connect) {
 924                         rc = ptlrpc_connect_import(imp);
 925                         if (rc < 0) {
 926                                 ptlrpc_request_free(request);
 927                                 return NULL;
 928                         }
 929                         ptlrpc_pinger_add_import(imp);
 930                 }
 931         }
 932
 933         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
 934         req_capsule_set(&request->rq_pill, format);
 935         return request;
 936 }
 937
 938 /**
 939  * Allocate new request structure for import \a imp and initialize its
 940  * buffer structure according to capsule template \a format.
 941  */
 942 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
 943                                             const struct req_format *format)
 944 {
 945         return ptlrpc_request_alloc_internal(imp, NULL, format);
 946 }
 947 EXPORT_SYMBOL(ptlrpc_request_alloc);
 948
 949 /**
 950  * Allocate new request structure for import \a imp from pool \a pool and
 951  * initialize its buffer structure according to capsule template \a format.
 952  */
 953 struct ptlrpc_request *
 954 ptlrpc_request_alloc_pool(struct obd_import *imp,
 955                           struct ptlrpc_request_pool *pool,
 956                           const struct req_format *format)
 957 {
 958         return ptlrpc_request_alloc_internal(imp, pool, format);
 959 }
 960 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
 961
 962 /**
 963  * For requests not from pool, free memory of the request structure.
 964  * For requests obtained from a pool earlier, return request back to pool.
 965  */
 966 void ptlrpc_request_free(struct ptlrpc_request *request)
 967 {
 968         if (request->rq_pool)
 969                 __ptlrpc_free_req_to_pool(request);
 970         else
 971                 ptlrpc_request_cache_free(request);
 972 }
 973 EXPORT_SYMBOL(ptlrpc_request_free);
 974
 975 /**
 976  * Allocate new request for operatione \a opcode and immediatelly pack it for
 977  * network transfer.
 978  * Only used for simple requests like OBD_PING where the only important
 979  * part of the request is operation itself.
 980  * Returns allocated request or NULL on error.
 981  */
 982 struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
 983                                                  const struct req_format *format,
 984                                                  __u32 version, int opcode)
 985 {
 986         struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
 987         int rc;
 988
 989         if (req) {
 990                 rc = ptlrpc_request_pack(req, version, opcode);
 991                 if (rc) {
 992                         ptlrpc_request_free(req);
 993                         req = NULL;
 994                 }
 995         }
 996         return req;
 997 }
 998 EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
 999
1000 /**
1001  * Allocate and initialize new request set structure on the current CPT.
1002  * Returns a pointer to the newly allocated set structure or NULL on error.
1003  */
1004 struct ptlrpc_request_set *ptlrpc_prep_set(void)
1005 {
1006         struct ptlrpc_request_set *set;
1007         int cpt;
1008
1009         ENTRY;
1010         cpt = cfs_cpt_current(cfs_cpt_table, 0);
1011         OBD_CPT_ALLOC(set, cfs_cpt_table, cpt, sizeof(*set));
1012         if (!set)
1013                 RETURN(NULL);
1014         atomic_set(&set->set_refcount, 1);
1015         INIT_LIST_HEAD(&set->set_requests);
1016         init_waitqueue_head(&set->set_waitq);
1017         atomic_set(&set->set_new_count, 0);
1018         atomic_set(&set->set_remaining, 0);
1019         spin_lock_init(&set->set_new_req_lock);
1020         INIT_LIST_HEAD(&set->set_new_requests);
1021         set->set_max_inflight = UINT_MAX;
1022         set->set_producer     = NULL;
1023         set->set_producer_arg = NULL;
1024         set->set_rc           = 0;
1025
1026         RETURN(set);
1027 }
1028 EXPORT_SYMBOL(ptlrpc_prep_set);
1029
1030 /**
1031  * Allocate and initialize new request set structure with flow control
1032  * extension. This extension allows to control the number of requests in-flight
1033  * for the whole set. A callback function to generate requests must be provided
1034  * and the request set will keep the number of requests sent over the wire to
1035  * @max_inflight.
1036  * Returns a pointer to the newly allocated set structure or NULL on error.
1037  */
1038 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
1039                                              void *arg)
1040
1041 {
1042         struct ptlrpc_request_set *set;
1043
1044         set = ptlrpc_prep_set();
1045         if (!set)
1046                 RETURN(NULL);
1047
1048         set->set_max_inflight  = max;
1049         set->set_producer      = func;
1050         set->set_producer_arg  = arg;
1051
1052         RETURN(set);
1053 }
1054
1055 /**
1056  * Wind down and free request set structure previously allocated with
1057  * ptlrpc_prep_set.
1058  * Ensures that all requests on the set have completed and removes
1059  * all requests from the request list in a set.
1060  * If any unsent request happen to be on the list, pretends that they got
1061  * an error in flight and calls their completion handler.
1062  */
1063 void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
1064 {
1065         struct list_head *tmp;
1066         struct list_head *next;
1067         int expected_phase;
1068         int n = 0;
1069
1070         ENTRY;
1071
1072         /* Requests on the set should either all be completed, or all be new */
1073         expected_phase = (atomic_read(&set->set_remaining) == 0) ?
1074                          RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
1075         list_for_each(tmp, &set->set_requests) {
1076                 struct ptlrpc_request *req =
1077                         list_entry(tmp, struct ptlrpc_request,
1078                                    rq_set_chain);
1079
1080                 LASSERT(req->rq_phase == expected_phase);
1081                 n++;
1082         }
1083
1084         LASSERTF(atomic_read(&set->set_remaining) == 0 ||
1085                  atomic_read(&set->set_remaining) == n, "%d / %d\n",
1086                  atomic_read(&set->set_remaining), n);
1087
1088         list_for_each_safe(tmp, next, &set->set_requests) {
1089                 struct ptlrpc_request *req =
1090                         list_entry(tmp, struct ptlrpc_request,
1091                                    rq_set_chain);
1092                 list_del_init(&req->rq_set_chain);
1093
1094                 LASSERT(req->rq_phase == expected_phase);
1095
1096                 if (req->rq_phase == RQ_PHASE_NEW) {
1097                         ptlrpc_req_interpret(NULL, req, -EBADR);
1098                         atomic_dec(&set->set_remaining);
1099                 }
1100
1101                 spin_lock(&req->rq_lock);
1102                 req->rq_set = NULL;
1103                 req->rq_invalid_rqset = 0;
1104                 spin_unlock(&req->rq_lock);
1105
1106                 ptlrpc_req_finished(req);
1107         }
1108
1109         LASSERT(atomic_read(&set->set_remaining) == 0);
1110
1111         ptlrpc_reqset_put(set);
1112         EXIT;
1113 }
1114 EXPORT_SYMBOL(ptlrpc_set_destroy);
1115
1116 /**
1117  * Add a new request to the general purpose request set.
1118  * Assumes request reference from the caller.
1119  */
1120 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
1121                         struct ptlrpc_request *req)
1122 {
1123         LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
1124         LASSERT(list_empty(&req->rq_set_chain));
1125
1126         if (req->rq_allow_intr)
1127                 set->set_allow_intr = 1;
1128
1129         /* The set takes over the caller's request reference */
1130         list_add_tail(&req->rq_set_chain, &set->set_requests);
1131         req->rq_set = set;
1132         atomic_inc(&set->set_remaining);
1133         req->rq_queued_time = ktime_get_seconds();
1134
1135         if (req->rq_reqmsg)
1136                 lustre_msg_set_jobid(req->rq_reqmsg, NULL);
1137
1138         if (set->set_producer)
1139                 /*
1140                  * If the request set has a producer callback, the RPC must be
1141                  * sent straight away
1142                  */
1143                 ptlrpc_send_new_req(req);
1144 }
1145 EXPORT_SYMBOL(ptlrpc_set_add_req);
1146
1147 /**
1148  * Add a request to a request with dedicated server thread
1149  * and wake the thread to make any necessary processing.
1150  * Currently only used for ptlrpcd.
1151  */
1152 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
1153                             struct ptlrpc_request *req)
1154 {
1155         struct ptlrpc_request_set *set = pc->pc_set;
1156         int count, i;
1157
1158         LASSERT(req->rq_set == NULL);
1159         LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
1160
1161         spin_lock(&set->set_new_req_lock);
1162         /*
1163          * The set takes over the caller's request reference.
1164          */
1165         req->rq_set = set;
1166         req->rq_queued_time = ktime_get_seconds();
1167         list_add_tail(&req->rq_set_chain, &set->set_new_requests);
1168         count = atomic_inc_return(&set->set_new_count);
1169         spin_unlock(&set->set_new_req_lock);
1170
1171         /* Only need to call wakeup once for the first entry. */
1172         if (count == 1) {
1173                 wake_up(&set->set_waitq);
1174
1175                 /*
1176                  * XXX: It maybe unnecessary to wakeup all the partners. But to
1177                  *      guarantee the async RPC can be processed ASAP, we have
1178                  *      no other better choice. It maybe fixed in future.
1179                  */
1180                 for (i = 0; i < pc->pc_npartners; i++)
1181                         wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
1182         }
1183 }
1184
1185 /**
1186  * Based on the current state of the import, determine if the request
1187  * can be sent, is an error, or should be delayed.
1188  *
1189  * Returns true if this request should be delayed. If false, and
1190  * *status is set, then the request can not be sent and *status is the
1191  * error code.  If false and status is 0, then request can be sent.
1192  *
1193  * The imp->imp_lock must be held.
1194  */
1195 static int ptlrpc_import_delay_req(struct obd_import *imp,
1196                                    struct ptlrpc_request *req, int *status)
1197 {
1198         int delay = 0;
1199
1200         ENTRY;
1201         LASSERT(status);
1202         *status = 0;
1203
1204         if (req->rq_ctx_init || req->rq_ctx_fini) {
1205                 /* always allow ctx init/fini rpc go through */
1206         } else if (imp->imp_state == LUSTRE_IMP_NEW) {
1207                 DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
1208                 *status = -EIO;
1209         } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
1210                 unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
1211
1212                 /*
1213                  * pings or MDS-equivalent STATFS may safely
1214                  * race with umount
1215                  */
1216                 DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
1217                           D_HA : D_ERROR, req, "IMP_CLOSED ");
1218                 *status = -EIO;
1219         } else if (ptlrpc_send_limit_expired(req)) {
1220                 /* probably doesn't need to be a D_ERROR afterinitial testing */
1221                 DEBUG_REQ(D_HA, req, "send limit expired ");
1222                 *status = -ETIMEDOUT;
1223         } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
1224                    imp->imp_state == LUSTRE_IMP_CONNECTING) {
1225                 ;/* allow CONNECT even if import is invalid */
1226                 if (atomic_read(&imp->imp_inval_count) != 0) {
1227                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1228                         *status = -EIO;
1229                 }
1230         } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
1231                 if (!imp->imp_deactive)
1232                         DEBUG_REQ(D_NET, req, "IMP_INVALID");
1233                 *status = -ESHUTDOWN; /* b=12940 */
1234         } else if (req->rq_import_generation != imp->imp_generation) {
1235                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
1236                 *status = -EIO;
1237         } else if (req->rq_send_state != imp->imp_state) {
1238                 /* invalidate in progress - any requests should be drop */
1239                 if (atomic_read(&imp->imp_inval_count) != 0) {
1240                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1241                         *status = -EIO;
1242                 } else if (req->rq_no_delay &&
1243                            imp->imp_generation != imp->imp_initiated_at) {
1244                         /* ignore nodelay for requests initiating connections */
1245                         *status = -EWOULDBLOCK;
1246                 } else if (req->rq_allow_replay &&
1247                            (imp->imp_state == LUSTRE_IMP_REPLAY ||
1248                             imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
1249                             imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
1250                             imp->imp_state == LUSTRE_IMP_RECOVER)) {
1251                         DEBUG_REQ(D_HA, req, "allow during recovery.\n");
1252                 } else {
1253                         delay = 1;
1254                 }
1255         }
1256
1257         RETURN(delay);
1258 }
1259
1260 /**
1261  * Decide if the error message should be printed to the console or not.
1262  * Makes its decision based on request type, status, and failure frequency.
1263  *
1264  * \param[in] req  request that failed and may need a console message
1265  *
1266  * \retval false if no message should be printed
1267  * \retval true  if console message should be printed
1268  */
1269 static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
1270 {
1271         LASSERT(req->rq_reqmsg != NULL);
1272
1273         /* Suppress particular reconnect errors which are to be expected. */
1274         if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
1275                 /* Suppress timed out reconnect requests */
1276                 if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
1277                     req->rq_timedout)
1278                         return false;
1279
1280                 /*
1281                  * Suppress most unavailable/again reconnect requests, but
1282                  * print occasionally so it is clear client is trying to
1283                  * connect to a server where no target is running.
1284                  */
1285                 if ((err == -ENODEV || err == -EAGAIN) &&
1286                     req->rq_import->imp_conn_cnt % 30 != 20)
1287                         return false;
1288         }
1289
1290         if (opc == LDLM_ENQUEUE && err == -EAGAIN)
1291                 /* -EAGAIN is normal when using POSIX flocks */
1292                 return false;
1293
1294         if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
1295             (req->rq_xid & 0xf) != 10)
1296                 /* Suppress most ping requests, they may fail occasionally */
1297                 return false;
1298
1299         return true;
1300 }
1301
1302 /**
1303  * Check request processing status.
1304  * Returns the status.
1305  */
1306 static int ptlrpc_check_status(struct ptlrpc_request *req)
1307 {
1308         int err;
1309
1310         ENTRY;
1311         err = lustre_msg_get_status(req->rq_repmsg);
1312         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
1313                 struct obd_import *imp = req->rq_import;
1314                 lnet_nid_t nid = imp->imp_connection->c_peer.nid;
1315                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
1316
1317                 if (ptlrpc_console_allow(req, opc, err))
1318                         LCONSOLE_ERROR_MSG(0x11,
1319                                            "%s: operation %s to node %s failed: rc = %d\n",
1320                                            imp->imp_obd->obd_name,
1321                                            ll_opcode2str(opc),
1322                                            libcfs_nid2str(nid), err);
1323                 RETURN(err < 0 ? err : -EINVAL);
1324         }
1325
1326         if (err < 0) {
1327                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1328         } else if (err > 0) {
1329                 /* XXX: translate this error from net to host */
1330                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1331         }
1332
1333         RETURN(err);
1334 }
1335
1336 /**
1337  * save pre-versions of objects into request for replay.
1338  * Versions are obtained from server reply.
1339  * used for VBR.
1340  */
1341 static void ptlrpc_save_versions(struct ptlrpc_request *req)
1342 {
1343         struct lustre_msg *repmsg = req->rq_repmsg;
1344         struct lustre_msg *reqmsg = req->rq_reqmsg;
1345         __u64 *versions = lustre_msg_get_versions(repmsg);
1346
1347         ENTRY;
1348         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
1349                 return;
1350
1351         LASSERT(versions);
1352         lustre_msg_set_versions(reqmsg, versions);
1353         CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
1354                versions[0], versions[1]);
1355
1356         EXIT;
1357 }
1358
1359 __u64 ptlrpc_known_replied_xid(struct obd_import *imp)
1360 {
1361         struct ptlrpc_request *req;
1362
1363         assert_spin_locked(&imp->imp_lock);
1364         if (list_empty(&imp->imp_unreplied_list))
1365                 return 0;
1366
1367         req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
1368                          rq_unreplied_list);
1369         LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
1370
1371         if (imp->imp_known_replied_xid < req->rq_xid - 1)
1372                 imp->imp_known_replied_xid = req->rq_xid - 1;
1373
1374         return req->rq_xid - 1;
1375 }
1376
1377 /**
1378  * Callback function called when client receives RPC reply for \a req.
1379  * Returns 0 on success or error code.
1380  * The return alue would be assigned to req->rq_status by the caller
1381  * as request processing status.
1382  * This function also decides if the request needs to be saved for later replay.
1383  */
1384 static int after_reply(struct ptlrpc_request *req)
1385 {
1386         struct obd_import *imp = req->rq_import;
1387         struct obd_device *obd = req->rq_import->imp_obd;
1388         ktime_t work_start;
1389         u64 committed;
1390         s64 timediff;
1391         int rc;
1392
1393         ENTRY;
1394         LASSERT(obd != NULL);
1395         /* repbuf must be unlinked */
1396         LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked);
1397
1398         if (req->rq_reply_truncated) {
1399                 if (ptlrpc_no_resend(req)) {
1400                         DEBUG_REQ(D_ERROR, req,
1401                                   "reply buffer overflow, expected: %d, actual size: %d",
1402                                   req->rq_nob_received, req->rq_repbuf_len);
1403                         RETURN(-EOVERFLOW);
1404                 }
1405
1406                 sptlrpc_cli_free_repbuf(req);
1407                 /*
1408                  * Pass the required reply buffer size (include
1409                  * space for early reply).
1410                  * NB: no need to roundup because alloc_repbuf
1411                  * will roundup it
1412                  */
1413                 req->rq_replen = req->rq_nob_received;
1414                 req->rq_nob_received = 0;
1415                 spin_lock(&req->rq_lock);
1416                 req->rq_resend       = 1;
1417                 spin_unlock(&req->rq_lock);
1418                 RETURN(0);
1419         }
1420
1421         work_start = ktime_get_real();
1422         timediff = ktime_us_delta(work_start, req->rq_sent_ns);
1423
1424         /*
1425          * NB Until this point, the whole of the incoming message,
1426          * including buflens, status etc is in the sender's byte order.
1427          */
1428         rc = sptlrpc_cli_unwrap_reply(req);
1429         if (rc) {
1430                 DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
1431                 RETURN(rc);
1432         }
1433
1434         /*
1435          * Security layer unwrap might ask resend this request.
1436          */
1437         if (req->rq_resend)
1438                 RETURN(0);
1439
1440         rc = unpack_reply(req);
1441         if (rc)
1442                 RETURN(rc);
1443
1444         /* retry indefinitely on EINPROGRESS */
1445         if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
1446             ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
1447                 time64_t now = ktime_get_real_seconds();
1448
1449                 DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
1450                 spin_lock(&req->rq_lock);
1451                 req->rq_resend = 1;
1452                 spin_unlock(&req->rq_lock);
1453                 req->rq_nr_resend++;
1454
1455                 /* Readjust the timeout for current conditions */
1456                 ptlrpc_at_set_req_timeout(req);
1457                 /*
1458                  * delay resend to give a chance to the server to get ready.
1459                  * The delay is increased by 1s on every resend and is capped to
1460                  * the current request timeout (i.e. obd_timeout if AT is off,
1461                  * or AT service time x 125% + 5s, see at_est2timeout)
1462                  */
1463                 if (req->rq_nr_resend > req->rq_timeout)
1464                         req->rq_sent = now + req->rq_timeout;
1465                 else
1466                         req->rq_sent = now + req->rq_nr_resend;
1467
1468                 /* Resend for EINPROGRESS will use a new XID */
1469                 spin_lock(&imp->imp_lock);
1470                 list_del_init(&req->rq_unreplied_list);
1471                 spin_unlock(&imp->imp_lock);
1472
1473                 RETURN(0);
1474         }
1475
1476         if (obd->obd_svc_stats) {
1477                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
1478                                     timediff);
1479                 ptlrpc_lprocfs_rpc_sent(req, timediff);
1480         }
1481
1482         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
1483             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
1484                 DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
1485                           lustre_msg_get_type(req->rq_repmsg));
1486                 RETURN(-EPROTO);
1487         }
1488
1489         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
1490                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
1491         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
1492         ptlrpc_at_adj_net_latency(req,
1493                                   lustre_msg_get_service_time(req->rq_repmsg));
1494
1495         rc = ptlrpc_check_status(req);
1496
1497         if (rc) {
1498                 /*
1499                  * Either we've been evicted, or the server has failed for
1500                  * some reason. Try to reconnect, and if that fails, punt to
1501                  * the upcall.
1502                  */
1503                 if (ptlrpc_recoverable_error(rc)) {
1504                         if (req->rq_send_state != LUSTRE_IMP_FULL ||
1505                             imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
1506                                 RETURN(rc);
1507                         }
1508                         ptlrpc_request_handle_notconn(req);
1509                         RETURN(rc);
1510                 }
1511         } else {
1512                 /*
1513                  * Let's look if server sent slv. Do it only for RPC with
1514                  * rc == 0.
1515                  */
1516                 ldlm_cli_update_pool(req);
1517         }
1518
1519         /*
1520          * Store transno in reqmsg for replay.
1521          */
1522         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
1523                 req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
1524                 lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
1525         }
1526
1527         if (imp->imp_replayable) {
1528                 spin_lock(&imp->imp_lock);
1529                 /*
1530                  * No point in adding already-committed requests to the replay
1531                  * list, we will just remove them immediately. b=9829
1532                  */
1533                 if (req->rq_transno != 0 &&
1534                     (req->rq_transno >
1535                      lustre_msg_get_last_committed(req->rq_repmsg) ||
1536                      req->rq_replay)) {
1537                         /** version recovery */
1538                         ptlrpc_save_versions(req);
1539                         ptlrpc_retain_replayable_request(req, imp);
1540                 } else if (req->rq_commit_cb &&
1541                            list_empty(&req->rq_replay_list)) {
1542                         /*
1543                          * NB: don't call rq_commit_cb if it's already on
1544                          * rq_replay_list, ptlrpc_free_committed() will call
1545                          * it later, see LU-3618 for details
1546                          */
1547                         spin_unlock(&imp->imp_lock);
1548                         req->rq_commit_cb(req);
1549                         spin_lock(&imp->imp_lock);
1550                 }
1551
1552                 /*
1553                  * Replay-enabled imports return commit-status information.
1554                  */
1555                 committed = lustre_msg_get_last_committed(req->rq_repmsg);
1556                 if (likely(committed > imp->imp_peer_committed_transno))
1557                         imp->imp_peer_committed_transno = committed;
1558
1559                 ptlrpc_free_committed(imp);
1560
1561                 if (!list_empty(&imp->imp_replay_list)) {
1562                         struct ptlrpc_request *last;
1563
1564                         last = list_entry(imp->imp_replay_list.prev,
1565                                           struct ptlrpc_request,
1566                                           rq_replay_list);
1567                         /*
1568                          * Requests with rq_replay stay on the list even if no
1569                          * commit is expected.
1570                          */
1571                         if (last->rq_transno > imp->imp_peer_committed_transno)
1572                                 ptlrpc_pinger_commit_expected(imp);
1573                 }
1574
1575                 spin_unlock(&imp->imp_lock);
1576         }
1577
1578         RETURN(rc);
1579 }
1580
1581 /**
1582  * Helper function to send request \a req over the network for the first time
1583  * Also adjusts request phase.
1584  * Returns 0 on success or error code.
1585  */
1586 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
1587 {
1588         struct obd_import *imp = req->rq_import;
1589         __u64 min_xid = 0;
1590         int rc;
1591
1592         ENTRY;
1593         LASSERT(req->rq_phase == RQ_PHASE_NEW);
1594
1595         /* do not try to go further if there is not enough memory in enc_pool */
1596         if (req->rq_sent && req->rq_bulk)
1597                 if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
1598                     pool_is_at_full_capacity())
1599                         RETURN(-ENOMEM);
1600
1601         if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
1602             (!req->rq_generation_set ||
1603              req->rq_import_generation == imp->imp_generation))
1604                 RETURN(0);
1605
1606         ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
1607
1608         spin_lock(&imp->imp_lock);
1609
1610         LASSERT(req->rq_xid != 0);
1611         LASSERT(!list_empty(&req->rq_unreplied_list));
1612
1613         if (!req->rq_generation_set)
1614                 req->rq_import_generation = imp->imp_generation;
1615
1616         if (ptlrpc_import_delay_req(imp, req, &rc)) {
1617                 spin_lock(&req->rq_lock);
1618                 req->rq_waiting = 1;
1619                 spin_unlock(&req->rq_lock);
1620
1621                 DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
1622                           ptlrpc_import_state_name(req->rq_send_state),
1623                           ptlrpc_import_state_name(imp->imp_state));
1624                 LASSERT(list_empty(&req->rq_list));
1625                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
1626                 atomic_inc(&req->rq_import->imp_inflight);
1627                 spin_unlock(&imp->imp_lock);
1628                 RETURN(0);
1629         }
1630
1631         if (rc != 0) {
1632                 spin_unlock(&imp->imp_lock);
1633                 req->rq_status = rc;
1634                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1635                 RETURN(rc);
1636         }
1637
1638         LASSERT(list_empty(&req->rq_list));
1639         list_add_tail(&req->rq_list, &imp->imp_sending_list);
1640         atomic_inc(&req->rq_import->imp_inflight);
1641
1642         /*
1643          * find the known replied XID from the unreplied list, CONNECT
1644          * and DISCONNECT requests are skipped to make the sanity check
1645          * on server side happy. see process_req_last_xid().
1646          *
1647          * For CONNECT: Because replay requests have lower XID, it'll
1648          * break the sanity check if CONNECT bump the exp_last_xid on
1649          * server.
1650          *
1651          * For DISCONNECT: Since client will abort inflight RPC before
1652          * sending DISCONNECT, DISCONNECT may carry an XID which higher
1653          * than the inflight RPC.
1654          */
1655         if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
1656                 min_xid = ptlrpc_known_replied_xid(imp);
1657         spin_unlock(&imp->imp_lock);
1658
1659         lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
1660
1661         lustre_msg_set_status(req->rq_reqmsg, current_pid());
1662
1663         rc = sptlrpc_req_refresh_ctx(req, -1);
1664         if (rc) {
1665                 if (req->rq_err) {
1666                         req->rq_status = rc;
1667                         RETURN(1);
1668                 } else {
1669                         spin_lock(&req->rq_lock);
1670                         req->rq_wait_ctx = 1;
1671                         spin_unlock(&req->rq_lock);
1672                         RETURN(0);
1673                 }
1674         }
1675
1676         CDEBUG(D_RPCTRACE,
1677                "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1678                current_comm(),
1679                imp->imp_obd->obd_uuid.uuid,
1680                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1681                obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
1682
1683         rc = ptl_send_rpc(req, 0);
1684         if (rc == -ENOMEM) {
1685                 spin_lock(&imp->imp_lock);
1686                 if (!list_empty(&req->rq_list)) {
1687                         list_del_init(&req->rq_list);
1688                         atomic_dec(&req->rq_import->imp_inflight);
1689                 }
1690                 spin_unlock(&imp->imp_lock);
1691                 ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
1692                 RETURN(rc);
1693         }
1694         if (rc) {
1695                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
1696                 spin_lock(&req->rq_lock);
1697                 req->rq_net_err = 1;
1698                 spin_unlock(&req->rq_lock);
1699                 RETURN(rc);
1700         }
1701         RETURN(0);
1702 }
1703
1704 static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
1705 {
1706         int remaining, rc;
1707
1708         ENTRY;
1709         LASSERT(set->set_producer != NULL);
1710
1711         remaining = atomic_read(&set->set_remaining);
1712
1713         /*
1714          * populate the ->set_requests list with requests until we
1715          * reach the maximum number of RPCs in flight for this set
1716          */
1717         while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
1718                 rc = set->set_producer(set, set->set_producer_arg);
1719                 if (rc == -ENOENT) {
1720                         /* no more RPC to produce */
1721                         set->set_producer     = NULL;
1722                         set->set_producer_arg = NULL;
1723                         RETURN(0);
1724                 }
1725         }
1726
1727         RETURN((atomic_read(&set->set_remaining) - remaining));
1728 }
1729
1730 /**
1731  * this sends any unsent RPCs in \a set and returns 1 if all are sent
1732  * and no more replies are expected.
1733  * (it is possible to get less replies than requests sent e.g. due to timed out
1734  * requests or requests that we had trouble to send out)
1735  *
1736  * NOTE: This function contains a potential schedule point (cond_resched()).
1737  */
1738 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
1739 {
1740         struct list_head *tmp, *next;
1741         struct list_head  comp_reqs;
1742         int force_timer_recalc = 0;
1743
1744         ENTRY;
1745         if (atomic_read(&set->set_remaining) == 0)
1746                 RETURN(1);
1747
1748         INIT_LIST_HEAD(&comp_reqs);
1749         list_for_each_safe(tmp, next, &set->set_requests) {
1750                 struct ptlrpc_request *req =
1751                         list_entry(tmp, struct ptlrpc_request,
1752                                    rq_set_chain);
1753                 struct obd_import *imp = req->rq_import;
1754                 int unregistered = 0;
1755                 int async = 1;
1756                 int rc = 0;
1757
1758                 if (req->rq_phase == RQ_PHASE_COMPLETE) {
1759                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1760                         continue;
1761                 }
1762
1763                 /*
1764                  * This schedule point is mainly for the ptlrpcd caller of this
1765                  * function.  Most ptlrpc sets are not long-lived and unbounded
1766                  * in length, but at the least the set used by the ptlrpcd is.
1767                  * Since the processing time is unbounded, we need to insert an
1768                  * explicit schedule point to make the thread well-behaved.
1769                  */
1770                 cond_resched();
1771
1772                 /*
1773                  * If the caller requires to allow to be interpreted by force
1774                  * and it has really been interpreted, then move the request
1775                  * to RQ_PHASE_INTERPRET phase in spite of what the current
1776                  * phase is.
1777                  */
1778                 if (unlikely(req->rq_allow_intr && req->rq_intr)) {
1779                         req->rq_status = -EINTR;
1780                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1781
1782                         /*
1783                          * Since it is interpreted and we have to wait for
1784                          * the reply to be unlinked, then use sync mode.
1785                          */
1786                         async = 0;
1787
1788                         GOTO(interpret, req->rq_status);
1789                 }
1790
1791                 if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
1792                         force_timer_recalc = 1;
1793
1794                 /* delayed send - skip */
1795                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
1796                         continue;
1797
1798                 /* delayed resend - skip */
1799                 if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
1800                     req->rq_sent > ktime_get_real_seconds())
1801                         continue;
1802
1803                 if (!(req->rq_phase == RQ_PHASE_RPC ||
1804                       req->rq_phase == RQ_PHASE_BULK ||
1805                       req->rq_phase == RQ_PHASE_INTERPRET ||
1806                       req->rq_phase == RQ_PHASE_UNREG_RPC ||
1807                       req->rq_phase == RQ_PHASE_UNREG_BULK)) {
1808                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
1809                         LBUG();
1810                 }
1811
1812                 if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
1813                     req->rq_phase == RQ_PHASE_UNREG_BULK) {
1814                         LASSERT(req->rq_next_phase != req->rq_phase);
1815                         LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
1816
1817                         if (req->rq_req_deadline &&
1818                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
1819                                 req->rq_req_deadline = 0;
1820                         if (req->rq_reply_deadline &&
1821                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
1822                                 req->rq_reply_deadline = 0;
1823                         if (req->rq_bulk_deadline &&
1824                             !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
1825                                 req->rq_bulk_deadline = 0;
1826
1827                         /*
1828                          * Skip processing until reply is unlinked. We
1829                          * can't return to pool before that and we can't
1830                          * call interpret before that. We need to make
1831                          * sure that all rdma transfers finished and will
1832                          * not corrupt any data.
1833                          */
1834                         if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
1835                             ptlrpc_client_recv_or_unlink(req))
1836                                 continue;
1837                         if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
1838                             ptlrpc_client_bulk_active(req))
1839                                 continue;
1840
1841                         /*
1842                          * Turn fail_loc off to prevent it from looping
1843                          * forever.
1844                          */
1845                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
1846                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
1847                                                      OBD_FAIL_ONCE);
1848                         }
1849                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
1850                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
1851                                                      OBD_FAIL_ONCE);
1852                         }
1853
1854                         /*
1855                          * Move to next phase if reply was successfully
1856                          * unlinked.
1857                          */
1858                         ptlrpc_rqphase_move(req, req->rq_next_phase);
1859                 }
1860
1861                 if (req->rq_phase == RQ_PHASE_INTERPRET)
1862                         GOTO(interpret, req->rq_status);
1863
1864                 /*
1865                  * Note that this also will start async reply unlink.
1866                  */
1867                 if (req->rq_net_err && !req->rq_timedout) {
1868                         ptlrpc_expire_one_request(req, 1);
1869
1870                         /*
1871                          * Check if we still need to wait for unlink.
1872                          */
1873                         if (ptlrpc_client_recv_or_unlink(req) ||
1874                             ptlrpc_client_bulk_active(req))
1875                                 continue;
1876                         /* If there is no need to resend, fail it now. */
1877                         if (req->rq_no_resend) {
1878                                 if (req->rq_status == 0)
1879                                         req->rq_status = -EIO;
1880                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1881                                 GOTO(interpret, req->rq_status);
1882                         } else {
1883                                 continue;
1884                         }
1885                 }
1886
1887                 if (req->rq_err) {
1888                         spin_lock(&req->rq_lock);
1889                         req->rq_replied = 0;
1890                         spin_unlock(&req->rq_lock);
1891                         if (req->rq_status == 0)
1892                                 req->rq_status = -EIO;
1893                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1894                         GOTO(interpret, req->rq_status);
1895                 }
1896
1897                 /*
1898                  * ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
1899                  * so it sets rq_intr regardless of individual rpc
1900                  * timeouts. The synchronous IO waiting path sets
1901                  * rq_intr irrespective of whether ptlrpcd
1902                  * has seen a timeout.  Our policy is to only interpret
1903                  * interrupted rpcs after they have timed out, so we
1904                  * need to enforce that here.
1905                  */
1906
1907                 if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
1908                                      req->rq_wait_ctx)) {
1909                         req->rq_status = -EINTR;
1910                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1911                         GOTO(interpret, req->rq_status);
1912                 }
1913
1914                 if (req->rq_phase == RQ_PHASE_RPC) {
1915                         if (req->rq_timedout || req->rq_resend ||
1916                             req->rq_waiting || req->rq_wait_ctx) {
1917                                 int status;
1918
1919                                 if (!ptlrpc_unregister_reply(req, 1)) {
1920                                         ptlrpc_unregister_bulk(req, 1);
1921                                         continue;
1922                                 }
1923
1924                                 spin_lock(&imp->imp_lock);
1925                                 if (ptlrpc_import_delay_req(imp, req,
1926                                                             &status)) {
1927                                         /*
1928                                          * put on delay list - only if we wait
1929                                          * recovery finished - before send
1930                                          */
1931                                         list_del_init(&req->rq_list);
1932                                         list_add_tail(&req->rq_list,
1933                                                       &imp->imp_delayed_list);
1934                                         spin_unlock(&imp->imp_lock);
1935                                         continue;
1936                                 }
1937
1938                                 if (status != 0)  {
1939                                         req->rq_status = status;
1940                                         ptlrpc_rqphase_move(req,
1941                                                             RQ_PHASE_INTERPRET);
1942                                         spin_unlock(&imp->imp_lock);
1943                                         GOTO(interpret, req->rq_status);
1944                                 }
1945                                 /* ignore on just initiated connections */
1946                                 if (ptlrpc_no_resend(req) &&
1947                                     !req->rq_wait_ctx &&
1948                                     imp->imp_generation !=
1949                                     imp->imp_initiated_at) {
1950                                         req->rq_status = -ENOTCONN;
1951                                         ptlrpc_rqphase_move(req,
1952                                                             RQ_PHASE_INTERPRET);
1953                                         spin_unlock(&imp->imp_lock);
1954                                         GOTO(interpret, req->rq_status);
1955                                 }
1956
1957                                 list_del_init(&req->rq_list);
1958                                 list_add_tail(&req->rq_list,
1959                                               &imp->imp_sending_list);
1960
1961                                 spin_unlock(&imp->imp_lock);
1962
1963                                 spin_lock(&req->rq_lock);
1964                                 req->rq_waiting = 0;
1965                                 spin_unlock(&req->rq_lock);
1966
1967                                 if (req->rq_timedout || req->rq_resend) {
1968                                         /*
1969                                          * This is re-sending anyways,
1970                                          * let's mark req as resend.
1971                                          */
1972                                         spin_lock(&req->rq_lock);
1973                                         req->rq_resend = 1;
1974                                         spin_unlock(&req->rq_lock);
1975                                 }
1976                                 /*
1977                                  * rq_wait_ctx is only touched by ptlrpcd,
1978                                  * so no lock is needed here.
1979                                  */
1980                                 status = sptlrpc_req_refresh_ctx(req, -1);
1981                                 if (status) {
1982                                         if (req->rq_err) {
1983                                                 req->rq_status = status;
1984                                                 spin_lock(&req->rq_lock);
1985                                                 req->rq_wait_ctx = 0;
1986                                                 spin_unlock(&req->rq_lock);
1987                                                 force_timer_recalc = 1;
1988                                         } else {
1989                                                 spin_lock(&req->rq_lock);
1990                                                 req->rq_wait_ctx = 1;
1991                                                 spin_unlock(&req->rq_lock);
1992                                         }
1993
1994                                         continue;
1995                                 } else {
1996                                         spin_lock(&req->rq_lock);
1997                                         req->rq_wait_ctx = 0;
1998                                         spin_unlock(&req->rq_lock);
1999                                 }
2000
2001                                 /*
2002                                  * In any case, the previous bulk should be
2003                                  * cleaned up to prepare for the new sending
2004                                  */
2005                                 if (req->rq_bulk &&
2006                                     !ptlrpc_unregister_bulk(req, 1))
2007                                         continue;
2008
2009                                 rc = ptl_send_rpc(req, 0);
2010                                 if (rc == -ENOMEM) {
2011                                         spin_lock(&imp->imp_lock);
2012                                         if (!list_empty(&req->rq_list))
2013                                                 list_del_init(&req->rq_list);
2014                                         spin_unlock(&imp->imp_lock);
2015                                         ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
2016                                         continue;
2017                                 }
2018                                 if (rc) {
2019                                         DEBUG_REQ(D_HA, req,
2020                                                   "send failed: rc = %d", rc);
2021                                         force_timer_recalc = 1;
2022                                         spin_lock(&req->rq_lock);
2023                                         req->rq_net_err = 1;
2024                                         spin_unlock(&req->rq_lock);
2025                                         continue;
2026                                 }
2027                                 /* need to reset the timeout */
2028                                 force_timer_recalc = 1;
2029                         }
2030
2031                         spin_lock(&req->rq_lock);
2032
2033                         if (ptlrpc_client_early(req)) {
2034                                 ptlrpc_at_recv_early_reply(req);
2035                                 spin_unlock(&req->rq_lock);
2036                                 continue;
2037                         }
2038
2039                         /* Still waiting for a reply? */
2040                         if (ptlrpc_client_recv(req)) {
2041                                 spin_unlock(&req->rq_lock);
2042                                 continue;
2043                         }
2044
2045                         /* Did we actually receive a reply? */
2046                         if (!ptlrpc_client_replied(req)) {
2047                                 spin_unlock(&req->rq_lock);
2048                                 continue;
2049                         }
2050
2051                         spin_unlock(&req->rq_lock);
2052
2053                         /*
2054                          * unlink from net because we are going to
2055                          * swab in-place of reply buffer
2056                          */
2057                         unregistered = ptlrpc_unregister_reply(req, 1);
2058                         if (!unregistered)
2059                                 continue;
2060
2061                         req->rq_status = after_reply(req);
2062                         if (req->rq_resend)
2063                                 continue;
2064
2065                         /*
2066                          * If there is no bulk associated with this request,
2067                          * then we're done and should let the interpreter
2068                          * process the reply. Similarly if the RPC returned
2069                          * an error, and therefore the bulk will never arrive.
2070                          */
2071                         if (!req->rq_bulk || req->rq_status < 0) {
2072                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2073                                 GOTO(interpret, req->rq_status);
2074                         }
2075
2076                         ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
2077                 }
2078
2079                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
2080                 if (ptlrpc_client_bulk_active(req))
2081                         continue;
2082
2083                 if (req->rq_bulk->bd_failure) {
2084                         /*
2085                          * The RPC reply arrived OK, but the bulk screwed
2086                          * up!  Dead weird since the server told us the RPC
2087                          * was good after getting the REPLY for her GET or
2088                          * the ACK for her PUT.
2089                          */
2090                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
2091                         req->rq_status = -EIO;
2092                 }
2093
2094                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
2095
2096 interpret:
2097                 LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
2098
2099                 /*
2100                  * This moves to "unregistering" phase we need to wait for
2101                  * reply unlink.
2102                  */
2103                 if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
2104                         /* start async bulk unlink too */
2105                         ptlrpc_unregister_bulk(req, 1);
2106                         continue;
2107                 }
2108
2109                 if (!ptlrpc_unregister_bulk(req, async))
2110                         continue;
2111
2112                 /*
2113                  * When calling interpret receiving already should be
2114                  * finished.
2115                  */
2116                 LASSERT(!req->rq_receiving_reply);
2117
2118                 ptlrpc_req_interpret(env, req, req->rq_status);
2119
2120                 if (ptlrpcd_check_work(req)) {
2121                         atomic_dec(&set->set_remaining);
2122                         continue;
2123                 }
2124                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
2125
2126                 if (req->rq_reqmsg)
2127                         CDEBUG(D_RPCTRACE,
2128                                "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
2129                                current_comm(),
2130                                imp->imp_obd->obd_uuid.uuid,
2131                                lustre_msg_get_status(req->rq_reqmsg),
2132                                req->rq_xid,
2133                                obd_import_nid2str(imp),
2134                                lustre_msg_get_opc(req->rq_reqmsg));
2135
2136                 spin_lock(&imp->imp_lock);
2137                 /*
2138                  * Request already may be not on sending or delaying list. This
2139                  * may happen in the case of marking it erroneous for the case
2140                  * ptlrpc_import_delay_req(req, status) find it impossible to
2141                  * allow sending this rpc and returns *status != 0.
2142                  */
2143                 if (!list_empty(&req->rq_list)) {
2144                         list_del_init(&req->rq_list);
2145                         atomic_dec(&imp->imp_inflight);
2146                 }
2147                 list_del_init(&req->rq_unreplied_list);
2148                 spin_unlock(&imp->imp_lock);
2149
2150                 atomic_dec(&set->set_remaining);
2151                 wake_up_all(&imp->imp_recovery_waitq);
2152
2153                 if (set->set_producer) {
2154                         /* produce a new request if possible */
2155                         if (ptlrpc_set_producer(set) > 0)
2156                                 force_timer_recalc = 1;
2157
2158                         /*
2159                          * free the request that has just been completed
2160                          * in order not to pollute set->set_requests
2161                          */
2162                         list_del_init(&req->rq_set_chain);
2163                         spin_lock(&req->rq_lock);
2164                         req->rq_set = NULL;
2165                         req->rq_invalid_rqset = 0;
2166                         spin_unlock(&req->rq_lock);
2167
2168                         /* record rq_status to compute the final status later */
2169                         if (req->rq_status != 0)
2170                                 set->set_rc = req->rq_status;
2171                         ptlrpc_req_finished(req);
2172                 } else {
2173                         list_move_tail(&req->rq_set_chain, &comp_reqs);
2174                 }
2175         }
2176
2177         /*
2178          * move completed request at the head of list so it's easier for
2179          * caller to find them
2180          */
2181         list_splice(&comp_reqs, &set->set_requests);
2182
2183         /* If we hit an error, we want to recover promptly. */
2184         RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
2185 }
2186 EXPORT_SYMBOL(ptlrpc_check_set);
2187
2188 /**
2189  * Time out request \a req. is \a async_unlink is set, that means do not wait
2190  * until LNet actually confirms network buffer unlinking.
2191  * Return 1 if we should give up further retrying attempts or 0 otherwise.
2192  */
2193 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
2194 {
2195         struct obd_import *imp = req->rq_import;
2196         unsigned int debug_mask = D_RPCTRACE;
2197         int rc = 0;
2198
2199         ENTRY;
2200         spin_lock(&req->rq_lock);
2201         req->rq_timedout = 1;
2202         spin_unlock(&req->rq_lock);
2203
2204         if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
2205                                  lustre_msg_get_status(req->rq_reqmsg)))
2206                 debug_mask = D_WARNING;
2207         DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
2208                   req->rq_net_err ? "failed due to network error" :
2209                      ((req->rq_real_sent == 0 ||
2210                        req->rq_real_sent < req->rq_sent ||
2211                        req->rq_real_sent >= req->rq_deadline) ?
2212                       "timed out for sent delay" : "timed out for slow reply"),
2213                   (s64)req->rq_sent, (s64)req->rq_real_sent);
2214
2215         if (imp && obd_debug_peer_on_timeout)
2216                 LNetDebugPeer(imp->imp_connection->c_peer);
2217
2218         ptlrpc_unregister_reply(req, async_unlink);
2219         ptlrpc_unregister_bulk(req, async_unlink);
2220
2221         if (obd_dump_on_timeout)
2222                 libcfs_debug_dumplog();
2223
2224         if (!imp) {
2225                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
2226                 RETURN(1);
2227         }
2228
2229         atomic_inc(&imp->imp_timeouts);
2230
2231         /* The DLM server doesn't want recovery run on its imports. */
2232         if (imp->imp_dlm_fake)
2233                 RETURN(1);
2234
2235         /*
2236          * If this request is for recovery or other primordial tasks,
2237          * then error it out here.
2238          */
2239         if (req->rq_ctx_init || req->rq_ctx_fini ||
2240             req->rq_send_state != LUSTRE_IMP_FULL ||
2241             imp->imp_obd->obd_no_recov) {
2242                 DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
2243                           ptlrpc_import_state_name(req->rq_send_state),
2244                           ptlrpc_import_state_name(imp->imp_state));
2245                 spin_lock(&req->rq_lock);
2246                 req->rq_status = -ETIMEDOUT;
2247                 req->rq_err = 1;
2248                 spin_unlock(&req->rq_lock);
2249                 RETURN(1);
2250         }
2251
2252         /*
2253          * if a request can't be resent we can't wait for an answer after
2254          * the timeout
2255          */
2256         if (ptlrpc_no_resend(req)) {
2257                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
2258                 rc = 1;
2259         }
2260
2261         ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
2262
2263         RETURN(rc);
2264 }
2265
2266 /**
2267  * Time out all uncompleted requests in request set pointed by \a data
2268  * Callback used when waiting on sets with l_wait_event.
2269  * Always returns 1.
2270  */
2271 int ptlrpc_expired_set(void *data)
2272 {
2273         struct ptlrpc_request_set *set = data;
2274         struct list_head *tmp;
2275         time64_t now = ktime_get_real_seconds();
2276
2277         ENTRY;
2278         LASSERT(set != NULL);
2279
2280         /*
2281          * A timeout expired. See which reqs it applies to...
2282          */
2283         list_for_each(tmp, &set->set_requests) {
2284                 struct ptlrpc_request *req =
2285                         list_entry(tmp, struct ptlrpc_request,
2286                                    rq_set_chain);
2287
2288                 /* don't expire request waiting for context */
2289                 if (req->rq_wait_ctx)
2290                         continue;
2291
2292                 /* Request in-flight? */
2293                 if (!((req->rq_phase == RQ_PHASE_RPC &&
2294                        !req->rq_waiting && !req->rq_resend) ||
2295                       (req->rq_phase == RQ_PHASE_BULK)))
2296                         continue;
2297
2298                 if (req->rq_timedout ||     /* already dealt with */
2299                     req->rq_deadline > now) /* not expired */
2300                         continue;
2301
2302                 /*
2303                  * Deal with this guy. Do it asynchronously to not block
2304                  * ptlrpcd thread.
2305                  */
2306                 ptlrpc_expire_one_request(req, 1);
2307         }
2308
2309         /*
2310          * When waiting for a whole set, we always break out of the
2311          * sleep so we can recalculate the timeout, or enable interrupts
2312          * if everyone's timed out.
2313          */
2314         RETURN(1);
2315 }
2316
2317 /**
2318  * Sets rq_intr flag in \a req under spinlock.
2319  */
2320 void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
2321 {
2322         spin_lock(&req->rq_lock);
2323         req->rq_intr = 1;
2324         spin_unlock(&req->rq_lock);
2325 }
2326 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
2327
2328 /**
2329  * Interrupts (sets interrupted flag) all uncompleted requests in
2330  * a set \a data. Callback for l_wait_event for interruptible waits.
2331  */
2332 static void ptlrpc_interrupted_set(void *data)
2333 {
2334         struct ptlrpc_request_set *set = data;
2335         struct list_head *tmp;
2336
2337         LASSERT(set != NULL);
2338         CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
2339
2340         list_for_each(tmp, &set->set_requests) {
2341                 struct ptlrpc_request *req =
2342                         list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2343
2344                 if (req->rq_intr)
2345                         continue;
2346
2347                 if (req->rq_phase != RQ_PHASE_RPC &&
2348                     req->rq_phase != RQ_PHASE_UNREG_RPC &&
2349                     !req->rq_allow_intr)
2350                         continue;
2351
2352                 ptlrpc_mark_interrupted(req);
2353         }
2354 }
2355
2356 /**
2357  * Get the smallest timeout in the set; this does NOT set a timeout.
2358  */
2359 time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
2360 {
2361         struct list_head *tmp;
2362         time64_t now = ktime_get_real_seconds();
2363         int timeout = 0;
2364         struct ptlrpc_request *req;
2365         time64_t deadline;
2366
2367         ENTRY;
2368         list_for_each(tmp, &set->set_requests) {
2369                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2370
2371                 /* Request in-flight? */
2372                 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
2373                       (req->rq_phase == RQ_PHASE_BULK) ||
2374                       (req->rq_phase == RQ_PHASE_NEW)))
2375                         continue;
2376
2377                 /* Already timed out. */
2378                 if (req->rq_timedout)
2379                         continue;
2380
2381                 /* Waiting for ctx. */
2382                 if (req->rq_wait_ctx)
2383                         continue;
2384
2385                 if (req->rq_phase == RQ_PHASE_NEW)
2386                         deadline = req->rq_sent;
2387                 else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
2388                         deadline = req->rq_sent;
2389                 else
2390                         deadline = req->rq_sent + req->rq_timeout;
2391
2392                 if (deadline <= now)    /* actually expired already */
2393                         timeout = 1;    /* ASAP */
2394                 else if (timeout == 0 || timeout > deadline - now)
2395                         timeout = deadline - now;
2396         }
2397         RETURN(timeout);
2398 }
2399
2400 /**
2401  * Send all unset request from the set and then wait untill all
2402  * requests in the set complete (either get a reply, timeout, get an
2403  * error or otherwise be interrupted).
2404  * Returns 0 on success or error code otherwise.
2405  */
2406 int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
2407 {
2408         struct list_head *tmp;
2409         struct ptlrpc_request *req;
2410         struct l_wait_info lwi;
2411         time64_t timeout;
2412         int rc;
2413
2414         ENTRY;
2415         if (set->set_producer)
2416                 (void)ptlrpc_set_producer(set);
2417         else
2418                 list_for_each(tmp, &set->set_requests) {
2419                         req = list_entry(tmp, struct ptlrpc_request,
2420                                          rq_set_chain);
2421                         if (req->rq_phase == RQ_PHASE_NEW)
2422                                 (void)ptlrpc_send_new_req(req);
2423                 }
2424
2425         if (list_empty(&set->set_requests))
2426                 RETURN(0);
2427
2428         do {
2429                 timeout = ptlrpc_set_next_timeout(set);
2430
2431                 /*
2432                  * wait until all complete, interrupted, or an in-flight
2433                  * req times out
2434                  */
2435                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
2436                        set, timeout);
2437
2438                 if ((timeout == 0 && !signal_pending(current)) ||
2439                     set->set_allow_intr)
2440                         /*
2441                          * No requests are in-flight (ether timed out
2442                          * or delayed), so we can allow interrupts.
2443                          * We still want to block for a limited time,
2444                          * so we allow interrupts during the timeout.
2445                          */
2446                         lwi = LWI_TIMEOUT_INTR_ALL(
2447                                         cfs_time_seconds(timeout ? timeout : 1),
2448                                         ptlrpc_expired_set,
2449                                         ptlrpc_interrupted_set, set);
2450                 else
2451                         /*
2452                          * At least one request is in flight, so no
2453                          * interrupts are allowed. Wait until all
2454                          * complete, or an in-flight req times out.
2455                          */
2456                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
2457                                           ptlrpc_expired_set, set);
2458
2459                 rc = l_wait_event(set->set_waitq,
2460                                   ptlrpc_check_set(NULL, set), &lwi);
2461
2462                 /*
2463                  * LU-769 - if we ignored the signal because it was already
2464                  * pending when we started, we need to handle it now or we risk
2465                  * it being ignored forever
2466                  */
2467                 if (rc == -ETIMEDOUT &&
2468                     (!lwi.lwi_allow_intr || set->set_allow_intr) &&
2469                     signal_pending(current)) {
2470                         sigset_t blocked_sigs =
2471                                            cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
2472
2473                         /*
2474                          * In fact we only interrupt for the "fatal" signals
2475                          * like SIGINT or SIGKILL. We still ignore less
2476                          * important signals since ptlrpc set is not easily
2477                          * reentrant from userspace again
2478                          */
2479                         if (signal_pending(current))
2480                                 ptlrpc_interrupted_set(set);
2481                         cfs_restore_sigs(blocked_sigs);
2482                 }
2483
2484                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
2485
2486                 /*
2487                  * -EINTR => all requests have been flagged rq_intr so next
2488                  * check completes.
2489                  * -ETIMEDOUT => someone timed out.  When all reqs have
2490                  * timed out, signals are enabled allowing completion with
2491                  * EINTR.
2492                  * I don't really care if we go once more round the loop in
2493                  * the error cases -eeb.
2494                  */
2495                 if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
2496                         list_for_each(tmp, &set->set_requests) {
2497                                 req = list_entry(tmp, struct ptlrpc_request,
2498                                                  rq_set_chain);
2499                                 spin_lock(&req->rq_lock);
2500                                 req->rq_invalid_rqset = 1;
2501                                 spin_unlock(&req->rq_lock);
2502                         }
2503                 }
2504         } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
2505
2506         LASSERT(atomic_read(&set->set_remaining) == 0);
2507
2508         rc = set->set_rc; /* rq_status of already freed requests if any */
2509         list_for_each(tmp, &set->set_requests) {
2510                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2511
2512                 LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
2513                 if (req->rq_status != 0)
2514                         rc = req->rq_status;
2515         }
2516
2517         RETURN(rc);
2518 }
2519 EXPORT_SYMBOL(ptlrpc_set_wait);
2520
2521 /**
2522  * Helper fuction for request freeing.
2523  * Called when request count reached zero and request needs to be freed.
2524  * Removes request from all sorts of sending/replay lists it might be on,
2525  * frees network buffers if any are present.
2526  * If \a locked is set, that means caller is already holding import imp_lock
2527  * and so we no longer need to reobtain it (for certain lists manipulations)
2528  */
2529 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
2530 {
2531         ENTRY;
2532
2533         if (!request)
2534                 RETURN_EXIT;
2535
2536         LASSERT(!request->rq_srv_req);
2537         LASSERT(request->rq_export == NULL);
2538         LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
2539         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
2540         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
2541         LASSERTF(!request->rq_replay, "req %p\n", request);
2542
2543         req_capsule_fini(&request->rq_pill);
2544
2545         /*
2546          * We must take it off the imp_replay_list first.  Otherwise, we'll set
2547          * request->rq_reqmsg to NULL while osc_close is dereferencing it.
2548          */
2549         if (request->rq_import) {
2550                 if (!locked)
2551                         spin_lock(&request->rq_import->imp_lock);
2552                 list_del_init(&request->rq_replay_list);
2553                 list_del_init(&request->rq_unreplied_list);
2554                 if (!locked)
2555                         spin_unlock(&request->rq_import->imp_lock);
2556         }
2557         LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
2558
2559         if (atomic_read(&request->rq_refcount) != 0) {
2560                 DEBUG_REQ(D_ERROR, request,
2561                           "freeing request with nonzero refcount");
2562                 LBUG();
2563         }
2564
2565         if (request->rq_repbuf)
2566                 sptlrpc_cli_free_repbuf(request);
2567
2568         if (request->rq_import) {
2569                 class_import_put(request->rq_import);
2570                 request->rq_import = NULL;
2571         }
2572         if (request->rq_bulk)
2573                 ptlrpc_free_bulk(request->rq_bulk);
2574
2575         if (request->rq_reqbuf || request->rq_clrbuf)
2576                 sptlrpc_cli_free_reqbuf(request);
2577
2578         if (request->rq_cli_ctx)
2579                 sptlrpc_req_put_ctx(request, !locked);
2580
2581         if (request->rq_pool)
2582                 __ptlrpc_free_req_to_pool(request);
2583         else
2584                 ptlrpc_request_cache_free(request);
2585         EXIT;
2586 }
2587
2588 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
2589 /**
2590  * Drop one request reference. Must be called with import imp_lock held.
2591  * When reference count drops to zero, request is freed.
2592  */
2593 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
2594 {
2595         assert_spin_locked(&request->rq_import->imp_lock);
2596         (void)__ptlrpc_req_finished(request, 1);
2597 }
2598
2599 /**
2600  * Helper function
2601  * Drops one reference count for request \a request.
2602  * \a locked set indicates that caller holds import imp_lock.
2603  * Frees the request whe reference count reaches zero.
2604  *
2605  * \retval 1    the request is freed
2606  * \retval 0    some others still hold references on the request
2607  */
2608 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
2609 {
2610         int count;
2611
2612         ENTRY;
2613         if (!request)
2614                 RETURN(1);
2615
2616         LASSERT(request != LP_POISON);
2617         LASSERT(request->rq_reqmsg != LP_POISON);
2618
2619         DEBUG_REQ(D_INFO, request, "refcount now %u",
2620                   atomic_read(&request->rq_refcount) - 1);
2621
2622         spin_lock(&request->rq_lock);
2623         count = atomic_dec_return(&request->rq_refcount);
2624         LASSERTF(count >= 0, "Invalid ref count %d\n", count);
2625
2626         /*
2627          * For open RPC, the client does not know the EA size (LOV, ACL, and
2628          * so on) before replied, then the client has to reserve very large
2629          * reply buffer. Such buffer will not be released until the RPC freed.
2630          * Since The open RPC is replayable, we need to keep it in the replay
2631          * list until close. If there are a lot of files opened concurrently,
2632          * then the client may be OOM.
2633          *
2634          * If fact, it is unnecessary to keep reply buffer for open replay,
2635          * related EAs have already been saved via mdc_save_lovea() before
2636          * coming here. So it is safe to free the reply buffer some earlier
2637          * before releasing the RPC to avoid client OOM. LU-9514
2638          */
2639         if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) {
2640                 spin_lock(&request->rq_early_free_lock);
2641                 sptlrpc_cli_free_repbuf(request);
2642                 request->rq_repbuf = NULL;
2643                 request->rq_repbuf_len = 0;
2644                 request->rq_repdata = NULL;
2645                 request->rq_reqdata_len = 0;
2646                 spin_unlock(&request->rq_early_free_lock);
2647         }
2648         spin_unlock(&request->rq_lock);
2649
2650         if (!count)
2651                 __ptlrpc_free_req(request, locked);
2652
2653         RETURN(!count);
2654 }
2655
2656 /**
2657  * Drops one reference count for a request.
2658  */
2659 void ptlrpc_req_finished(struct ptlrpc_request *request)
2660 {
2661         __ptlrpc_req_finished(request, 0);
2662 }
2663 EXPORT_SYMBOL(ptlrpc_req_finished);
2664
2665 /**
2666  * Returns xid of a \a request
2667  */
2668 __u64 ptlrpc_req_xid(struct ptlrpc_request *request)
2669 {
2670         return request->rq_xid;
2671 }
2672 EXPORT_SYMBOL(ptlrpc_req_xid);
2673
2674 /**
2675  * Disengage the client's reply buffer from the network
2676  * NB does _NOT_ unregister any client-side bulk.
2677  * IDEMPOTENT, but _not_ safe against concurrent callers.
2678  * The request owner (i.e. the thread doing the I/O) must call...
2679  * Returns 0 on success or 1 if unregistering cannot be made.
2680  */
2681 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
2682 {
2683         int rc;
2684         struct l_wait_info lwi;
2685
2686         /*
2687          * Might sleep.
2688          */
2689         LASSERT(!in_interrupt());
2690
2691         /* Let's setup deadline for reply unlink. */
2692         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
2693             async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
2694                 request->rq_reply_deadline = ktime_get_real_seconds() +
2695                                              LONG_UNLINK;
2696
2697         /*
2698          * Nothing left to do.
2699          */
2700         if (!ptlrpc_client_recv_or_unlink(request))
2701                 RETURN(1);
2702
2703         LNetMDUnlink(request->rq_reply_md_h);
2704
2705         /*
2706          * Let's check it once again.
2707          */
2708         if (!ptlrpc_client_recv_or_unlink(request))
2709                 RETURN(1);
2710
2711         /* Move to "Unregistering" phase as reply was not unlinked yet. */
2712         ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
2713
2714         /*
2715          * Do not wait for unlink to finish.
2716          */
2717         if (async)
2718                 RETURN(0);
2719
2720         /*
2721          * We have to l_wait_event() whatever the result, to give liblustre
2722          * a chance to run reply_in_callback(), and to make sure we've
2723          * unlinked before returning a req to the pool.
2724          */
2725         for (;;) {
2726                 /* The wq argument is ignored by user-space wait_event macros */
2727                 wait_queue_head_t *wq = (request->rq_set) ?
2728                                         &request->rq_set->set_waitq :
2729                                         &request->rq_reply_waitq;
2730                 /*
2731                  * Network access will complete in finite time but the HUGE
2732                  * timeout lets us CWARN for visibility of sluggish NALs
2733                  */
2734                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
2735                                            cfs_time_seconds(1), NULL, NULL);
2736                 rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
2737                                   &lwi);
2738                 if (rc == 0) {
2739                         ptlrpc_rqphase_move(request, request->rq_next_phase);
2740                         RETURN(1);
2741                 }
2742
2743                 LASSERT(rc == -ETIMEDOUT);
2744                 DEBUG_REQ(D_WARNING, request,
2745                           "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d",
2746                           request->rq_receiving_reply,
2747                           request->rq_req_unlinked,
2748                           request->rq_reply_unlinked);
2749         }
2750         RETURN(0);
2751 }
2752
2753 static void ptlrpc_free_request(struct ptlrpc_request *req)
2754 {
2755         spin_lock(&req->rq_lock);
2756         req->rq_replay = 0;
2757         spin_unlock(&req->rq_lock);
2758
2759         if (req->rq_commit_cb)
2760                 req->rq_commit_cb(req);
2761         list_del_init(&req->rq_replay_list);
2762
2763         __ptlrpc_req_finished(req, 1);
2764 }
2765
2766 /**
2767  * the request is committed and dropped from the replay list of its import
2768  */
2769 void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
2770 {
2771         struct obd_import *imp = req->rq_import;
2772
2773         spin_lock(&imp->imp_lock);
2774         if (list_empty(&req->rq_replay_list)) {
2775                 spin_unlock(&imp->imp_lock);
2776                 return;
2777         }
2778
2779         if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
2780                 if (imp->imp_replay_cursor == &req->rq_replay_list)
2781                         imp->imp_replay_cursor = req->rq_replay_list.next;
2782                 ptlrpc_free_request(req);
2783         }
2784
2785         spin_unlock(&imp->imp_lock);
2786 }
2787 EXPORT_SYMBOL(ptlrpc_request_committed);
2788
2789 /**
2790  * Iterates through replay_list on import and prunes
2791  * all requests have transno smaller than last_committed for the
2792  * import and don't have rq_replay set.
2793  * Since requests are sorted in transno order, stops when meetign first
2794  * transno bigger than last_committed.
2795  * caller must hold imp->imp_lock
2796  */
2797 void ptlrpc_free_committed(struct obd_import *imp)
2798 {
2799         struct ptlrpc_request *req, *saved;
2800         struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
2801         bool skip_committed_list = true;
2802
2803         ENTRY;
2804         LASSERT(imp != NULL);
2805         assert_spin_locked(&imp->imp_lock);
2806
2807         if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
2808             imp->imp_generation == imp->imp_last_generation_checked) {
2809                 CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
2810                        imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
2811                 RETURN_EXIT;
2812         }
2813         CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
2814                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
2815                imp->imp_generation);
2816
2817         if (imp->imp_generation != imp->imp_last_generation_checked ||
2818             imp->imp_last_transno_checked == 0)
2819                 skip_committed_list = false;
2820
2821         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
2822         imp->imp_last_generation_checked = imp->imp_generation;
2823
2824         list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
2825                                  rq_replay_list) {
2826                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
2827                 LASSERT(req != last_req);
2828                 last_req = req;
2829
2830                 if (req->rq_transno == 0) {
2831                         DEBUG_REQ(D_EMERG, req, "zero transno during replay");
2832                         LBUG();
2833                 }
2834                 if (req->rq_import_generation < imp->imp_generation) {
2835                         DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
2836                         GOTO(free_req, 0);
2837                 }
2838
2839                 /* not yet committed */
2840                 if (req->rq_transno > imp->imp_peer_committed_transno) {
2841                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
2842                         break;
2843                 }
2844
2845                 if (req->rq_replay) {
2846                         DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
2847                         list_move_tail(&req->rq_replay_list,
2848                                        &imp->imp_committed_list);
2849                         continue;
2850                 }
2851
2852                 DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
2853                           imp->imp_peer_committed_transno);
2854 free_req:
2855                 ptlrpc_free_request(req);
2856         }
2857
2858         if (skip_committed_list)
2859                 GOTO(out, 0);
2860
2861         list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
2862                                  rq_replay_list) {
2863                 LASSERT(req->rq_transno != 0);
2864                 if (req->rq_import_generation < imp->imp_generation ||
2865                     !req->rq_replay) {
2866                         DEBUG_REQ(D_RPCTRACE, req, "free %s open request",
2867                                   req->rq_import_generation <
2868                                   imp->imp_generation ? "stale" : "closed");
2869
2870                         if (imp->imp_replay_cursor == &req->rq_replay_list)
2871                                 imp->imp_replay_cursor =
2872                                         req->rq_replay_list.next;
2873
2874                         ptlrpc_free_request(req);
2875                 }
2876         }
2877 out:
2878         EXIT;
2879 }
2880
2881 void ptlrpc_cleanup_client(struct obd_import *imp)
2882 {
2883         ENTRY;
2884         EXIT;
2885 }
2886
2887 /**
2888  * Schedule previously sent request for resend.
2889  * For bulk requests we assign new xid (to avoid problems with
2890  * lost replies and therefore several transfers landing into same buffer
2891  * from different sending attempts).
2892  */
2893 void ptlrpc_resend_req(struct ptlrpc_request *req)
2894 {
2895         DEBUG_REQ(D_HA, req, "going to resend");
2896         spin_lock(&req->rq_lock);
2897
2898         /*
2899          * Request got reply but linked to the import list still.
2900          * Let ptlrpc_check_set() process it.
2901          */
2902         if (ptlrpc_client_replied(req)) {
2903                 spin_unlock(&req->rq_lock);
2904                 DEBUG_REQ(D_HA, req, "it has reply, so skip it");
2905                 return;
2906         }
2907
2908         req->rq_status = -EAGAIN;
2909
2910         req->rq_resend = 1;
2911         req->rq_net_err = 0;
2912         req->rq_timedout = 0;
2913
2914         ptlrpc_client_wake_req(req);
2915         spin_unlock(&req->rq_lock);
2916 }
2917
2918 /* XXX: this function and rq_status are currently unused */
2919 void ptlrpc_restart_req(struct ptlrpc_request *req)
2920 {
2921         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
2922         req->rq_status = -ERESTARTSYS;
2923
2924         spin_lock(&req->rq_lock);
2925         req->rq_restart = 1;
2926         req->rq_timedout = 0;
2927         ptlrpc_client_wake_req(req);
2928         spin_unlock(&req->rq_lock);
2929 }
2930
2931 /**
2932  * Grab additional reference on a request \a req
2933  */
2934 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
2935 {
2936         ENTRY;
2937         atomic_inc(&req->rq_refcount);
2938         RETURN(req);
2939 }
2940 EXPORT_SYMBOL(ptlrpc_request_addref);
2941
2942 /**
2943  * Add a request to import replay_list.
2944  * Must be called under imp_lock
2945  */
2946 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
2947                                       struct obd_import *imp)
2948 {
2949         struct list_head *tmp;
2950
2951         assert_spin_locked(&imp->imp_lock);
2952
2953         if (req->rq_transno == 0) {
2954                 DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
2955                 LBUG();
2956         }
2957
2958         /*
2959          * clear this for new requests that were resent as well
2960          * as resent replayed requests.
2961          */
2962         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
2963
2964         /* don't re-add requests that have been replayed */
2965         if (!list_empty(&req->rq_replay_list))
2966                 return;
2967
2968         lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
2969
2970         spin_lock(&req->rq_lock);
2971         req->rq_resend = 0;
2972         spin_unlock(&req->rq_lock);
2973
2974         LASSERT(imp->imp_replayable);
2975         /* Balanced in ptlrpc_free_committed, usually. */
2976         ptlrpc_request_addref(req);
2977         list_for_each_prev(tmp, &imp->imp_replay_list) {
2978                 struct ptlrpc_request *iter = list_entry(tmp,
2979                                                          struct ptlrpc_request,
2980                                                          rq_replay_list);
2981
2982                 /*
2983                  * We may have duplicate transnos if we create and then
2984                  * open a file, or for closes retained if to match creating
2985                  * opens, so use req->rq_xid as a secondary key.
2986                  * (See bugs 684, 685, and 428.)
2987                  * XXX no longer needed, but all opens need transnos!
2988                  */
2989                 if (iter->rq_transno > req->rq_transno)
2990                         continue;
2991
2992                 if (iter->rq_transno == req->rq_transno) {
2993                         LASSERT(iter->rq_xid != req->rq_xid);
2994                         if (iter->rq_xid > req->rq_xid)
2995                                 continue;
2996                 }
2997
2998                 list_add(&req->rq_replay_list, &iter->rq_replay_list);
2999                 return;
3000         }
3001
3002         list_add(&req->rq_replay_list, &imp->imp_replay_list);
3003 }
3004
3005 /**
3006  * Send request and wait until it completes.
3007  * Returns request processing status.
3008  */
3009 int ptlrpc_queue_wait(struct ptlrpc_request *req)
3010 {
3011         struct ptlrpc_request_set *set;
3012         int rc;
3013
3014         ENTRY;
3015         LASSERT(req->rq_set == NULL);
3016         LASSERT(!req->rq_receiving_reply);
3017
3018         set = ptlrpc_prep_set();
3019         if (!set) {
3020                 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
3021                 RETURN(-ENOMEM);
3022         }
3023
3024         /* for distributed debugging */
3025         lustre_msg_set_status(req->rq_reqmsg, current_pid());
3026
3027         /* add a ref for the set (see comment in ptlrpc_set_add_req) */
3028         ptlrpc_request_addref(req);
3029         ptlrpc_set_add_req(set, req);
3030         rc = ptlrpc_set_wait(NULL, set);
3031         ptlrpc_set_destroy(set);
3032
3033         RETURN(rc);
3034 }
3035 EXPORT_SYMBOL(ptlrpc_queue_wait);
3036
3037 /**
3038  * Callback used for replayed requests reply processing.
3039  * In case of successful reply calls registered request replay callback.
3040  * In case of error restart replay process.
3041  */
3042 static int ptlrpc_replay_interpret(const struct lu_env *env,
3043                                    struct ptlrpc_request *req,
3044                                    void *args, int rc)
3045 {
3046         struct ptlrpc_replay_async_args *aa = args;
3047         struct obd_import *imp = req->rq_import;
3048
3049         ENTRY;
3050         atomic_dec(&imp->imp_replay_inflight);
3051
3052         /*
3053          * Note: if it is bulk replay (MDS-MDS replay), then even if
3054          * server got the request, but bulk transfer timeout, let's
3055          * replay the bulk req again
3056          */
3057         if (!ptlrpc_client_replied(req) ||
3058             (req->rq_bulk &&
3059              lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
3060                 DEBUG_REQ(D_ERROR, req, "request replay timed out.\n");
3061                 GOTO(out, rc = -ETIMEDOUT);
3062         }
3063
3064         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
3065             (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
3066             lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
3067                 GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
3068
3069         /** VBR: check version failure */
3070         if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
3071                 /** replay was failed due to version mismatch */
3072                 DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
3073                 spin_lock(&imp->imp_lock);
3074                 imp->imp_vbr_failed = 1;
3075                 spin_unlock(&imp->imp_lock);
3076                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3077         } else {
3078                 /** The transno had better not change over replay. */
3079                 LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
3080                          lustre_msg_get_transno(req->rq_repmsg) ||
3081                          lustre_msg_get_transno(req->rq_repmsg) == 0,
3082                          "%#llx/%#llx\n",
3083                          lustre_msg_get_transno(req->rq_reqmsg),
3084                          lustre_msg_get_transno(req->rq_repmsg));
3085         }
3086
3087         spin_lock(&imp->imp_lock);
3088         imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
3089         spin_unlock(&imp->imp_lock);
3090         LASSERT(imp->imp_last_replay_transno);
3091
3092         /* transaction number shouldn't be bigger than the latest replayed */
3093         if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
3094                 DEBUG_REQ(D_ERROR, req,
3095                           "Reported transno %llu is bigger than the replayed one: %llu",
3096                           req->rq_transno,
3097                           lustre_msg_get_transno(req->rq_reqmsg));
3098                 GOTO(out, rc = -EINVAL);
3099         }
3100
3101         DEBUG_REQ(D_HA, req, "got rep");
3102
3103         /* let the callback do fixups, possibly including in the request */
3104         if (req->rq_replay_cb)
3105                 req->rq_replay_cb(req);
3106
3107         if (ptlrpc_client_replied(req) &&
3108             lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
3109                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
3110                           lustre_msg_get_status(req->rq_repmsg),
3111                           aa->praa_old_status);
3112
3113                 /*
3114                  * Note: If the replay fails for MDT-MDT recovery, let's
3115                  * abort all of the following requests in the replay
3116                  * and sending list, because MDT-MDT update requests
3117                  * are dependent on each other, see LU-7039
3118                  */
3119                 if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
3120                         struct ptlrpc_request *free_req;
3121                         struct ptlrpc_request *tmp;
3122
3123                         spin_lock(&imp->imp_lock);
3124                         list_for_each_entry_safe(free_req, tmp,
3125                                                  &imp->imp_replay_list,
3126                                                  rq_replay_list) {
3127                                 ptlrpc_free_request(free_req);
3128                         }
3129
3130                         list_for_each_entry_safe(free_req, tmp,
3131                                                  &imp->imp_committed_list,
3132                                                  rq_replay_list) {
3133                                 ptlrpc_free_request(free_req);
3134                         }
3135
3136                         list_for_each_entry_safe(free_req, tmp,
3137                                                  &imp->imp_delayed_list,
3138                                                  rq_list) {
3139                                 spin_lock(&free_req->rq_lock);
3140                                 free_req->rq_err = 1;
3141                                 free_req->rq_status = -EIO;
3142                                 ptlrpc_client_wake_req(free_req);
3143                                 spin_unlock(&free_req->rq_lock);
3144                         }
3145
3146                         list_for_each_entry_safe(free_req, tmp,
3147                                                  &imp->imp_sending_list,
3148                                                  rq_list) {
3149                                 spin_lock(&free_req->rq_lock);
3150                                 free_req->rq_err = 1;
3151                                 free_req->rq_status = -EIO;
3152                                 ptlrpc_client_wake_req(free_req);
3153                                 spin_unlock(&free_req->rq_lock);
3154                         }
3155                         spin_unlock(&imp->imp_lock);
3156                 }
3157         } else {
3158                 /* Put it back for re-replay. */
3159                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
3160         }
3161
3162         /*
3163          * Errors while replay can set transno to 0, but
3164          * imp_last_replay_transno shouldn't be set to 0 anyway
3165          */
3166         if (req->rq_transno == 0)
3167                 CERROR("Transno is 0 during replay!\n");
3168
3169         /* continue with recovery */
3170         rc = ptlrpc_import_recovery_state_machine(imp);
3171  out:
3172         req->rq_send_state = aa->praa_old_state;
3173
3174         if (rc != 0)
3175                 /* this replay failed, so restart recovery */
3176                 ptlrpc_connect_import(imp);
3177
3178         RETURN(rc);
3179 }
3180
3181 /**
3182  * Prepares and queues request for replay.
3183  * Adds it to ptlrpcd queue for actual sending.
3184  * Returns 0 on success.
3185  */
3186 int ptlrpc_replay_req(struct ptlrpc_request *req)
3187 {
3188         struct ptlrpc_replay_async_args *aa;
3189
3190         ENTRY;
3191
3192         LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
3193
3194         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
3195         aa = ptlrpc_req_async_args(req);
3196         memset(aa, 0, sizeof(*aa));
3197
3198         /* Prepare request to be resent with ptlrpcd */
3199         aa->praa_old_state = req->rq_send_state;
3200         req->rq_send_state = LUSTRE_IMP_REPLAY;
3201         req->rq_phase = RQ_PHASE_NEW;
3202         req->rq_next_phase = RQ_PHASE_UNDEFINED;
3203         if (req->rq_repmsg)
3204                 aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
3205         req->rq_status = 0;
3206         req->rq_interpret_reply = ptlrpc_replay_interpret;
3207         /* Readjust the timeout for current conditions */
3208         ptlrpc_at_set_req_timeout(req);
3209
3210         /* Tell server net_latency to calculate how long to wait for reply. */
3211         lustre_msg_set_service_time(req->rq_reqmsg,
3212                                     ptlrpc_at_get_net_latency(req));
3213         DEBUG_REQ(D_HA, req, "REPLAY");
3214
3215         atomic_inc(&req->rq_import->imp_replay_inflight);
3216         spin_lock(&req->rq_lock);
3217         req->rq_early_free_repbuf = 0;
3218         spin_unlock(&req->rq_lock);
3219         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
3220
3221         ptlrpcd_add_req(req);
3222         RETURN(0);
3223 }
3224
3225 /**
3226  * Aborts all in-flight request on import \a imp sending and delayed lists
3227  */
3228 void ptlrpc_abort_inflight(struct obd_import *imp)
3229 {
3230         struct list_head *tmp, *n;
3231
3232         ENTRY;
3233         /*
3234          * Make sure that no new requests get processed for this import.
3235          * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
3236          * this flag and then putting requests on sending_list or delayed_list.
3237          */
3238         spin_lock(&imp->imp_lock);
3239
3240         /*
3241          * XXX locking?  Maybe we should remove each request with the list
3242          * locked?  Also, how do we know if the requests on the list are
3243          * being freed at this time?
3244          */
3245         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
3246                 struct ptlrpc_request *req = list_entry(tmp,
3247                                                         struct ptlrpc_request,
3248                                                         rq_list);
3249
3250                 DEBUG_REQ(D_RPCTRACE, req, "inflight");
3251
3252                 spin_lock(&req->rq_lock);
3253                 if (req->rq_import_generation < imp->imp_generation) {
3254                         req->rq_err = 1;
3255                         req->rq_status = -EIO;
3256                         ptlrpc_client_wake_req(req);
3257                 }
3258                 spin_unlock(&req->rq_lock);
3259         }
3260
3261         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
3262                 struct ptlrpc_request *req =
3263                         list_entry(tmp, struct ptlrpc_request, rq_list);
3264
3265                 DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
3266
3267                 spin_lock(&req->rq_lock);
3268                 if (req->rq_import_generation < imp->imp_generation) {
3269                         req->rq_err = 1;
3270                         req->rq_status = -EIO;
3271                         ptlrpc_client_wake_req(req);
3272                 }
3273                 spin_unlock(&req->rq_lock);
3274         }
3275
3276         /*
3277          * Last chance to free reqs left on the replay list, but we
3278          * will still leak reqs that haven't committed.
3279          */
3280         if (imp->imp_replayable)
3281                 ptlrpc_free_committed(imp);
3282
3283         spin_unlock(&imp->imp_lock);
3284
3285         EXIT;
3286 }
3287
3288 /**
3289  * Abort all uncompleted requests in request set \a set
3290  */
3291 void ptlrpc_abort_set(struct ptlrpc_request_set *set)
3292 {
3293         struct list_head *tmp, *pos;
3294
3295         LASSERT(set != NULL);
3296
3297         list_for_each_safe(pos, tmp, &set->set_requests) {
3298                 struct ptlrpc_request *req =
3299                         list_entry(pos, struct ptlrpc_request,
3300                                    rq_set_chain);
3301
3302                 spin_lock(&req->rq_lock);
3303                 if (req->rq_phase != RQ_PHASE_RPC) {
3304                         spin_unlock(&req->rq_lock);
3305                         continue;
3306                 }
3307
3308                 req->rq_err = 1;
3309                 req->rq_status = -EINTR;
3310                 ptlrpc_client_wake_req(req);
3311                 spin_unlock(&req->rq_lock);
3312         }
3313 }
3314
3315 /**
3316  * Initialize the XID for the node.  This is common among all requests on
3317  * this node, and only requires the property that it is monotonically
3318  * increasing.  It does not need to be sequential.  Since this is also used
3319  * as the RDMA match bits, it is important that a single client NOT have
3320  * the same match bits for two different in-flight requests, hence we do
3321  * NOT want to have an XID per target or similar.
3322  *
3323  * To avoid an unlikely collision between match bits after a client reboot
3324  * (which would deliver old data into the wrong RDMA buffer) initialize
3325  * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
3326  * If the time is clearly incorrect, we instead use a 62-bit random number.
3327  * In the worst case the random number will overflow 1M RPCs per second in
3328  * 9133 years, or permutations thereof.
3329  */
3330 #define YEAR_2004 (1ULL << 30)
3331 void ptlrpc_init_xid(void)
3332 {
3333         time64_t now = ktime_get_real_seconds();
3334
3335         spin_lock_init(&ptlrpc_last_xid_lock);
3336         if (now < YEAR_2004) {
3337                 cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
3338                 ptlrpc_last_xid >>= 2;
3339                 ptlrpc_last_xid |= (1ULL << 61);
3340         } else {
3341                 ptlrpc_last_xid = (__u64)now << 20;
3342         }
3343
3344         /* Need to always be aligned to a power-of-two for mutli-bulk BRW */
3345         CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
3346         ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
3347 }
3348
3349 /**
3350  * Increase xid and returns resulting new value to the caller.
3351  *
3352  * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
3353  * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
3354  * itself uses the last bulk xid needed, so the server can determine the
3355  * the number of bulk transfers from the RPC XID and a bitmask.  The starting
3356  * xid must align to a power-of-two value.
3357  *
3358  * This is assumed to be true due to the initial ptlrpc_last_xid
3359  * value also being initialized to a power-of-two value. LU-1431
3360  */
3361 __u64 ptlrpc_next_xid(void)
3362 {
3363         __u64 next;
3364
3365         spin_lock(&ptlrpc_last_xid_lock);
3366         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3367         ptlrpc_last_xid = next;
3368         spin_unlock(&ptlrpc_last_xid_lock);
3369
3370         return next;
3371 }
3372
3373 /**
3374  * If request has a new allocated XID (new request or EINPROGRESS resend),
3375  * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
3376  * request to ensure previous bulk fails and avoid problems with lost replies
3377  * and therefore several transfers landing into the same buffer from different
3378  * sending attempts.
3379  */
3380 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
3381 {
3382         struct ptlrpc_bulk_desc *bd = req->rq_bulk;
3383
3384         LASSERT(bd != NULL);
3385
3386         /*
3387          * Generate new matchbits for all resend requests, including
3388          * resend replay.
3389          */
3390         if (req->rq_resend) {
3391                 __u64 old_mbits = req->rq_mbits;
3392
3393                 /*
3394                  * First time resend on -EINPROGRESS will generate new xid,
3395                  * so we can actually use the rq_xid as rq_mbits in such case,
3396                  * however, it's bit hard to distinguish such resend with a
3397                  * 'resend for the -EINPROGRESS resend'. To make it simple,
3398                  * we opt to generate mbits for all resend cases.
3399                  */
3400                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data,
3401                                  BULK_MBITS)) {
3402                         req->rq_mbits = ptlrpc_next_xid();
3403                 } else {
3404                         /*
3405                          * Old version transfers rq_xid to peer as
3406                          * matchbits.
3407                          */
3408                         spin_lock(&req->rq_import->imp_lock);
3409                         list_del_init(&req->rq_unreplied_list);
3410                         ptlrpc_assign_next_xid_nolock(req);
3411                         spin_unlock(&req->rq_import->imp_lock);
3412                         req->rq_mbits = req->rq_xid;
3413                 }
3414                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
3415                        old_mbits, req->rq_mbits);
3416         } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
3417                 /* Request being sent first time, use xid as matchbits. */
3418                 if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)
3419                     || req->rq_mbits == 0) {
3420                         req->rq_mbits = req->rq_xid;
3421                 } else {
3422                         int total_md = (bd->bd_iov_count + LNET_MAX_IOV - 1) /
3423                                         LNET_MAX_IOV;
3424                         req->rq_mbits -= total_md - 1;
3425                 }
3426         } else {
3427                 /*
3428                  * Replay request, xid and matchbits have already been
3429                  * correctly assigned.
3430                  */
3431                 return;
3432         }
3433
3434         /*
3435          * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
3436          * that server can infer the number of bulks that were prepared,
3437          * see LU-1431
3438          */
3439         req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
3440                           LNET_MAX_IOV) - 1;
3441
3442         /*
3443          * Set rq_xid as rq_mbits to indicate the final bulk for the old
3444          * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
3445          *
3446          * It's ok to directly set the rq_xid here, since this xid bump
3447          * won't affect the request position in unreplied list.
3448          */
3449         if (!OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS))
3450                 req->rq_xid = req->rq_mbits;
3451 }
3452
3453 /**
3454  * Get a glimpse at what next xid value might have been.
3455  * Returns possible next xid.
3456  */
3457 __u64 ptlrpc_sample_next_xid(void)
3458 {
3459 #if BITS_PER_LONG == 32
3460         /* need to avoid possible word tearing on 32-bit systems */
3461         __u64 next;
3462
3463         spin_lock(&ptlrpc_last_xid_lock);
3464         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3465         spin_unlock(&ptlrpc_last_xid_lock);
3466
3467         return next;
3468 #else
3469         /* No need to lock, since returned value is racy anyways */
3470         return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3471 #endif
3472 }
3473 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
3474
3475 /**
3476  * Functions for operating ptlrpc workers.
3477  *
3478  * A ptlrpc work is a function which will be running inside ptlrpc context.
3479  * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
3480  *
3481  * 1. after a work is created, it can be used many times, that is:
3482  *         handler = ptlrpcd_alloc_work();
3483  *         ptlrpcd_queue_work();
3484  *
3485  *    queue it again when necessary:
3486  *         ptlrpcd_queue_work();
3487  *         ptlrpcd_destroy_work();
3488  * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
3489  *    it will only be queued once in any time. Also as its name implies, it may
3490  *    have delay before it really runs by ptlrpcd thread.
3491  */
3492 struct ptlrpc_work_async_args {
3493         int (*cb)(const struct lu_env *, void *);
3494         void *cbdata;
3495 };
3496
3497 static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
3498 {
3499         /* re-initialize the req */
3500         req->rq_timeout         = obd_timeout;
3501         req->rq_sent            = ktime_get_real_seconds();
3502         req->rq_deadline        = req->rq_sent + req->rq_timeout;
3503         req->rq_phase           = RQ_PHASE_INTERPRET;
3504         req->rq_next_phase      = RQ_PHASE_COMPLETE;
3505         req->rq_xid             = ptlrpc_next_xid();
3506         req->rq_import_generation = req->rq_import->imp_generation;
3507
3508         ptlrpcd_add_req(req);
3509 }
3510
3511 static int work_interpreter(const struct lu_env *env,
3512                             struct ptlrpc_request *req, void *args, int rc)
3513 {
3514         struct ptlrpc_work_async_args *arg = args;
3515
3516         LASSERT(ptlrpcd_check_work(req));
3517         LASSERT(arg->cb != NULL);
3518
3519         rc = arg->cb(env, arg->cbdata);
3520
3521         list_del_init(&req->rq_set_chain);
3522         req->rq_set = NULL;
3523
3524         if (atomic_dec_return(&req->rq_refcount) > 1) {
3525                 atomic_set(&req->rq_refcount, 2);
3526                 ptlrpcd_add_work_req(req);
3527         }
3528         return rc;
3529 }
3530
3531 static int worker_format;
3532
3533 static int ptlrpcd_check_work(struct ptlrpc_request *req)
3534 {
3535         return req->rq_pill.rc_fmt == (void *)&worker_format;
3536 }
3537
3538 /**
3539  * Create a work for ptlrpc.
3540  */
3541 void *ptlrpcd_alloc_work(struct obd_import *imp,
3542                          int (*cb)(const struct lu_env *, void *), void *cbdata)
3543 {
3544         struct ptlrpc_request *req = NULL;
3545         struct ptlrpc_work_async_args *args;
3546
3547         ENTRY;
3548         might_sleep();
3549
3550         if (!cb)
3551                 RETURN(ERR_PTR(-EINVAL));
3552
3553         /* copy some code from deprecated fakereq. */
3554         req = ptlrpc_request_cache_alloc(GFP_NOFS);
3555         if (!req) {
3556                 CERROR("ptlrpc: run out of memory!\n");
3557                 RETURN(ERR_PTR(-ENOMEM));
3558         }
3559
3560         ptlrpc_cli_req_init(req);
3561
3562         req->rq_send_state = LUSTRE_IMP_FULL;
3563         req->rq_type = PTL_RPC_MSG_REQUEST;
3564         req->rq_import = class_import_get(imp);
3565         req->rq_interpret_reply = work_interpreter;
3566         /* don't want reply */
3567         req->rq_no_delay = req->rq_no_resend = 1;
3568         req->rq_pill.rc_fmt = (void *)&worker_format;
3569
3570         CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
3571         args = ptlrpc_req_async_args(req);
3572         args->cb     = cb;
3573         args->cbdata = cbdata;
3574
3575         RETURN(req);
3576 }
3577 EXPORT_SYMBOL(ptlrpcd_alloc_work);
3578
3579 void ptlrpcd_destroy_work(void *handler)
3580 {
3581         struct ptlrpc_request *req = handler;
3582
3583         if (req)
3584                 ptlrpc_req_finished(req);
3585 }
3586 EXPORT_SYMBOL(ptlrpcd_destroy_work);
3587
3588 int ptlrpcd_queue_work(void *handler)
3589 {
3590         struct ptlrpc_request *req = handler;
3591
3592         /*
3593          * Check if the req is already being queued.
3594          *
3595          * Here comes a trick: it lacks a way of checking if a req is being
3596          * processed reliably in ptlrpc. Here I have to use refcount of req
3597          * for this purpose. This is okay because the caller should use this
3598          * req as opaque data. - Jinshan
3599          */
3600         LASSERT(atomic_read(&req->rq_refcount) > 0);
3601         if (atomic_inc_return(&req->rq_refcount) == 2)
3602                 ptlrpcd_add_work_req(req);
3603         return 0;
3604 }
3605 EXPORT_SYMBOL(ptlrpcd_queue_work);