lustre/ptlrpc/niobuf.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  */
  32
  33 #define DEBUG_SUBSYSTEM S_RPC
  34 #include <libcfs/linux/linux-mem.h>
  35 #include <obd_support.h>
  36 #include <lustre_net.h>
  37 #include <lustre_lib.h>
  38 #include <obd.h>
  39 #include <obd_class.h>
  40 #include "ptlrpc_internal.h"
  41 #include <lnet/lib-lnet.h> /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */
  42
  43 /**
  44  * Helper function. Sends \a len bytes from \a base at offset \a offset
  45  * over \a conn connection to portal \a portal.
  46  * Returns 0 on success or error code.
  47  */
  48 static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len,
  49                         enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid,
  50                         lnet_nid_t self, struct lnet_process_id peer_id,
  51                         int portal, __u64 xid, unsigned int offset,
  52                         struct lnet_handle_md *bulk_cookie)
  53 {
  54         int              rc;
  55         struct lnet_md         md;
  56         ENTRY;
  57
  58         LASSERT (portal != 0);
  59         CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
  60         md.start     = base;
  61         md.length    = len;
  62         md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
  63         md.options   = PTLRPC_MD_OPTIONS;
  64         md.user_ptr  = cbid;
  65         md.handler   = ptlrpc_handler;
  66         LNetInvalidateMDHandle(&md.bulk_handle);
  67
  68         if (bulk_cookie) {
  69                 md.bulk_handle = *bulk_cookie;
  70                 md.options |= LNET_MD_BULK_HANDLE;
  71         }
  72
  73         if (unlikely(ack == LNET_ACK_REQ &&
  74                      OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
  75                 /* don't ask for the ack to simulate failing client */
  76                 ack = LNET_NOACK_REQ;
  77         }
  78
  79         rc = LNetMDBind(&md, LNET_UNLINK, mdh);
  80         if (unlikely(rc != 0)) {
  81                 CERROR ("LNetMDBind failed: %d\n", rc);
  82                 LASSERT (rc == -ENOMEM);
  83                 RETURN (-ENOMEM);
  84         }
  85
  86         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
  87                len, portal, xid, offset);
  88
  89         percpu_ref_get(&ptlrpc_pending);
  90
  91         rc = LNetPut(self, *mdh, ack,
  92                      peer_id, portal, xid, offset, 0);
  93         if (unlikely(rc != 0)) {
  94                 int rc2;
  95                 /* We're going to get an UNLINK event when I unlink below,
  96                  * which will complete just like any other failed send, so
  97                  * I fall through and return success here! */
  98                 CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
  99                        libcfs_id2str(peer_id), portal, xid, rc);
 100                 rc2 = LNetMDUnlink(*mdh);
 101                 LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
 102         }
 103
 104         RETURN (0);
 105 }
 106
 107 static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count)
 108 {
 109         int i;
 110
 111         for (i = 0; i < count; i++)
 112                 LNetMDUnlink(bd_mds[i]);
 113 }
 114
 115 #ifdef HAVE_SERVER_SUPPORT
 116 /**
 117  * Prepare bulk descriptor for specified incoming request \a req that
 118  * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
 119  * the bulk to be sent. Used on server-side after request was already
 120  * received.
 121  * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
 122  * error.
 123  */
 124 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
 125                                               unsigned nfrags, unsigned max_brw,
 126                                               unsigned int type,
 127                                               unsigned portal,
 128                                               const struct ptlrpc_bulk_frag_ops
 129                                                 *ops)
 130 {
 131         struct obd_export *exp = req->rq_export;
 132         struct ptlrpc_bulk_desc *desc;
 133
 134         ENTRY;
 135         LASSERT(ptlrpc_is_bulk_op_active(type));
 136
 137         desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
 138         if (desc == NULL)
 139                 RETURN(NULL);
 140
 141         desc->bd_export = class_export_get(exp);
 142         desc->bd_req = req;
 143
 144         desc->bd_cbid.cbid_fn  = server_bulk_callback;
 145         desc->bd_cbid.cbid_arg = desc;
 146
 147         /* NB we don't assign rq_bulk here; server-side requests are
 148          * re-used, and the handler frees the bulk desc explicitly. */
 149
 150         return desc;
 151 }
 152 EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
 153
 154 /**
 155  * Starts bulk transfer for descriptor \a desc on the server.
 156  * Returns 0 on success or error code.
 157  */
 158 int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 159 {
 160         struct obd_export        *exp = desc->bd_export;
 161         lnet_nid_t                self_nid;
 162         struct lnet_process_id    peer_id;
 163         int                       rc = 0;
 164         __u64                     mbits;
 165         int                       posted_md;
 166         int                       total_md;
 167         struct lnet_md                 md;
 168         ENTRY;
 169
 170         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET))
 171                 RETURN(0);
 172
 173         /* NB no locking required until desc is on the network */
 174         LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
 175
 176         LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
 177         LASSERT(desc->bd_cbid.cbid_arg == desc);
 178
 179         /*
 180          * Multi-Rail: get the preferred self and peer NIDs from the
 181          * request, so they are based on the route taken by the
 182          * message.
 183          */
 184         self_nid = desc->bd_req->rq_self;
 185         peer_id = desc->bd_req->rq_source;
 186
 187         /* NB total length may be 0 for a read past EOF, so we send 0
 188          * length bulks, since the client expects bulk events.
 189          *
 190          * The client may not need all of the bulk mbits for the RPC. The RPC
 191          * used the mbits of the highest bulk mbits needed, and the server masks
 192          * off high bits to get bulk count for this RPC. LU-1431 */
 193         mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
 194         total_md = desc->bd_req->rq_mbits - mbits + 1;
 195         desc->bd_refs = total_md;
 196         desc->bd_failure = 0;
 197
 198         md.user_ptr = &desc->bd_cbid;
 199         md.handler = ptlrpc_handler;
 200         md.threshold = 2; /* SENT and ACK/REPLY */
 201
 202         for (posted_md = 0; posted_md < total_md; mbits++) {
 203                 md.options = PTLRPC_MD_OPTIONS;
 204
 205                 /* NB it's assumed that source and sink buffer frags are
 206                  * page-aligned. Otherwise we'd have to send client bulk
 207                  * sizes over and split server buffer accordingly */
 208                 ptlrpc_fill_bulk_md(&md, desc, posted_md);
 209                 rc = LNetMDBind(&md, LNET_UNLINK, &desc->bd_mds[posted_md]);
 210                 if (rc != 0) {
 211                         CERROR("%s: LNetMDBind failed for MD %u: rc = %d\n",
 212                                exp->exp_obd->obd_name, posted_md, rc);
 213                         LASSERT(rc == -ENOMEM);
 214                         if (posted_md == 0) {
 215                                 desc->bd_md_count = 0;
 216                                 RETURN(-ENOMEM);
 217                         }
 218                         break;
 219                 }
 220                 percpu_ref_get(&ptlrpc_pending);
 221
 222                 /* sanity.sh 224c: lets skip last md */
 223                 if (posted_md == desc->bd_md_max_brw - 1)
 224                         OBD_FAIL_CHECK_RESET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3,
 225                                              CFS_FAIL_PTLRPC_OST_BULK_CB2);
 226
 227                 /* Network is about to get at the memory */
 228                 if (ptlrpc_is_bulk_put_source(desc->bd_type))
 229                         rc = LNetPut(self_nid, desc->bd_mds[posted_md],
 230                                      LNET_ACK_REQ, peer_id,
 231                                      desc->bd_portal, mbits, 0, 0);
 232                 else
 233                         rc = LNetGet(self_nid, desc->bd_mds[posted_md],
 234                                      peer_id, desc->bd_portal, mbits, 0, false);
 235
 236                 posted_md++;
 237                 if (rc != 0) {
 238                         CERROR("%s: failed bulk transfer with %s:%u x%llu: "
 239                                "rc = %d\n", exp->exp_obd->obd_name,
 240                                libcfs_id2str(peer_id), desc->bd_portal,
 241                                mbits, rc);
 242                         break;
 243                 }
 244         }
 245
 246         if (rc != 0) {
 247                 /* Can't send, so we unlink the MD bound above.  The UNLINK
 248                  * event this creates will signal completion with failure,
 249                  * so we return SUCCESS here! */
 250                 spin_lock(&desc->bd_lock);
 251                 desc->bd_refs -= total_md - posted_md;
 252                 spin_unlock(&desc->bd_lock);
 253                 LASSERT(desc->bd_refs >= 0);
 254
 255                 mdunlink_iterate_helper(desc->bd_mds, posted_md);
 256                 RETURN(0);
 257         }
 258
 259         CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
 260                "id %s mbits %#llx-%#llx\n", desc->bd_iov_count,
 261                desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id),
 262                mbits - posted_md, mbits - 1);
 263
 264         RETURN(0);
 265 }
 266
 267 /**
 268  * Server side bulk abort. Idempotent. Not thread-safe (i.e. only
 269  * serialises with completion callback)
 270  */
 271 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
 272 {
 273         LASSERT(!in_interrupt());           /* might sleep */
 274
 275         if (!ptlrpc_server_bulk_active(desc))   /* completed or */
 276                 return;                         /* never started */
 277
 278         /* We used to poison the pages with 0xab here because we did not want to
 279          * send any meaningful data over the wire for evicted clients (bug 9297)
 280          * However, this is no longer safe now that we use the page cache on the
 281          * OSS (bug 20560) */
 282
 283         /* The unlink ensures the callback happens ASAP and is the last
 284          * one.  If it fails, it must be because completion just happened,
 285          * but we must still wait_event_idle_timeout() in this case, to give
 286          * us a chance to run server_bulk_callback()
 287          */
 288         mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 289
 290         for (;;) {
 291                 /* Network access will complete in finite time but the HUGE
 292                  * timeout lets us CWARN for visibility of sluggish NALs */
 293                 int seconds = PTLRPC_REQ_LONG_UNLINK;
 294
 295                 while (seconds > 0 &&
 296                        wait_event_idle_timeout(desc->bd_waitq,
 297                                                !ptlrpc_server_bulk_active(desc),
 298                                                cfs_time_seconds(1)) == 0)
 299                         seconds -= 1;
 300                 if (seconds > 0)
 301                         return;
 302
 303                 CWARN("Unexpectedly long timeout: desc %p\n", desc);
 304         }
 305 }
 306 #endif /* HAVE_SERVER_SUPPORT */
 307
 308 /**
 309  * Register bulk at the sender for later transfer.
 310  * Returns 0 on success or error code.
 311  */
 312 int ptlrpc_register_bulk(struct ptlrpc_request *req)
 313 {
 314         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
 315         struct lnet_process_id peer;
 316         int rc = 0;
 317         int posted_md;
 318         int total_md;
 319         __u64 mbits;
 320         struct lnet_me *me;
 321         struct lnet_md md;
 322         ENTRY;
 323
 324         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
 325                 RETURN(0);
 326
 327         /* NB no locking required until desc is on the network */
 328         LASSERT(desc->bd_nob > 0);
 329         LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
 330         LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
 331         LASSERT(desc->bd_req != NULL);
 332         LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type));
 333
 334         /* cleanup the state of the bulk for it will be reused */
 335         if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
 336                 desc->bd_nob_transferred = 0;
 337         else if (desc->bd_nob_transferred != 0)
 338                 /* If the network failed after an RPC was sent, this condition
 339                  * could happen.  Rather than assert (was here before), return
 340                  * an EIO error. */
 341                 RETURN(-EIO);
 342
 343         desc->bd_failure = 0;
 344
 345         peer = desc->bd_import->imp_connection->c_peer;
 346
 347         LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
 348         LASSERT(desc->bd_cbid.cbid_arg == desc);
 349
 350         total_md = desc->bd_md_count;
 351         /* rq_mbits is matchbits of the final bulk */
 352         mbits = req->rq_mbits - desc->bd_md_count + 1;
 353
 354         LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
 355                  "first mbits = x%llu, last mbits = x%llu\n",
 356                  mbits, req->rq_mbits);
 357         LASSERTF(!(desc->bd_registered &&
 358                    req->rq_send_state != LUSTRE_IMP_REPLAY) ||
 359                  mbits != desc->bd_last_mbits,
 360                  "registered: %d  rq_mbits: %llu bd_last_mbits: %llu\n",
 361                  desc->bd_registered, mbits, desc->bd_last_mbits);
 362
 363         desc->bd_registered = 1;
 364         desc->bd_last_mbits = mbits;
 365         desc->bd_refs = total_md;
 366         md.user_ptr = &desc->bd_cbid;
 367         md.handler = ptlrpc_handler;
 368         md.threshold = 1;                       /* PUT or GET */
 369
 370         for (posted_md = 0; posted_md < desc->bd_md_count;
 371              posted_md++, mbits++) {
 372                 md.options = PTLRPC_MD_OPTIONS |
 373                              (ptlrpc_is_bulk_op_get(desc->bd_type) ?
 374                               LNET_MD_OP_GET : LNET_MD_OP_PUT);
 375                 ptlrpc_fill_bulk_md(&md, desc, posted_md);
 376
 377                 if (posted_md > 0 && posted_md + 1 == desc->bd_md_count &&
 378                     OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) {
 379                         rc = -ENOMEM;
 380                 } else {
 381                         me = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
 382                                   LNET_UNLINK, LNET_INS_AFTER);
 383                         rc = PTR_ERR_OR_ZERO(me);
 384                 }
 385                 if (rc != 0) {
 386                         CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
 387                                desc->bd_import->imp_obd->obd_name, mbits,
 388                                posted_md, rc);
 389                         break;
 390                 }
 391                 percpu_ref_get(&ptlrpc_pending);
 392
 393                 /* About to let the network at it... */
 394                 rc = LNetMDAttach(me, &md, LNET_UNLINK,
 395                                   &desc->bd_mds[posted_md]);
 396                 if (rc != 0) {
 397                         CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n",
 398                                desc->bd_import->imp_obd->obd_name, mbits,
 399                                posted_md, rc);
 400                         break;
 401                 }
 402         }
 403
 404         if (rc != 0) {
 405                 LASSERT(rc == -ENOMEM);
 406                 spin_lock(&desc->bd_lock);
 407                 desc->bd_refs -= total_md - posted_md;
 408                 spin_unlock(&desc->bd_lock);
 409                 LASSERT(desc->bd_refs >= 0);
 410                 mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 411                 req->rq_status = -ENOMEM;
 412                 desc->bd_registered = 0;
 413                 RETURN(-ENOMEM);
 414         }
 415
 416         spin_lock(&desc->bd_lock);
 417         /* Holler if peer manages to touch buffers before he knows the mbits */
 418         if (desc->bd_refs != total_md)
 419                 CWARN("%s: Peer %s touched %d buffers while I registered\n",
 420                       desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
 421                       total_md - desc->bd_refs);
 422         spin_unlock(&desc->bd_lock);
 423
 424         CDEBUG(D_NET,
 425                "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n",
 426                desc->bd_refs,
 427                ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
 428                desc->bd_iov_count, desc->bd_nob,
 429                desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
 430
 431         RETURN(0);
 432 }
 433
 434 /**
 435  * Disconnect a bulk desc from the network. Idempotent. Not
 436  * thread-safe (i.e. only interlocks with completion callback).
 437  * Returns 1 on success or 0 if network unregistration failed for whatever
 438  * reason.
 439  */
 440 int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
 441 {
 442         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
 443         ENTRY;
 444
 445         LASSERT(!in_interrupt());     /* might sleep */
 446
 447         if (desc)
 448                 desc->bd_registered = 0;
 449
 450         /* Let's setup deadline for reply unlink. */
 451         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
 452             async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0)
 453                 req->rq_bulk_deadline = ktime_get_real_seconds() +
 454                                         PTLRPC_REQ_LONG_UNLINK;
 455
 456         if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
 457                 RETURN(1);                              /* never registered */
 458
 459         LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
 460
 461         /* the unlink ensures the callback happens ASAP and is the last
 462          * one.  If it fails, it must be because completion just happened,
 463          * but we must still wait_event_idle_timeout() in this case to give
 464          * us a chance to run client_bulk_callback()
 465          */
 466         mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 467
 468         if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
 469                 RETURN(1);                              /* never registered */
 470
 471         /* Move to "Unregistering" phase as bulk was not unlinked yet. */
 472         ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK);
 473
 474         /* Do not wait for unlink to finish. */
 475         if (async)
 476                 RETURN(0);
 477
 478         for (;;) {
 479                 /* The wq argument is ignored by user-space wait_event macros */
 480                 wait_queue_head_t *wq = (req->rq_set != NULL) ?
 481                                         &req->rq_set->set_waitq :
 482                                         &req->rq_reply_waitq;
 483                 /*
 484                  * Network access will complete in finite time but the HUGE
 485                  * timeout lets us CWARN for visibility of sluggish NALs.
 486                  */
 487                 int seconds = PTLRPC_REQ_LONG_UNLINK;
 488
 489                 while (seconds > 0 &&
 490                        wait_event_idle_timeout(*wq,
 491                                                !ptlrpc_client_bulk_active(req),
 492                                                cfs_time_seconds(1)) == 0)
 493                         seconds -= 1;
 494                 if (seconds > 0) {
 495                         ptlrpc_rqphase_move(req, req->rq_next_phase);
 496                         RETURN(1);
 497                 }
 498
 499                 DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
 500                           desc);
 501         }
 502         RETURN(0);
 503 }
 504
 505 static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 506 {
 507         struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
 508         struct ptlrpc_service           *svc = svcpt->scp_service;
 509         timeout_t service_timeout;
 510
 511         service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() -
 512                                              req->rq_arrival_time.tv_sec, 1,
 513                                   (AT_OFF ? obd_timeout * 3 / 2 : at_max));
 514         if (!(flags & PTLRPC_REPLY_EARLY) &&
 515             (req->rq_type != PTL_RPC_MSG_ERR) &&
 516             (req->rq_reqmsg != NULL) &&
 517             !(lustre_msg_get_flags(req->rq_reqmsg) &
 518               (MSG_RESENT | MSG_REPLAY |
 519                MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
 520                 /* early replies, errors and recovery requests don't count
 521                  * toward our service time estimate
 522                  */
 523                 timeout_t oldse = at_measured(&svcpt->scp_at_estimate,
 524                                               service_timeout);
 525
 526                 if (oldse != 0) {
 527                         DEBUG_REQ(D_ADAPTTO, req,
 528                                   "svc %s changed estimate from %d to %d",
 529                                   svc->srv_name, oldse,
 530                                   at_get(&svcpt->scp_at_estimate));
 531                 }
 532         }
 533         /* Report actual service time for client latency calc */
 534         lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout);
 535         /* Report service time estimate for future client reqs, but report 0
 536          * (to be ignored by client) if it's an error reply during recovery.
 537          * b=15815
 538          */
 539         if (req->rq_type == PTL_RPC_MSG_ERR &&
 540             (req->rq_export == NULL ||
 541              req->rq_export->exp_obd->obd_recovering)) {
 542                 lustre_msg_set_timeout(req->rq_repmsg, 0);
 543         } else {
 544                 timeout_t timeout;
 545
 546                 if (req->rq_export && req->rq_reqmsg != NULL &&
 547                     (flags & PTLRPC_REPLY_EARLY) &&
 548                     lustre_msg_get_flags(req->rq_reqmsg) &
 549                     (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
 550                         struct obd_device *exp_obd = req->rq_export->exp_obd;
 551
 552                         timeout = ktime_get_real_seconds() -
 553                                   req->rq_arrival_time.tv_sec +
 554                                   min_t(timeout_t, at_extra,
 555                                         exp_obd->obd_recovery_timeout / 4);
 556                 } else {
 557                         timeout = at_get(&svcpt->scp_at_estimate);
 558                 }
 559                 lustre_msg_set_timeout(req->rq_repmsg, timeout);
 560         }
 561
 562         if (req->rq_reqmsg &&
 563             !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
 564                 CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
 565                        "req_flags=%#x magic=%x/%x len=%d\n",
 566                        flags, lustre_msg_get_flags(req->rq_reqmsg),
 567                        lustre_msg_get_magic(req->rq_reqmsg),
 568                        lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
 569         }
 570 }
 571
 572 /**
 573  * Send request reply from request \a req reply buffer.
 574  * \a flags defines reply types
 575  * Returns 0 on success or error code
 576  */
 577 int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
 578 {
 579         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 580         struct ptlrpc_connection  *conn;
 581         int                        rc;
 582
 583         /* We must already have a reply buffer (only ptlrpc_error() may be
 584          * called without one). The reply generated by sptlrpc layer (e.g.
 585          * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
 586          * have a request buffer which is either the actual (swabbed) incoming
 587          * request, or a saved copy if this is a req saved in
 588          * target_queue_final_reply().
 589          */
 590         LASSERT (req->rq_no_reply == 0);
 591         LASSERT (req->rq_reqbuf != NULL);
 592         LASSERT (rs != NULL);
 593         LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
 594         LASSERT (req->rq_repmsg != NULL);
 595         LASSERT (req->rq_repmsg == rs->rs_msg);
 596         LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
 597         LASSERT (rs->rs_cb_id.cbid_arg == rs);
 598
 599         /* There may be no rq_export during failover */
 600
 601         if (unlikely(req->rq_export && req->rq_export->exp_obd &&
 602                      req->rq_export->exp_obd->obd_fail)) {
 603                 /* Failed obd's only send ENODEV */
 604                 req->rq_type = PTL_RPC_MSG_ERR;
 605                 req->rq_status = -ENODEV;
 606                 CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
 607                        req->rq_export->exp_obd->obd_minor);
 608         }
 609
 610         if (req->rq_type != PTL_RPC_MSG_ERR)
 611                 req->rq_type = PTL_RPC_MSG_REPLY;
 612
 613         lustre_msg_set_type(req->rq_repmsg, req->rq_type);
 614         lustre_msg_set_status(req->rq_repmsg,
 615                               ptlrpc_status_hton(req->rq_status));
 616         lustre_msg_set_opc(req->rq_repmsg,
 617                 req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
 618
 619         target_pack_pool_reply(req);
 620
 621         ptlrpc_at_set_reply(req, flags);
 622
 623         if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
 624                 conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
 625         else
 626                 conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
 627
 628         if (unlikely(conn == NULL)) {
 629                 CERROR("not replying on NULL connection\n"); /* bug 9635 */
 630                 return -ENOTCONN;
 631         }
 632         ptlrpc_rs_addref(rs);                   /* +1 ref for the network */
 633
 634         rc = sptlrpc_svc_wrap_reply(req);
 635         if (unlikely(rc))
 636                 goto out;
 637
 638         req->rq_sent = ktime_get_real_seconds();
 639
 640         rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
 641                           (rs->rs_difficult && !rs->rs_no_ack) ?
 642                           LNET_ACK_REQ : LNET_NOACK_REQ,
 643                           &rs->rs_cb_id, req->rq_self, req->rq_source,
 644                           ptlrpc_req2svc(req)->srv_rep_portal,
 645                           req->rq_xid, req->rq_reply_off, NULL);
 646 out:
 647         if (unlikely(rc != 0))
 648                 ptlrpc_req_drop_rs(req);
 649         ptlrpc_connection_put(conn);
 650         return rc;
 651 }
 652
 653 int ptlrpc_reply (struct ptlrpc_request *req)
 654 {
 655         if (req->rq_no_reply)
 656                 return 0;
 657         else
 658                 return (ptlrpc_send_reply(req, 0));
 659 }
 660
 661 /**
 662  * For request \a req send an error reply back. Create empty
 663  * reply buffers if necessary.
 664  */
 665 int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
 666 {
 667         int rc;
 668         ENTRY;
 669
 670         if (req->rq_no_reply)
 671                 RETURN(0);
 672
 673         if (!req->rq_repmsg) {
 674                 rc = lustre_pack_reply(req, 1, NULL, NULL);
 675                 if (rc)
 676                         RETURN(rc);
 677         }
 678
 679         if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
 680             req->rq_status != -EPERM && req->rq_status != -ENOENT &&
 681             req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
 682                 req->rq_type = PTL_RPC_MSG_ERR;
 683
 684         rc = ptlrpc_send_reply(req, may_be_difficult);
 685         RETURN(rc);
 686 }
 687
 688 int ptlrpc_error(struct ptlrpc_request *req)
 689 {
 690         return ptlrpc_send_error(req, 0);
 691 }
 692
 693 /**
 694  * Send request \a request.
 695  * if \a noreply is set, don't expect any reply back and don't set up
 696  * reply buffers.
 697  * Returns 0 on success or error code.
 698  */
 699 int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 700 {
 701         int rc;
 702         int mpflag = 0;
 703         struct lnet_handle_md bulk_cookie;
 704         struct ptlrpc_connection *connection;
 705         struct lnet_me *reply_me = NULL;
 706         struct lnet_md reply_md;
 707         struct obd_import *imp = request->rq_import;
 708         struct obd_device *obd = imp->imp_obd;
 709         ENTRY;
 710
 711         LNetInvalidateMDHandle(&bulk_cookie);
 712
 713         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
 714                 RETURN(0);
 715
 716         LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
 717         LASSERT(request->rq_wait_ctx == 0);
 718
 719         /* If this is a re-transmit, we're required to have disengaged
 720          * cleanly from the previous attempt */
 721         LASSERT(!request->rq_receiving_reply);
 722         LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
 723                   (imp->imp_state == LUSTRE_IMP_FULL)));
 724
 725         if (unlikely(obd != NULL && obd->obd_fail)) {
 726                 CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
 727                        obd->obd_name);
 728                 /* this prevents us from waiting in ptlrpc_queue_wait */
 729                 spin_lock(&request->rq_lock);
 730                 request->rq_err = 1;
 731                 spin_unlock(&request->rq_lock);
 732                 request->rq_status = -ENODEV;
 733                 RETURN(-ENODEV);
 734         }
 735
 736         connection = imp->imp_connection;
 737
 738         lustre_msg_set_handle(request->rq_reqmsg,
 739                               &imp->imp_remote_handle);
 740         lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
 741         lustre_msg_set_conn_cnt(request->rq_reqmsg,
 742                                 imp->imp_conn_cnt);
 743         lustre_msghdr_set_flags(request->rq_reqmsg,
 744                                 imp->imp_msghdr_flags);
 745
 746         /* If it's the first time to resend the request for EINPROGRESS,
 747          * we need to allocate a new XID (see after_reply()), it's different
 748          * from the resend for reply timeout. */
 749         if (request->rq_nr_resend != 0 &&
 750             list_empty(&request->rq_unreplied_list)) {
 751                 __u64 min_xid = 0;
 752                 /* resend for EINPROGRESS, allocate new xid to avoid reply
 753                  * reconstruction */
 754                 spin_lock(&imp->imp_lock);
 755                 ptlrpc_assign_next_xid_nolock(request);
 756                 min_xid = ptlrpc_known_replied_xid(imp);
 757                 spin_unlock(&imp->imp_lock);
 758
 759                 lustre_msg_set_last_xid(request->rq_reqmsg, min_xid);
 760                 DEBUG_REQ(D_RPCTRACE, request,
 761                           "Allocating new XID for resend on EINPROGRESS");
 762         }
 763
 764         if (request->rq_bulk != NULL) {
 765                 ptlrpc_set_bulk_mbits(request);
 766                 lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
 767         }
 768
 769         if (list_empty(&request->rq_unreplied_list) ||
 770             request->rq_xid <= imp->imp_known_replied_xid) {
 771                 DEBUG_REQ(D_ERROR, request,
 772                           "xid=%llu, replied=%llu, list_empty=%d",
 773                           request->rq_xid, imp->imp_known_replied_xid,
 774                           list_empty(&request->rq_unreplied_list));
 775                 LBUG();
 776         }
 777
 778         /** For enabled AT all request should have AT_SUPPORT in the
 779          * FULL import state when OBD_CONNECT_AT is set */
 780         LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL ||
 781                 (imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) ||
 782                 !(imp->imp_connect_data.ocd_connect_flags &
 783                   OBD_CONNECT_AT));
 784
 785         if (request->rq_resend) {
 786                 lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
 787                 if (request->rq_resend_cb != NULL)
 788                         request->rq_resend_cb(request, &request->rq_async_args);
 789         }
 790         if (request->rq_memalloc)
 791                 mpflag = memalloc_noreclaim_save();
 792
 793         rc = sptlrpc_cli_wrap_request(request);
 794         if (rc)
 795                 GOTO(out, rc);
 796
 797         /* bulk register should be done after wrap_request() */
 798         if (request->rq_bulk != NULL) {
 799                 rc = ptlrpc_register_bulk (request);
 800                 if (rc != 0)
 801                         GOTO(cleanup_bulk, rc);
 802                 /*
 803                  * All the mds in the request will have the same cpt
 804                  * encoded in the cookie. So we can just get the first
 805                  * one.
 806                  */
 807                 bulk_cookie = request->rq_bulk->bd_mds[0];
 808         }
 809
 810         if (!noreply) {
 811                 LASSERT (request->rq_replen != 0);
 812                 if (request->rq_repbuf == NULL) {
 813                         LASSERT(request->rq_repdata == NULL);
 814                         LASSERT(request->rq_repmsg == NULL);
 815                         rc = sptlrpc_cli_alloc_repbuf(request,
 816                                                       request->rq_replen);
 817                         if (rc) {
 818                                 /* this prevents us from looping in
 819                                  * ptlrpc_queue_wait */
 820                                 spin_lock(&request->rq_lock);
 821                                 request->rq_err = 1;
 822                                 spin_unlock(&request->rq_lock);
 823                                 request->rq_status = rc;
 824                                 GOTO(cleanup_bulk, rc);
 825                         }
 826                 } else {
 827                         request->rq_repdata = NULL;
 828                         request->rq_repmsg = NULL;
 829                 }
 830
 831                 if (request->rq_bulk &&
 832                     OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_REPLY_ATTACH)) {
 833                         reply_me = ERR_PTR(-ENOMEM);
 834                 } else {
 835                         reply_me = LNetMEAttach(request->rq_reply_portal,
 836                                                 connection->c_peer,
 837                                                 request->rq_xid, 0,
 838                                                 LNET_UNLINK, LNET_INS_AFTER);
 839                 }
 840
 841                 if (IS_ERR(reply_me)) {
 842                         rc = PTR_ERR(reply_me);
 843                         CERROR("LNetMEAttach failed: %d\n", rc);
 844                         LASSERT(rc == -ENOMEM);
 845                         GOTO(cleanup_bulk, rc = -ENOMEM);
 846                 }
 847         }
 848
 849         spin_lock(&request->rq_lock);
 850         /* We are responsible for unlinking the reply buffer */
 851         request->rq_reply_unlinked = noreply;
 852         request->rq_receiving_reply = !noreply;
 853         /* Clear any flags that may be present from previous sends. */
 854         request->rq_req_unlinked = 0;
 855         request->rq_replied = 0;
 856         request->rq_err = 0;
 857         request->rq_timedout = 0;
 858         request->rq_net_err = 0;
 859         request->rq_resend = 0;
 860         request->rq_restart = 0;
 861         request->rq_reply_truncated = 0;
 862         spin_unlock(&request->rq_lock);
 863
 864         if (!noreply) {
 865                 reply_md.start     = request->rq_repbuf;
 866                 reply_md.length    = request->rq_repbuf_len;
 867                 /* Allow multiple early replies */
 868                 reply_md.threshold = LNET_MD_THRESH_INF;
 869                 /* Manage remote for early replies */
 870                 reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
 871                         LNET_MD_MANAGE_REMOTE |
 872                         LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
 873                 reply_md.user_ptr  = &request->rq_reply_cbid;
 874                 reply_md.handler = ptlrpc_handler;
 875
 876                 /* We must see the unlink callback to set rq_reply_unlinked,
 877                  * so we can't auto-unlink */
 878                 rc = LNetMDAttach(reply_me, &reply_md, LNET_RETAIN,
 879                                   &request->rq_reply_md_h);
 880                 if (rc != 0) {
 881                         CERROR("LNetMDAttach failed: %d\n", rc);
 882                         LASSERT(rc == -ENOMEM);
 883                         spin_lock(&request->rq_lock);
 884                         /* ...but the MD attach didn't succeed... */
 885                         request->rq_receiving_reply = 0;
 886                         spin_unlock(&request->rq_lock);
 887                         GOTO(cleanup_bulk, rc = -ENOMEM);
 888                 }
 889                 percpu_ref_get(&ptlrpc_pending);
 890
 891                 CDEBUG(D_NET,
 892                        "Setup reply buffer: %u bytes, xid %llu, portal %u\n",
 893                        request->rq_repbuf_len, request->rq_xid,
 894                        request->rq_reply_portal);
 895         }
 896
 897         /* add references on request for request_out_callback */
 898         ptlrpc_request_addref(request);
 899         if (obd != NULL && obd->obd_svc_stats != NULL)
 900                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
 901                                     atomic_read(&imp->imp_inflight));
 902
 903         OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
 904
 905         request->rq_sent_ns = ktime_get_real();
 906         request->rq_sent = ktime_get_real_seconds();
 907         /* We give the server rq_timeout secs to process the req, and
 908          * add the network latency for our local timeout.
 909          */
 910         request->rq_deadline = request->rq_sent + request->rq_timeout +
 911                 ptlrpc_at_get_net_latency(request);
 912
 913         ptlrpc_pinger_sending_on_import(imp);
 914
 915         DEBUG_REQ(D_INFO, request, "send flags=%x",
 916                   lustre_msg_get_flags(request->rq_reqmsg));
 917         rc = ptl_send_buf(&request->rq_req_md_h,
 918                           request->rq_reqbuf, request->rq_reqdata_len,
 919                           LNET_NOACK_REQ, &request->rq_req_cbid,
 920                           LNET_NID_ANY, connection->c_peer,
 921                           request->rq_request_portal,
 922                           request->rq_xid, 0, &bulk_cookie);
 923         if (likely(rc == 0))
 924                 GOTO(out, rc);
 925
 926         request->rq_req_unlinked = 1;
 927         ptlrpc_req_finished(request);
 928         if (noreply)
 929                 GOTO(out, rc);
 930
 931         LNetMDUnlink(request->rq_reply_md_h);
 932
 933         /* UNLINKED callback called synchronously */
 934         LASSERT(!request->rq_receiving_reply);
 935
 936  cleanup_bulk:
 937         /* We do sync unlink here as there was no real transfer here so
 938          * the chance to have long unlink to sluggish net is smaller here. */
 939         ptlrpc_unregister_bulk(request, 0);
 940  out:
 941         if (rc == -ENOMEM) {
 942                 /* set rq_sent so that this request is treated
 943                  * as a delayed send in the upper layers */
 944                 request->rq_sent = ktime_get_real_seconds();
 945         }
 946
 947         if (request->rq_memalloc)
 948                 memalloc_noreclaim_restore(mpflag);
 949
 950         return rc;
 951 }
 952 EXPORT_SYMBOL(ptl_send_rpc);
 953
 954 /**
 955  * Register request buffer descriptor for request receiving.
 956  */
 957 int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 958 {
 959         struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
 960         static struct lnet_process_id match_id = {
 961                 .nid = LNET_NID_ANY,
 962                 .pid = LNET_PID_ANY
 963         };
 964         int rc;
 965         struct lnet_md md;
 966         struct lnet_me *me;
 967
 968         CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
 969                service->srv_req_portal);
 970
 971         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
 972                 return (-ENOMEM);
 973
 974         /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
 975          * which means buffer can only be attached on local CPT, and LND
 976          * threads can find it by grabbing a local lock */
 977         me = LNetMEAttach(service->srv_req_portal,
 978                           match_id, 0, ~0, LNET_UNLINK,
 979                           rqbd->rqbd_svcpt->scp_cpt >= 0 ?
 980                           LNET_INS_LOCAL : LNET_INS_AFTER);
 981         if (IS_ERR(me)) {
 982                 CERROR("LNetMEAttach failed: %ld\n", PTR_ERR(me));
 983                 return -ENOMEM;
 984         }
 985
 986         LASSERT(rqbd->rqbd_refcount == 0);
 987         rqbd->rqbd_refcount = 1;
 988
 989         md.start     = rqbd->rqbd_buffer;
 990         md.length    = service->srv_buf_size;
 991         md.max_size  = service->srv_max_req_size;
 992         md.threshold = LNET_MD_THRESH_INF;
 993         md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
 994         md.user_ptr  = &rqbd->rqbd_cbid;
 995         md.handler   = ptlrpc_handler;
 996
 997         rc = LNetMDAttach(me, &md, LNET_UNLINK, &rqbd->rqbd_md_h);
 998         if (rc == 0) {
 999                 percpu_ref_get(&ptlrpc_pending);
1000                 return 0;
1001         }
1002
1003         CERROR("ptlrpc: LNetMDAttach failed: rc = %d\n", rc);
1004         LASSERT(rc == -ENOMEM);
1005         LASSERT(rc == 0);
1006         rqbd->rqbd_refcount = 0;
1007
1008         return -ENOMEM;
1009 }