lustre/ptlrpc/niobuf.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 /*
  31  * This file is part of Lustre, http://www.lustre.org/
  32  * Lustre is a trademark of Sun Microsystems, Inc.
  33  */
  34
  35 #define DEBUG_SUBSYSTEM S_RPC
  36 #ifndef __KERNEL__
  37 #include <liblustre.h>
  38 #endif
  39 #include <obd_support.h>
  40 #include <lustre_net.h>
  41 #include <lustre_lib.h>
  42 #include <obd.h>
  43 #include <obd_class.h>
  44 #include "ptlrpc_internal.h"
  45
  46 /**
  47  * Helper function. Sends \a len bytes from \a base at offset \a offset
  48  * over \a conn connection to portal \a portal.
  49  * Returns 0 on success or error code.
  50  */
  51 static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
  52                          lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
  53                          struct ptlrpc_connection *conn, int portal, __u64 xid,
  54                          unsigned int offset)
  55 {
  56         int              rc;
  57         lnet_md_t         md;
  58         ENTRY;
  59
  60         LASSERT (portal != 0);
  61         LASSERT (conn != NULL);
  62         CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
  63         md.start     = base;
  64         md.length    = len;
  65         md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
  66         md.options   = PTLRPC_MD_OPTIONS;
  67         md.user_ptr  = cbid;
  68         md.eq_handle = ptlrpc_eq_h;
  69
  70         if (unlikely(ack == LNET_ACK_REQ &&
  71                      OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
  72                 /* don't ask for the ack to simulate failing client */
  73                 ack = LNET_NOACK_REQ;
  74         }
  75
  76         rc = LNetMDBind (md, LNET_UNLINK, mdh);
  77         if (unlikely(rc != 0)) {
  78                 CERROR ("LNetMDBind failed: %d\n", rc);
  79                 LASSERT (rc == -ENOMEM);
  80                 RETURN (-ENOMEM);
  81         }
  82
  83         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
  84                len, portal, xid, offset);
  85
  86         rc = LNetPut (conn->c_self, *mdh, ack,
  87                       conn->c_peer, portal, xid, offset, 0);
  88         if (unlikely(rc != 0)) {
  89                 int rc2;
  90                 /* We're going to get an UNLINK event when I unlink below,
  91                  * which will complete just like any other failed send, so
  92                  * I fall through and return success here! */
  93                 CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n",
  94                        libcfs_id2str(conn->c_peer), portal, xid, rc);
  95                 rc2 = LNetMDUnlink(*mdh);
  96                 LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
  97         }
  98
  99         RETURN (0);
 100 }
 101
 102 #ifdef HAVE_SERVER_SUPPORT
 103 /**
 104  * Prepare bulk descriptor for specified incoming request \a req that
 105  * can fit \a npages * pages. \a type is bulk type. \a portal is where
 106  * the bulk to be sent. Used on server-side after request was already
 107  * received.
 108  * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
 109  * error.
 110  */
 111 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
 112                                               int npages, int type, int portal)
 113 {
 114         struct obd_export *exp = req->rq_export;
 115         struct ptlrpc_bulk_desc *desc;
 116
 117         ENTRY;
 118         LASSERT(type == BULK_PUT_SOURCE || type == BULK_GET_SINK);
 119
 120         desc = new_bulk(npages, type, portal);
 121         if (desc == NULL)
 122                 RETURN(NULL);
 123
 124         desc->bd_export = class_export_get(exp);
 125         desc->bd_req = req;
 126
 127         desc->bd_cbid.cbid_fn  = server_bulk_callback;
 128         desc->bd_cbid.cbid_arg = desc;
 129
 130         /* NB we don't assign rq_bulk here; server-side requests are
 131          * re-used, and the handler frees the bulk desc explicitly. */
 132
 133         return desc;
 134 }
 135 EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
 136
 137 /**
 138  * Starts bulk transfer for descriptor \a desc
 139  * Returns 0 on success or error code.
 140  */
 141 int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 142 {
 143         struct ptlrpc_connection *conn = desc->bd_export->exp_connection;
 144         int                       rc;
 145         int                       rc2;
 146         lnet_md_t                 md;
 147         __u64                     xid;
 148         ENTRY;
 149
 150         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET))
 151                 RETURN(0);
 152
 153         /* NB no locking required until desc is on the network */
 154         LASSERT (!desc->bd_network_rw);
 155         LASSERT (desc->bd_type == BULK_PUT_SOURCE ||
 156                  desc->bd_type == BULK_GET_SINK);
 157         desc->bd_success = 0;
 158
 159         md.user_ptr = &desc->bd_cbid;
 160         md.eq_handle = ptlrpc_eq_h;
 161         md.threshold = 2; /* SENT and ACK/REPLY */
 162         md.options = PTLRPC_MD_OPTIONS;
 163         ptlrpc_fill_bulk_md(&md, desc);
 164
 165         LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback);
 166         LASSERT (desc->bd_cbid.cbid_arg == desc);
 167
 168         /* NB total length may be 0 for a read past EOF, so we send a 0
 169          * length bulk, since the client expects a bulk event. */
 170
 171         rc = LNetMDBind(md, LNET_UNLINK, &desc->bd_md_h);
 172         if (rc != 0) {
 173                 CERROR("LNetMDBind failed: %d\n", rc);
 174                 LASSERT (rc == -ENOMEM);
 175                 RETURN(-ENOMEM);
 176         }
 177
 178         /* Client's bulk and reply matchbits are the same */
 179         xid = desc->bd_req->rq_xid;
 180         CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
 181                "id %s xid "LPX64"\n", desc->bd_iov_count,
 182                desc->bd_nob, desc->bd_portal,
 183                libcfs_id2str(conn->c_peer), xid);
 184
 185         /* Network is about to get at the memory */
 186         desc->bd_network_rw = 1;
 187
 188         if (desc->bd_type == BULK_PUT_SOURCE)
 189                 rc = LNetPut (conn->c_self, desc->bd_md_h, LNET_ACK_REQ,
 190                               conn->c_peer, desc->bd_portal, xid, 0, 0);
 191         else
 192                 rc = LNetGet (conn->c_self, desc->bd_md_h,
 193                               conn->c_peer, desc->bd_portal, xid, 0);
 194
 195         if (rc != 0) {
 196                 /* Can't send, so we unlink the MD bound above.  The UNLINK
 197                  * event this creates will signal completion with failure,
 198                  * so we return SUCCESS here! */
 199                 CERROR("Transfer(%s, %d, "LPX64") failed: %d\n",
 200                        libcfs_id2str(conn->c_peer), desc->bd_portal, xid, rc);
 201                 rc2 = LNetMDUnlink(desc->bd_md_h);
 202                 LASSERT (rc2 == 0);
 203         }
 204
 205         RETURN(0);
 206 }
 207 EXPORT_SYMBOL(ptlrpc_start_bulk_transfer);
 208
 209 /**
 210  * Server side bulk abort. Idempotent. Not thread-safe (i.e. only
 211  * serialises with completion callback)
 212  */
 213 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
 214 {
 215         struct l_wait_info       lwi;
 216         int                      rc;
 217
 218         LASSERT(!cfs_in_interrupt());           /* might sleep */
 219
 220         if (!ptlrpc_server_bulk_active(desc))   /* completed or */
 221                 return;                         /* never started */
 222
 223         /* We used to poison the pages with 0xab here because we did not want to
 224          * send any meaningful data over the wire for evicted clients (bug 9297)
 225          * However, this is no longer safe now that we use the page cache on the
 226          * OSS (bug 20560) */
 227
 228         /* The unlink ensures the callback happens ASAP and is the last
 229          * one.  If it fails, it must be because completion just happened,
 230          * but we must still l_wait_event() in this case, to give liblustre
 231          * a chance to run server_bulk_callback()*/
 232
 233         LNetMDUnlink(desc->bd_md_h);
 234
 235         for (;;) {
 236                 /* Network access will complete in finite time but the HUGE
 237                  * timeout lets us CWARN for visibility of sluggish NALs */
 238                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
 239                                            cfs_time_seconds(1), NULL, NULL);
 240                 rc = l_wait_event(desc->bd_waitq,
 241                                   !ptlrpc_server_bulk_active(desc), &lwi);
 242                 if (rc == 0)
 243                         return;
 244
 245                 LASSERT(rc == -ETIMEDOUT);
 246                 CWARN("Unexpectedly long timeout: desc %p\n", desc);
 247         }
 248 }
 249 EXPORT_SYMBOL(ptlrpc_abort_bulk);
 250 #endif /* HAVE_SERVER_SUPPORT */
 251
 252 /**
 253  * Register bulk for later transfer
 254  * Returns 0 on success or error code.
 255  */
 256 int ptlrpc_register_bulk(struct ptlrpc_request *req)
 257 {
 258         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
 259         lnet_process_id_t peer;
 260         int rc;
 261         int rc2;
 262         lnet_handle_me_t  me_h;
 263         lnet_md_t         md;
 264         ENTRY;
 265
 266         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
 267                 RETURN(0);
 268
 269         /* NB no locking required until desc is on the network */
 270         LASSERT (desc->bd_nob > 0);
 271         LASSERT (!desc->bd_network_rw);
 272         LASSERT (desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
 273         LASSERT (desc->bd_req != NULL);
 274         LASSERT (desc->bd_type == BULK_PUT_SINK ||
 275                  desc->bd_type == BULK_GET_SOURCE);
 276
 277         desc->bd_success = 0;
 278
 279         peer = desc->bd_import->imp_connection->c_peer;
 280
 281         md.user_ptr = &desc->bd_cbid;
 282         md.eq_handle = ptlrpc_eq_h;
 283         md.threshold = 1;                       /* PUT or GET */
 284         md.options = PTLRPC_MD_OPTIONS |
 285                      ((desc->bd_type == BULK_GET_SOURCE) ?
 286                       LNET_MD_OP_GET : LNET_MD_OP_PUT);
 287         ptlrpc_fill_bulk_md(&md, desc);
 288
 289         LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback);
 290         LASSERT (desc->bd_cbid.cbid_arg == desc);
 291
 292         /* XXX Registering the same xid on retried bulk makes my head
 293          * explode trying to understand how the original request's bulk
 294          * might interfere with the retried request -eeb
 295          * On the other hand replaying with the same xid is fine, since
 296          * we are guaranteed old request have completed. -green */
 297         LASSERTF(!(desc->bd_registered &&
 298                  req->rq_send_state != LUSTRE_IMP_REPLAY) ||
 299                  req->rq_xid != desc->bd_last_xid,
 300                  "registered: %d  rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
 301                  desc->bd_registered, req->rq_xid, desc->bd_last_xid);
 302         desc->bd_registered = 1;
 303         desc->bd_last_xid = req->rq_xid;
 304
 305         rc = LNetMEAttach(desc->bd_portal, peer,
 306                          req->rq_xid, 0, LNET_UNLINK, LNET_INS_AFTER, &me_h);
 307         if (rc != 0) {
 308                 CERROR("LNetMEAttach failed: %d\n", rc);
 309                 LASSERT (rc == -ENOMEM);
 310                 RETURN (-ENOMEM);
 311         }
 312
 313         /* About to let the network at it... */
 314         desc->bd_network_rw = 1;
 315         rc = LNetMDAttach(me_h, md, LNET_UNLINK, &desc->bd_md_h);
 316         if (rc != 0) {
 317                 CERROR("LNetMDAttach failed: %d\n", rc);
 318                 LASSERT (rc == -ENOMEM);
 319                 desc->bd_network_rw = 0;
 320                 rc2 = LNetMEUnlink (me_h);
 321                 LASSERT (rc2 == 0);
 322                 RETURN (-ENOMEM);
 323         }
 324
 325         CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPU64", "
 326                "portal %u\n",
 327                desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
 328                desc->bd_iov_count, desc->bd_nob,
 329                req->rq_xid, desc->bd_portal);
 330         RETURN(0);
 331 }
 332 EXPORT_SYMBOL(ptlrpc_register_bulk);
 333
 334 /**
 335  * Disconnect a bulk desc from the network. Idempotent. Not
 336  * thread-safe (i.e. only interlocks with completion callback).
 337  * Returns 1 on success or 0 if network unregistration failed for whatever
 338  * reason.
 339  */
 340 int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
 341 {
 342         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
 343         cfs_waitq_t             *wq;
 344         struct l_wait_info       lwi;
 345         int                      rc;
 346         ENTRY;
 347
 348         LASSERT(!cfs_in_interrupt());     /* might sleep */
 349
 350         /* Let's setup deadline for reply unlink. */
 351         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
 352             async && req->rq_bulk_deadline == 0)
 353                 req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
 354
 355         if (!ptlrpc_client_bulk_active(req))  /* completed or */
 356                 RETURN(1);                    /* never registered */
 357
 358         LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
 359
 360         /* the unlink ensures the callback happens ASAP and is the last
 361          * one.  If it fails, it must be because completion just happened,
 362          * but we must still l_wait_event() in this case to give liblustre
 363          * a chance to run client_bulk_callback() */
 364
 365         LNetMDUnlink(desc->bd_md_h);
 366
 367         if (!ptlrpc_client_bulk_active(req))  /* completed or */
 368                 RETURN(1);                    /* never registered */
 369
 370         /* Move to "Unregistering" phase as bulk was not unlinked yet. */
 371         ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
 372
 373         /* Do not wait for unlink to finish. */
 374         if (async)
 375                 RETURN(0);
 376
 377         if (req->rq_set != NULL)
 378                 wq = &req->rq_set->set_waitq;
 379         else
 380                 wq = &req->rq_reply_waitq;
 381
 382         for (;;) {
 383                 /* Network access will complete in finite time but the HUGE
 384                  * timeout lets us CWARN for visibility of sluggish NALs */
 385                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
 386                                            cfs_time_seconds(1), NULL, NULL);
 387                 rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
 388                 if (rc == 0) {
 389                         ptlrpc_rqphase_move(req, req->rq_next_phase);
 390                         RETURN(1);
 391                 }
 392
 393                 LASSERT(rc == -ETIMEDOUT);
 394                 DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
 395                           desc);
 396         }
 397         RETURN(0);
 398 }
 399 EXPORT_SYMBOL(ptlrpc_unregister_bulk);
 400
 401 static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 402 {
 403         struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
 404         struct ptlrpc_service           *svc = svcpt->scp_service;
 405         int service_time = max_t(int, cfs_time_current_sec() -
 406                                  req->rq_arrival_time.tv_sec, 1);
 407
 408         if (!(flags & PTLRPC_REPLY_EARLY) &&
 409             (req->rq_type != PTL_RPC_MSG_ERR) &&
 410             (req->rq_reqmsg != NULL) &&
 411             !(lustre_msg_get_flags(req->rq_reqmsg) &
 412               (MSG_RESENT | MSG_REPLAY |
 413                MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
 414                 /* early replies, errors and recovery requests don't count
 415                  * toward our service time estimate */
 416                 int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
 417
 418                 if (oldse != 0) {
 419                         DEBUG_REQ(D_ADAPTTO, req,
 420                                   "svc %s changed estimate from %d to %d",
 421                                   svc->srv_name, oldse,
 422                                   at_get(&svcpt->scp_at_estimate));
 423                 }
 424         }
 425         /* Report actual service time for client latency calc */
 426         lustre_msg_set_service_time(req->rq_repmsg, service_time);
 427         /* Report service time estimate for future client reqs, but report 0
 428          * (to be ignored by client) if it's a error reply during recovery.
 429          * (bz15815) */
 430         if (req->rq_type == PTL_RPC_MSG_ERR &&
 431             (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
 432                 lustre_msg_set_timeout(req->rq_repmsg, 0);
 433         else
 434                 lustre_msg_set_timeout(req->rq_repmsg,
 435                                        at_get(&svcpt->scp_at_estimate));
 436
 437         if (req->rq_reqmsg &&
 438             !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
 439                 CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
 440                        "req_flags=%#x magic=%d:%x/%x len=%d\n",
 441                        flags, lustre_msg_get_flags(req->rq_reqmsg),
 442                        lustre_msg_is_v1(req->rq_reqmsg),
 443                        lustre_msg_get_magic(req->rq_reqmsg),
 444                        lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
 445         }
 446 }
 447
 448 /**
 449  * Send request reply from request \a req reply buffer.
 450  * \a flags defines reply types
 451  * Returns 0 on sucess or error code
 452  */
 453 int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
 454 {
 455         struct ptlrpc_reply_state *rs = req->rq_reply_state;
 456         struct ptlrpc_connection  *conn;
 457         int                        rc;
 458
 459         /* We must already have a reply buffer (only ptlrpc_error() may be
 460          * called without one). The reply generated by sptlrpc layer (e.g.
 461          * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
 462          * have a request buffer which is either the actual (swabbed) incoming
 463          * request, or a saved copy if this is a req saved in
 464          * target_queue_final_reply().
 465          */
 466         LASSERT (req->rq_no_reply == 0);
 467         LASSERT (req->rq_reqbuf != NULL);
 468         LASSERT (rs != NULL);
 469         LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
 470         LASSERT (req->rq_repmsg != NULL);
 471         LASSERT (req->rq_repmsg == rs->rs_msg);
 472         LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
 473         LASSERT (rs->rs_cb_id.cbid_arg == rs);
 474
 475         /* There may be no rq_export during failover */
 476
 477         if (unlikely(req->rq_export && req->rq_export->exp_obd &&
 478                      req->rq_export->exp_obd->obd_fail)) {
 479                 /* Failed obd's only send ENODEV */
 480                 req->rq_type = PTL_RPC_MSG_ERR;
 481                 req->rq_status = -ENODEV;
 482                 CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
 483                        req->rq_export->exp_obd->obd_minor);
 484         }
 485
 486         /* In order to keep interoprability with the client (< 2.3) which
 487          * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
 488          * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
 489          * reply buffer on client will be overflow.
 490          *
 491          * XXX Remove this whenver we drop the interoprability with such client.
 492          */
 493         req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
 494                                            sizeof(struct ptlrpc_body_v2), 1);
 495
 496         if (req->rq_type != PTL_RPC_MSG_ERR)
 497                 req->rq_type = PTL_RPC_MSG_REPLY;
 498
 499         lustre_msg_set_type(req->rq_repmsg, req->rq_type);
 500         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
 501         lustre_msg_set_opc(req->rq_repmsg,
 502                 req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
 503
 504         target_pack_pool_reply(req);
 505
 506         ptlrpc_at_set_reply(req, flags);
 507
 508         if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
 509                 conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
 510         else
 511                 conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
 512
 513         if (unlikely(conn == NULL)) {
 514                 CERROR("not replying on NULL connection\n"); /* bug 9635 */
 515                 return -ENOTCONN;
 516         }
 517         ptlrpc_rs_addref(rs);                   /* +1 ref for the network */
 518
 519         rc = sptlrpc_svc_wrap_reply(req);
 520         if (unlikely(rc))
 521                 goto out;
 522
 523         req->rq_sent = cfs_time_current_sec();
 524
 525         rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
 526                            (rs->rs_difficult && !rs->rs_no_ack) ?
 527                            LNET_ACK_REQ : LNET_NOACK_REQ,
 528                            &rs->rs_cb_id, conn,
 529                            ptlrpc_req2svc(req)->srv_rep_portal,
 530                            req->rq_xid, req->rq_reply_off);
 531 out:
 532         if (unlikely(rc != 0))
 533                 ptlrpc_req_drop_rs(req);
 534         ptlrpc_connection_put(conn);
 535         return rc;
 536 }
 537 EXPORT_SYMBOL(ptlrpc_send_reply);
 538
 539 int ptlrpc_reply (struct ptlrpc_request *req)
 540 {
 541         if (req->rq_no_reply)
 542                 return 0;
 543         else
 544                 return (ptlrpc_send_reply(req, 0));
 545 }
 546 EXPORT_SYMBOL(ptlrpc_reply);
 547
 548 /**
 549  * For request \a req send an error reply back. Create empty
 550  * reply buffers if necessary.
 551  */
 552 int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
 553 {
 554         int rc;
 555         ENTRY;
 556
 557         if (req->rq_no_reply)
 558                 RETURN(0);
 559
 560         if (!req->rq_repmsg) {
 561                 rc = lustre_pack_reply(req, 1, NULL, NULL);
 562                 if (rc)
 563                         RETURN(rc);
 564         }
 565
 566         if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
 567             req->rq_status != -EPERM && req->rq_status != -ENOENT &&
 568             req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
 569                 req->rq_type = PTL_RPC_MSG_ERR;
 570
 571         rc = ptlrpc_send_reply(req, may_be_difficult);
 572         RETURN(rc);
 573 }
 574 EXPORT_SYMBOL(ptlrpc_send_error);
 575
 576 int ptlrpc_error(struct ptlrpc_request *req)
 577 {
 578         return ptlrpc_send_error(req, 0);
 579 }
 580 EXPORT_SYMBOL(ptlrpc_error);
 581
 582 /**
 583  * Send request \a request.
 584  * if \a noreply is set, don't expect any reply back and don't set up
 585  * reply buffers.
 586  * Returns 0 on success or error code.
 587  */
 588 int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 589 {
 590         int rc;
 591         int rc2;
 592         int mpflag = 0;
 593         struct ptlrpc_connection *connection;
 594         lnet_handle_me_t  reply_me_h;
 595         lnet_md_t         reply_md;
 596         struct obd_device *obd = request->rq_import->imp_obd;
 597         ENTRY;
 598
 599         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
 600                 RETURN(0);
 601
 602         LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
 603         LASSERT(request->rq_wait_ctx == 0);
 604
 605         /* If this is a re-transmit, we're required to have disengaged
 606          * cleanly from the previous attempt */
 607         LASSERT(!request->rq_receiving_reply);
 608
 609         if (request->rq_import->imp_obd &&
 610             request->rq_import->imp_obd->obd_fail) {
 611                 CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
 612                        request->rq_import->imp_obd->obd_name);
 613                 /* this prevents us from waiting in ptlrpc_queue_wait */
 614                 request->rq_err = 1;
 615                 request->rq_status = -ENODEV;
 616                 RETURN(-ENODEV);
 617         }
 618
 619         connection = request->rq_import->imp_connection;
 620
 621         lustre_msg_set_handle(request->rq_reqmsg,
 622                               &request->rq_import->imp_remote_handle);
 623         lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
 624         lustre_msg_set_conn_cnt(request->rq_reqmsg,
 625                                 request->rq_import->imp_conn_cnt);
 626         lustre_msghdr_set_flags(request->rq_reqmsg,
 627                                 request->rq_import->imp_msghdr_flags);
 628
 629         if (request->rq_resend)
 630                 lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
 631
 632         if (request->rq_memalloc)
 633                 mpflag = cfs_memory_pressure_get_and_set();
 634
 635         rc = sptlrpc_cli_wrap_request(request);
 636         if (rc)
 637                 GOTO(out, rc);
 638
 639         /* bulk register should be done after wrap_request() */
 640         if (request->rq_bulk != NULL) {
 641                 rc = ptlrpc_register_bulk (request);
 642                 if (rc != 0)
 643                         GOTO(out, rc);
 644         }
 645
 646         if (!noreply) {
 647                 LASSERT (request->rq_replen != 0);
 648                 if (request->rq_repbuf == NULL) {
 649                         LASSERT(request->rq_repdata == NULL);
 650                         LASSERT(request->rq_repmsg == NULL);
 651                         rc = sptlrpc_cli_alloc_repbuf(request,
 652                                                       request->rq_replen);
 653                         if (rc) {
 654                                 /* this prevents us from looping in
 655                                  * ptlrpc_queue_wait */
 656                                 request->rq_err = 1;
 657                                 request->rq_status = rc;
 658                                 GOTO(cleanup_bulk, rc);
 659                         }
 660                 } else {
 661                         request->rq_repdata = NULL;
 662                         request->rq_repmsg = NULL;
 663                 }
 664
 665                 rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
 666                                   connection->c_peer, request->rq_xid, 0,
 667                                   LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
 668                 if (rc != 0) {
 669                         CERROR("LNetMEAttach failed: %d\n", rc);
 670                         LASSERT (rc == -ENOMEM);
 671                         GOTO(cleanup_bulk, rc = -ENOMEM);
 672                 }
 673         }
 674
 675         spin_lock(&request->rq_lock);
 676         /* If the MD attach succeeds, there _will_ be a reply_in callback */
 677         request->rq_receiving_reply = !noreply;
 678         /* We are responsible for unlinking the reply buffer */
 679         request->rq_must_unlink = !noreply;
 680         /* Clear any flags that may be present from previous sends. */
 681         request->rq_replied = 0;
 682         request->rq_err = 0;
 683         request->rq_timedout = 0;
 684         request->rq_net_err = 0;
 685         request->rq_resend = 0;
 686         request->rq_restart = 0;
 687         request->rq_reply_truncate = 0;
 688         spin_unlock(&request->rq_lock);
 689
 690         if (!noreply) {
 691                 reply_md.start     = request->rq_repbuf;
 692                 reply_md.length    = request->rq_repbuf_len;
 693                 /* Allow multiple early replies */
 694                 reply_md.threshold = LNET_MD_THRESH_INF;
 695                 /* Manage remote for early replies */
 696                 reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
 697                         LNET_MD_MANAGE_REMOTE |
 698                         LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
 699                 reply_md.user_ptr  = &request->rq_reply_cbid;
 700                 reply_md.eq_handle = ptlrpc_eq_h;
 701
 702                 /* We must see the unlink callback to unset rq_must_unlink,
 703                    so we can't auto-unlink */
 704                 rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
 705                                   &request->rq_reply_md_h);
 706                 if (rc != 0) {
 707                         CERROR("LNetMDAttach failed: %d\n", rc);
 708                         LASSERT (rc == -ENOMEM);
 709                         spin_lock(&request->rq_lock);
 710                         /* ...but the MD attach didn't succeed... */
 711                         request->rq_receiving_reply = 0;
 712                         spin_unlock(&request->rq_lock);
 713                         GOTO(cleanup_me, rc = -ENOMEM);
 714                 }
 715
 716                 CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
 717                        ", portal %u\n",
 718                        request->rq_repbuf_len, request->rq_xid,
 719                        request->rq_reply_portal);
 720         }
 721
 722         /* add references on request for request_out_callback */
 723         ptlrpc_request_addref(request);
 724         if (obd->obd_svc_stats != NULL)
 725                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
 726                         cfs_atomic_read(&request->rq_import->imp_inflight));
 727
 728         OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
 729
 730         cfs_gettimeofday(&request->rq_arrival_time);
 731         request->rq_sent = cfs_time_current_sec();
 732         /* We give the server rq_timeout secs to process the req, and
 733            add the network latency for our local timeout. */
 734         request->rq_deadline = request->rq_sent + request->rq_timeout +
 735                 ptlrpc_at_get_net_latency(request);
 736
 737         ptlrpc_pinger_sending_on_import(request->rq_import);
 738
 739         DEBUG_REQ(D_INFO, request, "send flg=%x",
 740                   lustre_msg_get_flags(request->rq_reqmsg));
 741         rc = ptl_send_buf(&request->rq_req_md_h,
 742                           request->rq_reqbuf, request->rq_reqdata_len,
 743                           LNET_NOACK_REQ, &request->rq_req_cbid,
 744                           connection,
 745                           request->rq_request_portal,
 746                           request->rq_xid, 0);
 747         if (rc == 0)
 748                 GOTO(out, rc);
 749
 750         ptlrpc_req_finished(request);
 751         if (noreply)
 752                 GOTO(out, rc);
 753
 754  cleanup_me:
 755         /* MEUnlink is safe; the PUT didn't even get off the ground, and
 756          * nobody apart from the PUT's target has the right nid+XID to
 757          * access the reply buffer. */
 758         rc2 = LNetMEUnlink(reply_me_h);
 759         LASSERT (rc2 == 0);
 760         /* UNLINKED callback called synchronously */
 761         LASSERT(!request->rq_receiving_reply);
 762
 763  cleanup_bulk:
 764         /* We do sync unlink here as there was no real transfer here so
 765          * the chance to have long unlink to sluggish net is smaller here. */
 766         ptlrpc_unregister_bulk(request, 0);
 767  out:
 768         if (request->rq_memalloc)
 769                 cfs_memory_pressure_restore(mpflag);
 770         return rc;
 771 }
 772 EXPORT_SYMBOL(ptl_send_rpc);
 773
 774 /**
 775  * Register request buffer descriptor for request receiving.
 776  */
 777 int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 778 {
 779         struct ptlrpc_service     *service = rqbd->rqbd_svcpt->scp_service;
 780         static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
 781         int                       rc;
 782         lnet_md_t                 md;
 783         lnet_handle_me_t          me_h;
 784
 785         CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
 786                service->srv_req_portal);
 787
 788         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
 789                 return (-ENOMEM);
 790
 791         /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
 792          * which means buffer can only be attached on local CPT, and LND
 793          * threads can find it by grabbing a local lock */
 794         rc = LNetMEAttach(service->srv_req_portal,
 795                           match_id, 0, ~0, LNET_UNLINK,
 796                           rqbd->rqbd_svcpt->scp_cpt >= 0 ?
 797                           LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
 798         if (rc != 0) {
 799                 CERROR("LNetMEAttach failed: %d\n", rc);
 800                 return (-ENOMEM);
 801         }
 802
 803         LASSERT(rqbd->rqbd_refcount == 0);
 804         rqbd->rqbd_refcount = 1;
 805
 806         md.start     = rqbd->rqbd_buffer;
 807         md.length    = service->srv_buf_size;
 808         md.max_size  = service->srv_max_req_size;
 809         md.threshold = LNET_MD_THRESH_INF;
 810         md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
 811         md.user_ptr  = &rqbd->rqbd_cbid;
 812         md.eq_handle = ptlrpc_eq_h;
 813
 814         rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
 815         if (rc == 0)
 816                 return (0);
 817
 818         CERROR("LNetMDAttach failed: %d; \n", rc);
 819         LASSERT (rc == -ENOMEM);
 820         rc = LNetMEUnlink (me_h);
 821         LASSERT (rc == 0);
 822         rqbd->rqbd_refcount = 0;
 823
 824         return (-ENOMEM);
 825 }