lustre/osc/osc_request.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  */
  32
  33 #define DEBUG_SUBSYSTEM S_OSC
  34
  35 #include <lprocfs_status.h>
  36 #include <lustre_debug.h>
  37 #include <lustre_dlm.h>
  38 #include <lustre_fid.h>
  39 #include <lustre_ha.h>
  40 #include <uapi/linux/lustre/lustre_ioctl.h>
  41 #include <lustre_net.h>
  42 #include <lustre_obdo.h>
  43 #include <uapi/linux/lustre/lustre_param.h>
  44 #include <obd.h>
  45 #include <obd_cksum.h>
  46 #include <obd_class.h>
  47 #include <lustre_osc.h>
  48
  49 #include "osc_internal.h"
  50
  51 atomic_t osc_pool_req_count;
  52 unsigned int osc_reqpool_maxreqcount;
  53 struct ptlrpc_request_pool *osc_rq_pool;
  54
  55 /* max memory used for request pool, unit is MB */
  56 static unsigned int osc_reqpool_mem_max = 5;
  57 module_param(osc_reqpool_mem_max, uint, 0444);
  58
  59 #define osc_grant_args osc_brw_async_args
  60
  61 struct osc_setattr_args {
  62         struct obdo             *sa_oa;
  63         obd_enqueue_update_f     sa_upcall;
  64         void                    *sa_cookie;
  65 };
  66
  67 struct osc_fsync_args {
  68         struct osc_object       *fa_obj;
  69         struct obdo             *fa_oa;
  70         obd_enqueue_update_f    fa_upcall;
  71         void                    *fa_cookie;
  72 };
  73
  74 struct osc_ladvise_args {
  75         struct obdo             *la_oa;
  76         obd_enqueue_update_f     la_upcall;
  77         void                    *la_cookie;
  78 };
  79
  80 static void osc_release_ppga(struct brw_page **ppga, size_t count);
  81 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
  82                          void *data, int rc);
  83
  84 void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
  85 {
  86         struct ost_body *body;
  87
  88         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
  89         LASSERT(body);
  90
  91         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
  92 }
  93
  94 static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
  95                        struct obdo *oa)
  96 {
  97         struct ptlrpc_request   *req;
  98         struct ost_body         *body;
  99         int                      rc;
 100
 101         ENTRY;
 102         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
 103         if (req == NULL)
 104                 RETURN(-ENOMEM);
 105
 106         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
 107         if (rc) {
 108                 ptlrpc_request_free(req);
 109                 RETURN(rc);
 110         }
 111
 112         osc_pack_req_body(req, oa);
 113
 114         ptlrpc_request_set_replen(req);
 115
 116         rc = ptlrpc_queue_wait(req);
 117         if (rc)
 118                 GOTO(out, rc);
 119
 120         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 121         if (body == NULL)
 122                 GOTO(out, rc = -EPROTO);
 123
 124         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
 125         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 126
 127         oa->o_blksize = cli_brw_size(exp->exp_obd);
 128         oa->o_valid |= OBD_MD_FLBLKSZ;
 129
 130         EXIT;
 131 out:
 132         ptlrpc_req_finished(req);
 133
 134         return rc;
 135 }
 136
 137 static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
 138                        struct obdo *oa)
 139 {
 140         struct ptlrpc_request   *req;
 141         struct ost_body         *body;
 142         int                      rc;
 143
 144         ENTRY;
 145         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 146
 147         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 148         if (req == NULL)
 149                 RETURN(-ENOMEM);
 150
 151         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 152         if (rc) {
 153                 ptlrpc_request_free(req);
 154                 RETURN(rc);
 155         }
 156
 157         osc_pack_req_body(req, oa);
 158
 159         ptlrpc_request_set_replen(req);
 160
 161         rc = ptlrpc_queue_wait(req);
 162         if (rc)
 163                 GOTO(out, rc);
 164
 165         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 166         if (body == NULL)
 167                 GOTO(out, rc = -EPROTO);
 168
 169         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 170
 171         EXIT;
 172 out:
 173         ptlrpc_req_finished(req);
 174
 175         RETURN(rc);
 176 }
 177
 178 static int osc_setattr_interpret(const struct lu_env *env,
 179                                  struct ptlrpc_request *req,
 180                                  struct osc_setattr_args *sa, int rc)
 181 {
 182         struct ost_body *body;
 183         ENTRY;
 184
 185         if (rc != 0)
 186                 GOTO(out, rc);
 187
 188         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 189         if (body == NULL)
 190                 GOTO(out, rc = -EPROTO);
 191
 192         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
 193                              &body->oa);
 194 out:
 195         rc = sa->sa_upcall(sa->sa_cookie, rc);
 196         RETURN(rc);
 197 }
 198
 199 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 200                       obd_enqueue_update_f upcall, void *cookie,
 201                       struct ptlrpc_request_set *rqset)
 202 {
 203         struct ptlrpc_request   *req;
 204         struct osc_setattr_args *sa;
 205         int                      rc;
 206
 207         ENTRY;
 208
 209         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 210         if (req == NULL)
 211                 RETURN(-ENOMEM);
 212
 213         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 214         if (rc) {
 215                 ptlrpc_request_free(req);
 216                 RETURN(rc);
 217         }
 218
 219         osc_pack_req_body(req, oa);
 220
 221         ptlrpc_request_set_replen(req);
 222
 223         /* do mds to ost setattr asynchronously */
 224         if (!rqset) {
 225                 /* Do not wait for response. */
 226                 ptlrpcd_add_req(req);
 227         } else {
 228                 req->rq_interpret_reply =
 229                         (ptlrpc_interpterer_t)osc_setattr_interpret;
 230
 231                 CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args));
 232                 sa = ptlrpc_req_async_args(req);
 233                 sa->sa_oa = oa;
 234                 sa->sa_upcall = upcall;
 235                 sa->sa_cookie = cookie;
 236
 237                 if (rqset == PTLRPCD_SET)
 238                         ptlrpcd_add_req(req);
 239                 else
 240                         ptlrpc_set_add_req(rqset, req);
 241         }
 242
 243         RETURN(0);
 244 }
 245
 246 static int osc_ladvise_interpret(const struct lu_env *env,
 247                                  struct ptlrpc_request *req,
 248                                  void *arg, int rc)
 249 {
 250         struct osc_ladvise_args *la = arg;
 251         struct ost_body *body;
 252         ENTRY;
 253
 254         if (rc != 0)
 255                 GOTO(out, rc);
 256
 257         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 258         if (body == NULL)
 259                 GOTO(out, rc = -EPROTO);
 260
 261         *la->la_oa = body->oa;
 262 out:
 263         rc = la->la_upcall(la->la_cookie, rc);
 264         RETURN(rc);
 265 }
 266
 267 /**
 268  * If rqset is NULL, do not wait for response. Upcall and cookie could also
 269  * be NULL in this case
 270  */
 271 int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 272                      struct ladvise_hdr *ladvise_hdr,
 273                      obd_enqueue_update_f upcall, void *cookie,
 274                      struct ptlrpc_request_set *rqset)
 275 {
 276         struct ptlrpc_request   *req;
 277         struct ost_body         *body;
 278         struct osc_ladvise_args *la;
 279         int                      rc;
 280         struct lu_ladvise       *req_ladvise;
 281         struct lu_ladvise       *ladvise = ladvise_hdr->lah_advise;
 282         int                      num_advise = ladvise_hdr->lah_count;
 283         struct ladvise_hdr      *req_ladvise_hdr;
 284         ENTRY;
 285
 286         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
 287         if (req == NULL)
 288                 RETURN(-ENOMEM);
 289
 290         req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
 291                              num_advise * sizeof(*ladvise));
 292         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
 293         if (rc != 0) {
 294                 ptlrpc_request_free(req);
 295                 RETURN(rc);
 296         }
 297         req->rq_request_portal = OST_IO_PORTAL;
 298         ptlrpc_at_set_req_timeout(req);
 299
 300         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 301         LASSERT(body);
 302         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
 303                              oa);
 304
 305         req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
 306                                                  &RMF_OST_LADVISE_HDR);
 307         memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
 308
 309         req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
 310         memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
 311         ptlrpc_request_set_replen(req);
 312
 313         if (rqset == NULL) {
 314                 /* Do not wait for response. */
 315                 ptlrpcd_add_req(req);
 316                 RETURN(0);
 317         }
 318
 319         req->rq_interpret_reply = osc_ladvise_interpret;
 320         CLASSERT(sizeof(*la) <= sizeof(req->rq_async_args));
 321         la = ptlrpc_req_async_args(req);
 322         la->la_oa = oa;
 323         la->la_upcall = upcall;
 324         la->la_cookie = cookie;
 325
 326         if (rqset == PTLRPCD_SET)
 327                 ptlrpcd_add_req(req);
 328         else
 329                 ptlrpc_set_add_req(rqset, req);
 330
 331         RETURN(0);
 332 }
 333
 334 static int osc_create(const struct lu_env *env, struct obd_export *exp,
 335                       struct obdo *oa)
 336 {
 337         struct ptlrpc_request *req;
 338         struct ost_body       *body;
 339         int                    rc;
 340         ENTRY;
 341
 342         LASSERT(oa != NULL);
 343         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 344         LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
 345
 346         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
 347         if (req == NULL)
 348                 GOTO(out, rc = -ENOMEM);
 349
 350         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
 351         if (rc) {
 352                 ptlrpc_request_free(req);
 353                 GOTO(out, rc);
 354         }
 355
 356         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 357         LASSERT(body);
 358
 359         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 360
 361         ptlrpc_request_set_replen(req);
 362
 363         rc = ptlrpc_queue_wait(req);
 364         if (rc)
 365                 GOTO(out_req, rc);
 366
 367         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 368         if (body == NULL)
 369                 GOTO(out_req, rc = -EPROTO);
 370
 371         CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
 372         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 373
 374         oa->o_blksize = cli_brw_size(exp->exp_obd);
 375         oa->o_valid |= OBD_MD_FLBLKSZ;
 376
 377         CDEBUG(D_HA, "transno: %lld\n",
 378                lustre_msg_get_transno(req->rq_repmsg));
 379 out_req:
 380         ptlrpc_req_finished(req);
 381 out:
 382         RETURN(rc);
 383 }
 384
 385 int osc_punch_send(struct obd_export *exp, struct obdo *oa,
 386                    obd_enqueue_update_f upcall, void *cookie)
 387 {
 388         struct ptlrpc_request *req;
 389         struct osc_setattr_args *sa;
 390         struct obd_import *imp = class_exp2cliimp(exp);
 391         struct ost_body *body;
 392         int rc;
 393
 394         ENTRY;
 395
 396         req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
 397         if (req == NULL)
 398                 RETURN(-ENOMEM);
 399
 400         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
 401         if (rc < 0) {
 402                 ptlrpc_request_free(req);
 403                 RETURN(rc);
 404         }
 405
 406         osc_set_io_portal(req);
 407
 408         ptlrpc_at_set_req_timeout(req);
 409
 410         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 411
 412         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 413
 414         ptlrpc_request_set_replen(req);
 415
 416         req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
 417         CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args));
 418         sa = ptlrpc_req_async_args(req);
 419         sa->sa_oa = oa;
 420         sa->sa_upcall = upcall;
 421         sa->sa_cookie = cookie;
 422
 423         ptlrpcd_add_req(req);
 424
 425         RETURN(0);
 426 }
 427 EXPORT_SYMBOL(osc_punch_send);
 428
 429 static int osc_sync_interpret(const struct lu_env *env,
 430                               struct ptlrpc_request *req,
 431                               void *arg, int rc)
 432 {
 433         struct osc_fsync_args   *fa = arg;
 434         struct ost_body         *body;
 435         struct cl_attr          *attr = &osc_env_info(env)->oti_attr;
 436         unsigned long           valid = 0;
 437         struct cl_object        *obj;
 438         ENTRY;
 439
 440         if (rc != 0)
 441                 GOTO(out, rc);
 442
 443         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 444         if (body == NULL) {
 445                 CERROR("can't unpack ost_body\n");
 446                 GOTO(out, rc = -EPROTO);
 447         }
 448
 449         *fa->fa_oa = body->oa;
 450         obj = osc2cl(fa->fa_obj);
 451
 452         /* Update osc object's blocks attribute */
 453         cl_object_attr_lock(obj);
 454         if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
 455                 attr->cat_blocks = body->oa.o_blocks;
 456                 valid |= CAT_BLOCKS;
 457         }
 458
 459         if (valid != 0)
 460                 cl_object_attr_update(env, obj, attr, valid);
 461         cl_object_attr_unlock(obj);
 462
 463 out:
 464         rc = fa->fa_upcall(fa->fa_cookie, rc);
 465         RETURN(rc);
 466 }
 467
 468 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 469                   obd_enqueue_update_f upcall, void *cookie,
 470                   struct ptlrpc_request_set *rqset)
 471 {
 472         struct obd_export     *exp = osc_export(obj);
 473         struct ptlrpc_request *req;
 474         struct ost_body       *body;
 475         struct osc_fsync_args *fa;
 476         int                    rc;
 477         ENTRY;
 478
 479         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
 480         if (req == NULL)
 481                 RETURN(-ENOMEM);
 482
 483         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
 484         if (rc) {
 485                 ptlrpc_request_free(req);
 486                 RETURN(rc);
 487         }
 488
 489         /* overload the size and blocks fields in the oa with start/end */
 490         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 491         LASSERT(body);
 492         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 493
 494         ptlrpc_request_set_replen(req);
 495         req->rq_interpret_reply = osc_sync_interpret;
 496
 497         CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
 498         fa = ptlrpc_req_async_args(req);
 499         fa->fa_obj = obj;
 500         fa->fa_oa = oa;
 501         fa->fa_upcall = upcall;
 502         fa->fa_cookie = cookie;
 503
 504         if (rqset == PTLRPCD_SET)
 505                 ptlrpcd_add_req(req);
 506         else
 507                 ptlrpc_set_add_req(rqset, req);
 508
 509         RETURN (0);
 510 }
 511
 512 /* Find and cancel locally locks matched by @mode in the resource found by
 513  * @objid. Found locks are added into @cancel list. Returns the amount of
 514  * locks added to @cancels list. */
 515 static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
 516                                    struct list_head *cancels,
 517                                    enum ldlm_mode mode, __u64 lock_flags)
 518 {
 519         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
 520         struct ldlm_res_id res_id;
 521         struct ldlm_resource *res;
 522         int count;
 523         ENTRY;
 524
 525         /* Return, i.e. cancel nothing, only if ELC is supported (flag in
 526          * export) but disabled through procfs (flag in NS).
 527          *
 528          * This distinguishes from a case when ELC is not supported originally,
 529          * when we still want to cancel locks in advance and just cancel them
 530          * locally, without sending any RPC. */
 531         if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
 532                 RETURN(0);
 533
 534         ostid_build_res_name(&oa->o_oi, &res_id);
 535         res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
 536         if (IS_ERR(res))
 537                 RETURN(0);
 538
 539         LDLM_RESOURCE_ADDREF(res);
 540         count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
 541                                            lock_flags, 0, NULL);
 542         LDLM_RESOURCE_DELREF(res);
 543         ldlm_resource_putref(res);
 544         RETURN(count);
 545 }
 546
 547 static int osc_destroy_interpret(const struct lu_env *env,
 548                                  struct ptlrpc_request *req, void *data,
 549                                  int rc)
 550 {
 551         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 552
 553         atomic_dec(&cli->cl_destroy_in_flight);
 554         wake_up(&cli->cl_destroy_waitq);
 555         return 0;
 556 }
 557
 558 static int osc_can_send_destroy(struct client_obd *cli)
 559 {
 560         if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
 561             cli->cl_max_rpcs_in_flight) {
 562                 /* The destroy request can be sent */
 563                 return 1;
 564         }
 565         if (atomic_dec_return(&cli->cl_destroy_in_flight) <
 566             cli->cl_max_rpcs_in_flight) {
 567                 /*
 568                  * The counter has been modified between the two atomic
 569                  * operations.
 570                  */
 571                 wake_up(&cli->cl_destroy_waitq);
 572         }
 573         return 0;
 574 }
 575
 576 static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
 577                        struct obdo *oa)
 578 {
 579         struct client_obd     *cli = &exp->exp_obd->u.cli;
 580         struct ptlrpc_request *req;
 581         struct ost_body       *body;
 582         struct list_head       cancels = LIST_HEAD_INIT(cancels);
 583         int rc, count;
 584         ENTRY;
 585
 586         if (!oa) {
 587                 CDEBUG(D_INFO, "oa NULL\n");
 588                 RETURN(-EINVAL);
 589         }
 590
 591         count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
 592                                         LDLM_FL_DISCARD_DATA);
 593
 594         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
 595         if (req == NULL) {
 596                 ldlm_lock_list_put(&cancels, l_bl_ast, count);
 597                 RETURN(-ENOMEM);
 598         }
 599
 600         rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
 601                                0, &cancels, count);
 602         if (rc) {
 603                 ptlrpc_request_free(req);
 604                 RETURN(rc);
 605         }
 606
 607         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
 608         ptlrpc_at_set_req_timeout(req);
 609
 610         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 611         LASSERT(body);
 612         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 613
 614         ptlrpc_request_set_replen(req);
 615
 616         req->rq_interpret_reply = osc_destroy_interpret;
 617         if (!osc_can_send_destroy(cli)) {
 618                 struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
 619
 620                 /*
 621                  * Wait until the number of on-going destroy RPCs drops
 622                  * under max_rpc_in_flight
 623                  */
 624                 rc = l_wait_event_exclusive(cli->cl_destroy_waitq,
 625                                             osc_can_send_destroy(cli), &lwi);
 626                 if (rc) {
 627                         ptlrpc_req_finished(req);
 628                         RETURN(rc);
 629                 }
 630         }
 631
 632         /* Do not wait for response */
 633         ptlrpcd_add_req(req);
 634         RETURN(0);
 635 }
 636
 637 static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 638                                 long writing_bytes)
 639 {
 640         u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
 641
 642         LASSERT(!(oa->o_valid & bits));
 643
 644         oa->o_valid |= bits;
 645         spin_lock(&cli->cl_loi_list_lock);
 646         if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, GRANT_PARAM))
 647                 oa->o_dirty = cli->cl_dirty_grant;
 648         else
 649                 oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
 650         if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
 651                      cli->cl_dirty_max_pages)) {
 652                 CERROR("dirty %lu - %lu > dirty_max %lu\n",
 653                        cli->cl_dirty_pages, cli->cl_dirty_transit,
 654                        cli->cl_dirty_max_pages);
 655                 oa->o_undirty = 0;
 656         } else if (unlikely(atomic_long_read(&obd_dirty_pages) -
 657                             atomic_long_read(&obd_dirty_transit_pages) >
 658                             (long)(obd_max_dirty_pages + 1))) {
 659                 /* The atomic_read() allowing the atomic_inc() are
 660                  * not covered by a lock thus they may safely race and trip
 661                  * this CERROR() unless we add in a small fudge factor (+1). */
 662                 CERROR("%s: dirty %ld - %ld > system dirty_max %ld\n",
 663                        cli_name(cli), atomic_long_read(&obd_dirty_pages),
 664                        atomic_long_read(&obd_dirty_transit_pages),
 665                        obd_max_dirty_pages);
 666                 oa->o_undirty = 0;
 667         } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
 668                             0x7fffffff)) {
 669                 CERROR("dirty %lu - dirty_max %lu too big???\n",
 670                        cli->cl_dirty_pages, cli->cl_dirty_max_pages);
 671                 oa->o_undirty = 0;
 672         } else {
 673                 unsigned long nrpages;
 674
 675                 nrpages = cli->cl_max_pages_per_rpc;
 676                 nrpages *= cli->cl_max_rpcs_in_flight + 1;
 677                 nrpages = max(nrpages, cli->cl_dirty_max_pages);
 678                 oa->o_undirty = nrpages << PAGE_SHIFT;
 679                 if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data,
 680                                  GRANT_PARAM)) {
 681                         int nrextents;
 682
 683                         /* take extent tax into account when asking for more
 684                          * grant space */
 685                         nrextents = (nrpages + cli->cl_max_extent_pages - 1)  /
 686                                      cli->cl_max_extent_pages;
 687                         oa->o_undirty += nrextents * cli->cl_grant_extent_tax;
 688                 }
 689         }
 690         oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
 691         oa->o_dropped = cli->cl_lost_grant;
 692         cli->cl_lost_grant = 0;
 693         spin_unlock(&cli->cl_loi_list_lock);
 694         CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
 695                oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
 696 }
 697
 698 void osc_update_next_shrink(struct client_obd *cli)
 699 {
 700         cli->cl_next_shrink_grant = ktime_get_seconds() +
 701                                     cli->cl_grant_shrink_interval;
 702
 703         CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
 704                cli->cl_next_shrink_grant);
 705 }
 706
 707 static void __osc_update_grant(struct client_obd *cli, u64 grant)
 708 {
 709         spin_lock(&cli->cl_loi_list_lock);
 710         cli->cl_avail_grant += grant;
 711         spin_unlock(&cli->cl_loi_list_lock);
 712 }
 713
 714 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
 715 {
 716         if (body->oa.o_valid & OBD_MD_FLGRANT) {
 717                 CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
 718                 __osc_update_grant(cli, body->oa.o_grant);
 719         }
 720 }
 721
 722 static int osc_shrink_grant_interpret(const struct lu_env *env,
 723                                       struct ptlrpc_request *req,
 724                                       void *aa, int rc)
 725 {
 726         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 727         struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
 728         struct ost_body *body;
 729
 730         if (rc != 0) {
 731                 __osc_update_grant(cli, oa->o_grant);
 732                 GOTO(out, rc);
 733         }
 734
 735         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 736         LASSERT(body);
 737         osc_update_grant(cli, body);
 738 out:
 739         OBDO_FREE(oa);
 740         return rc;
 741 }
 742
 743 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
 744 {
 745         spin_lock(&cli->cl_loi_list_lock);
 746         oa->o_grant = cli->cl_avail_grant / 4;
 747         cli->cl_avail_grant -= oa->o_grant;
 748         spin_unlock(&cli->cl_loi_list_lock);
 749         if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
 750                 oa->o_valid |= OBD_MD_FLFLAGS;
 751                 oa->o_flags = 0;
 752         }
 753         oa->o_flags |= OBD_FL_SHRINK_GRANT;
 754         osc_update_next_shrink(cli);
 755 }
 756
 757 /* Shrink the current grant, either from some large amount to enough for a
 758  * full set of in-flight RPCs, or if we have already shrunk to that limit
 759  * then to enough for a single RPC.  This avoids keeping more grant than
 760  * needed, and avoids shrinking the grant piecemeal. */
 761 static int osc_shrink_grant(struct client_obd *cli)
 762 {
 763         __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
 764                              (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
 765
 766         spin_lock(&cli->cl_loi_list_lock);
 767         if (cli->cl_avail_grant <= target_bytes)
 768                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 769         spin_unlock(&cli->cl_loi_list_lock);
 770
 771         return osc_shrink_grant_to_target(cli, target_bytes);
 772 }
 773
 774 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 775 {
 776         int                     rc = 0;
 777         struct ost_body        *body;
 778         ENTRY;
 779
 780         spin_lock(&cli->cl_loi_list_lock);
 781         /* Don't shrink if we are already above or below the desired limit
 782          * We don't want to shrink below a single RPC, as that will negatively
 783          * impact block allocation and long-term performance. */
 784         if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
 785                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 786
 787         if (target_bytes >= cli->cl_avail_grant) {
 788                 spin_unlock(&cli->cl_loi_list_lock);
 789                 RETURN(0);
 790         }
 791         spin_unlock(&cli->cl_loi_list_lock);
 792
 793         OBD_ALLOC_PTR(body);
 794         if (!body)
 795                 RETURN(-ENOMEM);
 796
 797         osc_announce_cached(cli, &body->oa, 0);
 798
 799         spin_lock(&cli->cl_loi_list_lock);
 800         body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 801         cli->cl_avail_grant = target_bytes;
 802         spin_unlock(&cli->cl_loi_list_lock);
 803         if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
 804                 body->oa.o_valid |= OBD_MD_FLFLAGS;
 805                 body->oa.o_flags = 0;
 806         }
 807         body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
 808         osc_update_next_shrink(cli);
 809
 810         rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
 811                                 sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
 812                                 sizeof(*body), body, NULL);
 813         if (rc != 0)
 814                 __osc_update_grant(cli, body->oa.o_grant);
 815         OBD_FREE_PTR(body);
 816         RETURN(rc);
 817 }
 818
 819 static int osc_should_shrink_grant(struct client_obd *client)
 820 {
 821         time64_t next_shrink = client->cl_next_shrink_grant;
 822
 823         if ((client->cl_import->imp_connect_data.ocd_connect_flags &
 824              OBD_CONNECT_GRANT_SHRINK) == 0)
 825                 return 0;
 826
 827         if (ktime_get_seconds() >= next_shrink - 5) {
 828                 /* Get the current RPC size directly, instead of going via:
 829                  * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 830                  * Keep comment here so that it can be found by searching. */
 831                 int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
 832
 833                 if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
 834                     client->cl_avail_grant > brw_size)
 835                         return 1;
 836                 else
 837                         osc_update_next_shrink(client);
 838         }
 839         return 0;
 840 }
 841
 842 static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
 843 {
 844         struct client_obd *client;
 845
 846         list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
 847                 if (osc_should_shrink_grant(client))
 848                         osc_shrink_grant(client);
 849         }
 850         return 0;
 851 }
 852
 853 static int osc_add_shrink_grant(struct client_obd *client)
 854 {
 855         int rc;
 856
 857         rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
 858                                        TIMEOUT_GRANT,
 859                                        osc_grant_shrink_grant_cb, NULL,
 860                                        &client->cl_grant_shrink_list);
 861         if (rc) {
 862                 CERROR("add grant client %s error %d\n", cli_name(client), rc);
 863                 return rc;
 864         }
 865         CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
 866         osc_update_next_shrink(client);
 867         return 0;
 868 }
 869
 870 static int osc_del_shrink_grant(struct client_obd *client)
 871 {
 872         return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
 873                                          TIMEOUT_GRANT);
 874 }
 875
 876 void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 877 {
 878         /*
 879          * ocd_grant is the total grant amount we're expect to hold: if we've
 880          * been evicted, it's the new avail_grant amount, cl_dirty_pages will
 881          * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
 882          * dirty.
 883          *
 884          * race is tolerable here: if we're evicted, but imp_state already
 885          * left EVICTED state, then cl_dirty_pages must be 0 already.
 886          */
 887         spin_lock(&cli->cl_loi_list_lock);
 888         cli->cl_avail_grant = ocd->ocd_grant;
 889         if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
 890                 cli->cl_avail_grant -= cli->cl_reserved_grant;
 891                 if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
 892                         cli->cl_avail_grant -= cli->cl_dirty_grant;
 893                 else
 894                         cli->cl_avail_grant -=
 895                                         cli->cl_dirty_pages << PAGE_SHIFT;
 896         }
 897
 898         if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
 899                 u64 size;
 900                 int chunk_mask;
 901
 902                 /* overhead for each extent insertion */
 903                 cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
 904                 /* determine the appropriate chunk size used by osc_extent. */
 905                 cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
 906                                           ocd->ocd_grant_blkbits);
 907                 /* max_pages_per_rpc must be chunk aligned */
 908                 chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
 909                 cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
 910                                              ~chunk_mask) & chunk_mask;
 911                 /* determine maximum extent size, in #pages */
 912                 size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
 913                 cli->cl_max_extent_pages = size >> PAGE_SHIFT;
 914                 if (cli->cl_max_extent_pages == 0)
 915                         cli->cl_max_extent_pages = 1;
 916         } else {
 917                 cli->cl_grant_extent_tax = 0;
 918                 cli->cl_chunkbits = PAGE_SHIFT;
 919                 cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
 920         }
 921         spin_unlock(&cli->cl_loi_list_lock);
 922
 923         CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
 924                 "chunk bits: %d cl_max_extent_pages: %d\n",
 925                 cli_name(cli),
 926                 cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
 927                 cli->cl_max_extent_pages);
 928
 929         if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
 930             list_empty(&cli->cl_grant_shrink_list))
 931                 osc_add_shrink_grant(cli);
 932 }
 933 EXPORT_SYMBOL(osc_init_grant);
 934
 935 /* We assume that the reason this OSC got a short read is because it read
 936  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
 937  * via the LOV, and it _knows_ it's reading inside the file, it's just that
 938  * this stripe never got written at or beyond this stripe offset yet. */
 939 static void handle_short_read(int nob_read, size_t page_count,
 940                               struct brw_page **pga)
 941 {
 942         char *ptr;
 943         int i = 0;
 944
 945         /* skip bytes read OK */
 946         while (nob_read > 0) {
 947                 LASSERT (page_count > 0);
 948
 949                 if (pga[i]->count > nob_read) {
 950                         /* EOF inside this page */
 951                         ptr = kmap(pga[i]->pg) +
 952                                 (pga[i]->off & ~PAGE_MASK);
 953                         memset(ptr + nob_read, 0, pga[i]->count - nob_read);
 954                         kunmap(pga[i]->pg);
 955                         page_count--;
 956                         i++;
 957                         break;
 958                 }
 959
 960                 nob_read -= pga[i]->count;
 961                 page_count--;
 962                 i++;
 963         }
 964
 965         /* zero remaining pages */
 966         while (page_count-- > 0) {
 967                 ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
 968                 memset(ptr, 0, pga[i]->count);
 969                 kunmap(pga[i]->pg);
 970                 i++;
 971         }
 972 }
 973
 974 static int check_write_rcs(struct ptlrpc_request *req,
 975                            int requested_nob, int niocount,
 976                            size_t page_count, struct brw_page **pga)
 977 {
 978         int     i;
 979         __u32   *remote_rcs;
 980
 981         remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
 982                                                   sizeof(*remote_rcs) *
 983                                                   niocount);
 984         if (remote_rcs == NULL) {
 985                 CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
 986                 return(-EPROTO);
 987         }
 988
 989         /* return error if any niobuf was in error */
 990         for (i = 0; i < niocount; i++) {
 991                 if ((int)remote_rcs[i] < 0)
 992                         return(remote_rcs[i]);
 993
 994                 if (remote_rcs[i] != 0) {
 995                         CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
 996                                 i, remote_rcs[i], req);
 997                         return(-EPROTO);
 998                 }
 999         }
1000         if (req->rq_bulk != NULL &&
1001             req->rq_bulk->bd_nob_transferred != requested_nob) {
1002                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
1003                        req->rq_bulk->bd_nob_transferred, requested_nob);
1004                 return(-EPROTO);
1005         }
1006
1007         return (0);
1008 }
1009
1010 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
1011 {
1012         if (p1->flag != p2->flag) {
1013                 unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
1014                                   OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
1015                                   OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
1016
1017                 /* warn if we try to combine flags that we don't know to be
1018                  * safe to combine */
1019                 if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
1020                         CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
1021                               "report this at https://jira.hpdd.intel.com/\n",
1022                               p1->flag, p2->flag);
1023                 }
1024                 return 0;
1025         }
1026
1027         return (p1->off + p1->count == p2->off);
1028 }
1029
1030 static int osc_checksum_bulk(int nob, size_t pg_count,
1031                              struct brw_page **pga, int opc,
1032                              enum cksum_types cksum_type,
1033                              u32 *cksum)
1034 {
1035         int                             i = 0;
1036         struct cfs_crypto_hash_desc     *hdesc;
1037         unsigned int                    bufsize;
1038         unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
1039
1040         LASSERT(pg_count > 0);
1041
1042         hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1043         if (IS_ERR(hdesc)) {
1044                 CERROR("Unable to initialize checksum hash %s\n",
1045                        cfs_crypto_hash_name(cfs_alg));
1046                 return PTR_ERR(hdesc);
1047         }
1048
1049         while (nob > 0 && pg_count > 0) {
1050                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1051
1052                 /* corrupt the data before we compute the checksum, to
1053                  * simulate an OST->client data error */
1054                 if (i == 0 && opc == OST_READ &&
1055                     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
1056                         unsigned char *ptr = kmap(pga[i]->pg);
1057                         int off = pga[i]->off & ~PAGE_MASK;
1058
1059                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1060                         kunmap(pga[i]->pg);
1061                 }
1062                 cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
1063                                             pga[i]->off & ~PAGE_MASK,
1064                                             count);
1065                 LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
1066                                (int)(pga[i]->off & ~PAGE_MASK));
1067
1068                 nob -= pga[i]->count;
1069                 pg_count--;
1070                 i++;
1071         }
1072
1073         bufsize = sizeof(*cksum);
1074         cfs_crypto_hash_final(hdesc, (unsigned char *)cksum, &bufsize);
1075
1076         /* For sending we only compute the wrong checksum instead
1077          * of corrupting the data so it is still correct on a redo */
1078         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1079                 (*cksum)++;
1080
1081         return 0;
1082 }
1083
1084 static int
1085 osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
1086                      u32 page_count, struct brw_page **pga,
1087                      struct ptlrpc_request **reqp, int resend)
1088 {
1089         struct ptlrpc_request   *req;
1090         struct ptlrpc_bulk_desc *desc;
1091         struct ost_body         *body;
1092         struct obd_ioobj        *ioobj;
1093         struct niobuf_remote    *niobuf;
1094         int niocount, i, requested_nob, opc, rc, short_io_size = 0;
1095         struct osc_brw_async_args *aa;
1096         struct req_capsule      *pill;
1097         struct brw_page *pg_prev;
1098         void *short_io_buf;
1099
1100         ENTRY;
1101         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
1102                 RETURN(-ENOMEM); /* Recoverable */
1103         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
1104                 RETURN(-EINVAL); /* Fatal */
1105
1106         if ((cmd & OBD_BRW_WRITE) != 0) {
1107                 opc = OST_WRITE;
1108                 req = ptlrpc_request_alloc_pool(cli->cl_import,
1109                                                 osc_rq_pool,
1110                                                 &RQF_OST_BRW_WRITE);
1111         } else {
1112                 opc = OST_READ;
1113                 req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
1114         }
1115         if (req == NULL)
1116                 RETURN(-ENOMEM);
1117
1118         for (niocount = i = 1; i < page_count; i++) {
1119                 if (!can_merge_pages(pga[i - 1], pga[i]))
1120                         niocount++;
1121         }
1122
1123         pill = &req->rq_pill;
1124         req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
1125                              sizeof(*ioobj));
1126         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
1127                              niocount * sizeof(*niobuf));
1128
1129         for (i = 0; i < page_count; i++)
1130                 short_io_size += pga[i]->count;
1131
1132         /* Check if we can do a short io. */
1133         if (!(short_io_size <= cli->cl_short_io_bytes && niocount == 1 &&
1134             imp_connect_shortio(cli->cl_import)))
1135                 short_io_size = 0;
1136
1137         req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
1138                              opc == OST_READ ? 0 : short_io_size);
1139         if (opc == OST_READ)
1140                 req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
1141                                      short_io_size);
1142
1143         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
1144         if (rc) {
1145                 ptlrpc_request_free(req);
1146                 RETURN(rc);
1147         }
1148         osc_set_io_portal(req);
1149
1150         ptlrpc_at_set_req_timeout(req);
1151         /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
1152          * retry logic */
1153         req->rq_no_retry_einprogress = 1;
1154
1155         if (short_io_size != 0) {
1156                 desc = NULL;
1157                 short_io_buf = NULL;
1158                 goto no_bulk;
1159         }
1160
1161         desc = ptlrpc_prep_bulk_imp(req, page_count,
1162                 cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
1163                 (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
1164                         PTLRPC_BULK_PUT_SINK) |
1165                         PTLRPC_BULK_BUF_KIOV,
1166                 OST_BULK_PORTAL,
1167                 &ptlrpc_bulk_kiov_pin_ops);
1168
1169         if (desc == NULL)
1170                 GOTO(out, rc = -ENOMEM);
1171         /* NB request now owns desc and will free it when it gets freed */
1172 no_bulk:
1173         body = req_capsule_client_get(pill, &RMF_OST_BODY);
1174         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
1175         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1176         LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
1177
1178         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
1179
1180         obdo_to_ioobj(oa, ioobj);
1181         ioobj->ioo_bufcnt = niocount;
1182         /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
1183          * that might be send for this request.  The actual number is decided
1184          * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
1185          * "max - 1" for old client compatibility sending "0", and also so the
1186          * the actual maximum is a power-of-two number, not one less. LU-1431 */
1187         if (desc != NULL)
1188                 ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
1189         else /* short io */
1190                 ioobj_max_brw_set(ioobj, 0);
1191
1192         if (short_io_size != 0) {
1193                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1194                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1195                         body->oa.o_flags = 0;
1196                 }
1197                 body->oa.o_flags |= OBD_FL_SHORT_IO;
1198                 CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
1199                        short_io_size);
1200                 if (opc == OST_WRITE) {
1201                         short_io_buf = req_capsule_client_get(pill,
1202                                                               &RMF_SHORT_IO);
1203                         LASSERT(short_io_buf != NULL);
1204                 }
1205         }
1206
1207         LASSERT(page_count > 0);
1208         pg_prev = pga[0];
1209         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
1210                 struct brw_page *pg = pga[i];
1211                 int poff = pg->off & ~PAGE_MASK;
1212
1213                 LASSERT(pg->count > 0);
1214                 /* make sure there is no gap in the middle of page array */
1215                 LASSERTF(page_count == 1 ||
1216                          (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
1217                           ergo(i > 0 && i < page_count - 1,
1218                                poff == 0 && pg->count == PAGE_SIZE)   &&
1219                           ergo(i == page_count - 1, poff == 0)),
1220                          "i: %d/%d pg: %p off: %llu, count: %u\n",
1221                          i, page_count, pg, pg->off, pg->count);
1222                 LASSERTF(i == 0 || pg->off > pg_prev->off,
1223                          "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
1224                          " prev_pg %p [pri %lu ind %lu] off %llu\n",
1225                          i, page_count,
1226                          pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
1227                          pg_prev->pg, page_private(pg_prev->pg),
1228                          pg_prev->pg->index, pg_prev->off);
1229                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
1230                         (pg->flag & OBD_BRW_SRVLOCK));
1231                 if (short_io_size != 0 && opc == OST_WRITE) {
1232                         unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
1233
1234                         LASSERT(short_io_size >= requested_nob + pg->count);
1235                         memcpy(short_io_buf + requested_nob,
1236                                ptr + poff,
1237                                pg->count);
1238                         ll_kunmap_atomic(ptr, KM_USER0);
1239                 } else if (short_io_size == 0) {
1240                         desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
1241                                                          pg->count);
1242                 }
1243                 requested_nob += pg->count;
1244
1245                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
1246                         niobuf--;
1247                         niobuf->rnb_len += pg->count;
1248                 } else {
1249                         niobuf->rnb_offset = pg->off;
1250                         niobuf->rnb_len    = pg->count;
1251                         niobuf->rnb_flags  = pg->flag;
1252                 }
1253                 pg_prev = pg;
1254         }
1255
1256         LASSERTF((void *)(niobuf - niocount) ==
1257                 req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
1258                 "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
1259                 &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
1260
1261         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
1262         if (resend) {
1263                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1264                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1265                         body->oa.o_flags = 0;
1266                 }
1267                 body->oa.o_flags |= OBD_FL_RECOV_RESEND;
1268         }
1269
1270         if (osc_should_shrink_grant(cli))
1271                 osc_shrink_grant_local(cli, &body->oa);
1272
1273         /* size[REQ_REC_OFF] still sizeof (*body) */
1274         if (opc == OST_WRITE) {
1275                 if (cli->cl_checksum &&
1276                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
1277                         /* store cl_cksum_type in a local variable since
1278                          * it can be changed via lprocfs */
1279                         enum cksum_types cksum_type = cli->cl_cksum_type;
1280
1281                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1282                                 body->oa.o_flags = 0;
1283
1284                         body->oa.o_flags |= cksum_type_pack(cksum_type);
1285                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1286
1287                         rc = osc_checksum_bulk(requested_nob, page_count,
1288                                                pga, OST_WRITE, cksum_type,
1289                                                &body->oa.o_cksum);
1290                         if (rc < 0) {
1291                                 CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
1292                                        rc);
1293                                 GOTO(out, rc);
1294                         }
1295                         CDEBUG(D_PAGE, "checksum at write origin: %x\n",
1296                                body->oa.o_cksum);
1297
1298                         /* save this in 'oa', too, for later checking */
1299                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1300                         oa->o_flags |= cksum_type_pack(cksum_type);
1301                 } else {
1302                         /* clear out the checksum flag, in case this is a
1303                          * resend but cl_checksum is no longer set. b=11238 */
1304                         oa->o_valid &= ~OBD_MD_FLCKSUM;
1305                 }
1306                 oa->o_cksum = body->oa.o_cksum;
1307                 /* 1 RC per niobuf */
1308                 req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
1309                                      sizeof(__u32) * niocount);
1310         } else {
1311                 if (cli->cl_checksum &&
1312                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
1313                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1314                                 body->oa.o_flags = 0;
1315                         body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
1316                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1317                 }
1318
1319                 /* Client cksum has been already copied to wire obdo in previous
1320                  * lustre_set_wire_obdo(), and in the case a bulk-read is being
1321                  * resent due to cksum error, this will allow Server to
1322                  * check+dump pages on its side */
1323         }
1324         ptlrpc_request_set_replen(req);
1325
1326         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
1327         aa = ptlrpc_req_async_args(req);
1328         aa->aa_oa = oa;
1329         aa->aa_requested_nob = requested_nob;
1330         aa->aa_nio_count = niocount;
1331         aa->aa_page_count = page_count;
1332         aa->aa_resends = 0;
1333         aa->aa_ppga = pga;
1334         aa->aa_cli = cli;
1335         INIT_LIST_HEAD(&aa->aa_oaps);
1336
1337         *reqp = req;
1338         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1339         CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
1340                 req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
1341                 niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
1342         RETURN(0);
1343
1344  out:
1345         ptlrpc_req_finished(req);
1346         RETURN(rc);
1347 }
1348
1349 char dbgcksum_file_name[PATH_MAX];
1350
1351 static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
1352                                 struct brw_page **pga, __u32 server_cksum,
1353                                 __u32 client_cksum)
1354 {
1355         struct file *filp;
1356         int rc, i;
1357         unsigned int len;
1358         char *buf;
1359         mm_segment_t oldfs;
1360
1361         /* will only keep dump of pages on first error for the same range in
1362          * file/fid, not during the resends/retries. */
1363         snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
1364                  "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
1365                  (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ?
1366                   libcfs_debug_file_path_arr :
1367                   LIBCFS_DEBUG_FILE_PATH_DEFAULT),
1368                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
1369                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1370                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1371                  pga[0]->off,
1372                  pga[page_count-1]->off + pga[page_count-1]->count - 1,
1373                  client_cksum, server_cksum);
1374         filp = filp_open(dbgcksum_file_name,
1375                          O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
1376         if (IS_ERR(filp)) {
1377                 rc = PTR_ERR(filp);
1378                 if (rc == -EEXIST)
1379                         CDEBUG(D_INFO, "%s: can't open to dump pages with "
1380                                "checksum error: rc = %d\n", dbgcksum_file_name,
1381                                rc);
1382                 else
1383                         CERROR("%s: can't open to dump pages with checksum "
1384                                "error: rc = %d\n", dbgcksum_file_name, rc);
1385                 return;
1386         }
1387
1388         oldfs = get_fs();
1389         set_fs(KERNEL_DS);
1390         for (i = 0; i < page_count; i++) {
1391                 len = pga[i]->count;
1392                 buf = kmap(pga[i]->pg);
1393                 while (len != 0) {
1394                         rc = vfs_write(filp, (__force const char __user *)buf,
1395                                        len, &filp->f_pos);
1396                         if (rc < 0) {
1397                                 CERROR("%s: wanted to write %u but got %d "
1398                                        "error\n", dbgcksum_file_name, len, rc);
1399                                 break;
1400                         }
1401                         len -= rc;
1402                         buf += rc;
1403                         CDEBUG(D_INFO, "%s: wrote %d bytes\n",
1404                                dbgcksum_file_name, rc);
1405                 }
1406                 kunmap(pga[i]->pg);
1407         }
1408         set_fs(oldfs);
1409
1410         rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
1411         if (rc)
1412                 CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
1413         filp_close(filp, NULL);
1414         return;
1415 }
1416
1417 static int
1418 check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
1419                                 __u32 client_cksum, __u32 server_cksum,
1420                                 struct osc_brw_async_args *aa)
1421 {
1422         __u32 new_cksum;
1423         char *msg;
1424         enum cksum_types cksum_type;
1425         int rc;
1426
1427         if (server_cksum == client_cksum) {
1428                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
1429                 return 0;
1430         }
1431
1432         if (aa->aa_cli->cl_checksum_dump)
1433                 dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
1434                                     server_cksum, client_cksum);
1435
1436         cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
1437                                        oa->o_flags : 0);
1438         rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
1439                                aa->aa_ppga, OST_WRITE, cksum_type,
1440                                &new_cksum);
1441
1442         if (rc < 0)
1443                 msg = "failed to calculate the client write checksum";
1444         else if (cksum_type != cksum_type_unpack(aa->aa_oa->o_flags))
1445                 msg = "the server did not use the checksum type specified in "
1446                       "the original request - likely a protocol problem";
1447         else if (new_cksum == server_cksum)
1448                 msg = "changed on the client after we checksummed it - "
1449                       "likely false positive due to mmap IO (bug 11742)";
1450         else if (new_cksum == client_cksum)
1451                 msg = "changed in transit before arrival at OST";
1452         else
1453                 msg = "changed in transit AND doesn't match the original - "
1454                       "likely false positive due to mmap IO (bug 11742)";
1455
1456         LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
1457                            DFID " object "DOSTID" extent [%llu-%llu], original "
1458                            "client csum %x (type %x), server csum %x (type %x),"
1459                            " client csum now %x\n",
1460                            aa->aa_cli->cl_import->imp_obd->obd_name,
1461                            msg, libcfs_nid2str(peer->nid),
1462                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
1463                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1464                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1465                            POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
1466                            aa->aa_ppga[aa->aa_page_count - 1]->off +
1467                                 aa->aa_ppga[aa->aa_page_count-1]->count - 1,
1468                            client_cksum, cksum_type_unpack(aa->aa_oa->o_flags),
1469                            server_cksum, cksum_type, new_cksum);
1470         return 1;
1471 }
1472
1473 /* Note rc enters this function as number of bytes transferred */
1474 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
1475 {
1476         struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
1477         const struct lnet_process_id *peer =
1478                         &req->rq_import->imp_connection->c_peer;
1479         struct client_obd *cli = aa->aa_cli;
1480         struct ost_body *body;
1481         u32 client_cksum = 0;
1482         ENTRY;
1483
1484         if (rc < 0 && rc != -EDQUOT) {
1485                 DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
1486                 RETURN(rc);
1487         }
1488
1489         LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
1490         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
1491         if (body == NULL) {
1492                 DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
1493                 RETURN(-EPROTO);
1494         }
1495
1496         /* set/clear over quota flag for a uid/gid/projid */
1497         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
1498             body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
1499                 unsigned qid[LL_MAXQUOTAS] = {
1500                                          body->oa.o_uid, body->oa.o_gid,
1501                                          body->oa.o_projid };
1502                 CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n",
1503                        body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
1504                        body->oa.o_valid, body->oa.o_flags);
1505                        osc_quota_setdq(cli, qid, body->oa.o_valid,
1506                                        body->oa.o_flags);
1507         }
1508
1509         osc_update_grant(cli, body);
1510
1511         if (rc < 0)
1512                 RETURN(rc);
1513
1514         if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
1515                 client_cksum = aa->aa_oa->o_cksum; /* save for later */
1516
1517         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
1518                 if (rc > 0) {
1519                         CERROR("Unexpected +ve rc %d\n", rc);
1520                         RETURN(-EPROTO);
1521                 }
1522
1523                 if (req->rq_bulk != NULL &&
1524                     sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
1525                         RETURN(-EAGAIN);
1526
1527                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
1528                     check_write_checksum(&body->oa, peer, client_cksum,
1529                                          body->oa.o_cksum, aa))
1530                         RETURN(-EAGAIN);
1531
1532                 rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
1533                                      aa->aa_page_count, aa->aa_ppga);
1534                 GOTO(out, rc);
1535         }
1536
1537         /* The rest of this function executes only for OST_READs */
1538
1539         if (req->rq_bulk == NULL) {
1540                 rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
1541                                           RCL_SERVER);
1542                 LASSERT(rc == req->rq_status);
1543         } else {
1544                 /* if unwrap_bulk failed, return -EAGAIN to retry */
1545                 rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
1546         }
1547         if (rc < 0)
1548                 GOTO(out, rc = -EAGAIN);
1549
1550         if (rc > aa->aa_requested_nob) {
1551                 CERROR("Unexpected rc %d (%d requested)\n", rc,
1552                        aa->aa_requested_nob);
1553                 RETURN(-EPROTO);
1554         }
1555
1556         if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
1557                 CERROR ("Unexpected rc %d (%d transferred)\n",
1558                         rc, req->rq_bulk->bd_nob_transferred);
1559                 return (-EPROTO);
1560         }
1561
1562         if (req->rq_bulk == NULL) {
1563                 /* short io */
1564                 int nob, pg_count, i = 0;
1565                 unsigned char *buf;
1566
1567                 CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
1568                 pg_count = aa->aa_page_count;
1569                 buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
1570                                                    rc);
1571                 nob = rc;
1572                 while (nob > 0 && pg_count > 0) {
1573                         unsigned char *ptr;
1574                         int count = aa->aa_ppga[i]->count > nob ?
1575                                     nob : aa->aa_ppga[i]->count;
1576
1577                         CDEBUG(D_CACHE, "page %p count %d\n",
1578                                aa->aa_ppga[i]->pg, count);
1579                         ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
1580                         memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
1581                                count);
1582                         ll_kunmap_atomic((void *) ptr, KM_USER0);
1583
1584                         buf += count;
1585                         nob -= count;
1586                         i++;
1587                         pg_count--;
1588                 }
1589         }
1590
1591         if (rc < aa->aa_requested_nob)
1592                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
1593
1594         if (body->oa.o_valid & OBD_MD_FLCKSUM) {
1595                 static int cksum_counter;
1596                 u32        server_cksum = body->oa.o_cksum;
1597                 char      *via = "";
1598                 char      *router = "";
1599                 enum cksum_types cksum_type;
1600
1601                 cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
1602                                                body->oa.o_flags : 0);
1603                 rc = osc_checksum_bulk(rc, aa->aa_page_count, aa->aa_ppga,
1604                                        OST_READ, cksum_type, &client_cksum);
1605                 if (rc < 0) {
1606                         CDEBUG(D_PAGE,
1607                                "failed to calculate checksum, rc = %d\n", rc);
1608                         GOTO(out, rc);
1609                 }
1610                 if (req->rq_bulk != NULL &&
1611                     peer->nid != req->rq_bulk->bd_sender) {
1612                         via = " via ";
1613                         router = libcfs_nid2str(req->rq_bulk->bd_sender);
1614                 }
1615
1616                 if (server_cksum != client_cksum) {
1617                         struct ost_body *clbody;
1618                         u32 page_count = aa->aa_page_count;
1619
1620                         clbody = req_capsule_client_get(&req->rq_pill,
1621                                                         &RMF_OST_BODY);
1622                         if (cli->cl_checksum_dump)
1623                                 dump_all_bulk_pages(&clbody->oa, page_count,
1624                                                     aa->aa_ppga, server_cksum,
1625                                                     client_cksum);
1626
1627                         LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
1628                                            "%s%s%s inode "DFID" object "DOSTID
1629                                            " extent [%llu-%llu], client %x, "
1630                                            "server %x, cksum_type %x\n",
1631                                            req->rq_import->imp_obd->obd_name,
1632                                            libcfs_nid2str(peer->nid),
1633                                            via, router,
1634                                            clbody->oa.o_valid & OBD_MD_FLFID ?
1635                                                 clbody->oa.o_parent_seq : 0ULL,
1636                                            clbody->oa.o_valid & OBD_MD_FLFID ?
1637                                                 clbody->oa.o_parent_oid : 0,
1638                                            clbody->oa.o_valid & OBD_MD_FLFID ?
1639                                                 clbody->oa.o_parent_ver : 0,
1640                                            POSTID(&body->oa.o_oi),
1641                                            aa->aa_ppga[0]->off,
1642                                            aa->aa_ppga[page_count-1]->off +
1643                                            aa->aa_ppga[page_count-1]->count - 1,
1644                                            client_cksum, server_cksum,
1645                                            cksum_type);
1646                         cksum_counter = 0;
1647                         aa->aa_oa->o_cksum = client_cksum;
1648                         rc = -EAGAIN;
1649                 } else {
1650                         cksum_counter++;
1651                         CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
1652                         rc = 0;
1653                 }
1654         } else if (unlikely(client_cksum)) {
1655                 static int cksum_missed;
1656
1657                 cksum_missed++;
1658                 if ((cksum_missed & (-cksum_missed)) == cksum_missed)
1659                         CERROR("Checksum %u requested from %s but not sent\n",
1660                                cksum_missed, libcfs_nid2str(peer->nid));
1661         } else {
1662                 rc = 0;
1663         }
1664 out:
1665         if (rc >= 0)
1666                 lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
1667                                      aa->aa_oa, &body->oa);
1668
1669         RETURN(rc);
1670 }
1671
1672 static int osc_brw_redo_request(struct ptlrpc_request *request,
1673                                 struct osc_brw_async_args *aa, int rc)
1674 {
1675         struct ptlrpc_request *new_req;
1676         struct osc_brw_async_args *new_aa;
1677         struct osc_async_page *oap;
1678         ENTRY;
1679
1680         DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
1681                   "redo for recoverable error %d", rc);
1682
1683         rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
1684                                 OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
1685                                   aa->aa_cli, aa->aa_oa, aa->aa_page_count,
1686                                   aa->aa_ppga, &new_req, 1);
1687         if (rc)
1688                 RETURN(rc);
1689
1690         list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
1691                 if (oap->oap_request != NULL) {
1692                         LASSERTF(request == oap->oap_request,
1693                                  "request %p != oap_request %p\n",
1694                                  request, oap->oap_request);
1695                         if (oap->oap_interrupted) {
1696                                 ptlrpc_req_finished(new_req);
1697                                 RETURN(-EINTR);
1698                         }
1699                 }
1700         }
1701         /* New request takes over pga and oaps from old request.
1702          * Note that copying a list_head doesn't work, need to move it... */
1703         aa->aa_resends++;
1704         new_req->rq_interpret_reply = request->rq_interpret_reply;
1705         new_req->rq_async_args = request->rq_async_args;
1706         new_req->rq_commit_cb = request->rq_commit_cb;
1707         /* cap resend delay to the current request timeout, this is similar to
1708          * what ptlrpc does (see after_reply()) */
1709         if (aa->aa_resends > new_req->rq_timeout)
1710                 new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
1711         else
1712                 new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
1713         new_req->rq_generation_set = 1;
1714         new_req->rq_import_generation = request->rq_import_generation;
1715
1716         new_aa = ptlrpc_req_async_args(new_req);
1717
1718         INIT_LIST_HEAD(&new_aa->aa_oaps);
1719         list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
1720         INIT_LIST_HEAD(&new_aa->aa_exts);
1721         list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
1722         new_aa->aa_resends = aa->aa_resends;
1723
1724         list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
1725                 if (oap->oap_request) {
1726                         ptlrpc_req_finished(oap->oap_request);
1727                         oap->oap_request = ptlrpc_request_addref(new_req);
1728                 }
1729         }
1730
1731         /* XXX: This code will run into problem if we're going to support
1732          * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
1733          * and wait for all of them to be finished. We should inherit request
1734          * set from old request. */
1735         ptlrpcd_add_req(new_req);
1736
1737         DEBUG_REQ(D_INFO, new_req, "new request");
1738         RETURN(0);
1739 }
1740
1741 /*
1742  * ugh, we want disk allocation on the target to happen in offset order.  we'll
1743  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
1744  * fine for our small page arrays and doesn't require allocation.  its an
1745  * insertion sort that swaps elements that are strides apart, shrinking the
1746  * stride down until its '1' and the array is sorted.
1747  */
1748 static void sort_brw_pages(struct brw_page **array, int num)
1749 {
1750         int stride, i, j;
1751         struct brw_page *tmp;
1752
1753         if (num == 1)
1754                 return;
1755         for (stride = 1; stride < num ; stride = (stride * 3) + 1)
1756                 ;
1757
1758         do {
1759                 stride /= 3;
1760                 for (i = stride ; i < num ; i++) {
1761                         tmp = array[i];
1762                         j = i;
1763                         while (j >= stride && array[j - stride]->off > tmp->off) {
1764                                 array[j] = array[j - stride];
1765                                 j -= stride;
1766                         }
1767                         array[j] = tmp;
1768                 }
1769         } while (stride > 1);
1770 }
1771
1772 static void osc_release_ppga(struct brw_page **ppga, size_t count)
1773 {
1774         LASSERT(ppga != NULL);
1775         OBD_FREE(ppga, sizeof(*ppga) * count);
1776 }
1777
1778 static int brw_interpret(const struct lu_env *env,
1779                          struct ptlrpc_request *req, void *data, int rc)
1780 {
1781         struct osc_brw_async_args *aa = data;
1782         struct osc_extent *ext;
1783         struct osc_extent *tmp;
1784         struct client_obd *cli = aa->aa_cli;
1785         unsigned long           transferred = 0;
1786         ENTRY;
1787
1788         rc = osc_brw_fini_request(req, rc);
1789         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
1790         /* When server return -EINPROGRESS, client should always retry
1791          * regardless of the number of times the bulk was resent already. */
1792         if (osc_recoverable_error(rc) && !req->rq_no_delay) {
1793                 if (req->rq_import_generation !=
1794                     req->rq_import->imp_generation) {
1795                         CDEBUG(D_HA, "%s: resend cross eviction for object: "
1796                                ""DOSTID", rc = %d.\n",
1797                                req->rq_import->imp_obd->obd_name,
1798                                POSTID(&aa->aa_oa->o_oi), rc);
1799                 } else if (rc == -EINPROGRESS ||
1800                     client_should_resend(aa->aa_resends, aa->aa_cli)) {
1801                         rc = osc_brw_redo_request(req, aa, rc);
1802                 } else {
1803                         CERROR("%s: too many resent retries for object: "
1804                                "%llu:%llu, rc = %d.\n",
1805                                req->rq_import->imp_obd->obd_name,
1806                                POSTID(&aa->aa_oa->o_oi), rc);
1807                 }
1808
1809                 if (rc == 0)
1810                         RETURN(0);
1811                 else if (rc == -EAGAIN || rc == -EINPROGRESS)
1812                         rc = -EIO;
1813         }
1814
1815         if (rc == 0) {
1816                 struct obdo *oa = aa->aa_oa;
1817                 struct cl_attr *attr = &osc_env_info(env)->oti_attr;
1818                 unsigned long valid = 0;
1819                 struct cl_object *obj;
1820                 struct osc_async_page *last;
1821
1822                 last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
1823                 obj = osc2cl(last->oap_obj);
1824
1825                 cl_object_attr_lock(obj);
1826                 if (oa->o_valid & OBD_MD_FLBLOCKS) {
1827                         attr->cat_blocks = oa->o_blocks;
1828                         valid |= CAT_BLOCKS;
1829                 }
1830                 if (oa->o_valid & OBD_MD_FLMTIME) {
1831                         attr->cat_mtime = oa->o_mtime;
1832                         valid |= CAT_MTIME;
1833                 }
1834                 if (oa->o_valid & OBD_MD_FLATIME) {
1835                         attr->cat_atime = oa->o_atime;
1836                         valid |= CAT_ATIME;
1837                 }
1838                 if (oa->o_valid & OBD_MD_FLCTIME) {
1839                         attr->cat_ctime = oa->o_ctime;
1840                         valid |= CAT_CTIME;
1841                 }
1842
1843                 if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
1844                         struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
1845                         loff_t last_off = last->oap_count + last->oap_obj_off +
1846                                 last->oap_page_off;
1847
1848                         /* Change file size if this is an out of quota or
1849                          * direct IO write and it extends the file size */
1850                         if (loi->loi_lvb.lvb_size < last_off) {
1851                                 attr->cat_size = last_off;
1852                                 valid |= CAT_SIZE;
1853                         }
1854                         /* Extend KMS if it's not a lockless write */
1855                         if (loi->loi_kms < last_off &&
1856                             oap2osc_page(last)->ops_srvlock == 0) {
1857                                 attr->cat_kms = last_off;
1858                                 valid |= CAT_KMS;
1859                         }
1860                 }
1861
1862                 if (valid != 0)
1863                         cl_object_attr_update(env, obj, attr, valid);
1864                 cl_object_attr_unlock(obj);
1865         }
1866         OBDO_FREE(aa->aa_oa);
1867
1868         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
1869                 osc_inc_unstable_pages(req);
1870
1871         list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
1872                 list_del_init(&ext->oe_link);
1873                 osc_extent_finish(env, ext, 1,
1874                                   rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
1875         }
1876         LASSERT(list_empty(&aa->aa_exts));
1877         LASSERT(list_empty(&aa->aa_oaps));
1878
1879         transferred = (req->rq_bulk == NULL ? /* short io */
1880                        aa->aa_requested_nob :
1881                        req->rq_bulk->bd_nob_transferred);
1882
1883         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
1884         ptlrpc_lprocfs_brw(req, transferred);
1885
1886         spin_lock(&cli->cl_loi_list_lock);
1887         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
1888          * is called so we know whether to go to sync BRWs or wait for more
1889          * RPCs to complete */
1890         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
1891                 cli->cl_w_in_flight--;
1892         else
1893                 cli->cl_r_in_flight--;
1894         osc_wake_cache_waiters(cli);
1895         spin_unlock(&cli->cl_loi_list_lock);
1896
1897         osc_io_unplug(env, cli, NULL);
1898         RETURN(rc);
1899 }
1900
1901 static void brw_commit(struct ptlrpc_request *req)
1902 {
1903         /* If osc_inc_unstable_pages (via osc_extent_finish) races with
1904          * this called via the rq_commit_cb, I need to ensure
1905          * osc_dec_unstable_pages is still called. Otherwise unstable
1906          * pages may be leaked. */
1907         spin_lock(&req->rq_lock);
1908         if (likely(req->rq_unstable)) {
1909                 req->rq_unstable = 0;
1910                 spin_unlock(&req->rq_lock);
1911
1912                 osc_dec_unstable_pages(req);
1913         } else {
1914                 req->rq_committed = 1;
1915                 spin_unlock(&req->rq_lock);
1916         }
1917 }
1918
1919 /**
1920  * Build an RPC by the list of extent @ext_list. The caller must ensure
1921  * that the total pages in this list are NOT over max pages per RPC.
1922  * Extents in the list must be in OES_RPC state.
1923  */
1924 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
1925                   struct list_head *ext_list, int cmd)
1926 {
1927         struct ptlrpc_request           *req = NULL;
1928         struct osc_extent               *ext;
1929         struct brw_page                 **pga = NULL;
1930         struct osc_brw_async_args       *aa = NULL;
1931         struct obdo                     *oa = NULL;
1932         struct osc_async_page           *oap;
1933         struct osc_object               *obj = NULL;
1934         struct cl_req_attr              *crattr = NULL;
1935         loff_t                          starting_offset = OBD_OBJECT_EOF;
1936         loff_t                          ending_offset = 0;
1937         int                             mpflag = 0;
1938         int                             mem_tight = 0;
1939         int                             page_count = 0;
1940         bool                            soft_sync = false;
1941         bool                            interrupted = false;
1942         bool                            ndelay = false;
1943         int                             i;
1944         int                             grant = 0;
1945         int                             rc;
1946         __u32                           layout_version = 0;
1947         struct list_head                rpc_list = LIST_HEAD_INIT(rpc_list);
1948         struct ost_body                 *body;
1949         ENTRY;
1950         LASSERT(!list_empty(ext_list));
1951
1952         /* add pages into rpc_list to build BRW rpc */
1953         list_for_each_entry(ext, ext_list, oe_link) {
1954                 LASSERT(ext->oe_state == OES_RPC);
1955                 mem_tight |= ext->oe_memalloc;
1956                 grant += ext->oe_grants;
1957                 page_count += ext->oe_nr_pages;
1958                 layout_version = MAX(layout_version, ext->oe_layout_version);
1959                 if (obj == NULL)
1960                         obj = ext->oe_obj;
1961         }
1962
1963         soft_sync = osc_over_unstable_soft_limit(cli);
1964         if (mem_tight)
1965                 mpflag = cfs_memory_pressure_get_and_set();
1966
1967         OBD_ALLOC(pga, sizeof(*pga) * page_count);
1968         if (pga == NULL)
1969                 GOTO(out, rc = -ENOMEM);
1970
1971         OBDO_ALLOC(oa);
1972         if (oa == NULL)
1973                 GOTO(out, rc = -ENOMEM);
1974
1975         i = 0;
1976         list_for_each_entry(ext, ext_list, oe_link) {
1977                 list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
1978                         if (mem_tight)
1979                                 oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
1980                         if (soft_sync)
1981                                 oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
1982                         pga[i] = &oap->oap_brw_page;
1983                         pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
1984                         i++;
1985
1986                         list_add_tail(&oap->oap_rpc_item, &rpc_list);
1987                         if (starting_offset == OBD_OBJECT_EOF ||
1988                             starting_offset > oap->oap_obj_off)
1989                                 starting_offset = oap->oap_obj_off;
1990                         else
1991                                 LASSERT(oap->oap_page_off == 0);
1992                         if (ending_offset < oap->oap_obj_off + oap->oap_count)
1993                                 ending_offset = oap->oap_obj_off +
1994                                                 oap->oap_count;
1995                         else
1996                                 LASSERT(oap->oap_page_off + oap->oap_count ==
1997                                         PAGE_SIZE);
1998                         if (oap->oap_interrupted)
1999                                 interrupted = true;
2000                 }
2001                 if (ext->oe_ndelay)
2002                         ndelay = true;
2003         }
2004
2005         /* first page in the list */
2006         oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item);
2007
2008         crattr = &osc_env_info(env)->oti_req_attr;
2009         memset(crattr, 0, sizeof(*crattr));
2010         crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
2011         crattr->cra_flags = ~0ULL;
2012         crattr->cra_page = oap2cl_page(oap);
2013         crattr->cra_oa = oa;
2014         cl_req_attr_set(env, osc2cl(obj), crattr);
2015
2016         if (cmd == OBD_BRW_WRITE) {
2017                 oa->o_grant_used = grant;
2018                 if (layout_version > 0) {
2019                         CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
2020                                PFID(&oa->o_oi.oi_fid), layout_version);
2021
2022                         oa->o_layout_version = layout_version;
2023                         oa->o_valid |= OBD_MD_LAYOUT_VERSION;
2024                 }
2025         }
2026
2027         sort_brw_pages(pga, page_count);
2028         rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
2029         if (rc != 0) {
2030                 CERROR("prep_req failed: %d\n", rc);
2031                 GOTO(out, rc);
2032         }
2033
2034         req->rq_commit_cb = brw_commit;
2035         req->rq_interpret_reply = brw_interpret;
2036         req->rq_memalloc = mem_tight != 0;
2037         oap->oap_request = ptlrpc_request_addref(req);
2038         if (interrupted && !req->rq_intr)
2039                 ptlrpc_mark_interrupted(req);
2040         if (ndelay) {
2041                 req->rq_no_resend = req->rq_no_delay = 1;
2042                 /* probably set a shorter timeout value.
2043                  * to handle ETIMEDOUT in brw_interpret() correctly. */
2044                 /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
2045         }
2046
2047         /* Need to update the timestamps after the request is built in case
2048          * we race with setattr (locally or in queue at OST).  If OST gets
2049          * later setattr before earlier BRW (as determined by the request xid),
2050          * the OST will not use BRW timestamps.  Sadly, there is no obvious
2051          * way to do this in a single call.  bug 10150 */
2052         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
2053         crattr->cra_oa = &body->oa;
2054         crattr->cra_flags = OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
2055         cl_req_attr_set(env, osc2cl(obj), crattr);
2056         lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
2057
2058         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2059         aa = ptlrpc_req_async_args(req);
2060         INIT_LIST_HEAD(&aa->aa_oaps);
2061         list_splice_init(&rpc_list, &aa->aa_oaps);
2062         INIT_LIST_HEAD(&aa->aa_exts);
2063         list_splice_init(ext_list, &aa->aa_exts);
2064
2065         spin_lock(&cli->cl_loi_list_lock);
2066         starting_offset >>= PAGE_SHIFT;
2067         if (cmd == OBD_BRW_READ) {
2068                 cli->cl_r_in_flight++;
2069                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
2070                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
2071                 lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
2072                                       starting_offset + 1);
2073         } else {
2074                 cli->cl_w_in_flight++;
2075                 lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
2076                 lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
2077                 lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
2078                                       starting_offset + 1);
2079         }
2080         spin_unlock(&cli->cl_loi_list_lock);
2081
2082         DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight",
2083                   page_count, aa, cli->cl_r_in_flight,
2084                   cli->cl_w_in_flight);
2085         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
2086
2087         ptlrpcd_add_req(req);
2088         rc = 0;
2089         EXIT;
2090
2091 out:
2092         if (mem_tight != 0)
2093                 cfs_memory_pressure_restore(mpflag);
2094
2095         if (rc != 0) {
2096                 LASSERT(req == NULL);
2097
2098                 if (oa)
2099                         OBDO_FREE(oa);
2100                 if (pga)
2101                         OBD_FREE(pga, sizeof(*pga) * page_count);
2102                 /* this should happen rarely and is pretty bad, it makes the
2103                  * pending list not follow the dirty order */
2104                 while (!list_empty(ext_list)) {
2105                         ext = list_entry(ext_list->next, struct osc_extent,
2106                                          oe_link);
2107                         list_del_init(&ext->oe_link);
2108                         osc_extent_finish(env, ext, 0, rc);
2109                 }
2110         }
2111         RETURN(rc);
2112 }
2113
2114 static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
2115 {
2116         int set = 0;
2117
2118         LASSERT(lock != NULL);
2119
2120         lock_res_and_lock(lock);
2121
2122         if (lock->l_ast_data == NULL)
2123                 lock->l_ast_data = data;
2124         if (lock->l_ast_data == data)
2125                 set = 1;
2126
2127         unlock_res_and_lock(lock);
2128
2129         return set;
2130 }
2131
2132 int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
2133                      void *cookie, struct lustre_handle *lockh,
2134                      enum ldlm_mode mode, __u64 *flags, bool speculative,
2135                      int errcode)
2136 {
2137         bool intent = *flags & LDLM_FL_HAS_INTENT;
2138         int rc;
2139         ENTRY;
2140
2141         /* The request was created before ldlm_cli_enqueue call. */
2142         if (intent && errcode == ELDLM_LOCK_ABORTED) {
2143                 struct ldlm_reply *rep;
2144
2145                 rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
2146                 LASSERT(rep != NULL);
2147
2148                 rep->lock_policy_res1 =
2149                         ptlrpc_status_ntoh(rep->lock_policy_res1);
2150                 if (rep->lock_policy_res1)
2151                         errcode = rep->lock_policy_res1;
2152                 if (!speculative)
2153                         *flags |= LDLM_FL_LVB_READY;
2154         } else if (errcode == ELDLM_OK) {
2155                 *flags |= LDLM_FL_LVB_READY;
2156         }
2157
2158         /* Call the update callback. */
2159         rc = (*upcall)(cookie, lockh, errcode);
2160
2161         /* release the reference taken in ldlm_cli_enqueue() */
2162         if (errcode == ELDLM_LOCK_MATCHED)
2163                 errcode = ELDLM_OK;
2164         if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
2165                 ldlm_lock_decref(lockh, mode);
2166
2167         RETURN(rc);
2168 }
2169
2170 int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
2171                           struct osc_enqueue_args *aa, int rc)
2172 {
2173         struct ldlm_lock *lock;
2174         struct lustre_handle *lockh = &aa->oa_lockh;
2175         enum ldlm_mode mode = aa->oa_mode;
2176         struct ost_lvb *lvb = aa->oa_lvb;
2177         __u32 lvb_len = sizeof(*lvb);
2178         __u64 flags = 0;
2179
2180         ENTRY;
2181
2182         /* ldlm_cli_enqueue is holding a reference on the lock, so it must
2183          * be valid. */
2184         lock = ldlm_handle2lock(lockh);
2185         LASSERTF(lock != NULL,
2186                  "lockh %#llx, req %p, aa %p - client evicted?\n",
2187                  lockh->cookie, req, aa);
2188
2189         /* Take an additional reference so that a blocking AST that
2190          * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
2191          * to arrive after an upcall has been executed by
2192          * osc_enqueue_fini(). */
2193         ldlm_lock_addref(lockh, mode);
2194
2195         /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
2196         OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
2197
2198         /* Let CP AST to grant the lock first. */
2199         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
2200
2201         if (aa->oa_speculative) {
2202                 LASSERT(aa->oa_lvb == NULL);
2203                 LASSERT(aa->oa_flags == NULL);
2204                 aa->oa_flags = &flags;
2205         }
2206
2207         /* Complete obtaining the lock procedure. */
2208         rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
2209                                    aa->oa_mode, aa->oa_flags, lvb, lvb_len,
2210                                    lockh, rc);
2211         /* Complete osc stuff. */
2212         rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
2213                               aa->oa_flags, aa->oa_speculative, rc);
2214
2215         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
2216
2217         ldlm_lock_decref(lockh, mode);
2218         LDLM_LOCK_PUT(lock);
2219         RETURN(rc);
2220 }
2221
2222 struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
2223
2224 /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
2225  * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
2226  * other synchronous requests, however keeping some locks and trying to obtain
2227  * others may take a considerable amount of time in a case of ost failure; and
2228  * when other sync requests do not get released lock from a client, the client
2229  * is evicted from the cluster -- such scenarious make the life difficult, so
2230  * release locks just after they are obtained. */
2231 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
2232                      __u64 *flags, union ldlm_policy_data *policy,
2233                      struct ost_lvb *lvb, int kms_valid,
2234                      osc_enqueue_upcall_f upcall, void *cookie,
2235                      struct ldlm_enqueue_info *einfo,
2236                      struct ptlrpc_request_set *rqset, int async,
2237                      bool speculative)
2238 {
2239         struct obd_device *obd = exp->exp_obd;
2240         struct lustre_handle lockh = { 0 };
2241         struct ptlrpc_request *req = NULL;
2242         int intent = *flags & LDLM_FL_HAS_INTENT;
2243         __u64 match_flags = *flags;
2244         enum ldlm_mode mode;
2245         int rc;
2246         ENTRY;
2247
2248         /* Filesystem lock extents are extended to page boundaries so that
2249          * dealing with the page cache is a little smoother.  */
2250         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2251         policy->l_extent.end |= ~PAGE_MASK;
2252
2253         /*
2254          * kms is not valid when either object is completely fresh (so that no
2255          * locks are cached), or object was evicted. In the latter case cached
2256          * lock cannot be used, because it would prime inode state with
2257          * potentially stale LVB.
2258          */
2259         if (!kms_valid)
2260                 goto no_match;
2261
2262         /* Next, search for already existing extent locks that will cover us */
2263         /* If we're trying to read, we also search for an existing PW lock.  The
2264          * VFS and page cache already protect us locally, so lots of readers/
2265          * writers can share a single PW lock.
2266          *
2267          * There are problems with conversion deadlocks, so instead of
2268          * converting a read lock to a write lock, we'll just enqueue a new
2269          * one.
2270          *
2271          * At some point we should cancel the read lock instead of making them
2272          * send us a blocking callback, but there are problems with canceling
2273          * locks out from other users right now, too. */
2274         mode = einfo->ei_mode;
2275         if (einfo->ei_mode == LCK_PR)
2276                 mode |= LCK_PW;
2277         /* Normal lock requests must wait for the LVB to be ready before
2278          * matching a lock; speculative lock requests do not need to,
2279          * because they will not actually use the lock. */
2280         if (!speculative)
2281                 match_flags |= LDLM_FL_LVB_READY;
2282         if (intent != 0)
2283                 match_flags |= LDLM_FL_BLOCK_GRANTED;
2284         mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
2285                                einfo->ei_type, policy, mode, &lockh, 0);
2286         if (mode) {
2287                 struct ldlm_lock *matched;
2288
2289                 if (*flags & LDLM_FL_TEST_LOCK)
2290                         RETURN(ELDLM_OK);
2291
2292                 matched = ldlm_handle2lock(&lockh);
2293                 if (speculative) {
2294                         /* This DLM lock request is speculative, and does not
2295                          * have an associated IO request. Therefore if there
2296                          * is already a DLM lock, it wll just inform the
2297                          * caller to cancel the request for this stripe.*/
2298                         lock_res_and_lock(matched);
2299                         if (ldlm_extent_equal(&policy->l_extent,
2300                             &matched->l_policy_data.l_extent))
2301                                 rc = -EEXIST;
2302                         else
2303                                 rc = -ECANCELED;
2304                         unlock_res_and_lock(matched);
2305
2306                         ldlm_lock_decref(&lockh, mode);
2307                         LDLM_LOCK_PUT(matched);
2308                         RETURN(rc);
2309                 } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
2310                         *flags |= LDLM_FL_LVB_READY;
2311
2312                         /* We already have a lock, and it's referenced. */
2313                         (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
2314
2315                         ldlm_lock_decref(&lockh, mode);
2316                         LDLM_LOCK_PUT(matched);
2317                         RETURN(ELDLM_OK);
2318                 } else {
2319                         ldlm_lock_decref(&lockh, mode);
2320                         LDLM_LOCK_PUT(matched);
2321                 }
2322         }
2323
2324 no_match:
2325         if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
2326                 RETURN(-ENOLCK);
2327
2328         if (intent) {
2329                 req = ptlrpc_request_alloc(class_exp2cliimp(exp),
2330                                            &RQF_LDLM_ENQUEUE_LVB);
2331                 if (req == NULL)
2332                         RETURN(-ENOMEM);
2333
2334                 rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
2335                 if (rc) {
2336                         ptlrpc_request_free(req);
2337                         RETURN(rc);
2338                 }
2339
2340                 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
2341                                      sizeof *lvb);
2342                 ptlrpc_request_set_replen(req);
2343         }
2344
2345         /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
2346         *flags &= ~LDLM_FL_BLOCK_GRANTED;
2347
2348         rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
2349                               sizeof(*lvb), LVB_T_OST, &lockh, async);
2350         if (async) {
2351                 if (!rc) {
2352                         struct osc_enqueue_args *aa;
2353                         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2354                         aa = ptlrpc_req_async_args(req);
2355                         aa->oa_exp         = exp;
2356                         aa->oa_mode        = einfo->ei_mode;
2357                         aa->oa_type        = einfo->ei_type;
2358                         lustre_handle_copy(&aa->oa_lockh, &lockh);
2359                         aa->oa_upcall      = upcall;
2360                         aa->oa_cookie      = cookie;
2361                         aa->oa_speculative = speculative;
2362                         if (!speculative) {
2363                                 aa->oa_flags  = flags;
2364                                 aa->oa_lvb    = lvb;
2365                         } else {
2366                                 /* speculative locks are essentially to enqueue
2367                                  * a DLM lock  in advance, so we don't care
2368                                  * about the result of the enqueue. */
2369                                 aa->oa_lvb    = NULL;
2370                                 aa->oa_flags  = NULL;
2371                         }
2372
2373                         req->rq_interpret_reply =
2374                                 (ptlrpc_interpterer_t)osc_enqueue_interpret;
2375                         if (rqset == PTLRPCD_SET)
2376                                 ptlrpcd_add_req(req);
2377                         else
2378                                 ptlrpc_set_add_req(rqset, req);
2379                 } else if (intent) {
2380                         ptlrpc_req_finished(req);
2381                 }
2382                 RETURN(rc);
2383         }
2384
2385         rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
2386                               flags, speculative, rc);
2387         if (intent)
2388                 ptlrpc_req_finished(req);
2389
2390         RETURN(rc);
2391 }
2392
2393 int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
2394                    enum ldlm_type type, union ldlm_policy_data *policy,
2395                    enum ldlm_mode mode, __u64 *flags, void *data,
2396                    struct lustre_handle *lockh, int unref)
2397 {
2398         struct obd_device *obd = exp->exp_obd;
2399         __u64 lflags = *flags;
2400         enum ldlm_mode rc;
2401         ENTRY;
2402
2403         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
2404                 RETURN(-EIO);
2405
2406         /* Filesystem lock extents are extended to page boundaries so that
2407          * dealing with the page cache is a little smoother */
2408         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2409         policy->l_extent.end |= ~PAGE_MASK;
2410
2411         /* Next, search for already existing extent locks that will cover us */
2412         /* If we're trying to read, we also search for an existing PW lock.  The
2413          * VFS and page cache already protect us locally, so lots of readers/
2414          * writers can share a single PW lock. */
2415         rc = mode;
2416         if (mode == LCK_PR)
2417                 rc |= LCK_PW;
2418         rc = ldlm_lock_match(obd->obd_namespace, lflags,
2419                              res_id, type, policy, rc, lockh, unref);
2420         if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
2421                 RETURN(rc);
2422
2423         if (data != NULL) {
2424                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
2425
2426                 LASSERT(lock != NULL);
2427                 if (!osc_set_lock_data(lock, data)) {
2428                         ldlm_lock_decref(lockh, rc);
2429                         rc = 0;
2430                 }
2431                 LDLM_LOCK_PUT(lock);
2432         }
2433         RETURN(rc);
2434 }
2435
2436 static int osc_statfs_interpret(const struct lu_env *env,
2437                                 struct ptlrpc_request *req,
2438                                 struct osc_async_args *aa, int rc)
2439 {
2440         struct obd_statfs *msfs;
2441         ENTRY;
2442
2443         if (rc == -EBADR)
2444                 /* The request has in fact never been sent
2445                  * due to issues at a higher level (LOV).
2446                  * Exit immediately since the caller is
2447                  * aware of the problem and takes care
2448                  * of the clean up */
2449                  RETURN(rc);
2450
2451         if ((rc == -ENOTCONN || rc == -EAGAIN) &&
2452             (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
2453                 GOTO(out, rc = 0);
2454
2455         if (rc != 0)
2456                 GOTO(out, rc);
2457
2458         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
2459         if (msfs == NULL) {
2460                 GOTO(out, rc = -EPROTO);
2461         }
2462
2463         *aa->aa_oi->oi_osfs = *msfs;
2464 out:
2465         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
2466         RETURN(rc);
2467 }
2468
2469 static int osc_statfs_async(struct obd_export *exp,
2470                             struct obd_info *oinfo, time64_t max_age,
2471                             struct ptlrpc_request_set *rqset)
2472 {
2473         struct obd_device     *obd = class_exp2obd(exp);
2474         struct ptlrpc_request *req;
2475         struct osc_async_args *aa;
2476         int                    rc;
2477         ENTRY;
2478
2479         /* We could possibly pass max_age in the request (as an absolute
2480          * timestamp or a "seconds.usec ago") so the target can avoid doing
2481          * extra calls into the filesystem if that isn't necessary (e.g.
2482          * during mount that would help a bit).  Having relative timestamps
2483          * is not so great if request processing is slow, while absolute
2484          * timestamps are not ideal because they need time synchronization. */
2485         req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
2486         if (req == NULL)
2487                 RETURN(-ENOMEM);
2488
2489         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
2490         if (rc) {
2491                 ptlrpc_request_free(req);
2492                 RETURN(rc);
2493         }
2494         ptlrpc_request_set_replen(req);
2495         req->rq_request_portal = OST_CREATE_PORTAL;
2496         ptlrpc_at_set_req_timeout(req);
2497
2498         if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
2499                 /* procfs requests not want stat in wait for avoid deadlock */
2500                 req->rq_no_resend = 1;
2501                 req->rq_no_delay = 1;
2502         }
2503
2504         req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
2505         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2506         aa = ptlrpc_req_async_args(req);
2507         aa->aa_oi = oinfo;
2508
2509         ptlrpc_set_add_req(rqset, req);
2510         RETURN(0);
2511 }
2512
2513 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
2514                       struct obd_statfs *osfs, time64_t max_age, __u32 flags)
2515 {
2516         struct obd_device     *obd = class_exp2obd(exp);
2517         struct obd_statfs     *msfs;
2518         struct ptlrpc_request *req;
2519         struct obd_import     *imp = NULL;
2520         int rc;
2521         ENTRY;
2522
2523         /*Since the request might also come from lprocfs, so we need
2524          *sync this with client_disconnect_export Bug15684*/
2525         down_read(&obd->u.cli.cl_sem);
2526         if (obd->u.cli.cl_import)
2527                 imp = class_import_get(obd->u.cli.cl_import);
2528         up_read(&obd->u.cli.cl_sem);
2529         if (!imp)
2530                 RETURN(-ENODEV);
2531
2532         /* We could possibly pass max_age in the request (as an absolute
2533          * timestamp or a "seconds.usec ago") so the target can avoid doing
2534          * extra calls into the filesystem if that isn't necessary (e.g.
2535          * during mount that would help a bit).  Having relative timestamps
2536          * is not so great if request processing is slow, while absolute
2537          * timestamps are not ideal because they need time synchronization. */
2538         req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
2539
2540         class_import_put(imp);
2541
2542         if (req == NULL)
2543                 RETURN(-ENOMEM);
2544
2545         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
2546         if (rc) {
2547                 ptlrpc_request_free(req);
2548                 RETURN(rc);
2549         }
2550         ptlrpc_request_set_replen(req);
2551         req->rq_request_portal = OST_CREATE_PORTAL;
2552         ptlrpc_at_set_req_timeout(req);
2553
2554         if (flags & OBD_STATFS_NODELAY) {
2555                 /* procfs requests not want stat in wait for avoid deadlock */
2556                 req->rq_no_resend = 1;
2557                 req->rq_no_delay = 1;
2558         }
2559
2560         rc = ptlrpc_queue_wait(req);
2561         if (rc)
2562                 GOTO(out, rc);
2563
2564         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
2565         if (msfs == NULL) {
2566                 GOTO(out, rc = -EPROTO);
2567         }
2568
2569         *osfs = *msfs;
2570
2571         EXIT;
2572  out:
2573         ptlrpc_req_finished(req);
2574         return rc;
2575 }
2576
2577 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
2578                          void *karg, void __user *uarg)
2579 {
2580         struct obd_device *obd = exp->exp_obd;
2581         struct obd_ioctl_data *data = karg;
2582         int err = 0;
2583         ENTRY;
2584
2585         if (!try_module_get(THIS_MODULE)) {
2586                 CERROR("%s: cannot get module '%s'\n", obd->obd_name,
2587                        module_name(THIS_MODULE));
2588                 return -EINVAL;
2589         }
2590         switch (cmd) {
2591         case OBD_IOC_CLIENT_RECOVER:
2592                 err = ptlrpc_recover_import(obd->u.cli.cl_import,
2593                                             data->ioc_inlbuf1, 0);
2594                 if (err > 0)
2595                         err = 0;
2596                 GOTO(out, err);
2597         case IOC_OSC_SET_ACTIVE:
2598                 err = ptlrpc_set_import_active(obd->u.cli.cl_import,
2599                                                data->ioc_offset);
2600                 GOTO(out, err);
2601         case OBD_IOC_PING_TARGET:
2602                 err = ptlrpc_obd_ping(obd);
2603                 GOTO(out, err);
2604         default:
2605                 CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
2606                        cmd, current_comm());
2607                 GOTO(out, err = -ENOTTY);
2608         }
2609 out:
2610         module_put(THIS_MODULE);
2611         return err;
2612 }
2613
2614 int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
2615                        u32 keylen, void *key, u32 vallen, void *val,
2616                        struct ptlrpc_request_set *set)
2617 {
2618         struct ptlrpc_request *req;
2619         struct obd_device     *obd = exp->exp_obd;
2620         struct obd_import     *imp = class_exp2cliimp(exp);
2621         char                  *tmp;
2622         int                    rc;
2623         ENTRY;
2624
2625         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
2626
2627         if (KEY_IS(KEY_CHECKSUM)) {
2628                 if (vallen != sizeof(int))
2629                         RETURN(-EINVAL);
2630                 exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
2631                 RETURN(0);
2632         }
2633
2634         if (KEY_IS(KEY_SPTLRPC_CONF)) {
2635                 sptlrpc_conf_client_adapt(obd);
2636                 RETURN(0);
2637         }
2638
2639         if (KEY_IS(KEY_FLUSH_CTX)) {
2640                 sptlrpc_import_flush_my_ctx(imp);
2641                 RETURN(0);
2642         }
2643
2644         if (KEY_IS(KEY_CACHE_SET)) {
2645                 struct client_obd *cli = &obd->u.cli;
2646
2647                 LASSERT(cli->cl_cache == NULL); /* only once */
2648                 cli->cl_cache = (struct cl_client_cache *)val;
2649                 cl_cache_incref(cli->cl_cache);
2650                 cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
2651
2652                 /* add this osc into entity list */
2653                 LASSERT(list_empty(&cli->cl_lru_osc));
2654                 spin_lock(&cli->cl_cache->ccc_lru_lock);
2655                 list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
2656                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
2657
2658                 RETURN(0);
2659         }
2660
2661         if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
2662                 struct client_obd *cli = &obd->u.cli;
2663                 long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
2664                 long target = *(long *)val;
2665
2666                 nr = osc_lru_shrink(env, cli, min(nr, target), true);
2667                 *(long *)val -= nr;
2668                 RETURN(0);
2669         }
2670
2671         if (!set && !KEY_IS(KEY_GRANT_SHRINK))
2672                 RETURN(-EINVAL);
2673
2674         /* We pass all other commands directly to OST. Since nobody calls osc
2675            methods directly and everybody is supposed to go through LOV, we
2676            assume lov checked invalid values for us.
2677            The only recognised values so far are evict_by_nid and mds_conn.
2678            Even if something bad goes through, we'd get a -EINVAL from OST
2679            anyway. */
2680
2681         req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
2682                                                 &RQF_OST_SET_GRANT_INFO :
2683                                                 &RQF_OBD_SET_INFO);
2684         if (req == NULL)
2685                 RETURN(-ENOMEM);
2686
2687         req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
2688                              RCL_CLIENT, keylen);
2689         if (!KEY_IS(KEY_GRANT_SHRINK))
2690                 req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
2691                                      RCL_CLIENT, vallen);
2692         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
2693         if (rc) {
2694                 ptlrpc_request_free(req);
2695                 RETURN(rc);
2696         }
2697
2698         tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
2699         memcpy(tmp, key, keylen);
2700         tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
2701                                                         &RMF_OST_BODY :
2702                                                         &RMF_SETINFO_VAL);
2703         memcpy(tmp, val, vallen);
2704
2705         if (KEY_IS(KEY_GRANT_SHRINK)) {
2706                 struct osc_grant_args *aa;
2707                 struct obdo *oa;
2708
2709                 CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2710                 aa = ptlrpc_req_async_args(req);
2711                 OBDO_ALLOC(oa);
2712                 if (!oa) {
2713                         ptlrpc_req_finished(req);
2714                         RETURN(-ENOMEM);
2715                 }
2716                 *oa = ((struct ost_body *)val)->oa;
2717                 aa->aa_oa = oa;
2718                 req->rq_interpret_reply = osc_shrink_grant_interpret;
2719         }
2720
2721         ptlrpc_request_set_replen(req);
2722         if (!KEY_IS(KEY_GRANT_SHRINK)) {
2723                 LASSERT(set != NULL);
2724                 ptlrpc_set_add_req(set, req);
2725                 ptlrpc_check_set(NULL, set);
2726         } else {
2727                 ptlrpcd_add_req(req);
2728         }
2729
2730         RETURN(0);
2731 }
2732 EXPORT_SYMBOL(osc_set_info_async);
2733
2734 int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
2735                   struct obd_device *obd, struct obd_uuid *cluuid,
2736                   struct obd_connect_data *data, void *localdata)
2737 {
2738         struct client_obd *cli = &obd->u.cli;
2739
2740         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
2741                 long lost_grant;
2742                 long grant;
2743
2744                 spin_lock(&cli->cl_loi_list_lock);
2745                 grant = cli->cl_avail_grant + cli->cl_reserved_grant;
2746                 if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM)
2747                         grant += cli->cl_dirty_grant;
2748                 else
2749                         grant += cli->cl_dirty_pages << PAGE_SHIFT;
2750                 data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
2751                 lost_grant = cli->cl_lost_grant;
2752                 cli->cl_lost_grant = 0;
2753                 spin_unlock(&cli->cl_loi_list_lock);
2754
2755                 CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
2756                        " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
2757                        data->ocd_version, data->ocd_grant, lost_grant);
2758         }
2759
2760         RETURN(0);
2761 }
2762 EXPORT_SYMBOL(osc_reconnect);
2763
2764 int osc_disconnect(struct obd_export *exp)
2765 {
2766         struct obd_device *obd = class_exp2obd(exp);
2767         int rc;
2768
2769         rc = client_disconnect_export(exp);
2770         /**
2771          * Initially we put del_shrink_grant before disconnect_export, but it
2772          * causes the following problem if setup (connect) and cleanup
2773          * (disconnect) are tangled together.
2774          *      connect p1                     disconnect p2
2775          *   ptlrpc_connect_import
2776          *     ...............               class_manual_cleanup
2777          *                                     osc_disconnect
2778          *                                     del_shrink_grant
2779          *   ptlrpc_connect_interrupt
2780          *     init_grant_shrink
2781          *   add this client to shrink list
2782          *                                      cleanup_osc
2783          * Bang! pinger trigger the shrink.
2784          * So the osc should be disconnected from the shrink list, after we
2785          * are sure the import has been destroyed. BUG18662
2786          */
2787         if (obd->u.cli.cl_import == NULL)
2788                 osc_del_shrink_grant(&obd->u.cli);
2789         return rc;
2790 }
2791 EXPORT_SYMBOL(osc_disconnect);
2792
2793 int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
2794                                  struct hlist_node *hnode, void *arg)
2795 {
2796         struct lu_env *env = arg;
2797         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
2798         struct ldlm_lock *lock;
2799         struct osc_object *osc = NULL;
2800         ENTRY;
2801
2802         lock_res(res);
2803         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
2804                 if (lock->l_ast_data != NULL && osc == NULL) {
2805                         osc = lock->l_ast_data;
2806                         cl_object_get(osc2cl(osc));
2807                 }
2808
2809                 /* clear LDLM_FL_CLEANED flag to make sure it will be canceled
2810                  * by the 2nd round of ldlm_namespace_clean() call in
2811                  * osc_import_event(). */
2812                 ldlm_clear_cleaned(lock);
2813         }
2814         unlock_res(res);
2815
2816         if (osc != NULL) {
2817                 osc_object_invalidate(env, osc);
2818                 cl_object_put(env, osc2cl(osc));
2819         }
2820
2821         RETURN(0);
2822 }
2823 EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
2824
2825 static int osc_import_event(struct obd_device *obd,
2826                             struct obd_import *imp,
2827                             enum obd_import_event event)
2828 {
2829         struct client_obd *cli;
2830         int rc = 0;
2831
2832         ENTRY;
2833         LASSERT(imp->imp_obd == obd);
2834
2835         switch (event) {
2836         case IMP_EVENT_DISCON: {
2837                 cli = &obd->u.cli;
2838                 spin_lock(&cli->cl_loi_list_lock);
2839                 cli->cl_avail_grant = 0;
2840                 cli->cl_lost_grant = 0;
2841                 spin_unlock(&cli->cl_loi_list_lock);
2842                 break;
2843         }
2844         case IMP_EVENT_INACTIVE: {
2845                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
2846                 break;
2847         }
2848         case IMP_EVENT_INVALIDATE: {
2849                 struct ldlm_namespace *ns = obd->obd_namespace;
2850                 struct lu_env         *env;
2851                 __u16                  refcheck;
2852
2853                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
2854
2855                 env = cl_env_get(&refcheck);
2856                 if (!IS_ERR(env)) {
2857                         osc_io_unplug(env, &obd->u.cli, NULL);
2858
2859                         cfs_hash_for_each_nolock(ns->ns_rs_hash,
2860                                                  osc_ldlm_resource_invalidate,
2861                                                  env, 0);
2862                         cl_env_put(env, &refcheck);
2863
2864                         ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
2865                 } else
2866                         rc = PTR_ERR(env);
2867                 break;
2868         }
2869         case IMP_EVENT_ACTIVE: {
2870                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
2871                 break;
2872         }
2873         case IMP_EVENT_OCD: {
2874                 struct obd_connect_data *ocd = &imp->imp_connect_data;
2875
2876                 if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
2877                         osc_init_grant(&obd->u.cli, ocd);
2878
2879                 /* See bug 7198 */
2880                 if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
2881                         imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
2882
2883                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
2884                 break;
2885         }
2886         case IMP_EVENT_DEACTIVATE: {
2887                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
2888                 break;
2889         }
2890         case IMP_EVENT_ACTIVATE: {
2891                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
2892                 break;
2893         }
2894         default:
2895                 CERROR("Unknown import event %d\n", event);
2896                 LBUG();
2897         }
2898         RETURN(rc);
2899 }
2900
2901 /**
2902  * Determine whether the lock can be canceled before replaying the lock
2903  * during recovery, see bug16774 for detailed information.
2904  *
2905  * \retval zero the lock can't be canceled
2906  * \retval other ok to cancel
2907  */
2908 static int osc_cancel_weight(struct ldlm_lock *lock)
2909 {
2910         /*
2911          * Cancel all unused and granted extent lock.
2912          */
2913         if (lock->l_resource->lr_type == LDLM_EXTENT &&
2914             lock->l_granted_mode == lock->l_req_mode &&
2915             osc_ldlm_weigh_ast(lock) == 0)
2916                 RETURN(1);
2917
2918         RETURN(0);
2919 }
2920
2921 static int brw_queue_work(const struct lu_env *env, void *data)
2922 {
2923         struct client_obd *cli = data;
2924
2925         CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
2926
2927         osc_io_unplug(env, cli, NULL);
2928         RETURN(0);
2929 }
2930
2931 int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
2932 {
2933         struct client_obd *cli = &obd->u.cli;
2934         void *handler;
2935         int rc;
2936
2937         ENTRY;
2938
2939         rc = ptlrpcd_addref();
2940         if (rc)
2941                 RETURN(rc);
2942
2943         rc = client_obd_setup(obd, lcfg);
2944         if (rc)
2945                 GOTO(out_ptlrpcd, rc);
2946
2947
2948         handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
2949         if (IS_ERR(handler))
2950                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
2951         cli->cl_writeback_work = handler;
2952
2953         handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
2954         if (IS_ERR(handler))
2955                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
2956         cli->cl_lru_work = handler;
2957
2958         rc = osc_quota_setup(obd);
2959         if (rc)
2960                 GOTO(out_ptlrpcd_work, rc);
2961
2962         cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
2963
2964         INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
2965         RETURN(rc);
2966
2967 out_ptlrpcd_work:
2968         if (cli->cl_writeback_work != NULL) {
2969                 ptlrpcd_destroy_work(cli->cl_writeback_work);
2970                 cli->cl_writeback_work = NULL;
2971         }
2972         if (cli->cl_lru_work != NULL) {
2973                 ptlrpcd_destroy_work(cli->cl_lru_work);
2974                 cli->cl_lru_work = NULL;
2975         }
2976         client_obd_cleanup(obd);
2977 out_ptlrpcd:
2978         ptlrpcd_decref();
2979         RETURN(rc);
2980 }
2981 EXPORT_SYMBOL(osc_setup_common);
2982
2983 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
2984 {
2985         struct client_obd *cli = &obd->u.cli;
2986         struct obd_type   *type;
2987         int                adding;
2988         int                added;
2989         int                req_count;
2990         int                rc;
2991
2992         ENTRY;
2993
2994         rc = osc_setup_common(obd, lcfg);
2995         if (rc < 0)
2996                 RETURN(rc);
2997
2998 #ifdef CONFIG_PROC_FS
2999         obd->obd_vars = lprocfs_osc_obd_vars;
3000 #endif
3001         /* If this is true then both client (osc) and server (osp) are on the
3002          * same node. The osp layer if loaded first will register the osc proc
3003          * directory. In that case this obd_device will be attached its proc
3004          * tree to type->typ_procsym instead of obd->obd_type->typ_procroot.
3005          */
3006         type = class_search_type(LUSTRE_OSP_NAME);
3007         if (type && type->typ_procsym) {
3008                 obd->obd_proc_entry = lprocfs_register(obd->obd_name,
3009                                                        type->typ_procsym,
3010                                                        obd->obd_vars, obd);
3011                 if (IS_ERR(obd->obd_proc_entry)) {
3012                         rc = PTR_ERR(obd->obd_proc_entry);
3013                         CERROR("error %d setting up lprocfs for %s\n", rc,
3014                                obd->obd_name);
3015                         obd->obd_proc_entry = NULL;
3016                 }
3017         }
3018
3019         rc = lprocfs_obd_setup(obd, false);
3020         if (!rc) {
3021                 /* If the basic OSC proc tree construction succeeded then
3022                  * lets do the rest.
3023                  */
3024                 lproc_osc_attach_seqstat(obd);
3025                 sptlrpc_lprocfs_cliobd_attach(obd);
3026                 ptlrpc_lprocfs_register_obd(obd);
3027         }
3028
3029         /*
3030          * We try to control the total number of requests with a upper limit
3031          * osc_reqpool_maxreqcount. There might be some race which will cause
3032          * over-limit allocation, but it is fine.
3033          */
3034         req_count = atomic_read(&osc_pool_req_count);
3035         if (req_count < osc_reqpool_maxreqcount) {
3036                 adding = cli->cl_max_rpcs_in_flight + 2;
3037                 if (req_count + adding > osc_reqpool_maxreqcount)
3038                         adding = osc_reqpool_maxreqcount - req_count;
3039
3040                 added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
3041                 atomic_add(added, &osc_pool_req_count);
3042         }
3043
3044         INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
3045         ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
3046
3047         spin_lock(&osc_shrink_lock);
3048         list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
3049         spin_unlock(&osc_shrink_lock);
3050
3051         RETURN(0);
3052 }
3053
3054 int osc_precleanup_common(struct obd_device *obd)
3055 {
3056         struct client_obd *cli = &obd->u.cli;
3057         ENTRY;
3058
3059         /* LU-464
3060          * for echo client, export may be on zombie list, wait for
3061          * zombie thread to cull it, because cli.cl_import will be
3062          * cleared in client_disconnect_export():
3063          *   class_export_destroy() -> obd_cleanup() ->
3064          *   echo_device_free() -> echo_client_cleanup() ->
3065          *   obd_disconnect() -> osc_disconnect() ->
3066          *   client_disconnect_export()
3067          */
3068         obd_zombie_barrier();
3069         if (cli->cl_writeback_work) {
3070                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3071                 cli->cl_writeback_work = NULL;
3072         }
3073
3074         if (cli->cl_lru_work) {
3075                 ptlrpcd_destroy_work(cli->cl_lru_work);
3076                 cli->cl_lru_work = NULL;
3077         }
3078
3079         obd_cleanup_client_import(obd);
3080         RETURN(0);
3081 }
3082 EXPORT_SYMBOL(osc_precleanup_common);
3083
3084 static int osc_precleanup(struct obd_device *obd)
3085 {
3086         ENTRY;
3087
3088         osc_precleanup_common(obd);
3089
3090         ptlrpc_lprocfs_unregister_obd(obd);
3091         RETURN(0);
3092 }
3093
3094 int osc_cleanup_common(struct obd_device *obd)
3095 {
3096         struct client_obd *cli = &obd->u.cli;
3097         int rc;
3098
3099         ENTRY;
3100
3101         spin_lock(&osc_shrink_lock);
3102         list_del(&cli->cl_shrink_list);
3103         spin_unlock(&osc_shrink_lock);
3104
3105         /* lru cleanup */
3106         if (cli->cl_cache != NULL) {
3107                 LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
3108                 spin_lock(&cli->cl_cache->ccc_lru_lock);
3109                 list_del_init(&cli->cl_lru_osc);
3110                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
3111                 cli->cl_lru_left = NULL;
3112                 cl_cache_decref(cli->cl_cache);
3113                 cli->cl_cache = NULL;
3114         }
3115
3116         /* free memory of osc quota cache */
3117         osc_quota_cleanup(obd);
3118
3119         rc = client_obd_cleanup(obd);
3120
3121         ptlrpcd_decref();
3122         RETURN(rc);
3123 }
3124 EXPORT_SYMBOL(osc_cleanup_common);
3125
3126 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
3127 {
3128         int rc = class_process_proc_param(PARAM_OSC, obd->obd_vars, lcfg, obd);
3129         return rc > 0 ? 0: rc;
3130 }
3131
3132 static int osc_process_config(struct obd_device *obd, size_t len, void *buf)
3133 {
3134         return osc_process_config_base(obd, buf);
3135 }
3136
3137 static struct obd_ops osc_obd_ops = {
3138         .o_owner                = THIS_MODULE,
3139         .o_setup                = osc_setup,
3140         .o_precleanup           = osc_precleanup,
3141         .o_cleanup              = osc_cleanup_common,
3142         .o_add_conn             = client_import_add_conn,
3143         .o_del_conn             = client_import_del_conn,
3144         .o_connect              = client_connect_import,
3145         .o_reconnect            = osc_reconnect,
3146         .o_disconnect           = osc_disconnect,
3147         .o_statfs               = osc_statfs,
3148         .o_statfs_async         = osc_statfs_async,
3149         .o_create               = osc_create,
3150         .o_destroy              = osc_destroy,
3151         .o_getattr              = osc_getattr,
3152         .o_setattr              = osc_setattr,
3153         .o_iocontrol            = osc_iocontrol,
3154         .o_set_info_async       = osc_set_info_async,
3155         .o_import_event         = osc_import_event,
3156         .o_process_config       = osc_process_config,
3157         .o_quotactl             = osc_quotactl,
3158 };
3159
3160 static struct shrinker *osc_cache_shrinker;
3161 struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list);
3162 DEFINE_SPINLOCK(osc_shrink_lock);
3163
3164 #ifndef HAVE_SHRINKER_COUNT
3165 static int osc_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
3166 {
3167         struct shrink_control scv = {
3168                 .nr_to_scan = shrink_param(sc, nr_to_scan),
3169                 .gfp_mask   = shrink_param(sc, gfp_mask)
3170         };
3171 #if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL)
3172         struct shrinker *shrinker = NULL;
3173 #endif
3174
3175         (void)osc_cache_shrink_scan(shrinker, &scv);
3176
3177         return osc_cache_shrink_count(shrinker, &scv);
3178 }
3179 #endif
3180
3181 static int __init osc_init(void)
3182 {
3183         bool enable_proc = true;
3184         struct obd_type *type;
3185         unsigned int reqpool_size;
3186         unsigned int reqsize;
3187         int rc;
3188         DEF_SHRINKER_VAR(osc_shvar, osc_cache_shrink,
3189                          osc_cache_shrink_count, osc_cache_shrink_scan);
3190         ENTRY;
3191
3192         /* print an address of _any_ initialized kernel symbol from this
3193          * module, to allow debugging with gdb that doesn't support data
3194          * symbols from modules.*/
3195         CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
3196
3197         rc = lu_kmem_init(osc_caches);
3198         if (rc)
3199                 RETURN(rc);
3200
3201         type = class_search_type(LUSTRE_OSP_NAME);
3202         if (type != NULL && type->typ_procsym != NULL)
3203                 enable_proc = false;
3204
3205         rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL,
3206                                  LUSTRE_OSC_NAME, &osc_device_type);
3207         if (rc)
3208                 GOTO(out_kmem, rc);
3209
3210         osc_cache_shrinker = set_shrinker(DEFAULT_SEEKS, &osc_shvar);
3211
3212         /* This is obviously too much memory, only prevent overflow here */
3213         if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
3214                 GOTO(out_type, rc = -EINVAL);
3215
3216         reqpool_size = osc_reqpool_mem_max << 20;
3217
3218         reqsize = 1;
3219         while (reqsize < OST_IO_MAXREQSIZE)
3220                 reqsize = reqsize << 1;
3221
3222         /*
3223          * We don't enlarge the request count in OSC pool according to
3224          * cl_max_rpcs_in_flight. The allocation from the pool will only be
3225          * tried after normal allocation failed. So a small OSC pool won't
3226          * cause much performance degression in most of cases.
3227          */
3228         osc_reqpool_maxreqcount = reqpool_size / reqsize;
3229
3230         atomic_set(&osc_pool_req_count, 0);
3231         osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
3232                                           ptlrpc_add_rqs_to_pool);
3233
3234         if (osc_rq_pool != NULL)
3235                 GOTO(out, rc);
3236         rc = -ENOMEM;
3237 out_type:
3238         class_unregister_type(LUSTRE_OSC_NAME);
3239 out_kmem:
3240         lu_kmem_fini(osc_caches);
3241 out:
3242         RETURN(rc);
3243 }
3244
3245 static void __exit osc_exit(void)
3246 {
3247         remove_shrinker(osc_cache_shrinker);
3248         class_unregister_type(LUSTRE_OSC_NAME);
3249         lu_kmem_fini(osc_caches);
3250         ptlrpc_free_rq_pool(osc_rq_pool);
3251 }
3252
3253 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
3254 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
3255 MODULE_VERSION(LUSTRE_VERSION_STRING);
3256 MODULE_LICENSE("GPL");
3257
3258 module_init(osc_init);
3259 module_exit(osc_exit);