lustre/osc/osc_request.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  */
  31
  32 #define DEBUG_SUBSYSTEM S_OSC
  33
  34 #include <linux/workqueue.h>
  35 #include <libcfs/libcfs.h>
  36 #include <linux/falloc.h>
  37 #include <lprocfs_status.h>
  38 #include <lustre_dlm.h>
  39 #include <lustre_fid.h>
  40 #include <lustre_ha.h>
  41 #include <uapi/linux/lustre/lustre_ioctl.h>
  42 #include <lustre_net.h>
  43 #include <lustre_obdo.h>
  44 #include <obd.h>
  45 #include <obd_cksum.h>
  46 #include <obd_class.h>
  47 #include <lustre_osc.h>
  48 #include <linux/falloc.h>
  49
  50 #include "osc_internal.h"
  51 #include <lnet/lnet_rdma.h>
  52
  53 atomic_t osc_pool_req_count;
  54 unsigned int osc_reqpool_maxreqcount;
  55 struct ptlrpc_request_pool *osc_rq_pool;
  56
  57 /* max memory used for request pool, unit is MB */
  58 static unsigned int osc_reqpool_mem_max = 5;
  59 module_param(osc_reqpool_mem_max, uint, 0444);
  60
  61 static int osc_idle_timeout = 20;
  62 module_param(osc_idle_timeout, uint, 0644);
  63
  64 #define osc_grant_args osc_brw_async_args
  65
  66 struct osc_setattr_args {
  67         struct obdo             *sa_oa;
  68         obd_enqueue_update_f     sa_upcall;
  69         void                    *sa_cookie;
  70 };
  71
  72 struct osc_fsync_args {
  73         struct osc_object       *fa_obj;
  74         struct obdo             *fa_oa;
  75         obd_enqueue_update_f    fa_upcall;
  76         void                    *fa_cookie;
  77 };
  78
  79 struct osc_ladvise_args {
  80         struct obdo             *la_oa;
  81         obd_enqueue_update_f     la_upcall;
  82         void                    *la_cookie;
  83 };
  84
  85 static void osc_release_ppga(struct brw_page **ppga, size_t count);
  86 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
  87                          void *data, int rc);
  88
  89 void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
  90 {
  91         struct ost_body *body;
  92
  93         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
  94         LASSERT(body);
  95
  96         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
  97 }
  98
  99 static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
 100                        struct obdo *oa)
 101 {
 102         struct ptlrpc_request   *req;
 103         struct ost_body         *body;
 104         int                      rc;
 105
 106         ENTRY;
 107         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
 108         if (req == NULL)
 109                 RETURN(-ENOMEM);
 110
 111         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
 112         if (rc) {
 113                 ptlrpc_request_free(req);
 114                 RETURN(rc);
 115         }
 116
 117         osc_pack_req_body(req, oa);
 118
 119         ptlrpc_request_set_replen(req);
 120
 121         rc = ptlrpc_queue_wait(req);
 122         if (rc)
 123                 GOTO(out, rc);
 124
 125         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 126         if (body == NULL)
 127                 GOTO(out, rc = -EPROTO);
 128
 129         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
 130         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 131
 132         oa->o_blksize = cli_brw_size(exp->exp_obd);
 133         oa->o_valid |= OBD_MD_FLBLKSZ;
 134
 135         EXIT;
 136 out:
 137         ptlrpc_req_finished(req);
 138
 139         return rc;
 140 }
 141
 142 static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
 143                        struct obdo *oa)
 144 {
 145         struct ptlrpc_request   *req;
 146         struct ost_body         *body;
 147         int                      rc;
 148
 149         ENTRY;
 150         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 151
 152         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 153         if (req == NULL)
 154                 RETURN(-ENOMEM);
 155
 156         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 157         if (rc) {
 158                 ptlrpc_request_free(req);
 159                 RETURN(rc);
 160         }
 161
 162         osc_pack_req_body(req, oa);
 163
 164         ptlrpc_request_set_replen(req);
 165
 166         rc = ptlrpc_queue_wait(req);
 167         if (rc)
 168                 GOTO(out, rc);
 169
 170         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 171         if (body == NULL)
 172                 GOTO(out, rc = -EPROTO);
 173
 174         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 175
 176         EXIT;
 177 out:
 178         ptlrpc_req_finished(req);
 179
 180         RETURN(rc);
 181 }
 182
 183 static int osc_setattr_interpret(const struct lu_env *env,
 184                                  struct ptlrpc_request *req, void *args, int rc)
 185 {
 186         struct osc_setattr_args *sa = args;
 187         struct ost_body *body;
 188
 189         ENTRY;
 190
 191         if (rc != 0)
 192                 GOTO(out, rc);
 193
 194         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 195         if (body == NULL)
 196                 GOTO(out, rc = -EPROTO);
 197
 198         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
 199                              &body->oa);
 200 out:
 201         rc = sa->sa_upcall(sa->sa_cookie, rc);
 202         RETURN(rc);
 203 }
 204
 205 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 206                       obd_enqueue_update_f upcall, void *cookie,
 207                       struct ptlrpc_request_set *rqset)
 208 {
 209         struct ptlrpc_request   *req;
 210         struct osc_setattr_args *sa;
 211         int                      rc;
 212
 213         ENTRY;
 214
 215         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 216         if (req == NULL)
 217                 RETURN(-ENOMEM);
 218
 219         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 220         if (rc) {
 221                 ptlrpc_request_free(req);
 222                 RETURN(rc);
 223         }
 224
 225         osc_pack_req_body(req, oa);
 226
 227         ptlrpc_request_set_replen(req);
 228
 229         /* do mds to ost setattr asynchronously */
 230         if (!rqset) {
 231                 /* Do not wait for response. */
 232                 ptlrpcd_add_req(req);
 233         } else {
 234                 req->rq_interpret_reply = osc_setattr_interpret;
 235
 236                 sa = ptlrpc_req_async_args(sa, req);
 237                 sa->sa_oa = oa;
 238                 sa->sa_upcall = upcall;
 239                 sa->sa_cookie = cookie;
 240
 241                 ptlrpc_set_add_req(rqset, req);
 242         }
 243
 244         RETURN(0);
 245 }
 246
 247 static int osc_ladvise_interpret(const struct lu_env *env,
 248                                  struct ptlrpc_request *req,
 249                                  void *arg, int rc)
 250 {
 251         struct osc_ladvise_args *la = arg;
 252         struct ost_body *body;
 253         ENTRY;
 254
 255         if (rc != 0)
 256                 GOTO(out, rc);
 257
 258         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 259         if (body == NULL)
 260                 GOTO(out, rc = -EPROTO);
 261
 262         *la->la_oa = body->oa;
 263 out:
 264         rc = la->la_upcall(la->la_cookie, rc);
 265         RETURN(rc);
 266 }
 267
 268 /**
 269  * If rqset is NULL, do not wait for response. Upcall and cookie could also
 270  * be NULL in this case
 271  */
 272 int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 273                      struct ladvise_hdr *ladvise_hdr,
 274                      obd_enqueue_update_f upcall, void *cookie,
 275                      struct ptlrpc_request_set *rqset)
 276 {
 277         struct ptlrpc_request   *req;
 278         struct ost_body         *body;
 279         struct osc_ladvise_args *la;
 280         int                      rc;
 281         struct lu_ladvise       *req_ladvise;
 282         struct lu_ladvise       *ladvise = ladvise_hdr->lah_advise;
 283         int                      num_advise = ladvise_hdr->lah_count;
 284         struct ladvise_hdr      *req_ladvise_hdr;
 285         ENTRY;
 286
 287         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
 288         if (req == NULL)
 289                 RETURN(-ENOMEM);
 290
 291         req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
 292                              num_advise * sizeof(*ladvise));
 293         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
 294         if (rc != 0) {
 295                 ptlrpc_request_free(req);
 296                 RETURN(rc);
 297         }
 298         req->rq_request_portal = OST_IO_PORTAL;
 299         ptlrpc_at_set_req_timeout(req);
 300
 301         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 302         LASSERT(body);
 303         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
 304                              oa);
 305
 306         req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
 307                                                  &RMF_OST_LADVISE_HDR);
 308         memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
 309
 310         req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
 311         memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
 312         ptlrpc_request_set_replen(req);
 313
 314         if (rqset == NULL) {
 315                 /* Do not wait for response. */
 316                 ptlrpcd_add_req(req);
 317                 RETURN(0);
 318         }
 319
 320         req->rq_interpret_reply = osc_ladvise_interpret;
 321         la = ptlrpc_req_async_args(la, req);
 322         la->la_oa = oa;
 323         la->la_upcall = upcall;
 324         la->la_cookie = cookie;
 325
 326         ptlrpc_set_add_req(rqset, req);
 327
 328         RETURN(0);
 329 }
 330
 331 static int osc_create(const struct lu_env *env, struct obd_export *exp,
 332                       struct obdo *oa)
 333 {
 334         struct ptlrpc_request *req;
 335         struct ost_body       *body;
 336         int                    rc;
 337         ENTRY;
 338
 339         LASSERT(oa != NULL);
 340         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 341         LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
 342
 343         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
 344         if (req == NULL)
 345                 GOTO(out, rc = -ENOMEM);
 346
 347         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
 348         if (rc) {
 349                 ptlrpc_request_free(req);
 350                 GOTO(out, rc);
 351         }
 352
 353         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 354         LASSERT(body);
 355
 356         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 357
 358         ptlrpc_request_set_replen(req);
 359
 360         rc = ptlrpc_queue_wait(req);
 361         if (rc)
 362                 GOTO(out_req, rc);
 363
 364         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 365         if (body == NULL)
 366                 GOTO(out_req, rc = -EPROTO);
 367
 368         CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
 369         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 370
 371         oa->o_blksize = cli_brw_size(exp->exp_obd);
 372         oa->o_valid |= OBD_MD_FLBLKSZ;
 373
 374         CDEBUG(D_HA, "transno: %lld\n",
 375                lustre_msg_get_transno(req->rq_repmsg));
 376 out_req:
 377         ptlrpc_req_finished(req);
 378 out:
 379         RETURN(rc);
 380 }
 381
 382 int osc_punch_send(struct obd_export *exp, struct obdo *oa,
 383                    obd_enqueue_update_f upcall, void *cookie)
 384 {
 385         struct ptlrpc_request *req;
 386         struct osc_setattr_args *sa;
 387         struct obd_import *imp = class_exp2cliimp(exp);
 388         struct ost_body *body;
 389         int rc;
 390
 391         ENTRY;
 392
 393         req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
 394         if (req == NULL)
 395                 RETURN(-ENOMEM);
 396
 397         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
 398         if (rc < 0) {
 399                 ptlrpc_request_free(req);
 400                 RETURN(rc);
 401         }
 402
 403         osc_set_io_portal(req);
 404
 405         ptlrpc_at_set_req_timeout(req);
 406
 407         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 408
 409         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 410
 411         ptlrpc_request_set_replen(req);
 412
 413         req->rq_interpret_reply = osc_setattr_interpret;
 414         sa = ptlrpc_req_async_args(sa, req);
 415         sa->sa_oa = oa;
 416         sa->sa_upcall = upcall;
 417         sa->sa_cookie = cookie;
 418
 419         ptlrpcd_add_req(req);
 420
 421         RETURN(0);
 422 }
 423 EXPORT_SYMBOL(osc_punch_send);
 424
 425 /**
 426  * osc_fallocate_base() - Handles fallocate request.
 427  *
 428  * @exp:        Export structure
 429  * @oa:         Attributes passed to OSS from client (obdo structure)
 430  * @upcall:     Primary & supplementary group information
 431  * @cookie:     Exclusive identifier
 432  * @rqset:      Request list.
 433  * @mode:       Operation done on given range.
 434  *
 435  * osc_fallocate_base() - Handles fallocate requests only. Only block
 436  * allocation or standard preallocate operation is supported currently.
 437  * Other mode flags is not supported yet. ftruncate(2) or truncate(2)
 438  * is supported via SETATTR request.
 439  *
 440  * Return: Non-zero on failure and O on success.
 441  */
 442 int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
 443                        obd_enqueue_update_f upcall, void *cookie, int mode)
 444 {
 445         struct ptlrpc_request *req;
 446         struct osc_setattr_args *sa;
 447         struct ost_body *body;
 448         struct obd_import *imp = class_exp2cliimp(exp);
 449         int rc;
 450         ENTRY;
 451
 452         oa->o_falloc_mode = mode;
 453         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
 454                                    &RQF_OST_FALLOCATE);
 455         if (req == NULL)
 456                 RETURN(-ENOMEM);
 457
 458         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_FALLOCATE);
 459         if (rc != 0) {
 460                 ptlrpc_request_free(req);
 461                 RETURN(rc);
 462         }
 463
 464         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 465         LASSERT(body);
 466
 467         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 468
 469         ptlrpc_request_set_replen(req);
 470
 471         req->rq_interpret_reply = osc_setattr_interpret;
 472         BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
 473         sa = ptlrpc_req_async_args(sa, req);
 474         sa->sa_oa = oa;
 475         sa->sa_upcall = upcall;
 476         sa->sa_cookie = cookie;
 477
 478         ptlrpcd_add_req(req);
 479
 480         RETURN(0);
 481 }
 482 EXPORT_SYMBOL(osc_fallocate_base);
 483
 484 static int osc_sync_interpret(const struct lu_env *env,
 485                               struct ptlrpc_request *req, void *args, int rc)
 486 {
 487         struct osc_fsync_args *fa = args;
 488         struct ost_body *body;
 489         struct cl_attr *attr = &osc_env_info(env)->oti_attr;
 490         unsigned long valid = 0;
 491         struct cl_object *obj;
 492         ENTRY;
 493
 494         if (rc != 0)
 495                 GOTO(out, rc);
 496
 497         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 498         if (body == NULL) {
 499                 CERROR("can't unpack ost_body\n");
 500                 GOTO(out, rc = -EPROTO);
 501         }
 502
 503         *fa->fa_oa = body->oa;
 504         obj = osc2cl(fa->fa_obj);
 505
 506         /* Update osc object's blocks attribute */
 507         cl_object_attr_lock(obj);
 508         if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
 509                 attr->cat_blocks = body->oa.o_blocks;
 510                 valid |= CAT_BLOCKS;
 511         }
 512
 513         if (valid != 0)
 514                 cl_object_attr_update(env, obj, attr, valid);
 515         cl_object_attr_unlock(obj);
 516
 517 out:
 518         rc = fa->fa_upcall(fa->fa_cookie, rc);
 519         RETURN(rc);
 520 }
 521
 522 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 523                   obd_enqueue_update_f upcall, void *cookie,
 524                   struct ptlrpc_request_set *rqset)
 525 {
 526         struct obd_export     *exp = osc_export(obj);
 527         struct ptlrpc_request *req;
 528         struct ost_body       *body;
 529         struct osc_fsync_args *fa;
 530         int                    rc;
 531         ENTRY;
 532
 533         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
 534         if (req == NULL)
 535                 RETURN(-ENOMEM);
 536
 537         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
 538         if (rc) {
 539                 ptlrpc_request_free(req);
 540                 RETURN(rc);
 541         }
 542
 543         /* overload the size and blocks fields in the oa with start/end */
 544         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 545         LASSERT(body);
 546         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 547
 548         ptlrpc_request_set_replen(req);
 549         req->rq_interpret_reply = osc_sync_interpret;
 550
 551         fa = ptlrpc_req_async_args(fa, req);
 552         fa->fa_obj = obj;
 553         fa->fa_oa = oa;
 554         fa->fa_upcall = upcall;
 555         fa->fa_cookie = cookie;
 556
 557         ptlrpc_set_add_req(rqset, req);
 558
 559         RETURN (0);
 560 }
 561
 562 /* Find and cancel locally locks matched by @mode in the resource found by
 563  * @objid. Found locks are added into @cancel list. Returns the amount of
 564  * locks added to @cancels list. */
 565 static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
 566                                    struct list_head *cancels,
 567                                    enum ldlm_mode mode, __u64 lock_flags)
 568 {
 569         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
 570         struct ldlm_res_id res_id;
 571         struct ldlm_resource *res;
 572         int count;
 573         ENTRY;
 574
 575         /* Return, i.e. cancel nothing, only if ELC is supported (flag in
 576          * export) but disabled through procfs (flag in NS).
 577          *
 578          * This distinguishes from a case when ELC is not supported originally,
 579          * when we still want to cancel locks in advance and just cancel them
 580          * locally, without sending any RPC. */
 581         if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
 582                 RETURN(0);
 583
 584         ostid_build_res_name(&oa->o_oi, &res_id);
 585         res = ldlm_resource_get(ns, &res_id, 0, 0);
 586         if (IS_ERR(res))
 587                 RETURN(0);
 588
 589         LDLM_RESOURCE_ADDREF(res);
 590         count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
 591                                            lock_flags, 0, NULL);
 592         LDLM_RESOURCE_DELREF(res);
 593         ldlm_resource_putref(res);
 594         RETURN(count);
 595 }
 596
 597 static int osc_destroy_interpret(const struct lu_env *env,
 598                                  struct ptlrpc_request *req, void *args, int rc)
 599 {
 600         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 601
 602         atomic_dec(&cli->cl_destroy_in_flight);
 603         wake_up(&cli->cl_destroy_waitq);
 604
 605         return 0;
 606 }
 607
 608 static int osc_can_send_destroy(struct client_obd *cli)
 609 {
 610         if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
 611             cli->cl_max_rpcs_in_flight) {
 612                 /* The destroy request can be sent */
 613                 return 1;
 614         }
 615         if (atomic_dec_return(&cli->cl_destroy_in_flight) <
 616             cli->cl_max_rpcs_in_flight) {
 617                 /*
 618                  * The counter has been modified between the two atomic
 619                  * operations.
 620                  */
 621                 wake_up(&cli->cl_destroy_waitq);
 622         }
 623         return 0;
 624 }
 625
 626 static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
 627                        struct obdo *oa)
 628 {
 629         struct client_obd     *cli = &exp->exp_obd->u.cli;
 630         struct ptlrpc_request *req;
 631         struct ost_body       *body;
 632         LIST_HEAD(cancels);
 633         int rc, count;
 634         ENTRY;
 635
 636         if (!oa) {
 637                 CDEBUG(D_INFO, "oa NULL\n");
 638                 RETURN(-EINVAL);
 639         }
 640
 641         count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
 642                                         LDLM_FL_DISCARD_DATA);
 643
 644         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
 645         if (req == NULL) {
 646                 ldlm_lock_list_put(&cancels, l_bl_ast, count);
 647                 RETURN(-ENOMEM);
 648         }
 649
 650         rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
 651                                0, &cancels, count);
 652         if (rc) {
 653                 ptlrpc_request_free(req);
 654                 RETURN(rc);
 655         }
 656
 657         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
 658         ptlrpc_at_set_req_timeout(req);
 659
 660         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 661         LASSERT(body);
 662         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 663
 664         ptlrpc_request_set_replen(req);
 665
 666         req->rq_interpret_reply = osc_destroy_interpret;
 667         if (!osc_can_send_destroy(cli)) {
 668                 /*
 669                  * Wait until the number of on-going destroy RPCs drops
 670                  * under max_rpc_in_flight
 671                  */
 672                 rc = l_wait_event_abortable_exclusive(
 673                         cli->cl_destroy_waitq,
 674                         osc_can_send_destroy(cli));
 675                 if (rc) {
 676                         ptlrpc_req_finished(req);
 677                         RETURN(-EINTR);
 678                 }
 679         }
 680
 681         /* Do not wait for response */
 682         ptlrpcd_add_req(req);
 683         RETURN(0);
 684 }
 685
 686 static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 687                                 long writing_bytes)
 688 {
 689         u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
 690
 691         LASSERT(!(oa->o_valid & bits));
 692
 693         oa->o_valid |= bits;
 694         spin_lock(&cli->cl_loi_list_lock);
 695         if (cli->cl_ocd_grant_param)
 696                 oa->o_dirty = cli->cl_dirty_grant;
 697         else
 698                 oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
 699         if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
 700                 CERROR("dirty %lu > dirty_max %lu\n",
 701                        cli->cl_dirty_pages,
 702                        cli->cl_dirty_max_pages);
 703                 oa->o_undirty = 0;
 704         } else if (unlikely(atomic_long_read(&obd_dirty_pages) >
 705                             (long)(obd_max_dirty_pages + 1))) {
 706                 /* The atomic_read() allowing the atomic_inc() are
 707                  * not covered by a lock thus they may safely race and trip
 708                  * this CERROR() unless we add in a small fudge factor (+1). */
 709                 CERROR("%s: dirty %ld > system dirty_max %ld\n",
 710                        cli_name(cli), atomic_long_read(&obd_dirty_pages),
 711                        obd_max_dirty_pages);
 712                 oa->o_undirty = 0;
 713         } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
 714                             0x7fffffff)) {
 715                 CERROR("dirty %lu - dirty_max %lu too big???\n",
 716                        cli->cl_dirty_pages, cli->cl_dirty_max_pages);
 717                 oa->o_undirty = 0;
 718         } else {
 719                 unsigned long nrpages;
 720                 unsigned long undirty;
 721
 722                 nrpages = cli->cl_max_pages_per_rpc;
 723                 nrpages *= cli->cl_max_rpcs_in_flight + 1;
 724                 nrpages = max(nrpages, cli->cl_dirty_max_pages);
 725                 undirty = nrpages << PAGE_SHIFT;
 726                 if (cli->cl_ocd_grant_param) {
 727                         int nrextents;
 728
 729                         /* take extent tax into account when asking for more
 730                          * grant space */
 731                         nrextents = (nrpages + cli->cl_max_extent_pages - 1) /
 732                                      cli->cl_max_extent_pages;
 733                         undirty += nrextents * cli->cl_grant_extent_tax;
 734                 }
 735                 /* Do not ask for more than OBD_MAX_GRANT - a margin for server
 736                  * to add extent tax, etc.
 737                  */
 738                 oa->o_undirty = min(undirty, OBD_MAX_GRANT &
 739                                     ~(PTLRPC_MAX_BRW_SIZE * 4UL));
 740         }
 741         oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
 742         /* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
 743         if (cli->cl_lost_grant > INT_MAX) {
 744                 CDEBUG(D_CACHE,
 745                       "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
 746                       cli_name(cli), cli->cl_lost_grant);
 747                 oa->o_dropped = INT_MAX;
 748         } else {
 749                 oa->o_dropped = cli->cl_lost_grant;
 750         }
 751         cli->cl_lost_grant -= oa->o_dropped;
 752         spin_unlock(&cli->cl_loi_list_lock);
 753         CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
 754                " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
 755                oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
 756 }
 757
 758 void osc_update_next_shrink(struct client_obd *cli)
 759 {
 760         cli->cl_next_shrink_grant = ktime_get_seconds() +
 761                                     cli->cl_grant_shrink_interval;
 762
 763         CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
 764                cli->cl_next_shrink_grant);
 765 }
 766 EXPORT_SYMBOL(osc_update_next_shrink);
 767
 768 static void __osc_update_grant(struct client_obd *cli, u64 grant)
 769 {
 770         spin_lock(&cli->cl_loi_list_lock);
 771         cli->cl_avail_grant += grant;
 772         spin_unlock(&cli->cl_loi_list_lock);
 773 }
 774
 775 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
 776 {
 777         if (body->oa.o_valid & OBD_MD_FLGRANT) {
 778                 CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
 779                 __osc_update_grant(cli, body->oa.o_grant);
 780         }
 781 }
 782
 783 /**
 784  * grant thread data for shrinking space.
 785  */
 786 struct grant_thread_data {
 787         struct list_head        gtd_clients;
 788         struct mutex            gtd_mutex;
 789         unsigned long           gtd_stopped:1;
 790 };
 791 static struct grant_thread_data client_gtd;
 792
 793 static int osc_shrink_grant_interpret(const struct lu_env *env,
 794                                       struct ptlrpc_request *req,
 795                                       void *args, int rc)
 796 {
 797         struct osc_grant_args *aa = args;
 798         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 799         struct ost_body *body;
 800
 801         if (rc != 0) {
 802                 __osc_update_grant(cli, aa->aa_oa->o_grant);
 803                 GOTO(out, rc);
 804         }
 805
 806         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 807         LASSERT(body);
 808         osc_update_grant(cli, body);
 809 out:
 810         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
 811         aa->aa_oa = NULL;
 812
 813         return rc;
 814 }
 815
 816 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
 817 {
 818         spin_lock(&cli->cl_loi_list_lock);
 819         oa->o_grant = cli->cl_avail_grant / 4;
 820         cli->cl_avail_grant -= oa->o_grant;
 821         spin_unlock(&cli->cl_loi_list_lock);
 822         if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
 823                 oa->o_valid |= OBD_MD_FLFLAGS;
 824                 oa->o_flags = 0;
 825         }
 826         oa->o_flags |= OBD_FL_SHRINK_GRANT;
 827         osc_update_next_shrink(cli);
 828 }
 829
 830 /* Shrink the current grant, either from some large amount to enough for a
 831  * full set of in-flight RPCs, or if we have already shrunk to that limit
 832  * then to enough for a single RPC.  This avoids keeping more grant than
 833  * needed, and avoids shrinking the grant piecemeal. */
 834 static int osc_shrink_grant(struct client_obd *cli)
 835 {
 836         __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
 837                              (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
 838
 839         spin_lock(&cli->cl_loi_list_lock);
 840         if (cli->cl_avail_grant <= target_bytes)
 841                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 842         spin_unlock(&cli->cl_loi_list_lock);
 843
 844         return osc_shrink_grant_to_target(cli, target_bytes);
 845 }
 846
 847 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 848 {
 849         int                     rc = 0;
 850         struct ost_body        *body;
 851         ENTRY;
 852
 853         spin_lock(&cli->cl_loi_list_lock);
 854         /* Don't shrink if we are already above or below the desired limit
 855          * We don't want to shrink below a single RPC, as that will negatively
 856          * impact block allocation and long-term performance. */
 857         if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
 858                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 859
 860         if (target_bytes >= cli->cl_avail_grant) {
 861                 spin_unlock(&cli->cl_loi_list_lock);
 862                 RETURN(0);
 863         }
 864         spin_unlock(&cli->cl_loi_list_lock);
 865
 866         OBD_ALLOC_PTR(body);
 867         if (!body)
 868                 RETURN(-ENOMEM);
 869
 870         osc_announce_cached(cli, &body->oa, 0);
 871
 872         spin_lock(&cli->cl_loi_list_lock);
 873         if (target_bytes >= cli->cl_avail_grant) {
 874                 /* available grant has changed since target calculation */
 875                 spin_unlock(&cli->cl_loi_list_lock);
 876                 GOTO(out_free, rc = 0);
 877         }
 878         body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 879         cli->cl_avail_grant = target_bytes;
 880         spin_unlock(&cli->cl_loi_list_lock);
 881         if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
 882                 body->oa.o_valid |= OBD_MD_FLFLAGS;
 883                 body->oa.o_flags = 0;
 884         }
 885         body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
 886         osc_update_next_shrink(cli);
 887
 888         rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
 889                                 sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
 890                                 sizeof(*body), body, NULL);
 891         if (rc != 0)
 892                 __osc_update_grant(cli, body->oa.o_grant);
 893 out_free:
 894         OBD_FREE_PTR(body);
 895         RETURN(rc);
 896 }
 897
 898 static int osc_should_shrink_grant(struct client_obd *client)
 899 {
 900         time64_t next_shrink = client->cl_next_shrink_grant;
 901
 902         if (client->cl_import == NULL)
 903                 return 0;
 904
 905         if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
 906             client->cl_import->imp_grant_shrink_disabled) {
 907                 osc_update_next_shrink(client);
 908                 return 0;
 909         }
 910
 911         if (ktime_get_seconds() >= next_shrink - 5) {
 912                 /* Get the current RPC size directly, instead of going via:
 913                  * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 914                  * Keep comment here so that it can be found by searching. */
 915                 int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
 916
 917                 if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
 918                     client->cl_avail_grant > brw_size)
 919                         return 1;
 920                 else
 921                         osc_update_next_shrink(client);
 922         }
 923         return 0;
 924 }
 925
 926 #define GRANT_SHRINK_RPC_BATCH  100
 927
 928 static struct delayed_work work;
 929
 930 static void osc_grant_work_handler(struct work_struct *data)
 931 {
 932         struct client_obd *cli;
 933         int rpc_sent;
 934         bool init_next_shrink = true;
 935         time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
 936
 937         rpc_sent = 0;
 938         mutex_lock(&client_gtd.gtd_mutex);
 939         list_for_each_entry(cli, &client_gtd.gtd_clients,
 940                             cl_grant_chain) {
 941                 if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
 942                     osc_should_shrink_grant(cli)) {
 943                         osc_shrink_grant(cli);
 944                         rpc_sent++;
 945                 }
 946
 947                 if (!init_next_shrink) {
 948                         if (cli->cl_next_shrink_grant < next_shrink &&
 949                             cli->cl_next_shrink_grant > ktime_get_seconds())
 950                                 next_shrink = cli->cl_next_shrink_grant;
 951                 } else {
 952                         init_next_shrink = false;
 953                         next_shrink = cli->cl_next_shrink_grant;
 954                 }
 955         }
 956         mutex_unlock(&client_gtd.gtd_mutex);
 957
 958         if (client_gtd.gtd_stopped == 1)
 959                 return;
 960
 961         if (next_shrink > ktime_get_seconds()) {
 962                 time64_t delay = next_shrink - ktime_get_seconds();
 963
 964                 schedule_delayed_work(&work, cfs_time_seconds(delay));
 965         } else {
 966                 schedule_work(&work.work);
 967         }
 968 }
 969
 970 void osc_schedule_grant_work(void)
 971 {
 972         cancel_delayed_work_sync(&work);
 973         schedule_work(&work.work);
 974 }
 975 EXPORT_SYMBOL(osc_schedule_grant_work);
 976
 977 /**
 978  * Start grant thread for returing grant to server for idle clients.
 979  */
 980 static int osc_start_grant_work(void)
 981 {
 982         client_gtd.gtd_stopped = 0;
 983         mutex_init(&client_gtd.gtd_mutex);
 984         INIT_LIST_HEAD(&client_gtd.gtd_clients);
 985
 986         INIT_DELAYED_WORK(&work, osc_grant_work_handler);
 987         schedule_work(&work.work);
 988
 989         return 0;
 990 }
 991
 992 static void osc_stop_grant_work(void)
 993 {
 994         client_gtd.gtd_stopped = 1;
 995         cancel_delayed_work_sync(&work);
 996 }
 997
 998 static void osc_add_grant_list(struct client_obd *client)
 999 {
1000         mutex_lock(&client_gtd.gtd_mutex);
1001         list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
1002         mutex_unlock(&client_gtd.gtd_mutex);
1003 }
1004
1005 static void osc_del_grant_list(struct client_obd *client)
1006 {
1007         if (list_empty(&client->cl_grant_chain))
1008                 return;
1009
1010         mutex_lock(&client_gtd.gtd_mutex);
1011         list_del_init(&client->cl_grant_chain);
1012         mutex_unlock(&client_gtd.gtd_mutex);
1013 }
1014
1015 void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
1016 {
1017         /*
1018          * ocd_grant is the total grant amount we're expect to hold: if we've
1019          * been evicted, it's the new avail_grant amount, cl_dirty_pages will
1020          * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
1021          * dirty.
1022          *
1023          * race is tolerable here: if we're evicted, but imp_state already
1024          * left EVICTED state, then cl_dirty_pages must be 0 already.
1025          */
1026         spin_lock(&cli->cl_loi_list_lock);
1027         cli->cl_avail_grant = ocd->ocd_grant;
1028         if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
1029                 unsigned long consumed = cli->cl_reserved_grant;
1030
1031                 if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
1032                         consumed += cli->cl_dirty_grant;
1033                 else
1034                         consumed += cli->cl_dirty_pages << PAGE_SHIFT;
1035                 if (cli->cl_avail_grant < consumed) {
1036                         CERROR("%s: granted %ld but already consumed %ld\n",
1037                                cli_name(cli), cli->cl_avail_grant, consumed);
1038                         cli->cl_avail_grant = 0;
1039                 } else {
1040                         cli->cl_avail_grant -= consumed;
1041                 }
1042         }
1043
1044         if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
1045                 u64 size;
1046                 int chunk_mask;
1047
1048                 /* overhead for each extent insertion */
1049                 cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
1050                 /* determine the appropriate chunk size used by osc_extent. */
1051                 cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
1052                                           ocd->ocd_grant_blkbits);
1053                 /* max_pages_per_rpc must be chunk aligned */
1054                 chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
1055                 cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
1056                                              ~chunk_mask) & chunk_mask;
1057                 /* determine maximum extent size, in #pages */
1058                 size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
1059                 cli->cl_max_extent_pages = (size >> PAGE_SHIFT) ?: 1;
1060                 cli->cl_ocd_grant_param = 1;
1061         } else {
1062                 cli->cl_ocd_grant_param = 0;
1063                 cli->cl_grant_extent_tax = 0;
1064                 cli->cl_chunkbits = PAGE_SHIFT;
1065                 cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
1066         }
1067         spin_unlock(&cli->cl_loi_list_lock);
1068
1069         CDEBUG(D_CACHE,
1070                "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld. chunk bits: %d cl_max_extent_pages: %d\n",
1071                cli_name(cli),
1072                cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
1073                cli->cl_max_extent_pages);
1074
1075         if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
1076                 osc_add_grant_list(cli);
1077 }
1078 EXPORT_SYMBOL(osc_init_grant);
1079
1080 /* We assume that the reason this OSC got a short read is because it read
1081  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
1082  * via the LOV, and it _knows_ it's reading inside the file, it's just that
1083  * this stripe never got written at or beyond this stripe offset yet. */
1084 static void handle_short_read(int nob_read, size_t page_count,
1085                               struct brw_page **pga)
1086 {
1087         char *ptr;
1088         int i = 0;
1089
1090         /* skip bytes read OK */
1091         while (nob_read > 0) {
1092                 LASSERT (page_count > 0);
1093
1094                 if (pga[i]->count > nob_read) {
1095                         /* EOF inside this page */
1096                         ptr = kmap(pga[i]->pg) +
1097                                 (pga[i]->off & ~PAGE_MASK);
1098                         memset(ptr + nob_read, 0, pga[i]->count - nob_read);
1099                         kunmap(pga[i]->pg);
1100                         page_count--;
1101                         i++;
1102                         break;
1103                 }
1104
1105                 nob_read -= pga[i]->count;
1106                 page_count--;
1107                 i++;
1108         }
1109
1110         /* zero remaining pages */
1111         while (page_count-- > 0) {
1112                 ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
1113                 memset(ptr, 0, pga[i]->count);
1114                 kunmap(pga[i]->pg);
1115                 i++;
1116         }
1117 }
1118
1119 static int check_write_rcs(struct ptlrpc_request *req,
1120                            int requested_nob, int niocount,
1121                            size_t page_count, struct brw_page **pga)
1122 {
1123         int     i;
1124         __u32   *remote_rcs;
1125
1126         remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
1127                                                   sizeof(*remote_rcs) *
1128                                                   niocount);
1129         if (remote_rcs == NULL) {
1130                 CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
1131                 return(-EPROTO);
1132         }
1133
1134         /* return error if any niobuf was in error */
1135         for (i = 0; i < niocount; i++) {
1136                 if ((int)remote_rcs[i] < 0) {
1137                         CDEBUG(D_INFO, "rc[%d]: %d req %p\n",
1138                                i, remote_rcs[i], req);
1139                         return remote_rcs[i];
1140                 }
1141
1142                 if (remote_rcs[i] != 0) {
1143                         CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
1144                                 i, remote_rcs[i], req);
1145                         return(-EPROTO);
1146                 }
1147         }
1148         if (req->rq_bulk != NULL &&
1149             req->rq_bulk->bd_nob_transferred != requested_nob) {
1150                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
1151                        req->rq_bulk->bd_nob_transferred, requested_nob);
1152                 return(-EPROTO);
1153         }
1154
1155         return (0);
1156 }
1157
1158 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
1159 {
1160         if (p1->flag != p2->flag) {
1161                 unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
1162                                   OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
1163                                   OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC |
1164                                   OBD_BRW_SYS_RESOURCE);
1165
1166                 /* warn if we try to combine flags that we don't know to be
1167                  * safe to combine */
1168                 if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
1169                         CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
1170                               "report this at https://jira.whamcloud.com/\n",
1171                               p1->flag, p2->flag);
1172                 }
1173                 return 0;
1174         }
1175
1176         return (p1->off + p1->count == p2->off);
1177 }
1178
1179 #if IS_ENABLED(CONFIG_CRC_T10DIF)
1180 static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
1181                                    size_t pg_count, struct brw_page **pga,
1182                                    int opc, obd_dif_csum_fn *fn,
1183                                    int sector_size,
1184                                    u32 *check_sum, bool resend)
1185 {
1186         struct ahash_request *req;
1187         /* Used Adler as the default checksum type on top of DIF tags */
1188         unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
1189         struct page *__page;
1190         unsigned char *buffer;
1191         __be16 *guard_start;
1192         unsigned int bufsize;
1193         int guard_number;
1194         int used_number = 0;
1195         int used;
1196         u32 cksum;
1197         int rc = 0;
1198         int i = 0;
1199
1200         LASSERT(pg_count > 0);
1201
1202         __page = alloc_page(GFP_KERNEL);
1203         if (__page == NULL)
1204                 return -ENOMEM;
1205
1206         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1207         if (IS_ERR(req)) {
1208                 rc = PTR_ERR(req);
1209                 CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
1210                        obd_name, cfs_crypto_hash_name(cfs_alg), rc);
1211                 GOTO(out, rc);
1212         }
1213
1214         buffer = kmap(__page);
1215         guard_start = (__be16 *)buffer;
1216         guard_number = PAGE_SIZE / sizeof(*guard_start);
1217         CDEBUG(D_PAGE | (resend ? D_HA : 0),
1218                "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n",
1219                guard_number, resend, nob, pg_count);
1220
1221         while (nob > 0 && pg_count > 0) {
1222                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1223
1224                 /* corrupt the data before we compute the checksum, to
1225                  * simulate an OST->client data error */
1226                 if (unlikely(i == 0 && opc == OST_READ &&
1227                              OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
1228                         unsigned char *ptr = kmap(pga[i]->pg);
1229                         int off = pga[i]->off & ~PAGE_MASK;
1230
1231                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1232                         kunmap(pga[i]->pg);
1233                 }
1234
1235                 /*
1236                  * The left guard number should be able to hold checksums of a
1237                  * whole page
1238                  */
1239                 rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
1240                                                   pga[i]->off & ~PAGE_MASK,
1241                                                   count,
1242                                                   guard_start + used_number,
1243                                                   guard_number - used_number,
1244                                                   &used, sector_size,
1245                                                   fn);
1246                 if (unlikely(resend))
1247                         CDEBUG(D_PAGE | D_HA,
1248                                "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n",
1249                                i, used, pga[i]->off & ~PAGE_MASK, count,
1250                                (int)(used * sizeof(*guard_start)),
1251                                guard_start + used_number);
1252                 if (rc)
1253                         break;
1254
1255                 used_number += used;
1256                 if (used_number == guard_number) {
1257                         cfs_crypto_hash_update_page(req, __page, 0,
1258                                 used_number * sizeof(*guard_start));
1259                         used_number = 0;
1260                 }
1261
1262                 nob -= pga[i]->count;
1263                 pg_count--;
1264                 i++;
1265         }
1266         kunmap(__page);
1267         if (rc)
1268                 GOTO(out, rc);
1269
1270         if (used_number != 0)
1271                 cfs_crypto_hash_update_page(req, __page, 0,
1272                         used_number * sizeof(*guard_start));
1273
1274         bufsize = sizeof(cksum);
1275         cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
1276
1277         /* For sending we only compute the wrong checksum instead
1278          * of corrupting the data so it is still correct on a redo */
1279         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1280                 cksum++;
1281
1282         *check_sum = cksum;
1283 out:
1284         __free_page(__page);
1285         return rc;
1286 }
1287 #else /* !CONFIG_CRC_T10DIF */
1288 #define obd_dif_ip_fn NULL
1289 #define obd_dif_crc_fn NULL
1290 #define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum, re) \
1291         -EOPNOTSUPP
1292 #endif /* CONFIG_CRC_T10DIF */
1293
1294 static int osc_checksum_bulk(int nob, size_t pg_count,
1295                              struct brw_page **pga, int opc,
1296                              enum cksum_types cksum_type,
1297                              u32 *cksum)
1298 {
1299         int                             i = 0;
1300         struct ahash_request           *req;
1301         unsigned int                    bufsize;
1302         unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
1303
1304         LASSERT(pg_count > 0);
1305
1306         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1307         if (IS_ERR(req)) {
1308                 CERROR("Unable to initialize checksum hash %s\n",
1309                        cfs_crypto_hash_name(cfs_alg));
1310                 return PTR_ERR(req);
1311         }
1312
1313         while (nob > 0 && pg_count > 0) {
1314                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1315
1316                 /* corrupt the data before we compute the checksum, to
1317                  * simulate an OST->client data error */
1318                 if (i == 0 && opc == OST_READ &&
1319                     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
1320                         unsigned char *ptr = kmap(pga[i]->pg);
1321                         int off = pga[i]->off & ~PAGE_MASK;
1322
1323                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1324                         kunmap(pga[i]->pg);
1325                 }
1326                 cfs_crypto_hash_update_page(req, pga[i]->pg,
1327                                             pga[i]->off & ~PAGE_MASK,
1328                                             count);
1329                 LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
1330                                (int)(pga[i]->off & ~PAGE_MASK));
1331
1332                 nob -= pga[i]->count;
1333                 pg_count--;
1334                 i++;
1335         }
1336
1337         bufsize = sizeof(*cksum);
1338         cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
1339
1340         /* For sending we only compute the wrong checksum instead
1341          * of corrupting the data so it is still correct on a redo */
1342         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1343                 (*cksum)++;
1344
1345         return 0;
1346 }
1347
1348 static int osc_checksum_bulk_rw(const char *obd_name,
1349                                 enum cksum_types cksum_type,
1350                                 int nob, size_t pg_count,
1351                                 struct brw_page **pga, int opc,
1352                                 u32 *check_sum, bool resend)
1353 {
1354         obd_dif_csum_fn *fn = NULL;
1355         int sector_size = 0;
1356         int rc;
1357
1358         ENTRY;
1359         obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
1360
1361         if (fn)
1362                 rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
1363                                              opc, fn, sector_size, check_sum,
1364                                              resend);
1365         else
1366                 rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
1367                                        check_sum);
1368
1369         RETURN(rc);
1370 }
1371
1372 #ifdef CONFIG_LL_ENCRYPTION
1373 /**
1374  * osc_encrypt_pagecache_blocks() - overlay to llcrypt_encrypt_pagecache_blocks
1375  * @srcpage:      The locked pagecache page containing the block(s) to encrypt
1376  * @dstpage:      The page to put encryption result
1377  * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
1378  *              multiple of the filesystem's block size.
1379  * @offs:      Byte offset within @page of the first block to encrypt.  Must be
1380  *              a multiple of the filesystem's block size.
1381  * @gfp_flags: Memory allocation flags
1382  *
1383  * This overlay function is necessary to be able to provide our own bounce page.
1384  */
1385 static struct page *osc_encrypt_pagecache_blocks(struct page *srcpage,
1386                                                  struct page *dstpage,
1387                                                  unsigned int len,
1388                                                  unsigned int offs,
1389                                                  gfp_t gfp_flags)
1390
1391 {
1392         const struct inode *inode = srcpage->mapping->host;
1393         const unsigned int blockbits = inode->i_blkbits;
1394         const unsigned int blocksize = 1 << blockbits;
1395         u64 lblk_num = ((u64)srcpage->index << (PAGE_SHIFT - blockbits)) +
1396                 (offs >> blockbits);
1397         unsigned int i;
1398         int err;
1399
1400         if (unlikely(!dstpage))
1401                 return llcrypt_encrypt_pagecache_blocks(srcpage, len, offs,
1402                                                         gfp_flags);
1403
1404         if (WARN_ON_ONCE(!PageLocked(srcpage)))
1405                 return ERR_PTR(-EINVAL);
1406
1407         if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
1408                 return ERR_PTR(-EINVAL);
1409
1410         /* Set PagePrivate2 for disambiguation in
1411          * osc_finalize_bounce_page().
1412          * It means cipher page was not allocated by llcrypt.
1413          */
1414         SetPagePrivate2(dstpage);
1415
1416         for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
1417                 err = llcrypt_encrypt_block(inode, srcpage, dstpage, blocksize,
1418                                             i, lblk_num, gfp_flags);
1419                 if (err)
1420                         return ERR_PTR(err);
1421         }
1422         SetPagePrivate(dstpage);
1423         set_page_private(dstpage, (unsigned long)srcpage);
1424         return dstpage;
1425 }
1426
1427 /**
1428  * osc_finalize_bounce_page() - overlay to llcrypt_finalize_bounce_page
1429  *
1430  * This overlay function is necessary to handle bounce pages
1431  * allocated by ourselves.
1432  */
1433 static inline void osc_finalize_bounce_page(struct page **pagep)
1434 {
1435         struct page *page = *pagep;
1436
1437         /* PagePrivate2 was set in osc_encrypt_pagecache_blocks
1438          * to indicate the cipher page was allocated by ourselves.
1439          * So we must not free it via llcrypt.
1440          */
1441         if (unlikely(!page || !PagePrivate2(page)))
1442                 return llcrypt_finalize_bounce_page(pagep);
1443
1444         if (llcrypt_is_bounce_page(page)) {
1445                 *pagep = llcrypt_pagecache_page(page);
1446                 ClearPagePrivate2(page);
1447                 set_page_private(page, (unsigned long)NULL);
1448                 ClearPagePrivate(page);
1449         }
1450 }
1451 #else /* !CONFIG_LL_ENCRYPTION */
1452 #define osc_encrypt_pagecache_blocks(srcpage, dstpage, len, offs, gfp_flags) \
1453         llcrypt_encrypt_pagecache_blocks(srcpage, len, offs, gfp_flags)
1454 #define osc_finalize_bounce_page(page) llcrypt_finalize_bounce_page(page)
1455 #endif
1456
1457 static inline void osc_release_bounce_pages(struct brw_page **pga,
1458                                             u32 page_count)
1459 {
1460 #ifdef HAVE_LUSTRE_CRYPTO
1461         struct page **pa = NULL;
1462         int i, j = 0;
1463
1464         if (!pga[0])
1465                 return;
1466
1467 #ifdef CONFIG_LL_ENCRYPTION
1468         if (PageChecked(pga[0]->pg)) {
1469                 OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
1470                 if (!pa)
1471                         return;
1472         }
1473 #endif
1474
1475         for (i = 0; i < page_count; i++) {
1476                 /* Bounce pages used by osc_encrypt_pagecache_blocks()
1477                  * called from osc_brw_prep_request()
1478                  * are identified thanks to the PageChecked flag.
1479                  */
1480                 if (PageChecked(pga[i]->pg)) {
1481                         if (pa)
1482                                 pa[j++] = pga[i]->pg;
1483                         osc_finalize_bounce_page(&pga[i]->pg);
1484                 }
1485                 pga[i]->count -= pga[i]->bp_count_diff;
1486                 pga[i]->off += pga[i]->bp_off_diff;
1487         }
1488
1489         if (pa) {
1490                 sptlrpc_enc_pool_put_pages_array(pa, j);
1491                 OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
1492         }
1493 #endif
1494 }
1495
1496 static int
1497 osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
1498                      u32 page_count, struct brw_page **pga,
1499                      struct ptlrpc_request **reqp, int resend)
1500 {
1501         struct ptlrpc_request *req;
1502         struct ptlrpc_bulk_desc *desc;
1503         struct ost_body *body;
1504         struct obd_ioobj *ioobj;
1505         struct niobuf_remote *niobuf;
1506         int niocount, i, requested_nob, opc, rc, short_io_size = 0;
1507         struct osc_brw_async_args *aa;
1508         struct req_capsule *pill;
1509         struct brw_page *pg_prev;
1510         void *short_io_buf;
1511         const char *obd_name = cli->cl_import->imp_obd->obd_name;
1512         struct inode *inode = NULL;
1513         bool directio = false;
1514         bool gpu = 0;
1515         bool enable_checksum = true;
1516         struct cl_page *clpage;
1517
1518         ENTRY;
1519         if (pga[0]->pg) {
1520                 clpage = oap2cl_page(brw_page2oap(pga[0]));
1521                 inode = clpage->cp_inode;
1522                 if (clpage->cp_type == CPT_TRANSIENT)
1523                         directio = true;
1524         }
1525         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
1526                 RETURN(-ENOMEM); /* Recoverable */
1527         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
1528                 RETURN(-EINVAL); /* Fatal */
1529
1530         if ((cmd & OBD_BRW_WRITE) != 0) {
1531                 opc = OST_WRITE;
1532                 req = ptlrpc_request_alloc_pool(cli->cl_import,
1533                                                 osc_rq_pool,
1534                                                 &RQF_OST_BRW_WRITE);
1535         } else {
1536                 opc = OST_READ;
1537                 req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
1538         }
1539         if (req == NULL)
1540                 RETURN(-ENOMEM);
1541
1542         if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode) &&
1543             llcrypt_has_encryption_key(inode)) {
1544                 struct page **pa = NULL;
1545
1546 #ifdef CONFIG_LL_ENCRYPTION
1547                 OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
1548                 if (pa == NULL) {
1549                         ptlrpc_request_free(req);
1550                         RETURN(-ENOMEM);
1551                 }
1552
1553                 rc = sptlrpc_enc_pool_get_pages_array(pa, page_count);
1554                 if (rc) {
1555                         CDEBUG(D_SEC, "failed to allocate from enc pool: %d\n",
1556                                rc);
1557                         ptlrpc_request_free(req);
1558                         RETURN(rc);
1559                 }
1560 #endif
1561
1562                 for (i = 0; i < page_count; i++) {
1563                         struct brw_page *brwpg = pga[i];
1564                         struct page *data_page = NULL;
1565                         bool retried = false;
1566                         bool lockedbymyself;
1567                         u32 nunits = (brwpg->off & ~PAGE_MASK) + brwpg->count;
1568                         struct address_space *map_orig = NULL;
1569                         pgoff_t index_orig;
1570
1571 retry_encrypt:
1572                         nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
1573                         /* The page can already be locked when we arrive here.
1574                          * This is possible when cl_page_assume/vvp_page_assume
1575                          * is stuck on wait_on_page_writeback with page lock
1576                          * held. In this case there is no risk for the lock to
1577                          * be released while we are doing our encryption
1578                          * processing, because writeback against that page will
1579                          * end in vvp_page_completion_write/cl_page_completion,
1580                          * which means only once the page is fully processed.
1581                          */
1582                         lockedbymyself = trylock_page(brwpg->pg);
1583                         if (directio) {
1584                                 map_orig = brwpg->pg->mapping;
1585                                 brwpg->pg->mapping = inode->i_mapping;
1586                                 index_orig = brwpg->pg->index;
1587                                 clpage = oap2cl_page(brw_page2oap(brwpg));
1588                                 brwpg->pg->index = clpage->cp_page_index;
1589                         }
1590                         data_page =
1591                                 osc_encrypt_pagecache_blocks(brwpg->pg,
1592                                                             pa ? pa[i] : NULL,
1593                                                             nunits, 0,
1594                                                             GFP_NOFS);
1595                         if (directio) {
1596                                 brwpg->pg->mapping = map_orig;
1597                                 brwpg->pg->index = index_orig;
1598                         }
1599                         if (lockedbymyself)
1600                                 unlock_page(brwpg->pg);
1601                         if (IS_ERR(data_page)) {
1602                                 rc = PTR_ERR(data_page);
1603                                 if (rc == -ENOMEM && !retried) {
1604                                         retried = true;
1605                                         rc = 0;
1606                                         goto retry_encrypt;
1607                                 }
1608                                 if (pa) {
1609                                         sptlrpc_enc_pool_put_pages_array(pa + i,
1610                                                                 page_count - i);
1611                                         OBD_FREE_PTR_ARRAY_LARGE(pa,
1612                                                                  page_count);
1613                                 }
1614                                 ptlrpc_request_free(req);
1615                                 RETURN(rc);
1616                         }
1617                         /* Set PageChecked flag on bounce page for
1618                          * disambiguation in osc_release_bounce_pages().
1619                          */
1620                         SetPageChecked(data_page);
1621                         brwpg->pg = data_page;
1622                         /* there should be no gap in the middle of page array */
1623                         if (i == page_count - 1) {
1624                                 struct osc_async_page *oap =
1625                                         brw_page2oap(brwpg);
1626
1627                                 oa->o_size = oap->oap_count +
1628                                         oap->oap_obj_off + oap->oap_page_off;
1629                         }
1630                         /* len is forced to nunits, and relative offset to 0
1631                          * so store the old, clear text info
1632                          */
1633                         brwpg->bp_count_diff = nunits - brwpg->count;
1634                         brwpg->count = nunits;
1635                         brwpg->bp_off_diff = brwpg->off & ~PAGE_MASK;
1636                         brwpg->off = brwpg->off & PAGE_MASK;
1637                 }
1638
1639                 if (pa)
1640                         OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
1641         } else if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode)) {
1642                 struct osc_async_page *oap = brw_page2oap(pga[0]);
1643                 struct cl_page *clpage = oap2cl_page(oap);
1644                 struct cl_object *clobj = clpage->cp_obj;
1645                 struct cl_attr attr = { 0 };
1646                 struct lu_env *env;
1647                 __u16 refcheck;
1648
1649                 env = cl_env_get(&refcheck);
1650                 if (IS_ERR(env)) {
1651                         rc = PTR_ERR(env);
1652                         ptlrpc_request_free(req);
1653                         RETURN(rc);
1654                 }
1655
1656                 cl_object_attr_lock(clobj);
1657                 rc = cl_object_attr_get(env, clobj, &attr);
1658                 cl_object_attr_unlock(clobj);
1659                 cl_env_put(env, &refcheck);
1660                 if (rc != 0) {
1661                         ptlrpc_request_free(req);
1662                         RETURN(rc);
1663                 }
1664                 if (attr.cat_size)
1665                         oa->o_size = attr.cat_size;
1666         } else if (opc == OST_READ && inode && IS_ENCRYPTED(inode) &&
1667                    llcrypt_has_encryption_key(inode)) {
1668                 for (i = 0; i < page_count; i++) {
1669                         struct brw_page *pg = pga[i];
1670                         u32 nunits = (pg->off & ~PAGE_MASK) + pg->count;
1671
1672                         nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
1673                         /* count/off are forced to cover the whole encryption
1674                          * unit size so that all encrypted data is stored on the
1675                          * OST, so adjust bp_{count,off}_diff for the size of
1676                          * the clear text.
1677                          */
1678                         pg->bp_count_diff = nunits - pg->count;
1679                         pg->count = nunits;
1680                         pg->bp_off_diff = pg->off & ~PAGE_MASK;
1681                         pg->off = pg->off & PAGE_MASK;
1682                 }
1683         }
1684
1685         for (niocount = i = 1; i < page_count; i++) {
1686                 if (!can_merge_pages(pga[i - 1], pga[i]))
1687                         niocount++;
1688         }
1689
1690         pill = &req->rq_pill;
1691         req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
1692                              sizeof(*ioobj));
1693         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
1694                              niocount * sizeof(*niobuf));
1695
1696         for (i = 0; i < page_count; i++) {
1697                 short_io_size += pga[i]->count;
1698                 if (!inode || !IS_ENCRYPTED(inode) ||
1699                     !llcrypt_has_encryption_key(inode)) {
1700                         pga[i]->bp_count_diff = 0;
1701                         pga[i]->bp_off_diff = 0;
1702                 }
1703         }
1704
1705         if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
1706                 enable_checksum = false;
1707                 short_io_size = 0;
1708                 gpu = 1;
1709         }
1710
1711         /* Check if read/write is small enough to be a short io. */
1712         if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
1713             !imp_connect_shortio(cli->cl_import))
1714                 short_io_size = 0;
1715
1716         /* If this is an empty RPC to old server, just ignore it */
1717         if (!short_io_size && !pga[0]->pg) {
1718                 ptlrpc_request_free(req);
1719                 RETURN(-ENODATA);
1720         }
1721
1722         req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
1723                              opc == OST_READ ? 0 : short_io_size);
1724         if (opc == OST_READ)
1725                 req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
1726                                      short_io_size);
1727
1728         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
1729         if (rc) {
1730                 ptlrpc_request_free(req);
1731                 RETURN(rc);
1732         }
1733         osc_set_io_portal(req);
1734
1735         ptlrpc_at_set_req_timeout(req);
1736         /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
1737          * retry logic */
1738         req->rq_no_retry_einprogress = 1;
1739
1740         if (short_io_size != 0) {
1741                 desc = NULL;
1742                 short_io_buf = NULL;
1743                 goto no_bulk;
1744         }
1745
1746         desc = ptlrpc_prep_bulk_imp(req, page_count,
1747                 cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
1748                 (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
1749                         PTLRPC_BULK_PUT_SINK),
1750                 OST_BULK_PORTAL,
1751                 &ptlrpc_bulk_kiov_pin_ops);
1752
1753         if (desc == NULL)
1754                 GOTO(out, rc = -ENOMEM);
1755         /* NB request now owns desc and will free it when it gets freed */
1756         desc->bd_is_rdma = gpu;
1757 no_bulk:
1758         body = req_capsule_client_get(pill, &RMF_OST_BODY);
1759         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
1760         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1761         LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
1762
1763         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
1764
1765         /* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
1766          * and from_kgid(), because they are asynchronous. Fortunately, variable
1767          * oa contains valid o_uid and o_gid in these two operations.
1768          * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
1769          * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
1770          * other process logic */
1771         body->oa.o_uid = oa->o_uid;
1772         body->oa.o_gid = oa->o_gid;
1773
1774         obdo_to_ioobj(oa, ioobj);
1775         ioobj->ioo_bufcnt = niocount;
1776         /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
1777          * that might be send for this request.  The actual number is decided
1778          * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
1779          * "max - 1" for old client compatibility sending "0", and also so the
1780          * the actual maximum is a power-of-two number, not one less. LU-1431 */
1781         if (desc != NULL)
1782                 ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
1783         else /* short io */
1784                 ioobj_max_brw_set(ioobj, 0);
1785
1786         if (inode && IS_ENCRYPTED(inode) &&
1787             llcrypt_has_encryption_key(inode) &&
1788             !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) {
1789                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1790                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1791                         body->oa.o_flags = 0;
1792                 }
1793                 body->oa.o_flags |= LUSTRE_ENCRYPT_FL;
1794         }
1795
1796         if (short_io_size != 0) {
1797                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1798                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1799                         body->oa.o_flags = 0;
1800                 }
1801                 body->oa.o_flags |= OBD_FL_SHORT_IO;
1802                 CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
1803                        short_io_size);
1804                 if (opc == OST_WRITE) {
1805                         short_io_buf = req_capsule_client_get(pill,
1806                                                               &RMF_SHORT_IO);
1807                         LASSERT(short_io_buf != NULL);
1808                 }
1809         }
1810
1811         LASSERT(page_count > 0);
1812         pg_prev = pga[0];
1813         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
1814                 struct brw_page *pg = pga[i];
1815                 int poff = pg->off & ~PAGE_MASK;
1816
1817                 LASSERT(pg->count > 0);
1818                 /* make sure there is no gap in the middle of page array */
1819                 LASSERTF(page_count == 1 ||
1820                          (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
1821                           ergo(i > 0 && i < page_count - 1,
1822                                poff == 0 && pg->count == PAGE_SIZE)   &&
1823                           ergo(i == page_count - 1, poff == 0)),
1824                          "i: %d/%d pg: %p off: %llu, count: %u\n",
1825                          i, page_count, pg, pg->off, pg->count);
1826                 LASSERTF(i == 0 || pg->off > pg_prev->off,
1827                          "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
1828                          " prev_pg %p [pri %lu ind %lu] off %llu\n",
1829                          i, page_count,
1830                          pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
1831                          pg_prev->pg, page_private(pg_prev->pg),
1832                          pg_prev->pg->index, pg_prev->off);
1833                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
1834                         (pg->flag & OBD_BRW_SRVLOCK));
1835                 if (short_io_size != 0 && opc == OST_WRITE) {
1836                         unsigned char *ptr = kmap_atomic(pg->pg);
1837
1838                         LASSERT(short_io_size >= requested_nob + pg->count);
1839                         memcpy(short_io_buf + requested_nob,
1840                                ptr + poff,
1841                                pg->count);
1842                         kunmap_atomic(ptr);
1843                 } else if (short_io_size == 0) {
1844                         desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
1845                                                          pg->count);
1846                 }
1847                 requested_nob += pg->count;
1848
1849                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
1850                         niobuf--;
1851                         niobuf->rnb_len += pg->count;
1852                 } else {
1853                         niobuf->rnb_offset = pg->off;
1854                         niobuf->rnb_len    = pg->count;
1855                         niobuf->rnb_flags  = pg->flag;
1856                 }
1857                 pg_prev = pg;
1858         }
1859
1860         LASSERTF((void *)(niobuf - niocount) ==
1861                 req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
1862                 "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
1863                 &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
1864
1865         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
1866         if (resend) {
1867                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1868                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1869                         body->oa.o_flags = 0;
1870                 }
1871                 body->oa.o_flags |= OBD_FL_RECOV_RESEND;
1872         }
1873
1874         if (osc_should_shrink_grant(cli))
1875                 osc_shrink_grant_local(cli, &body->oa);
1876
1877         if (!cli->cl_checksum || sptlrpc_flavor_has_bulk(&req->rq_flvr))
1878                 enable_checksum = false;
1879
1880         /* size[REQ_REC_OFF] still sizeof (*body) */
1881         if (opc == OST_WRITE) {
1882                 if (enable_checksum) {
1883                         /* store cl_cksum_type in a local variable since
1884                          * it can be changed via lprocfs */
1885                         enum cksum_types cksum_type = cli->cl_cksum_type;
1886
1887                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1888                                 body->oa.o_flags = 0;
1889
1890                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1891                                                                 cksum_type);
1892                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1893
1894                         rc = osc_checksum_bulk_rw(obd_name, cksum_type,
1895                                                   requested_nob, page_count,
1896                                                   pga, OST_WRITE,
1897                                                   &body->oa.o_cksum, resend);
1898                         if (rc < 0) {
1899                                 CDEBUG(D_PAGE, "failed to checksum: rc = %d\n",
1900                                        rc);
1901                                 GOTO(out, rc);
1902                         }
1903                         CDEBUG(D_PAGE | (resend ? D_HA : 0),
1904                                "checksum at write origin: %x (%x)\n",
1905                                body->oa.o_cksum, cksum_type);
1906
1907                         /* save this in 'oa', too, for later checking */
1908                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1909                         oa->o_flags |= obd_cksum_type_pack(obd_name,
1910                                                            cksum_type);
1911                 } else {
1912                         /* clear out the checksum flag, in case this is a
1913                          * resend but cl_checksum is no longer set. b=11238 */
1914                         oa->o_valid &= ~OBD_MD_FLCKSUM;
1915                 }
1916                 oa->o_cksum = body->oa.o_cksum;
1917                 /* 1 RC per niobuf */
1918                 req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
1919                                      sizeof(__u32) * niocount);
1920         } else {
1921                 if (enable_checksum) {
1922                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1923                                 body->oa.o_flags = 0;
1924                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1925                                 cli->cl_cksum_type);
1926                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1927                 }
1928
1929                 /* Client cksum has been already copied to wire obdo in previous
1930                  * lustre_set_wire_obdo(), and in the case a bulk-read is being
1931                  * resent due to cksum error, this will allow Server to
1932                  * check+dump pages on its side */
1933         }
1934         ptlrpc_request_set_replen(req);
1935
1936         aa = ptlrpc_req_async_args(aa, req);
1937         aa->aa_oa = oa;
1938         aa->aa_requested_nob = requested_nob;
1939         aa->aa_nio_count = niocount;
1940         aa->aa_page_count = page_count;
1941         aa->aa_resends = 0;
1942         aa->aa_ppga = pga;
1943         aa->aa_cli = cli;
1944         INIT_LIST_HEAD(&aa->aa_oaps);
1945
1946         *reqp = req;
1947         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1948         CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
1949                 req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
1950                 niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
1951         RETURN(0);
1952
1953  out:
1954         ptlrpc_req_finished(req);
1955         RETURN(rc);
1956 }
1957
1958 char dbgcksum_file_name[PATH_MAX];
1959
1960 static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
1961                                 struct brw_page **pga, __u32 server_cksum,
1962                                 __u32 client_cksum)
1963 {
1964         struct file *filp;
1965         int rc, i;
1966         unsigned int len;
1967         char *buf;
1968
1969         /* will only keep dump of pages on first error for the same range in
1970          * file/fid, not during the resends/retries. */
1971         snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
1972                  "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
1973                  (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
1974                   libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
1975                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
1976                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1977                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1978                  pga[0]->off,
1979                  pga[page_count-1]->off + pga[page_count-1]->count - 1,
1980                  client_cksum, server_cksum);
1981         CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
1982         filp = filp_open(dbgcksum_file_name,
1983                          O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
1984         if (IS_ERR(filp)) {
1985                 rc = PTR_ERR(filp);
1986                 if (rc == -EEXIST)
1987                         CDEBUG(D_INFO, "%s: can't open to dump pages with "
1988                                "checksum error: rc = %d\n", dbgcksum_file_name,
1989                                rc);
1990                 else
1991                         CERROR("%s: can't open to dump pages with checksum "
1992                                "error: rc = %d\n", dbgcksum_file_name, rc);
1993                 return;
1994         }
1995
1996         for (i = 0; i < page_count; i++) {
1997                 len = pga[i]->count;
1998                 buf = kmap(pga[i]->pg);
1999                 while (len != 0) {
2000                         rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
2001                         if (rc < 0) {
2002                                 CERROR("%s: wanted to write %u but got %d "
2003                                        "error\n", dbgcksum_file_name, len, rc);
2004                                 break;
2005                         }
2006                         len -= rc;
2007                         buf += rc;
2008                 }
2009                 kunmap(pga[i]->pg);
2010         }
2011
2012         rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
2013         if (rc)
2014                 CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
2015         filp_close(filp, NULL);
2016
2017         libcfs_debug_dumplog();
2018 }
2019
2020 static int
2021 check_write_checksum(struct obdo *oa, const struct lnet_processid *peer,
2022                      __u32 client_cksum, __u32 server_cksum,
2023                      struct osc_brw_async_args *aa)
2024 {
2025         const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
2026         enum cksum_types cksum_type;
2027         obd_dif_csum_fn *fn = NULL;
2028         int sector_size = 0;
2029         __u32 new_cksum;
2030         char *msg;
2031         int rc;
2032
2033         if (server_cksum == client_cksum) {
2034                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
2035                 return 0;
2036         }
2037
2038         if (aa->aa_cli->cl_checksum_dump)
2039                 dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
2040                                     server_cksum, client_cksum);
2041
2042         cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
2043                                            oa->o_flags : 0);
2044
2045         switch (cksum_type) {
2046         case OBD_CKSUM_T10IP512:
2047                 fn = obd_dif_ip_fn;
2048                 sector_size = 512;
2049                 break;
2050         case OBD_CKSUM_T10IP4K:
2051                 fn = obd_dif_ip_fn;
2052                 sector_size = 4096;
2053                 break;
2054         case OBD_CKSUM_T10CRC512:
2055                 fn = obd_dif_crc_fn;
2056                 sector_size = 512;
2057                 break;
2058         case OBD_CKSUM_T10CRC4K:
2059                 fn = obd_dif_crc_fn;
2060                 sector_size = 4096;
2061                 break;
2062         default:
2063                 break;
2064         }
2065
2066         if (fn)
2067                 rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
2068                                              aa->aa_page_count, aa->aa_ppga,
2069                                              OST_WRITE, fn, sector_size,
2070                                              &new_cksum, true);
2071         else
2072                 rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
2073                                        aa->aa_ppga, OST_WRITE, cksum_type,
2074                                        &new_cksum);
2075
2076         if (rc < 0)
2077                 msg = "failed to calculate the client write checksum";
2078         else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
2079                 msg = "the server did not use the checksum type specified in "
2080                       "the original request - likely a protocol problem";
2081         else if (new_cksum == server_cksum)
2082                 msg = "changed on the client after we checksummed it - "
2083                       "likely false positive due to mmap IO (bug 11742)";
2084         else if (new_cksum == client_cksum)
2085                 msg = "changed in transit before arrival at OST";
2086         else
2087                 msg = "changed in transit AND doesn't match the original - "
2088                       "likely false positive due to mmap IO (bug 11742)";
2089
2090         LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
2091                            DFID " object "DOSTID" extent [%llu-%llu], original "
2092                            "client csum %x (type %x), server csum %x (type %x),"
2093                            " client csum now %x\n",
2094                            obd_name, msg, libcfs_nidstr(&peer->nid),
2095                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
2096                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
2097                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
2098                            POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
2099                            aa->aa_ppga[aa->aa_page_count - 1]->off +
2100                                 aa->aa_ppga[aa->aa_page_count-1]->count - 1,
2101                            client_cksum,
2102                            obd_cksum_type_unpack(aa->aa_oa->o_flags),
2103                            server_cksum, cksum_type, new_cksum);
2104         return 1;
2105 }
2106
2107 /* Note rc enters this function as number of bytes transferred */
2108 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
2109 {
2110         struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
2111         struct client_obd *cli = aa->aa_cli;
2112         const char *obd_name = cli->cl_import->imp_obd->obd_name;
2113         const struct lnet_processid *peer =
2114                 &req->rq_import->imp_connection->c_peer;
2115         struct ost_body *body;
2116         u32 client_cksum = 0;
2117         struct inode *inode = NULL;
2118         unsigned int blockbits = 0, blocksize = 0;
2119         struct cl_page *clpage;
2120
2121         ENTRY;
2122
2123         if (rc < 0 && rc != -EDQUOT) {
2124                 DEBUG_REQ(D_INFO, req, "Failed request: rc = %d", rc);
2125                 RETURN(rc);
2126         }
2127
2128         LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
2129         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
2130         if (body == NULL) {
2131                 DEBUG_REQ(D_INFO, req, "cannot unpack body");
2132                 RETURN(-EPROTO);
2133         }
2134
2135         /* set/clear over quota flag for a uid/gid/projid */
2136         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
2137             body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
2138                 unsigned qid[LL_MAXQUOTAS] = {
2139                                          body->oa.o_uid, body->oa.o_gid,
2140                                          body->oa.o_projid };
2141                 CDEBUG(D_QUOTA,
2142                        "setdq for [%u %u %u] with valid %#llx, flags %x\n",
2143                        body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
2144                        body->oa.o_valid, body->oa.o_flags);
2145                 osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
2146                                 body->oa.o_flags);
2147         }
2148
2149         osc_update_grant(cli, body);
2150
2151         if (rc < 0)
2152                 RETURN(rc);
2153
2154         if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
2155                 client_cksum = aa->aa_oa->o_cksum; /* save for later */
2156
2157         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
2158                 if (rc > 0) {
2159                         CERROR("%s: unexpected positive size %d\n",
2160                                obd_name, rc);
2161                         RETURN(-EPROTO);
2162                 }
2163
2164                 if (req->rq_bulk != NULL &&
2165                     sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
2166                         RETURN(-EAGAIN);
2167
2168                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
2169                     check_write_checksum(&body->oa, peer, client_cksum,
2170                                          body->oa.o_cksum, aa))
2171                         RETURN(-EAGAIN);
2172
2173                 rc = check_write_rcs(req, aa->aa_requested_nob,
2174                                      aa->aa_nio_count, aa->aa_page_count,
2175                                      aa->aa_ppga);
2176                 GOTO(out, rc);
2177         }
2178
2179         /* The rest of this function executes only for OST_READs */
2180
2181         if (req->rq_bulk == NULL) {
2182                 rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
2183                                           RCL_SERVER);
2184                 LASSERT(rc == req->rq_status);
2185         } else {
2186                 /* if unwrap_bulk failed, return -EAGAIN to retry */
2187                 rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
2188         }
2189         if (rc < 0)
2190                 GOTO(out, rc = -EAGAIN);
2191
2192         if (rc > aa->aa_requested_nob) {
2193                 CERROR("%s: unexpected size %d, requested %d\n", obd_name,
2194                        rc, aa->aa_requested_nob);
2195                 RETURN(-EPROTO);
2196         }
2197
2198         if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
2199                 CERROR("%s: unexpected size %d, transferred %d\n", obd_name,
2200                        rc, req->rq_bulk->bd_nob_transferred);
2201                 RETURN(-EPROTO);
2202         }
2203
2204         if (req->rq_bulk == NULL) {
2205                 /* short io */
2206                 int nob, pg_count, i = 0;
2207                 unsigned char *buf;
2208
2209                 CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
2210                 pg_count = aa->aa_page_count;
2211                 buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
2212                                                    rc);
2213                 nob = rc;
2214                 while (nob > 0 && pg_count > 0) {
2215                         unsigned char *ptr;
2216                         int count = aa->aa_ppga[i]->count > nob ?
2217                                     nob : aa->aa_ppga[i]->count;
2218
2219                         CDEBUG(D_CACHE, "page %p count %d\n",
2220                                aa->aa_ppga[i]->pg, count);
2221                         ptr = kmap_atomic(aa->aa_ppga[i]->pg);
2222                         memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
2223                                count);
2224                         kunmap_atomic((void *) ptr);
2225
2226                         buf += count;
2227                         nob -= count;
2228                         i++;
2229                         pg_count--;
2230                 }
2231         }
2232
2233         if (rc < aa->aa_requested_nob)
2234                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
2235
2236         if (body->oa.o_valid & OBD_MD_FLCKSUM) {
2237                 static int cksum_counter;
2238                 u32 server_cksum = body->oa.o_cksum;
2239                 int nob = rc;
2240                 char *via = "";
2241                 char *router = "";
2242                 enum cksum_types cksum_type;
2243                 u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
2244                         body->oa.o_flags : 0;
2245
2246                 cksum_type = obd_cksum_type_unpack(o_flags);
2247                 rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob,
2248                                           aa->aa_page_count, aa->aa_ppga,
2249                                           OST_READ, &client_cksum, false);
2250                 if (rc < 0)
2251                         GOTO(out, rc);
2252
2253                 if (req->rq_bulk != NULL &&
2254                     !nid_same(&peer->nid, &req->rq_bulk->bd_sender)) {
2255                         via = " via ";
2256                         router = libcfs_nidstr(&req->rq_bulk->bd_sender);
2257                 }
2258
2259                 if (server_cksum != client_cksum) {
2260                         struct ost_body *clbody;
2261                         __u32 client_cksum2;
2262                         u32 page_count = aa->aa_page_count;
2263
2264                         osc_checksum_bulk_rw(obd_name, cksum_type, nob,
2265                                              page_count, aa->aa_ppga,
2266                                              OST_READ, &client_cksum2, true);
2267                         clbody = req_capsule_client_get(&req->rq_pill,
2268                                                         &RMF_OST_BODY);
2269                         if (cli->cl_checksum_dump)
2270                                 dump_all_bulk_pages(&clbody->oa, page_count,
2271                                                     aa->aa_ppga, server_cksum,
2272                                                     client_cksum);
2273
2274                         LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
2275                                            "%s%s%s inode "DFID" object "DOSTID
2276                                            " extent [%llu-%llu], client %x/%x, "
2277                                            "server %x, cksum_type %x\n",
2278                                            obd_name,
2279                                            libcfs_nidstr(&peer->nid),
2280                                            via, router,
2281                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2282                                                 clbody->oa.o_parent_seq : 0ULL,
2283                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2284                                                 clbody->oa.o_parent_oid : 0,
2285                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2286                                                 clbody->oa.o_parent_ver : 0,
2287                                            POSTID(&body->oa.o_oi),
2288                                            aa->aa_ppga[0]->off,
2289                                            aa->aa_ppga[page_count-1]->off +
2290                                            aa->aa_ppga[page_count-1]->count - 1,
2291                                            client_cksum, client_cksum2,
2292                                            server_cksum, cksum_type);
2293                         cksum_counter = 0;
2294                         aa->aa_oa->o_cksum = client_cksum;
2295                         rc = -EAGAIN;
2296                 } else {
2297                         cksum_counter++;
2298                         CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
2299                         rc = 0;
2300                 }
2301         } else if (unlikely(client_cksum)) {
2302                 static int cksum_missed;
2303
2304                 cksum_missed++;
2305                 if ((cksum_missed & (-cksum_missed)) == cksum_missed)
2306                         CERROR("%s: checksum %u requested from %s but not sent\n",
2307                                obd_name, cksum_missed,
2308                                libcfs_nidstr(&peer->nid));
2309         } else {
2310                 rc = 0;
2311         }
2312
2313         /* get the inode from the first cl_page */
2314         clpage = oap2cl_page(brw_page2oap(aa->aa_ppga[0]));
2315         inode = clpage->cp_inode;
2316         if (clpage->cp_type == CPT_TRANSIENT && inode) {
2317                 blockbits = inode->i_blkbits;
2318                 blocksize = 1 << blockbits;
2319         }
2320         if (inode && IS_ENCRYPTED(inode)) {
2321                 int idx;
2322
2323                 if (!llcrypt_has_encryption_key(inode)) {
2324                         CDEBUG(D_SEC, "no enc key for ino %lu\n", inode->i_ino);
2325                         GOTO(out, rc);
2326                 }
2327                 for (idx = 0; idx < aa->aa_page_count; idx++) {
2328                         struct brw_page *brwpg = aa->aa_ppga[idx];
2329                         unsigned int offs = 0;
2330
2331                         while (offs < PAGE_SIZE) {
2332                                 /* do not decrypt if page is all 0s */
2333                                 if (memchr_inv(page_address(brwpg->pg) + offs,
2334                                       0, LUSTRE_ENCRYPTION_UNIT_SIZE) == NULL) {
2335                                         /* if page is empty forward info to
2336                                          * upper layers (ll_io_zero_page) by
2337                                          * clearing PagePrivate2
2338                                          */
2339                                         if (!offs)
2340                                                 ClearPagePrivate2(brwpg->pg);
2341                                         break;
2342                                 }
2343
2344                                 if (blockbits) {
2345                                         /* This is direct IO case. Directly call
2346                                          * decrypt function that takes inode as
2347                                          * input parameter. Page does not need
2348                                          * to be locked.
2349                                          */
2350                                         u64 lblk_num;
2351                                         unsigned int i;
2352
2353                                         clpage =
2354                                                oap2cl_page(brw_page2oap(brwpg));
2355                                         lblk_num =
2356                                                 ((u64)(clpage->cp_page_index) <<
2357                                                 (PAGE_SHIFT - blockbits)) +
2358                                                 (offs >> blockbits);
2359                                         for (i = offs;
2360                                              i < offs +
2361                                                     LUSTRE_ENCRYPTION_UNIT_SIZE;
2362                                              i += blocksize, lblk_num++) {
2363                                                 rc =
2364                                                   llcrypt_decrypt_block_inplace(
2365                                                           inode, brwpg->pg,
2366                                                           blocksize, i,
2367                                                           lblk_num);
2368                                                 if (rc)
2369                                                         break;
2370                                         }
2371                                 } else {
2372                                         rc = llcrypt_decrypt_pagecache_blocks(
2373                                                 brwpg->pg,
2374                                                 LUSTRE_ENCRYPTION_UNIT_SIZE,
2375                                                 offs);
2376                                 }
2377                                 if (rc)
2378                                         GOTO(out, rc);
2379
2380                                 offs += LUSTRE_ENCRYPTION_UNIT_SIZE;
2381                         }
2382                 }
2383         }
2384
2385 out:
2386         if (rc >= 0)
2387                 lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
2388                                      aa->aa_oa, &body->oa);
2389
2390         RETURN(rc);
2391 }
2392
2393 static int osc_brw_redo_request(struct ptlrpc_request *request,
2394                                 struct osc_brw_async_args *aa, int rc)
2395 {
2396         struct ptlrpc_request *new_req;
2397         struct osc_brw_async_args *new_aa;
2398         struct osc_async_page *oap;
2399         ENTRY;
2400
2401         /* The below message is checked in replay-ost-single.sh test_8ae*/
2402         DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
2403                   "redo for recoverable error %d", rc);
2404
2405         rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
2406                                 OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
2407                                   aa->aa_cli, aa->aa_oa, aa->aa_page_count,
2408                                   aa->aa_ppga, &new_req, 1);
2409         if (rc)
2410                 RETURN(rc);
2411
2412         list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
2413                 if (oap->oap_request != NULL) {
2414                         LASSERTF(request == oap->oap_request,
2415                                  "request %p != oap_request %p\n",
2416                                  request, oap->oap_request);
2417                 }
2418         }
2419         /*
2420          * New request takes over pga and oaps from old request.
2421          * Note that copying a list_head doesn't work, need to move it...
2422          */
2423         aa->aa_resends++;
2424         new_req->rq_interpret_reply = request->rq_interpret_reply;
2425         new_req->rq_async_args = request->rq_async_args;
2426         new_req->rq_commit_cb = request->rq_commit_cb;
2427         /* cap resend delay to the current request timeout, this is similar to
2428          * what ptlrpc does (see after_reply()) */
2429         if (aa->aa_resends > new_req->rq_timeout)
2430                 new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
2431         else
2432                 new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
2433         new_req->rq_generation_set = 1;
2434         new_req->rq_import_generation = request->rq_import_generation;
2435
2436         new_aa = ptlrpc_req_async_args(new_aa, new_req);
2437
2438         INIT_LIST_HEAD(&new_aa->aa_oaps);
2439         list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
2440         INIT_LIST_HEAD(&new_aa->aa_exts);
2441         list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
2442         new_aa->aa_resends = aa->aa_resends;
2443
2444         list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
2445                 if (oap->oap_request) {
2446                         ptlrpc_req_finished(oap->oap_request);
2447                         oap->oap_request = ptlrpc_request_addref(new_req);
2448                 }
2449         }
2450
2451         /* XXX: This code will run into problem if we're going to support
2452          * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
2453          * and wait for all of them to be finished. We should inherit request
2454          * set from old request. */
2455         ptlrpcd_add_req(new_req);
2456
2457         DEBUG_REQ(D_INFO, new_req, "new request");
2458         RETURN(0);
2459 }
2460
2461 /*
2462  * ugh, we want disk allocation on the target to happen in offset order.  we'll
2463  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
2464  * fine for our small page arrays and doesn't require allocation.  its an
2465  * insertion sort that swaps elements that are strides apart, shrinking the
2466  * stride down until its '1' and the array is sorted.
2467  */
2468 static void sort_brw_pages(struct brw_page **array, int num)
2469 {
2470         int stride, i, j;
2471         struct brw_page *tmp;
2472
2473         if (num == 1)
2474                 return;
2475         for (stride = 1; stride < num ; stride = (stride * 3) + 1)
2476                 ;
2477
2478         do {
2479                 stride /= 3;
2480                 for (i = stride ; i < num ; i++) {
2481                         tmp = array[i];
2482                         j = i;
2483                         while (j >= stride && array[j - stride]->off > tmp->off) {
2484                                 array[j] = array[j - stride];
2485                                 j -= stride;
2486                         }
2487                         array[j] = tmp;
2488                 }
2489         } while (stride > 1);
2490 }
2491
2492 static void osc_release_ppga(struct brw_page **ppga, size_t count)
2493 {
2494         LASSERT(ppga != NULL);
2495         OBD_FREE_PTR_ARRAY_LARGE(ppga, count);
2496 }
2497
2498 static int brw_interpret(const struct lu_env *env,
2499                          struct ptlrpc_request *req, void *args, int rc)
2500 {
2501         struct osc_brw_async_args *aa = args;
2502         struct osc_extent *ext;
2503         struct osc_extent *tmp;
2504         struct client_obd *cli = aa->aa_cli;
2505         unsigned long transferred = 0;
2506
2507         ENTRY;
2508
2509         rc = osc_brw_fini_request(req, rc);
2510         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
2511
2512         /* restore clear text pages */
2513         osc_release_bounce_pages(aa->aa_ppga, aa->aa_page_count);
2514
2515         /*
2516          * When server returns -EINPROGRESS, client should always retry
2517          * regardless of the number of times the bulk was resent already.
2518          */
2519         if (osc_recoverable_error(rc) && !req->rq_no_delay) {
2520                 if (req->rq_import_generation !=
2521                     req->rq_import->imp_generation) {
2522                         CDEBUG(D_HA, "%s: resend cross eviction for object: "
2523                                ""DOSTID", rc = %d.\n",
2524                                req->rq_import->imp_obd->obd_name,
2525                                POSTID(&aa->aa_oa->o_oi), rc);
2526                 } else if (rc == -EINPROGRESS ||
2527                            client_should_resend(aa->aa_resends, aa->aa_cli)) {
2528                         rc = osc_brw_redo_request(req, aa, rc);
2529                 } else {
2530                         CERROR("%s: too many resent retries for object: "
2531                                "%llu:%llu, rc = %d.\n",
2532                                req->rq_import->imp_obd->obd_name,
2533                                POSTID(&aa->aa_oa->o_oi), rc);
2534                 }
2535
2536                 if (rc == 0)
2537                         RETURN(0);
2538                 else if (rc == -EAGAIN || rc == -EINPROGRESS)
2539                         rc = -EIO;
2540         }
2541
2542         if (rc == 0) {
2543                 struct obdo *oa = aa->aa_oa;
2544                 struct cl_attr *attr = &osc_env_info(env)->oti_attr;
2545                 unsigned long valid = 0;
2546                 struct cl_object *obj;
2547                 struct osc_async_page *last;
2548
2549                 last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
2550                 obj = osc2cl(last->oap_obj);
2551
2552                 cl_object_attr_lock(obj);
2553                 if (oa->o_valid & OBD_MD_FLBLOCKS) {
2554                         attr->cat_blocks = oa->o_blocks;
2555                         valid |= CAT_BLOCKS;
2556                 }
2557                 if (oa->o_valid & OBD_MD_FLMTIME) {
2558                         attr->cat_mtime = oa->o_mtime;
2559                         valid |= CAT_MTIME;
2560                 }
2561                 if (oa->o_valid & OBD_MD_FLATIME) {
2562                         attr->cat_atime = oa->o_atime;
2563                         valid |= CAT_ATIME;
2564                 }
2565                 if (oa->o_valid & OBD_MD_FLCTIME) {
2566                         attr->cat_ctime = oa->o_ctime;
2567                         valid |= CAT_CTIME;
2568                 }
2569
2570                 if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
2571                         struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
2572                         loff_t last_off = last->oap_count + last->oap_obj_off +
2573                                 last->oap_page_off;
2574
2575                         /* Change file size if this is an out of quota or
2576                          * direct IO write and it extends the file size */
2577                         if (loi->loi_lvb.lvb_size < last_off) {
2578                                 attr->cat_size = last_off;
2579                                 valid |= CAT_SIZE;
2580                         }
2581                         /* Extend KMS if it's not a lockless write */
2582                         if (loi->loi_kms < last_off &&
2583                             oap2osc_page(last)->ops_srvlock == 0) {
2584                                 attr->cat_kms = last_off;
2585                                 valid |= CAT_KMS;
2586                         }
2587                 }
2588
2589                 if (valid != 0)
2590                         cl_object_attr_update(env, obj, attr, valid);
2591                 cl_object_attr_unlock(obj);
2592         }
2593         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
2594         aa->aa_oa = NULL;
2595
2596         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
2597                 osc_inc_unstable_pages(req);
2598
2599         list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
2600                 list_del_init(&ext->oe_link);
2601                 osc_extent_finish(env, ext, 1,
2602                                   rc && req->rq_no_delay ? -EAGAIN : rc);
2603         }
2604         LASSERT(list_empty(&aa->aa_exts));
2605         LASSERT(list_empty(&aa->aa_oaps));
2606
2607         transferred = (req->rq_bulk == NULL ? /* short io */
2608                        aa->aa_requested_nob :
2609                        req->rq_bulk->bd_nob_transferred);
2610
2611         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
2612         ptlrpc_lprocfs_brw(req, transferred);
2613
2614         spin_lock(&cli->cl_loi_list_lock);
2615         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
2616          * is called so we know whether to go to sync BRWs or wait for more
2617          * RPCs to complete */
2618         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
2619                 cli->cl_w_in_flight--;
2620         else
2621                 cli->cl_r_in_flight--;
2622         osc_wake_cache_waiters(cli);
2623         spin_unlock(&cli->cl_loi_list_lock);
2624
2625         osc_io_unplug(env, cli, NULL);
2626         RETURN(rc);
2627 }
2628
2629 static void brw_commit(struct ptlrpc_request *req)
2630 {
2631         /* If osc_inc_unstable_pages (via osc_extent_finish) races with
2632          * this called via the rq_commit_cb, I need to ensure
2633          * osc_dec_unstable_pages is still called. Otherwise unstable
2634          * pages may be leaked. */
2635         spin_lock(&req->rq_lock);
2636         if (likely(req->rq_unstable)) {
2637                 req->rq_unstable = 0;
2638                 spin_unlock(&req->rq_lock);
2639
2640                 osc_dec_unstable_pages(req);
2641         } else {
2642                 req->rq_committed = 1;
2643                 spin_unlock(&req->rq_lock);
2644         }
2645 }
2646
2647 /**
2648  * Build an RPC by the list of extent @ext_list. The caller must ensure
2649  * that the total pages in this list are NOT over max pages per RPC.
2650  * Extents in the list must be in OES_RPC state.
2651  */
2652 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
2653                   struct list_head *ext_list, int cmd)
2654 {
2655         struct ptlrpc_request           *req = NULL;
2656         struct osc_extent               *ext;
2657         struct brw_page                 **pga = NULL;
2658         struct osc_brw_async_args       *aa = NULL;
2659         struct obdo                     *oa = NULL;
2660         struct osc_async_page           *oap;
2661         struct osc_object               *obj = NULL;
2662         struct cl_req_attr              *crattr = NULL;
2663         loff_t                          starting_offset = OBD_OBJECT_EOF;
2664         loff_t                          ending_offset = 0;
2665         /* '1' for consistency with code that checks !mpflag to restore */
2666         int mpflag = 1;
2667         int                             mem_tight = 0;
2668         int                             page_count = 0;
2669         bool                            soft_sync = false;
2670         bool                            ndelay = false;
2671         int                             i;
2672         int                             grant = 0;
2673         int                             rc;
2674         __u32                           layout_version = 0;
2675         LIST_HEAD(rpc_list);
2676         struct ost_body                 *body;
2677         ENTRY;
2678         LASSERT(!list_empty(ext_list));
2679
2680         /* add pages into rpc_list to build BRW rpc */
2681         list_for_each_entry(ext, ext_list, oe_link) {
2682                 LASSERT(ext->oe_state == OES_RPC);
2683                 mem_tight |= ext->oe_memalloc;
2684                 grant += ext->oe_grants;
2685                 page_count += ext->oe_nr_pages;
2686                 layout_version = max(layout_version, ext->oe_layout_version);
2687                 if (obj == NULL)
2688                         obj = ext->oe_obj;
2689         }
2690
2691         soft_sync = osc_over_unstable_soft_limit(cli);
2692         if (mem_tight)
2693                 mpflag = memalloc_noreclaim_save();
2694
2695         OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count);
2696         if (pga == NULL)
2697                 GOTO(out, rc = -ENOMEM);
2698
2699         OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
2700         if (oa == NULL)
2701                 GOTO(out, rc = -ENOMEM);
2702
2703         i = 0;
2704         list_for_each_entry(ext, ext_list, oe_link) {
2705                 list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
2706                         if (mem_tight)
2707                                 oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
2708                         if (soft_sync)
2709                                 oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
2710                         pga[i] = &oap->oap_brw_page;
2711                         pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
2712                         i++;
2713
2714                         list_add_tail(&oap->oap_rpc_item, &rpc_list);
2715                         if (starting_offset == OBD_OBJECT_EOF ||
2716                             starting_offset > oap->oap_obj_off)
2717                                 starting_offset = oap->oap_obj_off;
2718                         else
2719                                 LASSERT(oap->oap_page_off == 0);
2720                         if (ending_offset < oap->oap_obj_off + oap->oap_count)
2721                                 ending_offset = oap->oap_obj_off +
2722                                                 oap->oap_count;
2723                         else
2724                                 LASSERT(oap->oap_page_off + oap->oap_count ==
2725                                         PAGE_SIZE);
2726                 }
2727                 if (ext->oe_ndelay)
2728                         ndelay = true;
2729         }
2730
2731         /* first page in the list */
2732         oap = list_first_entry(&rpc_list, typeof(*oap), oap_rpc_item);
2733
2734         crattr = &osc_env_info(env)->oti_req_attr;
2735         memset(crattr, 0, sizeof(*crattr));
2736         crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
2737         crattr->cra_flags = ~0ULL;
2738         crattr->cra_page = oap2cl_page(oap);
2739         crattr->cra_oa = oa;
2740         cl_req_attr_set(env, osc2cl(obj), crattr);
2741
2742         if (cmd == OBD_BRW_WRITE) {
2743                 oa->o_grant_used = grant;
2744                 if (layout_version > 0) {
2745                         CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
2746                                PFID(&oa->o_oi.oi_fid), layout_version);
2747
2748                         oa->o_layout_version = layout_version;
2749                         oa->o_valid |= OBD_MD_LAYOUT_VERSION;
2750                 }
2751         }
2752
2753         sort_brw_pages(pga, page_count);
2754         rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
2755         if (rc != 0) {
2756                 CERROR("prep_req failed: %d\n", rc);
2757                 GOTO(out, rc);
2758         }
2759
2760         req->rq_commit_cb = brw_commit;
2761         req->rq_interpret_reply = brw_interpret;
2762         req->rq_memalloc = mem_tight != 0;
2763         oap->oap_request = ptlrpc_request_addref(req);
2764         if (ndelay) {
2765                 req->rq_no_resend = req->rq_no_delay = 1;
2766                 /* probably set a shorter timeout value.
2767                  * to handle ETIMEDOUT in brw_interpret() correctly. */
2768                 /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
2769         }
2770
2771         /* Need to update the timestamps after the request is built in case
2772          * we race with setattr (locally or in queue at OST).  If OST gets
2773          * later setattr before earlier BRW (as determined by the request xid),
2774          * the OST will not use BRW timestamps.  Sadly, there is no obvious
2775          * way to do this in a single call.  bug 10150 */
2776         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
2777         crattr->cra_oa = &body->oa;
2778         crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
2779         cl_req_attr_set(env, osc2cl(obj), crattr);
2780         lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
2781
2782         aa = ptlrpc_req_async_args(aa, req);
2783         INIT_LIST_HEAD(&aa->aa_oaps);
2784         list_splice_init(&rpc_list, &aa->aa_oaps);
2785         INIT_LIST_HEAD(&aa->aa_exts);
2786         list_splice_init(ext_list, &aa->aa_exts);
2787
2788         spin_lock(&cli->cl_loi_list_lock);
2789         starting_offset >>= PAGE_SHIFT;
2790         ending_offset >>= PAGE_SHIFT;
2791         if (cmd == OBD_BRW_READ) {
2792                 cli->cl_r_in_flight++;
2793                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
2794                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
2795                 lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
2796                                       starting_offset + 1);
2797         } else {
2798                 cli->cl_w_in_flight++;
2799                 lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
2800                 lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
2801                 lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
2802                                       starting_offset + 1);
2803         }
2804         spin_unlock(&cli->cl_loi_list_lock);
2805
2806         DEBUG_REQ(D_INODE, req, "%d pages, aa %p, now %ur/%uw in flight",
2807                   page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight);
2808         if (libcfs_debug & D_IOTRACE) {
2809                 struct lu_fid fid;
2810
2811                 fid.f_seq = crattr->cra_oa->o_parent_seq;
2812                 fid.f_oid = crattr->cra_oa->o_parent_oid;
2813                 fid.f_ver = crattr->cra_oa->o_parent_ver;
2814                 CDEBUG(D_IOTRACE,
2815                        DFID": %d %s pages, start %lld, end %lld, now %ur/%uw in flight\n",
2816                        PFID(&fid), page_count,
2817                        cmd == OBD_BRW_READ ? "read" : "write", starting_offset,
2818                        ending_offset, cli->cl_r_in_flight, cli->cl_w_in_flight);
2819         }
2820         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
2821
2822         ptlrpcd_add_req(req);
2823         rc = 0;
2824         EXIT;
2825
2826 out:
2827         if (mem_tight)
2828                 memalloc_noreclaim_restore(mpflag);
2829
2830         if (rc != 0) {
2831                 LASSERT(req == NULL);
2832
2833                 if (oa)
2834                         OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
2835                 if (pga) {
2836                         osc_release_bounce_pages(pga, page_count);
2837                         osc_release_ppga(pga, page_count);
2838                 }
2839                 /* this should happen rarely and is pretty bad, it makes the
2840                  * pending list not follow the dirty order
2841                  */
2842                 while ((ext = list_first_entry_or_null(ext_list,
2843                                                        struct osc_extent,
2844                                                        oe_link)) != NULL) {
2845                         list_del_init(&ext->oe_link);
2846                         osc_extent_finish(env, ext, 0, rc);
2847                 }
2848         }
2849         RETURN(rc);
2850 }
2851
2852 /* This is to refresh our lock in face of no RPCs. */
2853 void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start)
2854 {
2855         struct ptlrpc_request *req;
2856         struct obdo oa;
2857         struct brw_page bpg = { .off = start, .count = 1};
2858         struct brw_page *pga = &bpg;
2859         int rc;
2860
2861         memset(&oa, 0, sizeof(oa));
2862         oa.o_oi = osc->oo_oinfo->loi_oi;
2863         oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLFLAGS;
2864         /* For updated servers - don't do a read */
2865         oa.o_flags = OBD_FL_NORPC;
2866
2867         rc = osc_brw_prep_request(OBD_BRW_READ, osc_cli(osc), &oa, 1, &pga,
2868                                   &req, 0);
2869
2870         /* If we succeeded we ship it off, if not there's no point in doing
2871          * anything. Also no resends.
2872          * No interpret callback, no commit callback.
2873          */
2874         if (!rc) {
2875                 req->rq_no_resend = 1;
2876                 ptlrpcd_add_req(req);
2877         }
2878 }
2879
2880 static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
2881 {
2882         int set = 0;
2883
2884         LASSERT(lock != NULL);
2885
2886         lock_res_and_lock(lock);
2887
2888         if (lock->l_ast_data == NULL)
2889                 lock->l_ast_data = data;
2890         if (lock->l_ast_data == data)
2891                 set = 1;
2892
2893         unlock_res_and_lock(lock);
2894
2895         return set;
2896 }
2897
2898 int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
2899                      void *cookie, struct lustre_handle *lockh,
2900                      enum ldlm_mode mode, __u64 *flags, bool speculative,
2901                      int errcode)
2902 {
2903         bool intent = *flags & LDLM_FL_HAS_INTENT;
2904         int rc;
2905         ENTRY;
2906
2907         /* The request was created before ldlm_cli_enqueue call. */
2908         if (intent && errcode == ELDLM_LOCK_ABORTED) {
2909                 struct ldlm_reply *rep;
2910
2911                 rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
2912                 LASSERT(rep != NULL);
2913
2914                 rep->lock_policy_res1 =
2915                         ptlrpc_status_ntoh(rep->lock_policy_res1);
2916                 if (rep->lock_policy_res1)
2917                         errcode = rep->lock_policy_res1;
2918                 if (!speculative)
2919                         *flags |= LDLM_FL_LVB_READY;
2920         } else if (errcode == ELDLM_OK) {
2921                 *flags |= LDLM_FL_LVB_READY;
2922         }
2923
2924         /* Call the update callback. */
2925         rc = (*upcall)(cookie, lockh, errcode);
2926
2927         /* release the reference taken in ldlm_cli_enqueue() */
2928         if (errcode == ELDLM_LOCK_MATCHED)
2929                 errcode = ELDLM_OK;
2930         if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
2931                 ldlm_lock_decref(lockh, mode);
2932
2933         RETURN(rc);
2934 }
2935
2936 int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
2937                           void *args, int rc)
2938 {
2939         struct osc_enqueue_args *aa = args;
2940         struct ldlm_lock *lock;
2941         struct lustre_handle *lockh = &aa->oa_lockh;
2942         enum ldlm_mode mode = aa->oa_mode;
2943         struct ost_lvb *lvb = aa->oa_lvb;
2944         __u32 lvb_len = sizeof(*lvb);
2945         __u64 flags = 0;
2946         struct ldlm_enqueue_info einfo = {
2947                 .ei_type = aa->oa_type,
2948                 .ei_mode = mode,
2949         };
2950
2951         ENTRY;
2952
2953         /* ldlm_cli_enqueue is holding a reference on the lock, so it must
2954          * be valid. */
2955         lock = ldlm_handle2lock(lockh);
2956         LASSERTF(lock != NULL,
2957                  "lockh %#llx, req %p, aa %p - client evicted?\n",
2958                  lockh->cookie, req, aa);
2959
2960         /* Take an additional reference so that a blocking AST that
2961          * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
2962          * to arrive after an upcall has been executed by
2963          * osc_enqueue_fini(). */
2964         ldlm_lock_addref(lockh, mode);
2965
2966         /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
2967         OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
2968
2969         /* Let CP AST to grant the lock first. */
2970         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
2971
2972         if (aa->oa_speculative) {
2973                 LASSERT(aa->oa_lvb == NULL);
2974                 LASSERT(aa->oa_flags == NULL);
2975                 aa->oa_flags = &flags;
2976         }
2977
2978         /* Complete obtaining the lock procedure. */
2979         rc = ldlm_cli_enqueue_fini(aa->oa_exp, &req->rq_pill, &einfo, 1,
2980                                    aa->oa_flags, lvb, lvb_len, lockh, rc,
2981                                    false);
2982         /* Complete osc stuff. */
2983         rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
2984                               aa->oa_flags, aa->oa_speculative, rc);
2985
2986         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
2987
2988         ldlm_lock_decref(lockh, mode);
2989         LDLM_LOCK_PUT(lock);
2990         RETURN(rc);
2991 }
2992
2993 /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
2994  * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
2995  * other synchronous requests, however keeping some locks and trying to obtain
2996  * others may take a considerable amount of time in a case of ost failure; and
2997  * when other sync requests do not get released lock from a client, the client
2998  * is evicted from the cluster -- such scenarious make the life difficult, so
2999  * release locks just after they are obtained. */
3000 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
3001                      __u64 *flags, union ldlm_policy_data *policy,
3002                      struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
3003                      void *cookie, struct ldlm_enqueue_info *einfo,
3004                      struct ptlrpc_request_set *rqset, int async,
3005                      bool speculative)
3006 {
3007         struct obd_device *obd = exp->exp_obd;
3008         struct lustre_handle lockh = { 0 };
3009         struct ptlrpc_request *req = NULL;
3010         int intent = *flags & LDLM_FL_HAS_INTENT;
3011         __u64 search_flags = *flags;
3012         __u64 match_flags = 0;
3013         enum ldlm_mode mode;
3014         int rc;
3015         ENTRY;
3016
3017         /* Filesystem lock extents are extended to page boundaries so that
3018          * dealing with the page cache is a little smoother.  */
3019         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
3020         policy->l_extent.end |= ~PAGE_MASK;
3021
3022         /* Next, search for already existing extent locks that will cover us */
3023         /* If we're trying to read, we also search for an existing PW lock.  The
3024          * VFS and page cache already protect us locally, so lots of readers/
3025          * writers can share a single PW lock.
3026          *
3027          * There are problems with conversion deadlocks, so instead of
3028          * converting a read lock to a write lock, we'll just enqueue a new
3029          * one.
3030          *
3031          * At some point we should cancel the read lock instead of making them
3032          * send us a blocking callback, but there are problems with canceling
3033          * locks out from other users right now, too. */
3034         mode = einfo->ei_mode;
3035         if (einfo->ei_mode == LCK_PR)
3036                 mode |= LCK_PW;
3037         /* Normal lock requests must wait for the LVB to be ready before
3038          * matching a lock; speculative lock requests do not need to,
3039          * because they will not actually use the lock. */
3040         if (!speculative)
3041                 search_flags |= LDLM_FL_LVB_READY;
3042         if (intent != 0)
3043                 search_flags |= LDLM_FL_BLOCK_GRANTED;
3044         if (mode == LCK_GROUP)
3045                 match_flags = LDLM_MATCH_GROUP;
3046         mode = ldlm_lock_match_with_skip(obd->obd_namespace, search_flags, 0,
3047                                          res_id, einfo->ei_type, policy, mode,
3048                                          &lockh, match_flags);
3049         if (mode) {
3050                 struct ldlm_lock *matched;
3051
3052                 if (*flags & LDLM_FL_TEST_LOCK)
3053                         RETURN(ELDLM_OK);
3054
3055                 matched = ldlm_handle2lock(&lockh);
3056                 if (speculative) {
3057                         /* This DLM lock request is speculative, and does not
3058                          * have an associated IO request. Therefore if there
3059                          * is already a DLM lock, it wll just inform the
3060                          * caller to cancel the request for this stripe.*/
3061                         lock_res_and_lock(matched);
3062                         if (ldlm_extent_equal(&policy->l_extent,
3063                             &matched->l_policy_data.l_extent))
3064                                 rc = -EEXIST;
3065                         else
3066                                 rc = -ECANCELED;
3067                         unlock_res_and_lock(matched);
3068
3069                         ldlm_lock_decref(&lockh, mode);
3070                         LDLM_LOCK_PUT(matched);
3071                         RETURN(rc);
3072                 } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
3073                         *flags |= LDLM_FL_LVB_READY;
3074
3075                         /* We already have a lock, and it's referenced. */
3076                         (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
3077
3078                         ldlm_lock_decref(&lockh, mode);
3079                         LDLM_LOCK_PUT(matched);
3080                         RETURN(ELDLM_OK);
3081                 } else {
3082                         ldlm_lock_decref(&lockh, mode);
3083                         LDLM_LOCK_PUT(matched);
3084                 }
3085         }
3086
3087         if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
3088                 RETURN(-ENOLCK);
3089
3090         /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
3091         *flags &= ~LDLM_FL_BLOCK_GRANTED;
3092
3093         rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
3094                               sizeof(*lvb), LVB_T_OST, &lockh, async);
3095         if (async) {
3096                 if (!rc) {
3097                         struct osc_enqueue_args *aa;
3098                         aa = ptlrpc_req_async_args(aa, req);
3099                         aa->oa_exp         = exp;
3100                         aa->oa_mode        = einfo->ei_mode;
3101                         aa->oa_type        = einfo->ei_type;
3102                         lustre_handle_copy(&aa->oa_lockh, &lockh);
3103                         aa->oa_upcall      = upcall;
3104                         aa->oa_cookie      = cookie;
3105                         aa->oa_speculative = speculative;
3106                         if (!speculative) {
3107                                 aa->oa_flags  = flags;
3108                                 aa->oa_lvb    = lvb;
3109                         } else {
3110                                 /* speculative locks are essentially to enqueue
3111                                  * a DLM lock  in advance, so we don't care
3112                                  * about the result of the enqueue. */
3113                                 aa->oa_lvb    = NULL;
3114                                 aa->oa_flags  = NULL;
3115                         }
3116
3117                         req->rq_interpret_reply = osc_enqueue_interpret;
3118                         ptlrpc_set_add_req(rqset, req);
3119                 }
3120                 RETURN(rc);
3121         }
3122
3123         rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
3124                               flags, speculative, rc);
3125
3126         RETURN(rc);
3127 }
3128
3129 int osc_match_base(const struct lu_env *env, struct obd_export *exp,
3130                    struct ldlm_res_id *res_id, enum ldlm_type type,
3131                    union ldlm_policy_data *policy, enum ldlm_mode mode,
3132                    __u64 *flags, struct osc_object *obj,
3133                    struct lustre_handle *lockh, enum ldlm_match_flags match_flags)
3134 {
3135         struct obd_device *obd = exp->exp_obd;
3136         __u64 lflags = *flags;
3137         enum ldlm_mode rc;
3138         ENTRY;
3139
3140         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
3141                 RETURN(-EIO);
3142
3143         /* Filesystem lock extents are extended to page boundaries so that
3144          * dealing with the page cache is a little smoother */
3145         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
3146         policy->l_extent.end |= ~PAGE_MASK;
3147
3148         /* Next, search for already existing extent locks that will cover us */
3149         rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0,
3150                                         res_id, type, policy, mode, lockh,
3151                                         match_flags);
3152         if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
3153                 RETURN(rc);
3154
3155         if (obj != NULL) {
3156                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
3157
3158                 LASSERT(lock != NULL);
3159                 if (osc_set_lock_data(lock, obj)) {
3160                         lock_res_and_lock(lock);
3161                         if (!ldlm_is_lvb_cached(lock)) {
3162                                 LASSERT(lock->l_ast_data == obj);
3163                                 osc_lock_lvb_update(env, obj, lock, NULL);
3164                                 ldlm_set_lvb_cached(lock);
3165                         }
3166                         unlock_res_and_lock(lock);
3167                 } else {
3168                         ldlm_lock_decref(lockh, rc);
3169                         rc = 0;
3170                 }
3171                 LDLM_LOCK_PUT(lock);
3172         }
3173         RETURN(rc);
3174 }
3175
3176 static int osc_statfs_interpret(const struct lu_env *env,
3177                                 struct ptlrpc_request *req, void *args, int rc)
3178 {
3179         struct osc_async_args *aa = args;
3180         struct obd_statfs *msfs;
3181
3182         ENTRY;
3183         if (rc == -EBADR)
3184                 /*
3185                  * The request has in fact never been sent due to issues at
3186                  * a higher level (LOV).  Exit immediately since the caller
3187                  * is aware of the problem and takes care of the clean up.
3188                  */
3189                 RETURN(rc);
3190
3191         if ((rc == -ENOTCONN || rc == -EAGAIN) &&
3192             (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
3193                 GOTO(out, rc = 0);
3194
3195         if (rc != 0)
3196                 GOTO(out, rc);
3197
3198         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
3199         if (msfs == NULL)
3200                 GOTO(out, rc = -EPROTO);
3201
3202         *aa->aa_oi->oi_osfs = *msfs;
3203 out:
3204         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
3205
3206         RETURN(rc);
3207 }
3208
3209 static int osc_statfs_async(struct obd_export *exp,
3210                             struct obd_info *oinfo, time64_t max_age,
3211                             struct ptlrpc_request_set *rqset)
3212 {
3213         struct obd_device     *obd = class_exp2obd(exp);
3214         struct ptlrpc_request *req;
3215         struct osc_async_args *aa;
3216         int rc;
3217         ENTRY;
3218
3219         if (obd->obd_osfs_age >= max_age) {
3220                 CDEBUG(D_SUPER,
3221                        "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
3222                        obd->obd_name, &obd->obd_osfs,
3223                        obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
3224                        obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
3225                 spin_lock(&obd->obd_osfs_lock);
3226                 memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
3227                 spin_unlock(&obd->obd_osfs_lock);
3228                 oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
3229                 if (oinfo->oi_cb_up)
3230                         oinfo->oi_cb_up(oinfo, 0);
3231
3232                 RETURN(0);
3233         }
3234
3235         /* We could possibly pass max_age in the request (as an absolute
3236          * timestamp or a "seconds.usec ago") so the target can avoid doing
3237          * extra calls into the filesystem if that isn't necessary (e.g.
3238          * during mount that would help a bit).  Having relative timestamps
3239          * is not so great if request processing is slow, while absolute
3240          * timestamps are not ideal because they need time synchronization. */
3241         req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
3242         if (req == NULL)
3243                 RETURN(-ENOMEM);
3244
3245         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3246         if (rc) {
3247                 ptlrpc_request_free(req);
3248                 RETURN(rc);
3249         }
3250         ptlrpc_request_set_replen(req);
3251         req->rq_request_portal = OST_CREATE_PORTAL;
3252         ptlrpc_at_set_req_timeout(req);
3253
3254         if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
3255                 /* procfs requests not want stat in wait for avoid deadlock */
3256                 req->rq_no_resend = 1;
3257                 req->rq_no_delay = 1;
3258         }
3259
3260         req->rq_interpret_reply = osc_statfs_interpret;
3261         aa = ptlrpc_req_async_args(aa, req);
3262         aa->aa_oi = oinfo;
3263
3264         ptlrpc_set_add_req(rqset, req);
3265         RETURN(0);
3266 }
3267
3268 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
3269                       struct obd_statfs *osfs, time64_t max_age, __u32 flags)
3270 {
3271         struct obd_device     *obd = class_exp2obd(exp);
3272         struct obd_statfs     *msfs;
3273         struct ptlrpc_request *req;
3274         struct obd_import     *imp, *imp0;
3275         int rc;
3276         ENTRY;
3277
3278         /*Since the request might also come from lprocfs, so we need
3279          *sync this with client_disconnect_export Bug15684
3280          */
3281         with_imp_locked(obd, imp0, rc)
3282                 imp = class_import_get(imp0);
3283         if (rc)
3284                 RETURN(rc);
3285
3286         /* We could possibly pass max_age in the request (as an absolute
3287          * timestamp or a "seconds.usec ago") so the target can avoid doing
3288          * extra calls into the filesystem if that isn't necessary (e.g.
3289          * during mount that would help a bit).  Having relative timestamps
3290          * is not so great if request processing is slow, while absolute
3291          * timestamps are not ideal because they need time synchronization. */
3292         req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
3293
3294         class_import_put(imp);
3295
3296         if (req == NULL)
3297                 RETURN(-ENOMEM);
3298
3299         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3300         if (rc) {
3301                 ptlrpc_request_free(req);
3302                 RETURN(rc);
3303         }
3304         ptlrpc_request_set_replen(req);
3305         req->rq_request_portal = OST_CREATE_PORTAL;
3306         ptlrpc_at_set_req_timeout(req);
3307
3308         if (flags & OBD_STATFS_NODELAY) {
3309                 /* procfs requests not want stat in wait for avoid deadlock */
3310                 req->rq_no_resend = 1;
3311                 req->rq_no_delay = 1;
3312         }
3313
3314         rc = ptlrpc_queue_wait(req);
3315         if (rc)
3316                 GOTO(out, rc);
3317
3318         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
3319         if (msfs == NULL)
3320                 GOTO(out, rc = -EPROTO);
3321
3322         *osfs = *msfs;
3323
3324         EXIT;
3325 out:
3326         ptlrpc_req_finished(req);
3327         return rc;
3328 }
3329
3330 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
3331                          void *karg, void __user *uarg)
3332 {
3333         struct obd_device *obd = exp->exp_obd;
3334         struct obd_ioctl_data *data = karg;
3335         int rc = 0;
3336
3337         ENTRY;
3338         CDEBUG(D_IOCTL, "%s: cmd=%x len=%u karg=%pK uarg=%pK\n",
3339                obd->obd_name, cmd, len, karg, uarg);
3340
3341         if (!try_module_get(THIS_MODULE)) {
3342                 CERROR("%s: cannot get module '%s'\n", obd->obd_name,
3343                        module_name(THIS_MODULE));
3344                 return -EINVAL;
3345         }
3346         switch (cmd) {
3347         case OBD_IOC_CLIENT_RECOVER:
3348                 rc = ptlrpc_recover_import(obd->u.cli.cl_import,
3349                                            data->ioc_inlbuf1, 0);
3350                 if (rc > 0)
3351                         rc = 0;
3352                 break;
3353         case OBD_IOC_GETATTR:
3354                 rc = obd_getattr(NULL, exp, &data->ioc_obdo1);
3355                 break;
3356         case IOC_OSC_SET_ACTIVE:
3357                 rc = ptlrpc_set_import_active(obd->u.cli.cl_import,
3358                                               data->ioc_offset);
3359                 break;
3360         default:
3361                 rc = OBD_IOC_DEBUG(D_IOCTL, obd->obd_name, cmd, "unrecognized",
3362                                    -ENOTTY);
3363                 break;
3364         }
3365
3366         module_put(THIS_MODULE);
3367         return rc;
3368 }
3369
3370 int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
3371                        u32 keylen, void *key, u32 vallen, void *val,
3372                        struct ptlrpc_request_set *set)
3373 {
3374         struct ptlrpc_request *req;
3375         struct obd_device     *obd = exp->exp_obd;
3376         struct obd_import     *imp = class_exp2cliimp(exp);
3377         char                  *tmp;
3378         int                    rc;
3379         ENTRY;
3380
3381         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
3382
3383         if (KEY_IS(KEY_CHECKSUM)) {
3384                 if (vallen != sizeof(int))
3385                         RETURN(-EINVAL);
3386                 exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
3387                 RETURN(0);
3388         }
3389
3390         if (KEY_IS(KEY_SPTLRPC_CONF)) {
3391                 sptlrpc_conf_client_adapt(obd);
3392                 RETURN(0);
3393         }
3394
3395         if (KEY_IS(KEY_FLUSH_CTX)) {
3396                 sptlrpc_import_flush_my_ctx(imp);
3397                 RETURN(0);
3398         }
3399
3400         if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
3401                 struct client_obd *cli = &obd->u.cli;
3402                 long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
3403                 long target = *(long *)val;
3404
3405                 nr = osc_lru_shrink(env, cli, min(nr, target), true);
3406                 *(long *)val -= nr;
3407                 RETURN(0);
3408         }
3409
3410         if (!set && !KEY_IS(KEY_GRANT_SHRINK))
3411                 RETURN(-EINVAL);
3412
3413         /* We pass all other commands directly to OST. Since nobody calls osc
3414            methods directly and everybody is supposed to go through LOV, we
3415            assume lov checked invalid values for us.
3416            The only recognised values so far are evict_by_nid and mds_conn.
3417            Even if something bad goes through, we'd get a -EINVAL from OST
3418            anyway. */
3419
3420         req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
3421                                                 &RQF_OST_SET_GRANT_INFO :
3422                                                 &RQF_OBD_SET_INFO);
3423         if (req == NULL)
3424                 RETURN(-ENOMEM);
3425
3426         req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
3427                              RCL_CLIENT, keylen);
3428         if (!KEY_IS(KEY_GRANT_SHRINK))
3429                 req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
3430                                      RCL_CLIENT, vallen);
3431         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
3432         if (rc) {
3433                 ptlrpc_request_free(req);
3434                 RETURN(rc);
3435         }
3436
3437         tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
3438         memcpy(tmp, key, keylen);
3439         tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
3440                                                         &RMF_OST_BODY :
3441                                                         &RMF_SETINFO_VAL);
3442         memcpy(tmp, val, vallen);
3443
3444         if (KEY_IS(KEY_GRANT_SHRINK)) {
3445                 struct osc_grant_args *aa;
3446                 struct obdo *oa;
3447
3448                 aa = ptlrpc_req_async_args(aa, req);
3449                 OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
3450                 if (!oa) {
3451                         ptlrpc_req_finished(req);
3452                         RETURN(-ENOMEM);
3453                 }
3454                 *oa = ((struct ost_body *)val)->oa;
3455                 aa->aa_oa = oa;
3456                 req->rq_interpret_reply = osc_shrink_grant_interpret;
3457         }
3458
3459         ptlrpc_request_set_replen(req);
3460         if (!KEY_IS(KEY_GRANT_SHRINK)) {
3461                 LASSERT(set != NULL);
3462                 ptlrpc_set_add_req(set, req);
3463                 ptlrpc_check_set(NULL, set);
3464         } else {
3465                 ptlrpcd_add_req(req);
3466         }
3467
3468         RETURN(0);
3469 }
3470 EXPORT_SYMBOL(osc_set_info_async);
3471
3472 int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
3473                   struct obd_device *obd, struct obd_uuid *cluuid,
3474                   struct obd_connect_data *data, void *localdata)
3475 {
3476         struct client_obd *cli = &obd->u.cli;
3477
3478         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
3479                 long lost_grant;
3480                 long grant;
3481
3482                 spin_lock(&cli->cl_loi_list_lock);
3483                 grant = cli->cl_avail_grant + cli->cl_reserved_grant;
3484                 if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
3485                         /* restore ocd_grant_blkbits as client page bits */
3486                         data->ocd_grant_blkbits = PAGE_SHIFT;
3487                         grant += cli->cl_dirty_grant;
3488                 } else {
3489                         grant += cli->cl_dirty_pages << PAGE_SHIFT;
3490                 }
3491                 data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
3492                 lost_grant = cli->cl_lost_grant;
3493                 cli->cl_lost_grant = 0;
3494                 spin_unlock(&cli->cl_loi_list_lock);
3495
3496                 CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
3497                        " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
3498                        data->ocd_version, data->ocd_grant, lost_grant);
3499         }
3500
3501         RETURN(0);
3502 }
3503 EXPORT_SYMBOL(osc_reconnect);
3504
3505 int osc_disconnect(struct obd_export *exp)
3506 {
3507         struct obd_device *obd = class_exp2obd(exp);
3508         int rc;
3509
3510         rc = client_disconnect_export(exp);
3511         /**
3512          * Initially we put del_shrink_grant before disconnect_export, but it
3513          * causes the following problem if setup (connect) and cleanup
3514          * (disconnect) are tangled together.
3515          *      connect p1                     disconnect p2
3516          *   ptlrpc_connect_import
3517          *     ...............               class_manual_cleanup
3518          *                                     osc_disconnect
3519          *                                     del_shrink_grant
3520          *   ptlrpc_connect_interrupt
3521          *     osc_init_grant
3522          *   add this client to shrink list
3523          *                                      cleanup_osc
3524          * Bang! grant shrink thread trigger the shrink. BUG18662
3525          */
3526         osc_del_grant_list(&obd->u.cli);
3527         return rc;
3528 }
3529 EXPORT_SYMBOL(osc_disconnect);
3530
3531 int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
3532                                  struct hlist_node *hnode, void *arg)
3533 {
3534         struct lu_env *env = arg;
3535         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
3536         struct ldlm_lock *lock;
3537         struct osc_object *osc = NULL;
3538         ENTRY;
3539
3540         lock_res(res);
3541         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
3542                 if (lock->l_ast_data != NULL && osc == NULL) {
3543                         osc = lock->l_ast_data;
3544                         cl_object_get(osc2cl(osc));
3545                 }
3546
3547                 /* clear LDLM_FL_CLEANED flag to make sure it will be canceled
3548                  * by the 2nd round of ldlm_namespace_clean() call in
3549                  * osc_import_event(). */
3550                 ldlm_clear_cleaned(lock);
3551         }
3552         unlock_res(res);
3553
3554         if (osc != NULL) {
3555                 osc_object_invalidate(env, osc);
3556                 cl_object_put(env, osc2cl(osc));
3557         }
3558
3559         RETURN(0);
3560 }
3561 EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
3562
3563 static int osc_import_event(struct obd_device *obd,
3564                             struct obd_import *imp,
3565                             enum obd_import_event event)
3566 {
3567         struct client_obd *cli;
3568         int rc = 0;
3569
3570         ENTRY;
3571         LASSERT(imp->imp_obd == obd);
3572
3573         switch (event) {
3574         case IMP_EVENT_DISCON: {
3575                 cli = &obd->u.cli;
3576                 spin_lock(&cli->cl_loi_list_lock);
3577                 cli->cl_avail_grant = 0;
3578                 cli->cl_lost_grant = 0;
3579                 spin_unlock(&cli->cl_loi_list_lock);
3580                 break;
3581         }
3582         case IMP_EVENT_INACTIVE: {
3583                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
3584                 break;
3585         }
3586         case IMP_EVENT_INVALIDATE: {
3587                 struct ldlm_namespace *ns = obd->obd_namespace;
3588                 struct lu_env         *env;
3589                 __u16                  refcheck;
3590
3591                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3592
3593                 env = cl_env_get(&refcheck);
3594                 if (!IS_ERR(env)) {
3595                         osc_io_unplug(env, &obd->u.cli, NULL);
3596
3597                         cfs_hash_for_each_nolock(ns->ns_rs_hash,
3598                                                  osc_ldlm_resource_invalidate,
3599                                                  env, 0);
3600                         cl_env_put(env, &refcheck);
3601
3602                         ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3603                 } else
3604                         rc = PTR_ERR(env);
3605                 break;
3606         }
3607         case IMP_EVENT_ACTIVE: {
3608                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
3609                 break;
3610         }
3611         case IMP_EVENT_OCD: {
3612                 struct obd_connect_data *ocd = &imp->imp_connect_data;
3613
3614                 if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
3615                         osc_init_grant(&obd->u.cli, ocd);
3616
3617                 /* See bug 7198 */
3618                 if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
3619                         imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
3620
3621                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
3622                 break;
3623         }
3624         case IMP_EVENT_DEACTIVATE: {
3625                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
3626                 break;
3627         }
3628         case IMP_EVENT_ACTIVATE: {
3629                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
3630                 break;
3631         }
3632         default:
3633                 CERROR("Unknown import event %d\n", event);
3634                 LBUG();
3635         }
3636         RETURN(rc);
3637 }
3638
3639 /**
3640  * Determine whether the lock can be canceled before replaying the lock
3641  * during recovery, see bug16774 for detailed information.
3642  *
3643  * \retval zero the lock can't be canceled
3644  * \retval other ok to cancel
3645  */
3646 static int osc_cancel_weight(struct ldlm_lock *lock)
3647 {
3648         /*
3649          * Cancel all unused and granted extent lock.
3650          */
3651         if (lock->l_resource->lr_type == LDLM_EXTENT &&
3652             ldlm_is_granted(lock) &&
3653             osc_ldlm_weigh_ast(lock) == 0)
3654                 RETURN(1);
3655
3656         RETURN(0);
3657 }
3658
3659 static int brw_queue_work(const struct lu_env *env, void *data)
3660 {
3661         struct client_obd *cli = data;
3662
3663         CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
3664
3665         osc_io_unplug(env, cli, NULL);
3666         RETURN(0);
3667 }
3668
3669 int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
3670 {
3671         struct client_obd *cli = &obd->u.cli;
3672         void *handler;
3673         int rc;
3674
3675         ENTRY;
3676
3677         rc = ptlrpcd_addref();
3678         if (rc)
3679                 RETURN(rc);
3680
3681         rc = client_obd_setup(obd, lcfg);
3682         if (rc)
3683                 GOTO(out_ptlrpcd, rc);
3684
3685
3686         handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
3687         if (IS_ERR(handler))
3688                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3689         cli->cl_writeback_work = handler;
3690
3691         handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
3692         if (IS_ERR(handler))
3693                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3694         cli->cl_lru_work = handler;
3695
3696         rc = osc_quota_setup(obd);
3697         if (rc)
3698                 GOTO(out_ptlrpcd_work, rc);
3699
3700         cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
3701         cli->cl_root_squash = 0;
3702         osc_update_next_shrink(cli);
3703
3704         RETURN(rc);
3705
3706 out_ptlrpcd_work:
3707         if (cli->cl_writeback_work != NULL) {
3708                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3709                 cli->cl_writeback_work = NULL;
3710         }
3711         if (cli->cl_lru_work != NULL) {
3712                 ptlrpcd_destroy_work(cli->cl_lru_work);
3713                 cli->cl_lru_work = NULL;
3714         }
3715         client_obd_cleanup(obd);
3716 out_ptlrpcd:
3717         ptlrpcd_decref();
3718         RETURN(rc);
3719 }
3720 EXPORT_SYMBOL(osc_setup_common);
3721
3722 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
3723 {
3724         struct client_obd *cli = &obd->u.cli;
3725         int                adding;
3726         int                added;
3727         int                req_count;
3728         int                rc;
3729
3730         ENTRY;
3731
3732         rc = osc_setup_common(obd, lcfg);
3733         if (rc < 0)
3734                 RETURN(rc);
3735
3736         rc = osc_tunables_init(obd);
3737         if (rc)
3738                 RETURN(rc);
3739
3740         /*
3741          * We try to control the total number of requests with a upper limit
3742          * osc_reqpool_maxreqcount. There might be some race which will cause
3743          * over-limit allocation, but it is fine.
3744          */
3745         req_count = atomic_read(&osc_pool_req_count);
3746         if (req_count < osc_reqpool_maxreqcount) {
3747                 adding = cli->cl_max_rpcs_in_flight + 2;
3748                 if (req_count + adding > osc_reqpool_maxreqcount)
3749                         adding = osc_reqpool_maxreqcount - req_count;
3750
3751                 added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
3752                 atomic_add(added, &osc_pool_req_count);
3753         }
3754
3755         ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
3756
3757         spin_lock(&osc_shrink_lock);
3758         list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
3759         spin_unlock(&osc_shrink_lock);
3760         cli->cl_import->imp_idle_timeout = osc_idle_timeout;
3761         cli->cl_import->imp_idle_debug = D_HA;
3762
3763         RETURN(0);
3764 }
3765
3766 int osc_precleanup_common(struct obd_device *obd)
3767 {
3768         struct client_obd *cli = &obd->u.cli;
3769         ENTRY;
3770
3771         /* LU-464
3772          * for echo client, export may be on zombie list, wait for
3773          * zombie thread to cull it, because cli.cl_import will be
3774          * cleared in client_disconnect_export():
3775          *   class_export_destroy() -> obd_cleanup() ->
3776          *   echo_device_free() -> echo_client_cleanup() ->
3777          *   obd_disconnect() -> osc_disconnect() ->
3778          *   client_disconnect_export()
3779          */
3780         obd_zombie_barrier();
3781         if (cli->cl_writeback_work) {
3782                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3783                 cli->cl_writeback_work = NULL;
3784         }
3785
3786         if (cli->cl_lru_work) {
3787                 ptlrpcd_destroy_work(cli->cl_lru_work);
3788                 cli->cl_lru_work = NULL;
3789         }
3790
3791         obd_cleanup_client_import(obd);
3792         RETURN(0);
3793 }
3794 EXPORT_SYMBOL(osc_precleanup_common);
3795
3796 static int osc_precleanup(struct obd_device *obd)
3797 {
3798         ENTRY;
3799
3800         osc_precleanup_common(obd);
3801
3802         ptlrpc_lprocfs_unregister_obd(obd);
3803         RETURN(0);
3804 }
3805
3806 int osc_cleanup_common(struct obd_device *obd)
3807 {
3808         struct client_obd *cli = &obd->u.cli;
3809         int rc;
3810
3811         ENTRY;
3812
3813         spin_lock(&osc_shrink_lock);
3814         list_del(&cli->cl_shrink_list);
3815         spin_unlock(&osc_shrink_lock);
3816
3817         /* lru cleanup */
3818         if (cli->cl_cache != NULL) {
3819                 LASSERT(refcount_read(&cli->cl_cache->ccc_users) > 0);
3820                 spin_lock(&cli->cl_cache->ccc_lru_lock);
3821                 list_del_init(&cli->cl_lru_osc);
3822                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
3823                 cli->cl_lru_left = NULL;
3824                 cl_cache_decref(cli->cl_cache);
3825                 cli->cl_cache = NULL;
3826         }
3827
3828         /* free memory of osc quota cache */
3829         osc_quota_cleanup(obd);
3830
3831         rc = client_obd_cleanup(obd);
3832
3833         ptlrpcd_decref();
3834         RETURN(rc);
3835 }
3836 EXPORT_SYMBOL(osc_cleanup_common);
3837
3838 static const struct obd_ops osc_obd_ops = {
3839         .o_owner                = THIS_MODULE,
3840         .o_setup                = osc_setup,
3841         .o_precleanup           = osc_precleanup,
3842         .o_cleanup              = osc_cleanup_common,
3843         .o_add_conn             = client_import_add_conn,
3844         .o_del_conn             = client_import_del_conn,
3845         .o_connect              = client_connect_import,
3846         .o_reconnect            = osc_reconnect,
3847         .o_disconnect           = osc_disconnect,
3848         .o_statfs               = osc_statfs,
3849         .o_statfs_async         = osc_statfs_async,
3850         .o_create               = osc_create,
3851         .o_destroy              = osc_destroy,
3852         .o_getattr              = osc_getattr,
3853         .o_setattr              = osc_setattr,
3854         .o_iocontrol            = osc_iocontrol,
3855         .o_set_info_async       = osc_set_info_async,
3856         .o_import_event         = osc_import_event,
3857         .o_quotactl             = osc_quotactl,
3858 };
3859
3860 LIST_HEAD(osc_shrink_list);
3861 DEFINE_SPINLOCK(osc_shrink_lock);
3862
3863 #ifdef HAVE_SHRINKER_COUNT
3864 static struct shrinker osc_cache_shrinker = {
3865         .count_objects  = osc_cache_shrink_count,
3866         .scan_objects   = osc_cache_shrink_scan,
3867         .seeks          = DEFAULT_SEEKS,
3868 };
3869 #else
3870 static int osc_cache_shrink(struct shrinker *shrinker,
3871                             struct shrink_control *sc)
3872 {
3873         (void)osc_cache_shrink_scan(shrinker, sc);
3874
3875         return osc_cache_shrink_count(shrinker, sc);
3876 }
3877
3878 static struct shrinker osc_cache_shrinker = {
3879         .shrink   = osc_cache_shrink,
3880         .seeks    = DEFAULT_SEEKS,
3881 };
3882 #endif
3883
3884 static int __init osc_init(void)
3885 {
3886         unsigned int reqpool_size;
3887         unsigned int reqsize;
3888         int rc;
3889         ENTRY;
3890
3891         /* print an address of _any_ initialized kernel symbol from this
3892          * module, to allow debugging with gdb that doesn't support data
3893          * symbols from modules.*/
3894         CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
3895
3896         rc = lu_kmem_init(osc_caches);
3897         if (rc)
3898                 RETURN(rc);
3899
3900         rc = register_shrinker(&osc_cache_shrinker);
3901         if (rc)
3902                 GOTO(out_kmem, rc);
3903
3904         /* This is obviously too much memory, only prevent overflow here */
3905         if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
3906                 GOTO(out_shrinker, rc = -EINVAL);
3907
3908         reqpool_size = osc_reqpool_mem_max << 20;
3909
3910         reqsize = 1;
3911         while (reqsize < OST_IO_MAXREQSIZE)
3912                 reqsize = reqsize << 1;
3913
3914         /*
3915          * We don't enlarge the request count in OSC pool according to
3916          * cl_max_rpcs_in_flight. The allocation from the pool will only be
3917          * tried after normal allocation failed. So a small OSC pool won't
3918          * cause much performance degression in most of cases.
3919          */
3920         osc_reqpool_maxreqcount = reqpool_size / reqsize;
3921
3922         atomic_set(&osc_pool_req_count, 0);
3923         osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
3924                                           ptlrpc_add_rqs_to_pool);
3925
3926         if (osc_rq_pool == NULL)
3927                 GOTO(out_shrinker, rc = -ENOMEM);
3928
3929         rc = osc_start_grant_work();
3930         if (rc != 0)
3931                 GOTO(out_req_pool, rc);
3932
3933         rc = class_register_type(&osc_obd_ops, NULL, true,
3934                                  LUSTRE_OSC_NAME, &osc_device_type);
3935         if (rc < 0)
3936                 GOTO(out_stop_grant, rc);
3937
3938         RETURN(rc);
3939
3940 out_stop_grant:
3941         osc_stop_grant_work();
3942 out_req_pool:
3943         ptlrpc_free_rq_pool(osc_rq_pool);
3944 out_shrinker:
3945         unregister_shrinker(&osc_cache_shrinker);
3946 out_kmem:
3947         lu_kmem_fini(osc_caches);
3948
3949         RETURN(rc);
3950 }
3951
3952 static void __exit osc_exit(void)
3953 {
3954         class_unregister_type(LUSTRE_OSC_NAME);
3955         ptlrpc_free_rq_pool(osc_rq_pool);
3956         osc_stop_grant_work();
3957         unregister_shrinker(&osc_cache_shrinker);
3958         lu_kmem_fini(osc_caches);
3959 }
3960
3961 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
3962 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
3963 MODULE_VERSION(LUSTRE_VERSION_STRING);
3964 MODULE_LICENSE("GPL");
3965
3966 module_init(osc_init);
3967 module_exit(osc_exit);