lustre/osc/osc_request.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  */
  31
  32 #define DEBUG_SUBSYSTEM S_OSC
  33
  34 #include <linux/workqueue.h>
  35 #include <libcfs/libcfs.h>
  36 #include <linux/falloc.h>
  37 #include <lprocfs_status.h>
  38 #include <lustre_dlm.h>
  39 #include <lustre_fid.h>
  40 #include <lustre_ha.h>
  41 #include <uapi/linux/lustre/lustre_ioctl.h>
  42 #include <lustre_net.h>
  43 #include <lustre_obdo.h>
  44 #include <obd.h>
  45 #include <obd_cksum.h>
  46 #include <obd_class.h>
  47 #include <lustre_osc.h>
  48 #include <linux/falloc.h>
  49
  50 #include "osc_internal.h"
  51 #include <lnet/lnet_rdma.h>
  52
  53 atomic_t osc_pool_req_count;
  54 unsigned int osc_reqpool_maxreqcount;
  55 struct ptlrpc_request_pool *osc_rq_pool;
  56
  57 /* max memory used for request pool, unit is MB */
  58 static unsigned int osc_reqpool_mem_max = 5;
  59 module_param(osc_reqpool_mem_max, uint, 0444);
  60
  61 static int osc_idle_timeout = 20;
  62 module_param(osc_idle_timeout, uint, 0644);
  63
  64 #define osc_grant_args osc_brw_async_args
  65
  66 struct osc_setattr_args {
  67         struct obdo             *sa_oa;
  68         obd_enqueue_update_f     sa_upcall;
  69         void                    *sa_cookie;
  70 };
  71
  72 struct osc_fsync_args {
  73         struct osc_object       *fa_obj;
  74         struct obdo             *fa_oa;
  75         obd_enqueue_update_f    fa_upcall;
  76         void                    *fa_cookie;
  77 };
  78
  79 struct osc_ladvise_args {
  80         struct obdo             *la_oa;
  81         obd_enqueue_update_f     la_upcall;
  82         void                    *la_cookie;
  83 };
  84
  85 static void osc_release_ppga(struct brw_page **ppga, size_t count);
  86 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
  87                          void *data, int rc);
  88
  89 void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
  90 {
  91         struct ost_body *body;
  92
  93         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
  94         LASSERT(body);
  95
  96         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
  97 }
  98
  99 static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
 100                        struct obdo *oa)
 101 {
 102         struct ptlrpc_request   *req;
 103         struct ost_body         *body;
 104         int                      rc;
 105
 106         ENTRY;
 107         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
 108         if (req == NULL)
 109                 RETURN(-ENOMEM);
 110
 111         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
 112         if (rc) {
 113                 ptlrpc_request_free(req);
 114                 RETURN(rc);
 115         }
 116
 117         osc_pack_req_body(req, oa);
 118
 119         ptlrpc_request_set_replen(req);
 120
 121         rc = ptlrpc_queue_wait(req);
 122         if (rc)
 123                 GOTO(out, rc);
 124
 125         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 126         if (body == NULL)
 127                 GOTO(out, rc = -EPROTO);
 128
 129         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
 130         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 131
 132         oa->o_blksize = cli_brw_size(exp->exp_obd);
 133         oa->o_valid |= OBD_MD_FLBLKSZ;
 134
 135         EXIT;
 136 out:
 137         ptlrpc_req_finished(req);
 138
 139         return rc;
 140 }
 141
 142 static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
 143                        struct obdo *oa)
 144 {
 145         struct ptlrpc_request   *req;
 146         struct ost_body         *body;
 147         int                      rc;
 148
 149         ENTRY;
 150         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 151
 152         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 153         if (req == NULL)
 154                 RETURN(-ENOMEM);
 155
 156         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 157         if (rc) {
 158                 ptlrpc_request_free(req);
 159                 RETURN(rc);
 160         }
 161
 162         osc_pack_req_body(req, oa);
 163
 164         ptlrpc_request_set_replen(req);
 165
 166         rc = ptlrpc_queue_wait(req);
 167         if (rc)
 168                 GOTO(out, rc);
 169
 170         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 171         if (body == NULL)
 172                 GOTO(out, rc = -EPROTO);
 173
 174         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 175
 176         EXIT;
 177 out:
 178         ptlrpc_req_finished(req);
 179
 180         RETURN(rc);
 181 }
 182
 183 static int osc_setattr_interpret(const struct lu_env *env,
 184                                  struct ptlrpc_request *req, void *args, int rc)
 185 {
 186         struct osc_setattr_args *sa = args;
 187         struct ost_body *body;
 188
 189         ENTRY;
 190
 191         if (rc != 0)
 192                 GOTO(out, rc);
 193
 194         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 195         if (body == NULL)
 196                 GOTO(out, rc = -EPROTO);
 197
 198         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
 199                              &body->oa);
 200 out:
 201         rc = sa->sa_upcall(sa->sa_cookie, rc);
 202         RETURN(rc);
 203 }
 204
 205 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 206                       obd_enqueue_update_f upcall, void *cookie,
 207                       struct ptlrpc_request_set *rqset)
 208 {
 209         struct ptlrpc_request   *req;
 210         struct osc_setattr_args *sa;
 211         int                      rc;
 212
 213         ENTRY;
 214
 215         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
 216         if (req == NULL)
 217                 RETURN(-ENOMEM);
 218
 219         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
 220         if (rc) {
 221                 ptlrpc_request_free(req);
 222                 RETURN(rc);
 223         }
 224
 225         osc_pack_req_body(req, oa);
 226
 227         ptlrpc_request_set_replen(req);
 228
 229         /* do mds to ost setattr asynchronously */
 230         if (!rqset) {
 231                 /* Do not wait for response. */
 232                 ptlrpcd_add_req(req);
 233         } else {
 234                 req->rq_interpret_reply = osc_setattr_interpret;
 235
 236                 sa = ptlrpc_req_async_args(sa, req);
 237                 sa->sa_oa = oa;
 238                 sa->sa_upcall = upcall;
 239                 sa->sa_cookie = cookie;
 240
 241                 ptlrpc_set_add_req(rqset, req);
 242         }
 243
 244         RETURN(0);
 245 }
 246
 247 static int osc_ladvise_interpret(const struct lu_env *env,
 248                                  struct ptlrpc_request *req,
 249                                  void *arg, int rc)
 250 {
 251         struct osc_ladvise_args *la = arg;
 252         struct ost_body *body;
 253         ENTRY;
 254
 255         if (rc != 0)
 256                 GOTO(out, rc);
 257
 258         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 259         if (body == NULL)
 260                 GOTO(out, rc = -EPROTO);
 261
 262         *la->la_oa = body->oa;
 263 out:
 264         rc = la->la_upcall(la->la_cookie, rc);
 265         RETURN(rc);
 266 }
 267
 268 /**
 269  * If rqset is NULL, do not wait for response. Upcall and cookie could also
 270  * be NULL in this case
 271  */
 272 int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 273                      struct ladvise_hdr *ladvise_hdr,
 274                      obd_enqueue_update_f upcall, void *cookie,
 275                      struct ptlrpc_request_set *rqset)
 276 {
 277         struct ptlrpc_request   *req;
 278         struct ost_body         *body;
 279         struct osc_ladvise_args *la;
 280         int                      rc;
 281         struct lu_ladvise       *req_ladvise;
 282         struct lu_ladvise       *ladvise = ladvise_hdr->lah_advise;
 283         int                      num_advise = ladvise_hdr->lah_count;
 284         struct ladvise_hdr      *req_ladvise_hdr;
 285         ENTRY;
 286
 287         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
 288         if (req == NULL)
 289                 RETURN(-ENOMEM);
 290
 291         req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
 292                              num_advise * sizeof(*ladvise));
 293         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
 294         if (rc != 0) {
 295                 ptlrpc_request_free(req);
 296                 RETURN(rc);
 297         }
 298         req->rq_request_portal = OST_IO_PORTAL;
 299         ptlrpc_at_set_req_timeout(req);
 300
 301         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 302         LASSERT(body);
 303         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
 304                              oa);
 305
 306         req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
 307                                                  &RMF_OST_LADVISE_HDR);
 308         memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
 309
 310         req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
 311         memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
 312         ptlrpc_request_set_replen(req);
 313
 314         if (rqset == NULL) {
 315                 /* Do not wait for response. */
 316                 ptlrpcd_add_req(req);
 317                 RETURN(0);
 318         }
 319
 320         req->rq_interpret_reply = osc_ladvise_interpret;
 321         la = ptlrpc_req_async_args(la, req);
 322         la->la_oa = oa;
 323         la->la_upcall = upcall;
 324         la->la_cookie = cookie;
 325
 326         ptlrpc_set_add_req(rqset, req);
 327
 328         RETURN(0);
 329 }
 330
 331 static int osc_create(const struct lu_env *env, struct obd_export *exp,
 332                       struct obdo *oa)
 333 {
 334         struct ptlrpc_request *req;
 335         struct ost_body       *body;
 336         int                    rc;
 337         ENTRY;
 338
 339         LASSERT(oa != NULL);
 340         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 341         LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
 342
 343         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
 344         if (req == NULL)
 345                 GOTO(out, rc = -ENOMEM);
 346
 347         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
 348         if (rc) {
 349                 ptlrpc_request_free(req);
 350                 GOTO(out, rc);
 351         }
 352
 353         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 354         LASSERT(body);
 355
 356         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 357
 358         ptlrpc_request_set_replen(req);
 359
 360         rc = ptlrpc_queue_wait(req);
 361         if (rc)
 362                 GOTO(out_req, rc);
 363
 364         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 365         if (body == NULL)
 366                 GOTO(out_req, rc = -EPROTO);
 367
 368         CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
 369         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
 370
 371         oa->o_blksize = cli_brw_size(exp->exp_obd);
 372         oa->o_valid |= OBD_MD_FLBLKSZ;
 373
 374         CDEBUG(D_HA, "transno: %lld\n",
 375                lustre_msg_get_transno(req->rq_repmsg));
 376 out_req:
 377         ptlrpc_req_finished(req);
 378 out:
 379         RETURN(rc);
 380 }
 381
 382 int osc_punch_send(struct obd_export *exp, struct obdo *oa,
 383                    obd_enqueue_update_f upcall, void *cookie)
 384 {
 385         struct ptlrpc_request *req;
 386         struct osc_setattr_args *sa;
 387         struct obd_import *imp = class_exp2cliimp(exp);
 388         struct ost_body *body;
 389         int rc;
 390
 391         ENTRY;
 392
 393         req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
 394         if (req == NULL)
 395                 RETURN(-ENOMEM);
 396
 397         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
 398         if (rc < 0) {
 399                 ptlrpc_request_free(req);
 400                 RETURN(rc);
 401         }
 402
 403         osc_set_io_portal(req);
 404
 405         ptlrpc_at_set_req_timeout(req);
 406
 407         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 408
 409         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 410
 411         ptlrpc_request_set_replen(req);
 412
 413         req->rq_interpret_reply = osc_setattr_interpret;
 414         sa = ptlrpc_req_async_args(sa, req);
 415         sa->sa_oa = oa;
 416         sa->sa_upcall = upcall;
 417         sa->sa_cookie = cookie;
 418
 419         ptlrpcd_add_req(req);
 420
 421         RETURN(0);
 422 }
 423 EXPORT_SYMBOL(osc_punch_send);
 424
 425 /**
 426  * osc_fallocate_base() - Handles fallocate request.
 427  *
 428  * @exp:        Export structure
 429  * @oa:         Attributes passed to OSS from client (obdo structure)
 430  * @upcall:     Primary & supplementary group information
 431  * @cookie:     Exclusive identifier
 432  * @rqset:      Request list.
 433  * @mode:       Operation done on given range.
 434  *
 435  * osc_fallocate_base() - Handles fallocate requests only. Only block
 436  * allocation or standard preallocate operation is supported currently.
 437  * Other mode flags is not supported yet. ftruncate(2) or truncate(2)
 438  * is supported via SETATTR request.
 439  *
 440  * Return: Non-zero on failure and O on success.
 441  */
 442 int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
 443                        obd_enqueue_update_f upcall, void *cookie, int mode)
 444 {
 445         struct ptlrpc_request *req;
 446         struct osc_setattr_args *sa;
 447         struct ost_body *body;
 448         struct obd_import *imp = class_exp2cliimp(exp);
 449         int rc;
 450         ENTRY;
 451
 452         oa->o_falloc_mode = mode;
 453         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
 454                                    &RQF_OST_FALLOCATE);
 455         if (req == NULL)
 456                 RETURN(-ENOMEM);
 457
 458         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_FALLOCATE);
 459         if (rc != 0) {
 460                 ptlrpc_request_free(req);
 461                 RETURN(rc);
 462         }
 463
 464         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 465         LASSERT(body);
 466
 467         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 468
 469         ptlrpc_request_set_replen(req);
 470
 471         req->rq_interpret_reply = osc_setattr_interpret;
 472         BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
 473         sa = ptlrpc_req_async_args(sa, req);
 474         sa->sa_oa = oa;
 475         sa->sa_upcall = upcall;
 476         sa->sa_cookie = cookie;
 477
 478         ptlrpcd_add_req(req);
 479
 480         RETURN(0);
 481 }
 482 EXPORT_SYMBOL(osc_fallocate_base);
 483
 484 static int osc_sync_interpret(const struct lu_env *env,
 485                               struct ptlrpc_request *req, void *args, int rc)
 486 {
 487         struct osc_fsync_args *fa = args;
 488         struct ost_body *body;
 489         struct cl_attr *attr = &osc_env_info(env)->oti_attr;
 490         unsigned long valid = 0;
 491         struct cl_object *obj;
 492         ENTRY;
 493
 494         if (rc != 0)
 495                 GOTO(out, rc);
 496
 497         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 498         if (body == NULL) {
 499                 CERROR("can't unpack ost_body\n");
 500                 GOTO(out, rc = -EPROTO);
 501         }
 502
 503         *fa->fa_oa = body->oa;
 504         obj = osc2cl(fa->fa_obj);
 505
 506         /* Update osc object's blocks attribute */
 507         cl_object_attr_lock(obj);
 508         if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
 509                 attr->cat_blocks = body->oa.o_blocks;
 510                 valid |= CAT_BLOCKS;
 511         }
 512
 513         if (valid != 0)
 514                 cl_object_attr_update(env, obj, attr, valid);
 515         cl_object_attr_unlock(obj);
 516
 517 out:
 518         rc = fa->fa_upcall(fa->fa_cookie, rc);
 519         RETURN(rc);
 520 }
 521
 522 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 523                   obd_enqueue_update_f upcall, void *cookie,
 524                   struct ptlrpc_request_set *rqset)
 525 {
 526         struct obd_export     *exp = osc_export(obj);
 527         struct ptlrpc_request *req;
 528         struct ost_body       *body;
 529         struct osc_fsync_args *fa;
 530         int                    rc;
 531         ENTRY;
 532
 533         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
 534         if (req == NULL)
 535                 RETURN(-ENOMEM);
 536
 537         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
 538         if (rc) {
 539                 ptlrpc_request_free(req);
 540                 RETURN(rc);
 541         }
 542
 543         /* overload the size and blocks fields in the oa with start/end */
 544         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 545         LASSERT(body);
 546         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 547
 548         ptlrpc_request_set_replen(req);
 549         req->rq_interpret_reply = osc_sync_interpret;
 550
 551         fa = ptlrpc_req_async_args(fa, req);
 552         fa->fa_obj = obj;
 553         fa->fa_oa = oa;
 554         fa->fa_upcall = upcall;
 555         fa->fa_cookie = cookie;
 556
 557         ptlrpc_set_add_req(rqset, req);
 558
 559         RETURN (0);
 560 }
 561
 562 /* Find and cancel locally locks matched by @mode in the resource found by
 563  * @objid. Found locks are added into @cancel list. Returns the amount of
 564  * locks added to @cancels list. */
 565 static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
 566                                    struct list_head *cancels,
 567                                    enum ldlm_mode mode, __u64 lock_flags)
 568 {
 569         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
 570         struct ldlm_res_id res_id;
 571         struct ldlm_resource *res;
 572         int count;
 573         ENTRY;
 574
 575         /* Return, i.e. cancel nothing, only if ELC is supported (flag in
 576          * export) but disabled through procfs (flag in NS).
 577          *
 578          * This distinguishes from a case when ELC is not supported originally,
 579          * when we still want to cancel locks in advance and just cancel them
 580          * locally, without sending any RPC. */
 581         if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
 582                 RETURN(0);
 583
 584         ostid_build_res_name(&oa->o_oi, &res_id);
 585         res = ldlm_resource_get(ns, &res_id, 0, 0);
 586         if (IS_ERR(res))
 587                 RETURN(0);
 588
 589         LDLM_RESOURCE_ADDREF(res);
 590         count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
 591                                            lock_flags, 0, NULL);
 592         LDLM_RESOURCE_DELREF(res);
 593         ldlm_resource_putref(res);
 594         RETURN(count);
 595 }
 596
 597 static int osc_destroy_interpret(const struct lu_env *env,
 598                                  struct ptlrpc_request *req, void *args, int rc)
 599 {
 600         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 601
 602         atomic_dec(&cli->cl_destroy_in_flight);
 603         wake_up(&cli->cl_destroy_waitq);
 604
 605         return 0;
 606 }
 607
 608 static int osc_can_send_destroy(struct client_obd *cli)
 609 {
 610         if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
 611             cli->cl_max_rpcs_in_flight) {
 612                 /* The destroy request can be sent */
 613                 return 1;
 614         }
 615         if (atomic_dec_return(&cli->cl_destroy_in_flight) <
 616             cli->cl_max_rpcs_in_flight) {
 617                 /*
 618                  * The counter has been modified between the two atomic
 619                  * operations.
 620                  */
 621                 wake_up(&cli->cl_destroy_waitq);
 622         }
 623         return 0;
 624 }
 625
 626 static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
 627                        struct obdo *oa)
 628 {
 629         struct client_obd     *cli = &exp->exp_obd->u.cli;
 630         struct ptlrpc_request *req;
 631         struct ost_body       *body;
 632         LIST_HEAD(cancels);
 633         int rc, count;
 634         ENTRY;
 635
 636         if (!oa) {
 637                 CDEBUG(D_INFO, "oa NULL\n");
 638                 RETURN(-EINVAL);
 639         }
 640
 641         count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
 642                                         LDLM_FL_DISCARD_DATA);
 643
 644         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
 645         if (req == NULL) {
 646                 ldlm_lock_list_put(&cancels, l_bl_ast, count);
 647                 RETURN(-ENOMEM);
 648         }
 649
 650         rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
 651                                0, &cancels, count);
 652         if (rc) {
 653                 ptlrpc_request_free(req);
 654                 RETURN(rc);
 655         }
 656
 657         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
 658         ptlrpc_at_set_req_timeout(req);
 659
 660         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 661         LASSERT(body);
 662         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 663
 664         ptlrpc_request_set_replen(req);
 665
 666         req->rq_interpret_reply = osc_destroy_interpret;
 667         if (!osc_can_send_destroy(cli)) {
 668                 /*
 669                  * Wait until the number of on-going destroy RPCs drops
 670                  * under max_rpc_in_flight
 671                  */
 672                 rc = l_wait_event_abortable_exclusive(
 673                         cli->cl_destroy_waitq,
 674                         osc_can_send_destroy(cli));
 675                 if (rc) {
 676                         ptlrpc_req_finished(req);
 677                         RETURN(-EINTR);
 678                 }
 679         }
 680
 681         /* Do not wait for response */
 682         ptlrpcd_add_req(req);
 683         RETURN(0);
 684 }
 685
 686 static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 687                                 long writing_bytes)
 688 {
 689         u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
 690
 691         LASSERT(!(oa->o_valid & bits));
 692
 693         oa->o_valid |= bits;
 694         spin_lock(&cli->cl_loi_list_lock);
 695         if (cli->cl_ocd_grant_param)
 696                 oa->o_dirty = cli->cl_dirty_grant;
 697         else
 698                 oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
 699         if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
 700                 CERROR("dirty %lu > dirty_max %lu\n",
 701                        cli->cl_dirty_pages,
 702                        cli->cl_dirty_max_pages);
 703                 oa->o_undirty = 0;
 704         } else if (unlikely(atomic_long_read(&obd_dirty_pages) >
 705                             (long)(obd_max_dirty_pages + 1))) {
 706                 /* The atomic_read() allowing the atomic_inc() are
 707                  * not covered by a lock thus they may safely race and trip
 708                  * this CERROR() unless we add in a small fudge factor (+1). */
 709                 CERROR("%s: dirty %ld > system dirty_max %ld\n",
 710                        cli_name(cli), atomic_long_read(&obd_dirty_pages),
 711                        obd_max_dirty_pages);
 712                 oa->o_undirty = 0;
 713         } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
 714                             0x7fffffff)) {
 715                 CERROR("dirty %lu - dirty_max %lu too big???\n",
 716                        cli->cl_dirty_pages, cli->cl_dirty_max_pages);
 717                 oa->o_undirty = 0;
 718         } else {
 719                 unsigned long nrpages;
 720                 unsigned long undirty;
 721
 722                 nrpages = cli->cl_max_pages_per_rpc;
 723                 nrpages *= cli->cl_max_rpcs_in_flight + 1;
 724                 nrpages = max(nrpages, cli->cl_dirty_max_pages);
 725                 undirty = nrpages << PAGE_SHIFT;
 726                 if (cli->cl_ocd_grant_param) {
 727                         int nrextents;
 728
 729                         /* take extent tax into account when asking for more
 730                          * grant space */
 731                         nrextents = (nrpages + cli->cl_max_extent_pages - 1) /
 732                                      cli->cl_max_extent_pages;
 733                         undirty += nrextents * cli->cl_grant_extent_tax;
 734                 }
 735                 /* Do not ask for more than OBD_MAX_GRANT - a margin for server
 736                  * to add extent tax, etc.
 737                  */
 738                 oa->o_undirty = min(undirty, OBD_MAX_GRANT &
 739                                     ~(PTLRPC_MAX_BRW_SIZE * 4UL));
 740         }
 741         oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
 742         /* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
 743         if (cli->cl_lost_grant > INT_MAX) {
 744                 CDEBUG(D_CACHE,
 745                       "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
 746                       cli_name(cli), cli->cl_lost_grant);
 747                 oa->o_dropped = INT_MAX;
 748         } else {
 749                 oa->o_dropped = cli->cl_lost_grant;
 750         }
 751         cli->cl_lost_grant -= oa->o_dropped;
 752         spin_unlock(&cli->cl_loi_list_lock);
 753         CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
 754                " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
 755                oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
 756 }
 757
 758 void osc_update_next_shrink(struct client_obd *cli)
 759 {
 760         cli->cl_next_shrink_grant = ktime_get_seconds() +
 761                                     cli->cl_grant_shrink_interval;
 762
 763         CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
 764                cli->cl_next_shrink_grant);
 765 }
 766 EXPORT_SYMBOL(osc_update_next_shrink);
 767
 768 static void __osc_update_grant(struct client_obd *cli, u64 grant)
 769 {
 770         spin_lock(&cli->cl_loi_list_lock);
 771         cli->cl_avail_grant += grant;
 772         spin_unlock(&cli->cl_loi_list_lock);
 773 }
 774
 775 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
 776 {
 777         if (body->oa.o_valid & OBD_MD_FLGRANT) {
 778                 CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
 779                 __osc_update_grant(cli, body->oa.o_grant);
 780         }
 781 }
 782
 783 /**
 784  * grant thread data for shrinking space.
 785  */
 786 struct grant_thread_data {
 787         struct list_head        gtd_clients;
 788         struct mutex            gtd_mutex;
 789         unsigned long           gtd_stopped:1;
 790 };
 791 static struct grant_thread_data client_gtd;
 792
 793 static int osc_shrink_grant_interpret(const struct lu_env *env,
 794                                       struct ptlrpc_request *req,
 795                                       void *args, int rc)
 796 {
 797         struct osc_grant_args *aa = args;
 798         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
 799         struct ost_body *body;
 800
 801         if (rc != 0) {
 802                 __osc_update_grant(cli, aa->aa_oa->o_grant);
 803                 GOTO(out, rc);
 804         }
 805
 806         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 807         LASSERT(body);
 808         osc_update_grant(cli, body);
 809 out:
 810         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
 811         aa->aa_oa = NULL;
 812
 813         return rc;
 814 }
 815
 816 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
 817 {
 818         spin_lock(&cli->cl_loi_list_lock);
 819         oa->o_grant = cli->cl_avail_grant / 4;
 820         cli->cl_avail_grant -= oa->o_grant;
 821         spin_unlock(&cli->cl_loi_list_lock);
 822         if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
 823                 oa->o_valid |= OBD_MD_FLFLAGS;
 824                 oa->o_flags = 0;
 825         }
 826         oa->o_flags |= OBD_FL_SHRINK_GRANT;
 827         osc_update_next_shrink(cli);
 828 }
 829
 830 /* Shrink the current grant, either from some large amount to enough for a
 831  * full set of in-flight RPCs, or if we have already shrunk to that limit
 832  * then to enough for a single RPC.  This avoids keeping more grant than
 833  * needed, and avoids shrinking the grant piecemeal. */
 834 static int osc_shrink_grant(struct client_obd *cli)
 835 {
 836         __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
 837                              (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
 838
 839         spin_lock(&cli->cl_loi_list_lock);
 840         if (cli->cl_avail_grant <= target_bytes)
 841                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 842         spin_unlock(&cli->cl_loi_list_lock);
 843
 844         return osc_shrink_grant_to_target(cli, target_bytes);
 845 }
 846
 847 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 848 {
 849         int                     rc = 0;
 850         struct ost_body        *body;
 851         ENTRY;
 852
 853         spin_lock(&cli->cl_loi_list_lock);
 854         /* Don't shrink if we are already above or below the desired limit
 855          * We don't want to shrink below a single RPC, as that will negatively
 856          * impact block allocation and long-term performance. */
 857         if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
 858                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 859
 860         if (target_bytes >= cli->cl_avail_grant) {
 861                 spin_unlock(&cli->cl_loi_list_lock);
 862                 RETURN(0);
 863         }
 864         spin_unlock(&cli->cl_loi_list_lock);
 865
 866         OBD_ALLOC_PTR(body);
 867         if (!body)
 868                 RETURN(-ENOMEM);
 869
 870         osc_announce_cached(cli, &body->oa, 0);
 871
 872         spin_lock(&cli->cl_loi_list_lock);
 873         if (target_bytes >= cli->cl_avail_grant) {
 874                 /* available grant has changed since target calculation */
 875                 spin_unlock(&cli->cl_loi_list_lock);
 876                 GOTO(out_free, rc = 0);
 877         }
 878         body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 879         cli->cl_avail_grant = target_bytes;
 880         spin_unlock(&cli->cl_loi_list_lock);
 881         if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
 882                 body->oa.o_valid |= OBD_MD_FLFLAGS;
 883                 body->oa.o_flags = 0;
 884         }
 885         body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
 886         osc_update_next_shrink(cli);
 887
 888         rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
 889                                 sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
 890                                 sizeof(*body), body, NULL);
 891         if (rc != 0)
 892                 __osc_update_grant(cli, body->oa.o_grant);
 893 out_free:
 894         OBD_FREE_PTR(body);
 895         RETURN(rc);
 896 }
 897
 898 static int osc_should_shrink_grant(struct client_obd *client)
 899 {
 900         time64_t next_shrink = client->cl_next_shrink_grant;
 901
 902         if (client->cl_import == NULL)
 903                 return 0;
 904
 905         if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
 906             client->cl_import->imp_grant_shrink_disabled) {
 907                 osc_update_next_shrink(client);
 908                 return 0;
 909         }
 910
 911         if (ktime_get_seconds() >= next_shrink - 5) {
 912                 /* Get the current RPC size directly, instead of going via:
 913                  * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 914                  * Keep comment here so that it can be found by searching. */
 915                 int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
 916
 917                 if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
 918                     client->cl_avail_grant > brw_size)
 919                         return 1;
 920                 else
 921                         osc_update_next_shrink(client);
 922         }
 923         return 0;
 924 }
 925
 926 #define GRANT_SHRINK_RPC_BATCH  100
 927
 928 static struct delayed_work work;
 929
 930 static void osc_grant_work_handler(struct work_struct *data)
 931 {
 932         struct client_obd *cli;
 933         int rpc_sent;
 934         bool init_next_shrink = true;
 935         time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
 936
 937         rpc_sent = 0;
 938         mutex_lock(&client_gtd.gtd_mutex);
 939         list_for_each_entry(cli, &client_gtd.gtd_clients,
 940                             cl_grant_chain) {
 941                 if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
 942                     osc_should_shrink_grant(cli)) {
 943                         osc_shrink_grant(cli);
 944                         rpc_sent++;
 945                 }
 946
 947                 if (!init_next_shrink) {
 948                         if (cli->cl_next_shrink_grant < next_shrink &&
 949                             cli->cl_next_shrink_grant > ktime_get_seconds())
 950                                 next_shrink = cli->cl_next_shrink_grant;
 951                 } else {
 952                         init_next_shrink = false;
 953                         next_shrink = cli->cl_next_shrink_grant;
 954                 }
 955         }
 956         mutex_unlock(&client_gtd.gtd_mutex);
 957
 958         if (client_gtd.gtd_stopped == 1)
 959                 return;
 960
 961         if (next_shrink > ktime_get_seconds()) {
 962                 time64_t delay = next_shrink - ktime_get_seconds();
 963
 964                 schedule_delayed_work(&work, cfs_time_seconds(delay));
 965         } else {
 966                 schedule_work(&work.work);
 967         }
 968 }
 969
 970 void osc_schedule_grant_work(void)
 971 {
 972         cancel_delayed_work_sync(&work);
 973         schedule_work(&work.work);
 974 }
 975 EXPORT_SYMBOL(osc_schedule_grant_work);
 976
 977 /**
 978  * Start grant thread for returing grant to server for idle clients.
 979  */
 980 static int osc_start_grant_work(void)
 981 {
 982         client_gtd.gtd_stopped = 0;
 983         mutex_init(&client_gtd.gtd_mutex);
 984         INIT_LIST_HEAD(&client_gtd.gtd_clients);
 985
 986         INIT_DELAYED_WORK(&work, osc_grant_work_handler);
 987         schedule_work(&work.work);
 988
 989         return 0;
 990 }
 991
 992 static void osc_stop_grant_work(void)
 993 {
 994         client_gtd.gtd_stopped = 1;
 995         cancel_delayed_work_sync(&work);
 996 }
 997
 998 static void osc_add_grant_list(struct client_obd *client)
 999 {
1000         mutex_lock(&client_gtd.gtd_mutex);
1001         list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
1002         mutex_unlock(&client_gtd.gtd_mutex);
1003 }
1004
1005 static void osc_del_grant_list(struct client_obd *client)
1006 {
1007         if (list_empty(&client->cl_grant_chain))
1008                 return;
1009
1010         mutex_lock(&client_gtd.gtd_mutex);
1011         list_del_init(&client->cl_grant_chain);
1012         mutex_unlock(&client_gtd.gtd_mutex);
1013 }
1014
1015 void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
1016 {
1017         /*
1018          * ocd_grant is the total grant amount we're expect to hold: if we've
1019          * been evicted, it's the new avail_grant amount, cl_dirty_pages will
1020          * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
1021          * dirty.
1022          *
1023          * race is tolerable here: if we're evicted, but imp_state already
1024          * left EVICTED state, then cl_dirty_pages must be 0 already.
1025          */
1026         spin_lock(&cli->cl_loi_list_lock);
1027         cli->cl_avail_grant = ocd->ocd_grant;
1028         if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
1029                 unsigned long consumed = cli->cl_reserved_grant;
1030
1031                 if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
1032                         consumed += cli->cl_dirty_grant;
1033                 else
1034                         consumed += cli->cl_dirty_pages << PAGE_SHIFT;
1035                 if (cli->cl_avail_grant < consumed) {
1036                         CERROR("%s: granted %ld but already consumed %ld\n",
1037                                cli_name(cli), cli->cl_avail_grant, consumed);
1038                         cli->cl_avail_grant = 0;
1039                 } else {
1040                         cli->cl_avail_grant -= consumed;
1041                 }
1042         }
1043
1044         if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
1045                 u64 size;
1046                 int chunk_mask;
1047
1048                 /* overhead for each extent insertion */
1049                 cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
1050                 /* determine the appropriate chunk size used by osc_extent. */
1051                 cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
1052                                           ocd->ocd_grant_blkbits);
1053                 /* max_pages_per_rpc must be chunk aligned */
1054                 chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
1055                 cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
1056                                              ~chunk_mask) & chunk_mask;
1057                 /* determine maximum extent size, in #pages */
1058                 size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
1059                 cli->cl_max_extent_pages = (size >> PAGE_SHIFT) ?: 1;
1060                 cli->cl_ocd_grant_param = 1;
1061         } else {
1062                 cli->cl_ocd_grant_param = 0;
1063                 cli->cl_grant_extent_tax = 0;
1064                 cli->cl_chunkbits = PAGE_SHIFT;
1065                 cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
1066         }
1067         spin_unlock(&cli->cl_loi_list_lock);
1068
1069         CDEBUG(D_CACHE,
1070                "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld. chunk bits: %d cl_max_extent_pages: %d\n",
1071                cli_name(cli),
1072                cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
1073                cli->cl_max_extent_pages);
1074
1075         if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
1076                 osc_add_grant_list(cli);
1077 }
1078 EXPORT_SYMBOL(osc_init_grant);
1079
1080 /* We assume that the reason this OSC got a short read is because it read
1081  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
1082  * via the LOV, and it _knows_ it's reading inside the file, it's just that
1083  * this stripe never got written at or beyond this stripe offset yet. */
1084 static void handle_short_read(int nob_read, size_t page_count,
1085                               struct brw_page **pga)
1086 {
1087         char *ptr;
1088         int i = 0;
1089
1090         /* skip bytes read OK */
1091         while (nob_read > 0) {
1092                 LASSERT (page_count > 0);
1093
1094                 if (pga[i]->count > nob_read) {
1095                         /* EOF inside this page */
1096                         ptr = kmap(pga[i]->pg) +
1097                                 (pga[i]->off & ~PAGE_MASK);
1098                         memset(ptr + nob_read, 0, pga[i]->count - nob_read);
1099                         kunmap(pga[i]->pg);
1100                         page_count--;
1101                         i++;
1102                         break;
1103                 }
1104
1105                 nob_read -= pga[i]->count;
1106                 page_count--;
1107                 i++;
1108         }
1109
1110         /* zero remaining pages */
1111         while (page_count-- > 0) {
1112                 ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
1113                 memset(ptr, 0, pga[i]->count);
1114                 kunmap(pga[i]->pg);
1115                 i++;
1116         }
1117 }
1118
1119 static int check_write_rcs(struct ptlrpc_request *req,
1120                            int requested_nob, int niocount,
1121                            size_t page_count, struct brw_page **pga)
1122 {
1123         int     i;
1124         __u32   *remote_rcs;
1125
1126         remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
1127                                                   sizeof(*remote_rcs) *
1128                                                   niocount);
1129         if (remote_rcs == NULL) {
1130                 CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
1131                 return(-EPROTO);
1132         }
1133
1134         /* return error if any niobuf was in error */
1135         for (i = 0; i < niocount; i++) {
1136                 if ((int)remote_rcs[i] < 0) {
1137                         CDEBUG(D_INFO, "rc[%d]: %d req %p\n",
1138                                i, remote_rcs[i], req);
1139                         return remote_rcs[i];
1140                 }
1141
1142                 if (remote_rcs[i] != 0) {
1143                         CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
1144                                 i, remote_rcs[i], req);
1145                         return(-EPROTO);
1146                 }
1147         }
1148         if (req->rq_bulk != NULL &&
1149             req->rq_bulk->bd_nob_transferred != requested_nob) {
1150                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
1151                        req->rq_bulk->bd_nob_transferred, requested_nob);
1152                 return(-EPROTO);
1153         }
1154
1155         return (0);
1156 }
1157
1158 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
1159 {
1160         if (p1->flag != p2->flag) {
1161                 unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
1162                                   OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
1163                                   OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC |
1164                                   OBD_BRW_SYS_RESOURCE);
1165
1166                 /* warn if we try to combine flags that we don't know to be
1167                  * safe to combine */
1168                 if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
1169                         CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
1170                               "report this at https://jira.whamcloud.com/\n",
1171                               p1->flag, p2->flag);
1172                 }
1173                 return 0;
1174         }
1175
1176         return (p1->off + p1->count == p2->off);
1177 }
1178
1179 #if IS_ENABLED(CONFIG_CRC_T10DIF)
1180 static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
1181                                    size_t pg_count, struct brw_page **pga,
1182                                    int opc, obd_dif_csum_fn *fn,
1183                                    int sector_size,
1184                                    u32 *check_sum, bool resend)
1185 {
1186         struct ahash_request *req;
1187         /* Used Adler as the default checksum type on top of DIF tags */
1188         unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
1189         struct page *__page;
1190         unsigned char *buffer;
1191         __be16 *guard_start;
1192         unsigned int bufsize;
1193         int guard_number;
1194         int used_number = 0;
1195         int used;
1196         u32 cksum;
1197         int rc = 0;
1198         int i = 0;
1199
1200         LASSERT(pg_count > 0);
1201
1202         __page = alloc_page(GFP_KERNEL);
1203         if (__page == NULL)
1204                 return -ENOMEM;
1205
1206         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1207         if (IS_ERR(req)) {
1208                 rc = PTR_ERR(req);
1209                 CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
1210                        obd_name, cfs_crypto_hash_name(cfs_alg), rc);
1211                 GOTO(out, rc);
1212         }
1213
1214         buffer = kmap(__page);
1215         guard_start = (__be16 *)buffer;
1216         guard_number = PAGE_SIZE / sizeof(*guard_start);
1217         CDEBUG(D_PAGE | (resend ? D_HA : 0),
1218                "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n",
1219                guard_number, resend, nob, pg_count);
1220
1221         while (nob > 0 && pg_count > 0) {
1222                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1223
1224                 /* corrupt the data before we compute the checksum, to
1225                  * simulate an OST->client data error */
1226                 if (unlikely(i == 0 && opc == OST_READ &&
1227                              OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
1228                         unsigned char *ptr = kmap(pga[i]->pg);
1229                         int off = pga[i]->off & ~PAGE_MASK;
1230
1231                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1232                         kunmap(pga[i]->pg);
1233                 }
1234
1235                 /*
1236                  * The left guard number should be able to hold checksums of a
1237                  * whole page
1238                  */
1239                 rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
1240                                                   pga[i]->off & ~PAGE_MASK,
1241                                                   count,
1242                                                   guard_start + used_number,
1243                                                   guard_number - used_number,
1244                                                   &used, sector_size,
1245                                                   fn);
1246                 if (unlikely(resend))
1247                         CDEBUG(D_PAGE | D_HA,
1248                                "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n",
1249                                i, used, pga[i]->off & ~PAGE_MASK, count,
1250                                (int)(used * sizeof(*guard_start)),
1251                                guard_start + used_number);
1252                 if (rc)
1253                         break;
1254
1255                 used_number += used;
1256                 if (used_number == guard_number) {
1257                         cfs_crypto_hash_update_page(req, __page, 0,
1258                                 used_number * sizeof(*guard_start));
1259                         used_number = 0;
1260                 }
1261
1262                 nob -= pga[i]->count;
1263                 pg_count--;
1264                 i++;
1265         }
1266         kunmap(__page);
1267         if (rc)
1268                 GOTO(out, rc);
1269
1270         if (used_number != 0)
1271                 cfs_crypto_hash_update_page(req, __page, 0,
1272                         used_number * sizeof(*guard_start));
1273
1274         bufsize = sizeof(cksum);
1275         cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
1276
1277         /* For sending we only compute the wrong checksum instead
1278          * of corrupting the data so it is still correct on a redo */
1279         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1280                 cksum++;
1281
1282         *check_sum = cksum;
1283 out:
1284         __free_page(__page);
1285         return rc;
1286 }
1287 #else /* !CONFIG_CRC_T10DIF */
1288 #define obd_dif_ip_fn NULL
1289 #define obd_dif_crc_fn NULL
1290 #define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum, re) \
1291         -EOPNOTSUPP
1292 #endif /* CONFIG_CRC_T10DIF */
1293
1294 static int osc_checksum_bulk(int nob, size_t pg_count,
1295                              struct brw_page **pga, int opc,
1296                              enum cksum_types cksum_type,
1297                              u32 *cksum)
1298 {
1299         int                             i = 0;
1300         struct ahash_request           *req;
1301         unsigned int                    bufsize;
1302         unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
1303
1304         LASSERT(pg_count > 0);
1305
1306         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1307         if (IS_ERR(req)) {
1308                 CERROR("Unable to initialize checksum hash %s\n",
1309                        cfs_crypto_hash_name(cfs_alg));
1310                 return PTR_ERR(req);
1311         }
1312
1313         while (nob > 0 && pg_count > 0) {
1314                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1315
1316                 /* corrupt the data before we compute the checksum, to
1317                  * simulate an OST->client data error */
1318                 if (i == 0 && opc == OST_READ &&
1319                     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
1320                         unsigned char *ptr = kmap(pga[i]->pg);
1321                         int off = pga[i]->off & ~PAGE_MASK;
1322
1323                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1324                         kunmap(pga[i]->pg);
1325                 }
1326                 cfs_crypto_hash_update_page(req, pga[i]->pg,
1327                                             pga[i]->off & ~PAGE_MASK,
1328                                             count);
1329                 LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
1330                                (int)(pga[i]->off & ~PAGE_MASK));
1331
1332                 nob -= pga[i]->count;
1333                 pg_count--;
1334                 i++;
1335         }
1336
1337         bufsize = sizeof(*cksum);
1338         cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
1339
1340         /* For sending we only compute the wrong checksum instead
1341          * of corrupting the data so it is still correct on a redo */
1342         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1343                 (*cksum)++;
1344
1345         return 0;
1346 }
1347
1348 static int osc_checksum_bulk_rw(const char *obd_name,
1349                                 enum cksum_types cksum_type,
1350                                 int nob, size_t pg_count,
1351                                 struct brw_page **pga, int opc,
1352                                 u32 *check_sum, bool resend)
1353 {
1354         obd_dif_csum_fn *fn = NULL;
1355         int sector_size = 0;
1356         int rc;
1357
1358         ENTRY;
1359         obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
1360
1361         if (fn)
1362                 rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
1363                                              opc, fn, sector_size, check_sum,
1364                                              resend);
1365         else
1366                 rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
1367                                        check_sum);
1368
1369         RETURN(rc);
1370 }
1371
1372 #ifdef CONFIG_LL_ENCRYPTION
1373 /**
1374  * osc_encrypt_pagecache_blocks() - overlay to llcrypt_encrypt_pagecache_blocks
1375  * @srcpage:      The locked pagecache page containing the block(s) to encrypt
1376  * @dstpage:      The page to put encryption result
1377  * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
1378  *              multiple of the filesystem's block size.
1379  * @offs:      Byte offset within @page of the first block to encrypt.  Must be
1380  *              a multiple of the filesystem's block size.
1381  * @gfp_flags: Memory allocation flags
1382  *
1383  * This overlay function is necessary to be able to provide our own bounce page.
1384  */
1385 static struct page *osc_encrypt_pagecache_blocks(struct page *srcpage,
1386                                                  struct page *dstpage,
1387                                                  unsigned int len,
1388                                                  unsigned int offs,
1389                                                  gfp_t gfp_flags)
1390
1391 {
1392         const struct inode *inode = srcpage->mapping->host;
1393         const unsigned int blockbits = inode->i_blkbits;
1394         const unsigned int blocksize = 1 << blockbits;
1395         u64 lblk_num = ((u64)srcpage->index << (PAGE_SHIFT - blockbits)) +
1396                 (offs >> blockbits);
1397         unsigned int i;
1398         int err;
1399
1400         if (unlikely(!dstpage))
1401                 return llcrypt_encrypt_pagecache_blocks(srcpage, len, offs,
1402                                                         gfp_flags);
1403
1404         if (WARN_ON_ONCE(!PageLocked(srcpage)))
1405                 return ERR_PTR(-EINVAL);
1406
1407         if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
1408                 return ERR_PTR(-EINVAL);
1409
1410         /* Set PagePrivate2 for disambiguation in
1411          * osc_finalize_bounce_page().
1412          * It means cipher page was not allocated by llcrypt.
1413          */
1414         SetPagePrivate2(dstpage);
1415
1416         for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
1417                 err = llcrypt_encrypt_block(inode, srcpage, dstpage, blocksize,
1418                                             i, lblk_num, gfp_flags);
1419                 if (err)
1420                         return ERR_PTR(err);
1421         }
1422         SetPagePrivate(dstpage);
1423         set_page_private(dstpage, (unsigned long)srcpage);
1424         return dstpage;
1425 }
1426
1427 /**
1428  * osc_finalize_bounce_page() - overlay to llcrypt_finalize_bounce_page
1429  *
1430  * This overlay function is necessary to handle bounce pages
1431  * allocated by ourselves.
1432  */
1433 static inline void osc_finalize_bounce_page(struct page **pagep)
1434 {
1435         struct page *page = *pagep;
1436
1437         /* PagePrivate2 was set in osc_encrypt_pagecache_blocks
1438          * to indicate the cipher page was allocated by ourselves.
1439          * So we must not free it via llcrypt.
1440          */
1441         if (unlikely(!page || !PagePrivate2(page)))
1442                 return llcrypt_finalize_bounce_page(pagep);
1443
1444         if (llcrypt_is_bounce_page(page)) {
1445                 *pagep = llcrypt_pagecache_page(page);
1446                 ClearPagePrivate2(page);
1447                 set_page_private(page, (unsigned long)NULL);
1448                 ClearPagePrivate(page);
1449         }
1450 }
1451 #else /* !CONFIG_LL_ENCRYPTION */
1452 #define osc_encrypt_pagecache_blocks(srcpage, dstpage, len, offs, gfp_flags) \
1453         llcrypt_encrypt_pagecache_blocks(srcpage, len, offs, gfp_flags)
1454 #define osc_finalize_bounce_page(page) llcrypt_finalize_bounce_page(page)
1455 #endif
1456
1457 static inline void osc_release_bounce_pages(struct brw_page **pga,
1458                                             u32 page_count)
1459 {
1460 #ifdef HAVE_LUSTRE_CRYPTO
1461         struct page **pa = NULL;
1462         int i, j = 0;
1463
1464         if (!pga[0])
1465                 return;
1466
1467 #ifdef CONFIG_LL_ENCRYPTION
1468         if (PageChecked(pga[0]->pg)) {
1469                 OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
1470                 if (!pa)
1471                         return;
1472         }
1473 #endif
1474
1475         for (i = 0; i < page_count; i++) {
1476                 /* Bounce pages used by osc_encrypt_pagecache_blocks()
1477                  * called from osc_brw_prep_request()
1478                  * are identified thanks to the PageChecked flag.
1479                  */
1480                 if (PageChecked(pga[i]->pg)) {
1481                         if (pa)
1482                                 pa[j++] = pga[i]->pg;
1483                         osc_finalize_bounce_page(&pga[i]->pg);
1484                 }
1485                 pga[i]->count -= pga[i]->bp_count_diff;
1486                 pga[i]->off += pga[i]->bp_off_diff;
1487         }
1488
1489         if (pa) {
1490                 sptlrpc_enc_pool_put_pages_array(pa, j);
1491                 OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
1492         }
1493 #endif
1494 }
1495
1496 static int
1497 osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
1498                      u32 page_count, struct brw_page **pga,
1499                      struct ptlrpc_request **reqp, int resend)
1500 {
1501         struct ptlrpc_request *req;
1502         struct ptlrpc_bulk_desc *desc;
1503         struct ost_body *body;
1504         struct obd_ioobj *ioobj;
1505         struct niobuf_remote *niobuf;
1506         int niocount, i, requested_nob, opc, rc, short_io_size = 0;
1507         struct osc_brw_async_args *aa;
1508         struct req_capsule *pill;
1509         struct brw_page *pg_prev;
1510         void *short_io_buf;
1511         const char *obd_name = cli->cl_import->imp_obd->obd_name;
1512         struct inode *inode = NULL;
1513         bool directio = false;
1514         bool gpu = 0;
1515         bool enable_checksum = true;
1516         struct cl_page *clpage;
1517
1518         ENTRY;
1519         if (pga[0]->pg) {
1520                 clpage = oap2cl_page(brw_page2oap(pga[0]));
1521                 inode = clpage->cp_inode;
1522                 if (clpage->cp_type == CPT_TRANSIENT)
1523                         directio = true;
1524         }
1525         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
1526                 RETURN(-ENOMEM); /* Recoverable */
1527         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
1528                 RETURN(-EINVAL); /* Fatal */
1529
1530         if ((cmd & OBD_BRW_WRITE) != 0) {
1531                 opc = OST_WRITE;
1532                 req = ptlrpc_request_alloc_pool(cli->cl_import,
1533                                                 osc_rq_pool,
1534                                                 &RQF_OST_BRW_WRITE);
1535         } else {
1536                 opc = OST_READ;
1537                 req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
1538         }
1539         if (req == NULL)
1540                 RETURN(-ENOMEM);
1541
1542         if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode) &&
1543             llcrypt_has_encryption_key(inode)) {
1544                 struct page **pa = NULL;
1545
1546 #ifdef CONFIG_LL_ENCRYPTION
1547                 OBD_ALLOC_PTR_ARRAY_LARGE(pa, page_count);
1548                 if (pa == NULL) {
1549                         ptlrpc_request_free(req);
1550                         RETURN(-ENOMEM);
1551                 }
1552
1553                 rc = sptlrpc_enc_pool_get_pages_array(pa, page_count);
1554                 if (rc) {
1555                         CDEBUG(D_SEC, "failed to allocate from enc pool: %d\n",
1556                                rc);
1557                         ptlrpc_request_free(req);
1558                         RETURN(rc);
1559                 }
1560 #endif
1561
1562                 for (i = 0; i < page_count; i++) {
1563                         struct brw_page *brwpg = pga[i];
1564                         struct page *data_page = NULL;
1565                         bool retried = false;
1566                         bool lockedbymyself;
1567                         u32 nunits = (brwpg->off & ~PAGE_MASK) + brwpg->count;
1568                         struct address_space *map_orig = NULL;
1569                         pgoff_t index_orig;
1570
1571 retry_encrypt:
1572                         nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
1573                         /* The page can already be locked when we arrive here.
1574                          * This is possible when cl_page_assume/vvp_page_assume
1575                          * is stuck on wait_on_page_writeback with page lock
1576                          * held. In this case there is no risk for the lock to
1577                          * be released while we are doing our encryption
1578                          * processing, because writeback against that page will
1579                          * end in vvp_page_completion_write/cl_page_completion,
1580                          * which means only once the page is fully processed.
1581                          */
1582                         lockedbymyself = trylock_page(brwpg->pg);
1583                         if (directio) {
1584                                 map_orig = brwpg->pg->mapping;
1585                                 brwpg->pg->mapping = inode->i_mapping;
1586                                 index_orig = brwpg->pg->index;
1587                                 clpage = oap2cl_page(brw_page2oap(brwpg));
1588                                 brwpg->pg->index = clpage->cp_page_index;
1589                         }
1590                         data_page =
1591                                 osc_encrypt_pagecache_blocks(brwpg->pg,
1592                                                             pa ? pa[i] : NULL,
1593                                                             nunits, 0,
1594                                                             GFP_NOFS);
1595                         if (directio) {
1596                                 brwpg->pg->mapping = map_orig;
1597                                 brwpg->pg->index = index_orig;
1598                         }
1599                         if (lockedbymyself)
1600                                 unlock_page(brwpg->pg);
1601                         if (IS_ERR(data_page)) {
1602                                 rc = PTR_ERR(data_page);
1603                                 if (rc == -ENOMEM && !retried) {
1604                                         retried = true;
1605                                         rc = 0;
1606                                         goto retry_encrypt;
1607                                 }
1608                                 if (pa) {
1609                                         sptlrpc_enc_pool_put_pages_array(pa + i,
1610                                                                 page_count - i);
1611                                         OBD_FREE_PTR_ARRAY_LARGE(pa,
1612                                                                  page_count);
1613                                 }
1614                                 ptlrpc_request_free(req);
1615                                 RETURN(rc);
1616                         }
1617                         /* Set PageChecked flag on bounce page for
1618                          * disambiguation in osc_release_bounce_pages().
1619                          */
1620                         SetPageChecked(data_page);
1621                         brwpg->pg = data_page;
1622                         /* there should be no gap in the middle of page array */
1623                         if (i == page_count - 1) {
1624                                 struct osc_async_page *oap =
1625                                         brw_page2oap(brwpg);
1626
1627                                 oa->o_size = oap->oap_count +
1628                                         oap->oap_obj_off + oap->oap_page_off;
1629                         }
1630                         /* len is forced to nunits, and relative offset to 0
1631                          * so store the old, clear text info
1632                          */
1633                         brwpg->bp_count_diff = nunits - brwpg->count;
1634                         brwpg->count = nunits;
1635                         brwpg->bp_off_diff = brwpg->off & ~PAGE_MASK;
1636                         brwpg->off = brwpg->off & PAGE_MASK;
1637                 }
1638
1639                 if (pa)
1640                         OBD_FREE_PTR_ARRAY_LARGE(pa, page_count);
1641         } else if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode)) {
1642                 struct osc_async_page *oap = brw_page2oap(pga[0]);
1643                 struct cl_page *clpage = oap2cl_page(oap);
1644                 struct cl_object *clobj = clpage->cp_obj;
1645                 struct cl_attr attr = { 0 };
1646                 struct lu_env *env;
1647                 __u16 refcheck;
1648
1649                 env = cl_env_get(&refcheck);
1650                 if (IS_ERR(env)) {
1651                         rc = PTR_ERR(env);
1652                         ptlrpc_request_free(req);
1653                         RETURN(rc);
1654                 }
1655
1656                 cl_object_attr_lock(clobj);
1657                 rc = cl_object_attr_get(env, clobj, &attr);
1658                 cl_object_attr_unlock(clobj);
1659                 cl_env_put(env, &refcheck);
1660                 if (rc != 0) {
1661                         ptlrpc_request_free(req);
1662                         RETURN(rc);
1663                 }
1664                 if (attr.cat_size)
1665                         oa->o_size = attr.cat_size;
1666         } else if (opc == OST_READ && inode && IS_ENCRYPTED(inode) &&
1667                    llcrypt_has_encryption_key(inode)) {
1668                 for (i = 0; i < page_count; i++) {
1669                         struct brw_page *pg = pga[i];
1670                         u32 nunits = (pg->off & ~PAGE_MASK) + pg->count;
1671
1672                         nunits = round_up(nunits, LUSTRE_ENCRYPTION_UNIT_SIZE);
1673                         /* count/off are forced to cover the whole encryption
1674                          * unit size so that all encrypted data is stored on the
1675                          * OST, so adjust bp_{count,off}_diff for the size of
1676                          * the clear text.
1677                          */
1678                         pg->bp_count_diff = nunits - pg->count;
1679                         pg->count = nunits;
1680                         pg->bp_off_diff = pg->off & ~PAGE_MASK;
1681                         pg->off = pg->off & PAGE_MASK;
1682                 }
1683         }
1684
1685         for (niocount = i = 1; i < page_count; i++) {
1686                 if (!can_merge_pages(pga[i - 1], pga[i]))
1687                         niocount++;
1688         }
1689
1690         pill = &req->rq_pill;
1691         req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
1692                              sizeof(*ioobj));
1693         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
1694                              niocount * sizeof(*niobuf));
1695
1696         for (i = 0; i < page_count; i++) {
1697                 short_io_size += pga[i]->count;
1698                 if (!inode || !IS_ENCRYPTED(inode) ||
1699                     !llcrypt_has_encryption_key(inode)) {
1700                         pga[i]->bp_count_diff = 0;
1701                         pga[i]->bp_off_diff = 0;
1702                 }
1703         }
1704
1705         if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
1706                 enable_checksum = false;
1707                 short_io_size = 0;
1708                 gpu = 1;
1709         }
1710
1711         /* Check if read/write is small enough to be a short io. */
1712         if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
1713             !imp_connect_shortio(cli->cl_import))
1714                 short_io_size = 0;
1715
1716         /* If this is an empty RPC to old server, just ignore it */
1717         if (!short_io_size && !pga[0]->pg) {
1718                 ptlrpc_request_free(req);
1719                 RETURN(-ENODATA);
1720         }
1721
1722         req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
1723                              opc == OST_READ ? 0 : short_io_size);
1724         if (opc == OST_READ)
1725                 req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
1726                                      short_io_size);
1727
1728         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
1729         if (rc) {
1730                 ptlrpc_request_free(req);
1731                 RETURN(rc);
1732         }
1733         osc_set_io_portal(req);
1734
1735         ptlrpc_at_set_req_timeout(req);
1736         /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
1737          * retry logic */
1738         req->rq_no_retry_einprogress = 1;
1739
1740         if (short_io_size != 0) {
1741                 desc = NULL;
1742                 short_io_buf = NULL;
1743                 goto no_bulk;
1744         }
1745
1746         desc = ptlrpc_prep_bulk_imp(req, page_count,
1747                 cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
1748                 (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
1749                         PTLRPC_BULK_PUT_SINK),
1750                 OST_BULK_PORTAL,
1751                 &ptlrpc_bulk_kiov_pin_ops);
1752
1753         if (desc == NULL)
1754                 GOTO(out, rc = -ENOMEM);
1755         /* NB request now owns desc and will free it when it gets freed */
1756         desc->bd_is_rdma = gpu;
1757 no_bulk:
1758         body = req_capsule_client_get(pill, &RMF_OST_BODY);
1759         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
1760         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1761         LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
1762
1763         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
1764
1765         /* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
1766          * and from_kgid(), because they are asynchronous. Fortunately, variable
1767          * oa contains valid o_uid and o_gid in these two operations.
1768          * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
1769          * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
1770          * other process logic */
1771         body->oa.o_uid = oa->o_uid;
1772         body->oa.o_gid = oa->o_gid;
1773
1774         obdo_to_ioobj(oa, ioobj);
1775         ioobj->ioo_bufcnt = niocount;
1776         /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
1777          * that might be send for this request.  The actual number is decided
1778          * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
1779          * "max - 1" for old client compatibility sending "0", and also so the
1780          * the actual maximum is a power-of-two number, not one less. LU-1431 */
1781         if (desc != NULL)
1782                 ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
1783         else /* short io */
1784                 ioobj_max_brw_set(ioobj, 0);
1785
1786         if (inode && IS_ENCRYPTED(inode) &&
1787             llcrypt_has_encryption_key(inode) &&
1788             !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) {
1789                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1790                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1791                         body->oa.o_flags = 0;
1792                 }
1793                 body->oa.o_flags |= LUSTRE_ENCRYPT_FL;
1794         }
1795
1796         if (short_io_size != 0) {
1797                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1798                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1799                         body->oa.o_flags = 0;
1800                 }
1801                 body->oa.o_flags |= OBD_FL_SHORT_IO;
1802                 CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
1803                        short_io_size);
1804                 if (opc == OST_WRITE) {
1805                         short_io_buf = req_capsule_client_get(pill,
1806                                                               &RMF_SHORT_IO);
1807                         LASSERT(short_io_buf != NULL);
1808                 }
1809         }
1810
1811         LASSERT(page_count > 0);
1812         pg_prev = pga[0];
1813         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
1814                 struct brw_page *pg = pga[i];
1815                 int poff = pg->off & ~PAGE_MASK;
1816
1817                 LASSERT(pg->count > 0);
1818                 /* make sure there is no gap in the middle of page array */
1819                 LASSERTF(page_count == 1 ||
1820                          (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
1821                           ergo(i > 0 && i < page_count - 1,
1822                                poff == 0 && pg->count == PAGE_SIZE)   &&
1823                           ergo(i == page_count - 1, poff == 0)),
1824                          "i: %d/%d pg: %p off: %llu, count: %u\n",
1825                          i, page_count, pg, pg->off, pg->count);
1826                 LASSERTF(i == 0 || pg->off > pg_prev->off,
1827                          "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
1828                          " prev_pg %p [pri %lu ind %lu] off %llu\n",
1829                          i, page_count,
1830                          pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
1831                          pg_prev->pg, page_private(pg_prev->pg),
1832                          pg_prev->pg->index, pg_prev->off);
1833                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
1834                         (pg->flag & OBD_BRW_SRVLOCK));
1835                 if (short_io_size != 0 && opc == OST_WRITE) {
1836                         unsigned char *ptr = kmap_atomic(pg->pg);
1837
1838                         LASSERT(short_io_size >= requested_nob + pg->count);
1839                         memcpy(short_io_buf + requested_nob,
1840                                ptr + poff,
1841                                pg->count);
1842                         kunmap_atomic(ptr);
1843                 } else if (short_io_size == 0) {
1844                         desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
1845                                                          pg->count);
1846                 }
1847                 requested_nob += pg->count;
1848
1849                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
1850                         niobuf--;
1851                         niobuf->rnb_len += pg->count;
1852                 } else {
1853                         niobuf->rnb_offset = pg->off;
1854                         niobuf->rnb_len    = pg->count;
1855                         niobuf->rnb_flags  = pg->flag;
1856                 }
1857                 pg_prev = pg;
1858         }
1859
1860         LASSERTF((void *)(niobuf - niocount) ==
1861                 req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
1862                 "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
1863                 &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
1864
1865         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
1866         if (resend) {
1867                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1868                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1869                         body->oa.o_flags = 0;
1870                 }
1871                 body->oa.o_flags |= OBD_FL_RECOV_RESEND;
1872         }
1873
1874         if (osc_should_shrink_grant(cli))
1875                 osc_shrink_grant_local(cli, &body->oa);
1876
1877         if (!cli->cl_checksum || sptlrpc_flavor_has_bulk(&req->rq_flvr))
1878                 enable_checksum = false;
1879
1880         /* size[REQ_REC_OFF] still sizeof (*body) */
1881         if (opc == OST_WRITE) {
1882                 if (enable_checksum) {
1883                         /* store cl_cksum_type in a local variable since
1884                          * it can be changed via lprocfs */
1885                         enum cksum_types cksum_type = cli->cl_cksum_type;
1886
1887                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1888                                 body->oa.o_flags = 0;
1889
1890                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1891                                                                 cksum_type);
1892                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1893
1894                         rc = osc_checksum_bulk_rw(obd_name, cksum_type,
1895                                                   requested_nob, page_count,
1896                                                   pga, OST_WRITE,
1897                                                   &body->oa.o_cksum, resend);
1898                         if (rc < 0) {
1899                                 CDEBUG(D_PAGE, "failed to checksum: rc = %d\n",
1900                                        rc);
1901                                 GOTO(out, rc);
1902                         }
1903                         CDEBUG(D_PAGE | (resend ? D_HA : 0),
1904                                "checksum at write origin: %x (%x)\n",
1905                                body->oa.o_cksum, cksum_type);
1906
1907                         /* save this in 'oa', too, for later checking */
1908                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1909                         oa->o_flags |= obd_cksum_type_pack(obd_name,
1910                                                            cksum_type);
1911                 } else {
1912                         /* clear out the checksum flag, in case this is a
1913                          * resend but cl_checksum is no longer set. b=11238 */
1914                         oa->o_valid &= ~OBD_MD_FLCKSUM;
1915                 }
1916                 oa->o_cksum = body->oa.o_cksum;
1917                 /* 1 RC per niobuf */
1918                 req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
1919                                      sizeof(__u32) * niocount);
1920         } else {
1921                 if (enable_checksum) {
1922                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1923                                 body->oa.o_flags = 0;
1924                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1925                                 cli->cl_cksum_type);
1926                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1927                 }
1928
1929                 /* Client cksum has been already copied to wire obdo in previous
1930                  * lustre_set_wire_obdo(), and in the case a bulk-read is being
1931                  * resent due to cksum error, this will allow Server to
1932                  * check+dump pages on its side */
1933         }
1934         ptlrpc_request_set_replen(req);
1935
1936         aa = ptlrpc_req_async_args(aa, req);
1937         aa->aa_oa = oa;
1938         aa->aa_requested_nob = requested_nob;
1939         aa->aa_nio_count = niocount;
1940         aa->aa_page_count = page_count;
1941         aa->aa_resends = 0;
1942         aa->aa_ppga = pga;
1943         aa->aa_cli = cli;
1944         INIT_LIST_HEAD(&aa->aa_oaps);
1945
1946         *reqp = req;
1947         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1948         CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
1949                 req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
1950                 niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
1951         RETURN(0);
1952
1953  out:
1954         ptlrpc_req_finished(req);
1955         RETURN(rc);
1956 }
1957
1958 char dbgcksum_file_name[PATH_MAX];
1959
1960 static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
1961                                 struct brw_page **pga, __u32 server_cksum,
1962                                 __u32 client_cksum)
1963 {
1964         struct file *filp;
1965         int rc, i;
1966         unsigned int len;
1967         char *buf;
1968
1969         /* will only keep dump of pages on first error for the same range in
1970          * file/fid, not during the resends/retries. */
1971         snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
1972                  "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
1973                  (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
1974                   libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
1975                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
1976                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1977                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1978                  pga[0]->off,
1979                  pga[page_count-1]->off + pga[page_count-1]->count - 1,
1980                  client_cksum, server_cksum);
1981         CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
1982         filp = filp_open(dbgcksum_file_name,
1983                          O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
1984         if (IS_ERR(filp)) {
1985                 rc = PTR_ERR(filp);
1986                 if (rc == -EEXIST)
1987                         CDEBUG(D_INFO, "%s: can't open to dump pages with "
1988                                "checksum error: rc = %d\n", dbgcksum_file_name,
1989                                rc);
1990                 else
1991                         CERROR("%s: can't open to dump pages with checksum "
1992                                "error: rc = %d\n", dbgcksum_file_name, rc);
1993                 return;
1994         }
1995
1996         for (i = 0; i < page_count; i++) {
1997                 len = pga[i]->count;
1998                 buf = kmap(pga[i]->pg);
1999                 while (len != 0) {
2000                         rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
2001                         if (rc < 0) {
2002                                 CERROR("%s: wanted to write %u but got %d "
2003                                        "error\n", dbgcksum_file_name, len, rc);
2004                                 break;
2005                         }
2006                         len -= rc;
2007                         buf += rc;
2008                 }
2009                 kunmap(pga[i]->pg);
2010         }
2011
2012         rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
2013         if (rc)
2014                 CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
2015         filp_close(filp, NULL);
2016
2017         libcfs_debug_dumplog();
2018 }
2019
2020 static int
2021 check_write_checksum(struct obdo *oa, const struct lnet_processid *peer,
2022                      __u32 client_cksum, __u32 server_cksum,
2023                      struct osc_brw_async_args *aa)
2024 {
2025         const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
2026         enum cksum_types cksum_type;
2027         obd_dif_csum_fn *fn = NULL;
2028         int sector_size = 0;
2029         __u32 new_cksum;
2030         char *msg;
2031         int rc;
2032
2033         if (server_cksum == client_cksum) {
2034                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
2035                 return 0;
2036         }
2037
2038         if (aa->aa_cli->cl_checksum_dump)
2039                 dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
2040                                     server_cksum, client_cksum);
2041
2042         cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
2043                                            oa->o_flags : 0);
2044
2045         switch (cksum_type) {
2046         case OBD_CKSUM_T10IP512:
2047                 fn = obd_dif_ip_fn;
2048                 sector_size = 512;
2049                 break;
2050         case OBD_CKSUM_T10IP4K:
2051                 fn = obd_dif_ip_fn;
2052                 sector_size = 4096;
2053                 break;
2054         case OBD_CKSUM_T10CRC512:
2055                 fn = obd_dif_crc_fn;
2056                 sector_size = 512;
2057                 break;
2058         case OBD_CKSUM_T10CRC4K:
2059                 fn = obd_dif_crc_fn;
2060                 sector_size = 4096;
2061                 break;
2062         default:
2063                 break;
2064         }
2065
2066         if (fn)
2067                 rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
2068                                              aa->aa_page_count, aa->aa_ppga,
2069                                              OST_WRITE, fn, sector_size,
2070                                              &new_cksum, true);
2071         else
2072                 rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
2073                                        aa->aa_ppga, OST_WRITE, cksum_type,
2074                                        &new_cksum);
2075
2076         if (rc < 0)
2077                 msg = "failed to calculate the client write checksum";
2078         else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
2079                 msg = "the server did not use the checksum type specified in "
2080                       "the original request - likely a protocol problem";
2081         else if (new_cksum == server_cksum)
2082                 msg = "changed on the client after we checksummed it - "
2083                       "likely false positive due to mmap IO (bug 11742)";
2084         else if (new_cksum == client_cksum)
2085                 msg = "changed in transit before arrival at OST";
2086         else
2087                 msg = "changed in transit AND doesn't match the original - "
2088                       "likely false positive due to mmap IO (bug 11742)";
2089
2090         LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
2091                            DFID " object "DOSTID" extent [%llu-%llu], original "
2092                            "client csum %x (type %x), server csum %x (type %x),"
2093                            " client csum now %x\n",
2094                            obd_name, msg, libcfs_nidstr(&peer->nid),
2095                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
2096                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
2097                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
2098                            POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
2099                            aa->aa_ppga[aa->aa_page_count - 1]->off +
2100                                 aa->aa_ppga[aa->aa_page_count-1]->count - 1,
2101                            client_cksum,
2102                            obd_cksum_type_unpack(aa->aa_oa->o_flags),
2103                            server_cksum, cksum_type, new_cksum);
2104         return 1;
2105 }
2106
2107 /* Note rc enters this function as number of bytes transferred */
2108 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
2109 {
2110         struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
2111         struct client_obd *cli = aa->aa_cli;
2112         const char *obd_name = cli->cl_import->imp_obd->obd_name;
2113         const struct lnet_processid *peer =
2114                 &req->rq_import->imp_connection->c_peer;
2115         struct ost_body *body;
2116         u32 client_cksum = 0;
2117         struct inode *inode = NULL;
2118         unsigned int blockbits = 0, blocksize = 0;
2119         struct cl_page *clpage;
2120
2121         ENTRY;
2122
2123         if (rc < 0 && rc != -EDQUOT) {
2124                 DEBUG_REQ(D_INFO, req, "Failed request: rc = %d", rc);
2125                 RETURN(rc);
2126         }
2127
2128         LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
2129         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
2130         if (body == NULL) {
2131                 DEBUG_REQ(D_INFO, req, "cannot unpack body");
2132                 RETURN(-EPROTO);
2133         }
2134
2135         /* set/clear over quota flag for a uid/gid/projid */
2136         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
2137             body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
2138                 unsigned qid[LL_MAXQUOTAS] = {
2139                                          body->oa.o_uid, body->oa.o_gid,
2140                                          body->oa.o_projid };
2141                 CDEBUG(D_QUOTA,
2142                        "setdq for [%u %u %u] with valid %#llx, flags %x\n",
2143                        body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
2144                        body->oa.o_valid, body->oa.o_flags);
2145                 osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
2146                                 body->oa.o_flags);
2147         }
2148
2149         osc_update_grant(cli, body);
2150
2151         if (rc < 0)
2152                 RETURN(rc);
2153
2154         if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
2155                 client_cksum = aa->aa_oa->o_cksum; /* save for later */
2156
2157         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
2158                 if (rc > 0) {
2159                         CERROR("%s: unexpected positive size %d\n",
2160                                obd_name, rc);
2161                         RETURN(-EPROTO);
2162                 }
2163
2164                 if (req->rq_bulk != NULL &&
2165                     sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
2166                         RETURN(-EAGAIN);
2167
2168                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
2169                     check_write_checksum(&body->oa, peer, client_cksum,
2170                                          body->oa.o_cksum, aa))
2171                         RETURN(-EAGAIN);
2172
2173                 rc = check_write_rcs(req, aa->aa_requested_nob,
2174                                      aa->aa_nio_count, aa->aa_page_count,
2175                                      aa->aa_ppga);
2176                 GOTO(out, rc);
2177         }
2178
2179         /* The rest of this function executes only for OST_READs */
2180
2181         if (req->rq_bulk == NULL) {
2182                 rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
2183                                           RCL_SERVER);
2184                 LASSERT(rc == req->rq_status);
2185         } else {
2186                 /* if unwrap_bulk failed, return -EAGAIN to retry */
2187                 rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
2188         }
2189         if (rc < 0)
2190                 GOTO(out, rc = -EAGAIN);
2191
2192         if (rc > aa->aa_requested_nob) {
2193                 CERROR("%s: unexpected size %d, requested %d\n", obd_name,
2194                        rc, aa->aa_requested_nob);
2195                 RETURN(-EPROTO);
2196         }
2197
2198         if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
2199                 CERROR("%s: unexpected size %d, transferred %d\n", obd_name,
2200                        rc, req->rq_bulk->bd_nob_transferred);
2201                 RETURN(-EPROTO);
2202         }
2203
2204         if (req->rq_bulk == NULL) {
2205                 /* short io */
2206                 int nob, pg_count, i = 0;
2207                 unsigned char *buf;
2208
2209                 CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
2210                 pg_count = aa->aa_page_count;
2211                 buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
2212                                                    rc);
2213                 nob = rc;
2214                 while (nob > 0 && pg_count > 0) {
2215                         unsigned char *ptr;
2216                         int count = aa->aa_ppga[i]->count > nob ?
2217                                     nob : aa->aa_ppga[i]->count;
2218
2219                         CDEBUG(D_CACHE, "page %p count %d\n",
2220                                aa->aa_ppga[i]->pg, count);
2221                         ptr = kmap_atomic(aa->aa_ppga[i]->pg);
2222                         memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
2223                                count);
2224                         kunmap_atomic((void *) ptr);
2225
2226                         buf += count;
2227                         nob -= count;
2228                         i++;
2229                         pg_count--;
2230                 }
2231         }
2232
2233         if (rc < aa->aa_requested_nob)
2234                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
2235
2236         if (body->oa.o_valid & OBD_MD_FLCKSUM) {
2237                 static int cksum_counter;
2238                 u32 server_cksum = body->oa.o_cksum;
2239                 int nob = rc;
2240                 char *via = "";
2241                 char *router = "";
2242                 enum cksum_types cksum_type;
2243                 u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
2244                         body->oa.o_flags : 0;
2245
2246                 cksum_type = obd_cksum_type_unpack(o_flags);
2247                 rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob,
2248                                           aa->aa_page_count, aa->aa_ppga,
2249                                           OST_READ, &client_cksum, false);
2250                 if (rc < 0)
2251                         GOTO(out, rc);
2252
2253                 if (req->rq_bulk != NULL &&
2254                     !nid_same(&peer->nid, &req->rq_bulk->bd_sender)) {
2255                         via = " via ";
2256                         router = libcfs_nidstr(&req->rq_bulk->bd_sender);
2257                 }
2258
2259                 if (server_cksum != client_cksum) {
2260                         struct ost_body *clbody;
2261                         __u32 client_cksum2;
2262                         u32 page_count = aa->aa_page_count;
2263
2264                         osc_checksum_bulk_rw(obd_name, cksum_type, nob,
2265                                              page_count, aa->aa_ppga,
2266                                              OST_READ, &client_cksum2, true);
2267                         clbody = req_capsule_client_get(&req->rq_pill,
2268                                                         &RMF_OST_BODY);
2269                         if (cli->cl_checksum_dump)
2270                                 dump_all_bulk_pages(&clbody->oa, page_count,
2271                                                     aa->aa_ppga, server_cksum,
2272                                                     client_cksum);
2273
2274                         LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
2275                                            "%s%s%s inode "DFID" object "DOSTID
2276                                            " extent [%llu-%llu], client %x/%x, "
2277                                            "server %x, cksum_type %x\n",
2278                                            obd_name,
2279                                            libcfs_nidstr(&peer->nid),
2280                                            via, router,
2281                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2282                                                 clbody->oa.o_parent_seq : 0ULL,
2283                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2284                                                 clbody->oa.o_parent_oid : 0,
2285                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2286                                                 clbody->oa.o_parent_ver : 0,
2287                                            POSTID(&body->oa.o_oi),
2288                                            aa->aa_ppga[0]->off,
2289                                            aa->aa_ppga[page_count-1]->off +
2290                                            aa->aa_ppga[page_count-1]->count - 1,
2291                                            client_cksum, client_cksum2,
2292                                            server_cksum, cksum_type);
2293                         cksum_counter = 0;
2294                         aa->aa_oa->o_cksum = client_cksum;
2295                         rc = -EAGAIN;
2296                 } else {
2297                         cksum_counter++;
2298                         CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
2299                         rc = 0;
2300                 }
2301         } else if (unlikely(client_cksum)) {
2302                 static int cksum_missed;
2303
2304                 cksum_missed++;
2305                 if ((cksum_missed & (-cksum_missed)) == cksum_missed)
2306                         CERROR("%s: checksum %u requested from %s but not sent\n",
2307                                obd_name, cksum_missed,
2308                                libcfs_nidstr(&peer->nid));
2309         } else {
2310                 rc = 0;
2311         }
2312
2313         /* get the inode from the first cl_page */
2314         clpage = oap2cl_page(brw_page2oap(aa->aa_ppga[0]));
2315         inode = clpage->cp_inode;
2316         if (clpage->cp_type == CPT_TRANSIENT && inode) {
2317                 blockbits = inode->i_blkbits;
2318                 blocksize = 1 << blockbits;
2319         }
2320         if (inode && IS_ENCRYPTED(inode)) {
2321                 int idx;
2322
2323                 if (!llcrypt_has_encryption_key(inode)) {
2324                         CDEBUG(D_SEC, "no enc key for ino %lu\n", inode->i_ino);
2325                         GOTO(out, rc);
2326                 }
2327                 for (idx = 0; idx < aa->aa_page_count; idx++) {
2328                         struct brw_page *brwpg = aa->aa_ppga[idx];
2329                         unsigned int offs = 0;
2330
2331                         while (offs < PAGE_SIZE) {
2332                                 /* do not decrypt if page is all 0s */
2333                                 if (memchr_inv(page_address(brwpg->pg) + offs,
2334                                       0, LUSTRE_ENCRYPTION_UNIT_SIZE) == NULL) {
2335                                         /* if page is empty forward info to
2336                                          * upper layers (ll_io_zero_page) by
2337                                          * clearing PagePrivate2
2338                                          */
2339                                         if (!offs)
2340                                                 ClearPagePrivate2(brwpg->pg);
2341                                         break;
2342                                 }
2343
2344                                 if (blockbits) {
2345                                         /* This is direct IO case. Directly call
2346                                          * decrypt function that takes inode as
2347                                          * input parameter. Page does not need
2348                                          * to be locked.
2349                                          */
2350                                         u64 lblk_num;
2351                                         unsigned int i;
2352
2353                                         clpage =
2354                                                oap2cl_page(brw_page2oap(brwpg));
2355                                         lblk_num =
2356                                                 ((u64)(clpage->cp_page_index) <<
2357                                                 (PAGE_SHIFT - blockbits)) +
2358                                                 (offs >> blockbits);
2359                                         for (i = offs;
2360                                              i < offs +
2361                                                     LUSTRE_ENCRYPTION_UNIT_SIZE;
2362                                              i += blocksize, lblk_num++) {
2363                                                 rc =
2364                                                   llcrypt_decrypt_block_inplace(
2365                                                           inode, brwpg->pg,
2366                                                           blocksize, i,
2367                                                           lblk_num);
2368                                                 if (rc)
2369                                                         break;
2370                                         }
2371                                 } else {
2372                                         rc = llcrypt_decrypt_pagecache_blocks(
2373                                                 brwpg->pg,
2374                                                 LUSTRE_ENCRYPTION_UNIT_SIZE,
2375                                                 offs);
2376                                 }
2377                                 if (rc)
2378                                         GOTO(out, rc);
2379
2380                                 offs += LUSTRE_ENCRYPTION_UNIT_SIZE;
2381                         }
2382                 }
2383         }
2384
2385 out:
2386         if (rc >= 0)
2387                 lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
2388                                      aa->aa_oa, &body->oa);
2389
2390         RETURN(rc);
2391 }
2392
2393 static int osc_brw_redo_request(struct ptlrpc_request *request,
2394                                 struct osc_brw_async_args *aa, int rc)
2395 {
2396         struct ptlrpc_request *new_req;
2397         struct osc_brw_async_args *new_aa;
2398         struct osc_async_page *oap;
2399         ENTRY;
2400
2401         /* The below message is checked in replay-ost-single.sh test_8ae*/
2402         DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
2403                   "redo for recoverable error %d", rc);
2404
2405         rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
2406                                 OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
2407                                   aa->aa_cli, aa->aa_oa, aa->aa_page_count,
2408                                   aa->aa_ppga, &new_req, 1);
2409         if (rc)
2410                 RETURN(rc);
2411
2412         list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
2413                 if (oap->oap_request != NULL) {
2414                         LASSERTF(request == oap->oap_request,
2415                                  "request %p != oap_request %p\n",
2416                                  request, oap->oap_request);
2417                 }
2418         }
2419         /*
2420          * New request takes over pga and oaps from old request.
2421          * Note that copying a list_head doesn't work, need to move it...
2422          */
2423         aa->aa_resends++;
2424         new_req->rq_interpret_reply = request->rq_interpret_reply;
2425         new_req->rq_async_args = request->rq_async_args;
2426         new_req->rq_commit_cb = request->rq_commit_cb;
2427         /* cap resend delay to the current request timeout, this is similar to
2428          * what ptlrpc does (see after_reply()) */
2429         if (aa->aa_resends > new_req->rq_timeout)
2430                 new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
2431         else
2432                 new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
2433         new_req->rq_generation_set = 1;
2434         new_req->rq_import_generation = request->rq_import_generation;
2435
2436         new_aa = ptlrpc_req_async_args(new_aa, new_req);
2437
2438         INIT_LIST_HEAD(&new_aa->aa_oaps);
2439         list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
2440         INIT_LIST_HEAD(&new_aa->aa_exts);
2441         list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
2442         new_aa->aa_resends = aa->aa_resends;
2443
2444         list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
2445                 if (oap->oap_request) {
2446                         ptlrpc_req_finished(oap->oap_request);
2447                         oap->oap_request = ptlrpc_request_addref(new_req);
2448                 }
2449         }
2450
2451         /* XXX: This code will run into problem if we're going to support
2452          * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
2453          * and wait for all of them to be finished. We should inherit request
2454          * set from old request. */
2455         ptlrpcd_add_req(new_req);
2456
2457         DEBUG_REQ(D_INFO, new_req, "new request");
2458         RETURN(0);
2459 }
2460
2461 /*
2462  * ugh, we want disk allocation on the target to happen in offset order.  we'll
2463  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
2464  * fine for our small page arrays and doesn't require allocation.  its an
2465  * insertion sort that swaps elements that are strides apart, shrinking the
2466  * stride down until its '1' and the array is sorted.
2467  */
2468 static void sort_brw_pages(struct brw_page **array, int num)
2469 {
2470         int stride, i, j;
2471         struct brw_page *tmp;
2472
2473         if (num == 1)
2474                 return;
2475         for (stride = 1; stride < num ; stride = (stride * 3) + 1)
2476                 ;
2477
2478         do {
2479                 stride /= 3;
2480                 for (i = stride ; i < num ; i++) {
2481                         tmp = array[i];
2482                         j = i;
2483                         while (j >= stride && array[j - stride]->off > tmp->off) {
2484                                 array[j] = array[j - stride];
2485                                 j -= stride;
2486                         }
2487                         array[j] = tmp;
2488                 }
2489         } while (stride > 1);
2490 }
2491
2492 static void osc_release_ppga(struct brw_page **ppga, size_t count)
2493 {
2494         LASSERT(ppga != NULL);
2495         OBD_FREE_PTR_ARRAY_LARGE(ppga, count);
2496 }
2497
2498 static int brw_interpret(const struct lu_env *env,
2499                          struct ptlrpc_request *req, void *args, int rc)
2500 {
2501         struct osc_brw_async_args *aa = args;
2502         struct osc_extent *ext;
2503         struct osc_extent *tmp;
2504         struct client_obd *cli = aa->aa_cli;
2505         unsigned long transferred = 0;
2506
2507         ENTRY;
2508
2509         rc = osc_brw_fini_request(req, rc);
2510         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
2511
2512         /* restore clear text pages */
2513         osc_release_bounce_pages(aa->aa_ppga, aa->aa_page_count);
2514
2515         /*
2516          * When server returns -EINPROGRESS, client should always retry
2517          * regardless of the number of times the bulk was resent already.
2518          */
2519         if (osc_recoverable_error(rc) && !req->rq_no_delay) {
2520                 if (req->rq_import_generation !=
2521                     req->rq_import->imp_generation) {
2522                         CDEBUG(D_HA, "%s: resend cross eviction for object: "
2523                                ""DOSTID", rc = %d.\n",
2524                                req->rq_import->imp_obd->obd_name,
2525                                POSTID(&aa->aa_oa->o_oi), rc);
2526                 } else if (rc == -EINPROGRESS ||
2527                            client_should_resend(aa->aa_resends, aa->aa_cli)) {
2528                         rc = osc_brw_redo_request(req, aa, rc);
2529                 } else {
2530                         CERROR("%s: too many resent retries for object: "
2531                                "%llu:%llu, rc = %d.\n",
2532                                req->rq_import->imp_obd->obd_name,
2533                                POSTID(&aa->aa_oa->o_oi), rc);
2534                 }
2535
2536                 if (rc == 0)
2537                         RETURN(0);
2538                 else if (rc == -EAGAIN || rc == -EINPROGRESS)
2539                         rc = -EIO;
2540         }
2541
2542         if (rc == 0) {
2543                 struct obdo *oa = aa->aa_oa;
2544                 struct cl_attr *attr = &osc_env_info(env)->oti_attr;
2545                 unsigned long valid = 0;
2546                 struct cl_object *obj;
2547                 struct osc_async_page *last;
2548
2549                 last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
2550                 obj = osc2cl(last->oap_obj);
2551
2552                 cl_object_attr_lock(obj);
2553                 if (oa->o_valid & OBD_MD_FLBLOCKS) {
2554                         attr->cat_blocks = oa->o_blocks;
2555                         valid |= CAT_BLOCKS;
2556                 }
2557                 if (oa->o_valid & OBD_MD_FLMTIME) {
2558                         attr->cat_mtime = oa->o_mtime;
2559                         valid |= CAT_MTIME;
2560                 }
2561                 if (oa->o_valid & OBD_MD_FLATIME) {
2562                         attr->cat_atime = oa->o_atime;
2563                         valid |= CAT_ATIME;
2564                 }
2565                 if (oa->o_valid & OBD_MD_FLCTIME) {
2566                         attr->cat_ctime = oa->o_ctime;
2567                         valid |= CAT_CTIME;
2568                 }
2569
2570                 if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
2571                         struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
2572                         loff_t last_off = last->oap_count + last->oap_obj_off +
2573                                 last->oap_page_off;
2574
2575                         /* Change file size if this is an out of quota or
2576                          * direct IO write and it extends the file size */
2577                         if (loi->loi_lvb.lvb_size < last_off) {
2578                                 attr->cat_size = last_off;
2579                                 valid |= CAT_SIZE;
2580                         }
2581                         /* Extend KMS if it's not a lockless write */
2582                         if (loi->loi_kms < last_off &&
2583                             oap2osc_page(last)->ops_srvlock == 0) {
2584                                 attr->cat_kms = last_off;
2585                                 valid |= CAT_KMS;
2586                         }
2587                 }
2588
2589                 if (valid != 0)
2590                         cl_object_attr_update(env, obj, attr, valid);
2591                 cl_object_attr_unlock(obj);
2592         }
2593         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
2594         aa->aa_oa = NULL;
2595
2596         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
2597                 osc_inc_unstable_pages(req);
2598
2599         list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
2600                 list_del_init(&ext->oe_link);
2601                 osc_extent_finish(env, ext, 1,
2602                                   rc && req->rq_no_delay ? -EAGAIN : rc);
2603         }
2604         LASSERT(list_empty(&aa->aa_exts));
2605         LASSERT(list_empty(&aa->aa_oaps));
2606
2607         transferred = (req->rq_bulk == NULL ? /* short io */
2608                        aa->aa_requested_nob :
2609                        req->rq_bulk->bd_nob_transferred);
2610
2611         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
2612         ptlrpc_lprocfs_brw(req, transferred);
2613
2614         spin_lock(&cli->cl_loi_list_lock);
2615         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
2616          * is called so we know whether to go to sync BRWs or wait for more
2617          * RPCs to complete */
2618         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
2619                 cli->cl_w_in_flight--;
2620         else
2621                 cli->cl_r_in_flight--;
2622         osc_wake_cache_waiters(cli);
2623         spin_unlock(&cli->cl_loi_list_lock);
2624
2625         osc_io_unplug(env, cli, NULL);
2626         RETURN(rc);
2627 }
2628
2629 static void brw_commit(struct ptlrpc_request *req)
2630 {
2631         /* If osc_inc_unstable_pages (via osc_extent_finish) races with
2632          * this called via the rq_commit_cb, I need to ensure
2633          * osc_dec_unstable_pages is still called. Otherwise unstable
2634          * pages may be leaked. */
2635         spin_lock(&req->rq_lock);
2636         if (likely(req->rq_unstable)) {
2637                 req->rq_unstable = 0;
2638                 spin_unlock(&req->rq_lock);
2639
2640                 osc_dec_unstable_pages(req);
2641         } else {
2642                 req->rq_committed = 1;
2643                 spin_unlock(&req->rq_lock);
2644         }
2645 }
2646
2647 /**
2648  * Build an RPC by the list of extent @ext_list. The caller must ensure
2649  * that the total pages in this list are NOT over max pages per RPC.
2650  * Extents in the list must be in OES_RPC state.
2651  */
2652 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
2653                   struct list_head *ext_list, int cmd)
2654 {
2655         struct ptlrpc_request           *req = NULL;
2656         struct osc_extent               *ext;
2657         struct brw_page                 **pga = NULL;
2658         struct osc_brw_async_args       *aa = NULL;
2659         struct obdo                     *oa = NULL;
2660         struct osc_async_page           *oap;
2661         struct osc_object               *obj = NULL;
2662         struct cl_req_attr              *crattr = NULL;
2663         loff_t                          starting_offset = OBD_OBJECT_EOF;
2664         loff_t                          ending_offset = 0;
2665         /* '1' for consistency with code that checks !mpflag to restore */
2666         int mpflag = 1;
2667         int                             mem_tight = 0;
2668         int                             page_count = 0;
2669         bool                            soft_sync = false;
2670         bool                            ndelay = false;
2671         int                             i;
2672         int                             grant = 0;
2673         int                             rc;
2674         __u32                           layout_version = 0;
2675         LIST_HEAD(rpc_list);
2676         struct ost_body                 *body;
2677         ENTRY;
2678         LASSERT(!list_empty(ext_list));
2679
2680         /* add pages into rpc_list to build BRW rpc */
2681         list_for_each_entry(ext, ext_list, oe_link) {
2682                 LASSERT(ext->oe_state == OES_RPC);
2683                 mem_tight |= ext->oe_memalloc;
2684                 grant += ext->oe_grants;
2685                 page_count += ext->oe_nr_pages;
2686                 layout_version = max(layout_version, ext->oe_layout_version);
2687                 if (obj == NULL)
2688                         obj = ext->oe_obj;
2689         }
2690
2691         soft_sync = osc_over_unstable_soft_limit(cli);
2692         if (mem_tight)
2693                 mpflag = memalloc_noreclaim_save();
2694
2695         OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count);
2696         if (pga == NULL)
2697                 GOTO(out, rc = -ENOMEM);
2698
2699         OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
2700         if (oa == NULL)
2701                 GOTO(out, rc = -ENOMEM);
2702
2703         i = 0;
2704         list_for_each_entry(ext, ext_list, oe_link) {
2705                 list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
2706                         if (mem_tight)
2707                                 oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
2708                         if (soft_sync)
2709                                 oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
2710                         pga[i] = &oap->oap_brw_page;
2711                         pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
2712                         i++;
2713
2714                         list_add_tail(&oap->oap_rpc_item, &rpc_list);
2715                         if (starting_offset == OBD_OBJECT_EOF ||
2716                             starting_offset > oap->oap_obj_off)
2717                                 starting_offset = oap->oap_obj_off;
2718                         else
2719                                 LASSERT(oap->oap_page_off == 0);
2720                         if (ending_offset < oap->oap_obj_off + oap->oap_count)
2721                                 ending_offset = oap->oap_obj_off +
2722                                                 oap->oap_count;
2723                         else
2724                                 LASSERT(oap->oap_page_off + oap->oap_count ==
2725                                         PAGE_SIZE);
2726                 }
2727                 if (ext->oe_ndelay)
2728                         ndelay = true;
2729         }
2730
2731         /* first page in the list */
2732         oap = list_first_entry(&rpc_list, typeof(*oap), oap_rpc_item);
2733
2734         crattr = &osc_env_info(env)->oti_req_attr;
2735         memset(crattr, 0, sizeof(*crattr));
2736         crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
2737         crattr->cra_flags = ~0ULL;
2738         crattr->cra_page = oap2cl_page(oap);
2739         crattr->cra_oa = oa;
2740         cl_req_attr_set(env, osc2cl(obj), crattr);
2741
2742         if (cmd == OBD_BRW_WRITE) {
2743                 oa->o_grant_used = grant;
2744                 if (layout_version > 0) {
2745                         CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
2746                                PFID(&oa->o_oi.oi_fid), layout_version);
2747
2748                         oa->o_layout_version = layout_version;
2749                         oa->o_valid |= OBD_MD_LAYOUT_VERSION;
2750                 }
2751         }
2752
2753         sort_brw_pages(pga, page_count);
2754         rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
2755         if (rc != 0) {
2756                 CERROR("prep_req failed: %d\n", rc);
2757                 GOTO(out, rc);
2758         }
2759
2760         req->rq_commit_cb = brw_commit;
2761         req->rq_interpret_reply = brw_interpret;
2762         req->rq_memalloc = mem_tight != 0;
2763         oap->oap_request = ptlrpc_request_addref(req);
2764         if (ndelay) {
2765                 req->rq_no_resend = req->rq_no_delay = 1;
2766                 /* probably set a shorter timeout value.
2767                  * to handle ETIMEDOUT in brw_interpret() correctly. */
2768                 /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
2769         }
2770
2771         /* Need to update the timestamps after the request is built in case
2772          * we race with setattr (locally or in queue at OST).  If OST gets
2773          * later setattr before earlier BRW (as determined by the request xid),
2774          * the OST will not use BRW timestamps.  Sadly, there is no obvious
2775          * way to do this in a single call.  bug 10150 */
2776         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
2777         crattr->cra_oa = &body->oa;
2778         crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
2779         cl_req_attr_set(env, osc2cl(obj), crattr);
2780         lustre_msg_set_uid_gid(req->rq_reqmsg, &crattr->cra_uid,
2781                                &crattr->cra_gid);
2782         lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
2783
2784         aa = ptlrpc_req_async_args(aa, req);
2785         INIT_LIST_HEAD(&aa->aa_oaps);
2786         list_splice_init(&rpc_list, &aa->aa_oaps);
2787         INIT_LIST_HEAD(&aa->aa_exts);
2788         list_splice_init(ext_list, &aa->aa_exts);
2789
2790         spin_lock(&cli->cl_loi_list_lock);
2791         starting_offset >>= PAGE_SHIFT;
2792         ending_offset >>= PAGE_SHIFT;
2793         if (cmd == OBD_BRW_READ) {
2794                 cli->cl_r_in_flight++;
2795                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
2796                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
2797                 lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
2798                                       starting_offset + 1);
2799         } else {
2800                 cli->cl_w_in_flight++;
2801                 lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
2802                 lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
2803                 lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
2804                                       starting_offset + 1);
2805         }
2806         spin_unlock(&cli->cl_loi_list_lock);
2807
2808         DEBUG_REQ(D_INODE, req, "%d pages, aa %p, now %ur/%uw in flight",
2809                   page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight);
2810         if (libcfs_debug & D_IOTRACE) {
2811                 struct lu_fid fid;
2812
2813                 fid.f_seq = crattr->cra_oa->o_parent_seq;
2814                 fid.f_oid = crattr->cra_oa->o_parent_oid;
2815                 fid.f_ver = crattr->cra_oa->o_parent_ver;
2816                 CDEBUG(D_IOTRACE,
2817                        DFID": %d %s pages, start %lld, end %lld, now %ur/%uw in flight\n",
2818                        PFID(&fid), page_count,
2819                        cmd == OBD_BRW_READ ? "read" : "write", starting_offset,
2820                        ending_offset, cli->cl_r_in_flight, cli->cl_w_in_flight);
2821         }
2822         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
2823
2824         ptlrpcd_add_req(req);
2825         rc = 0;
2826         EXIT;
2827
2828 out:
2829         if (mem_tight)
2830                 memalloc_noreclaim_restore(mpflag);
2831
2832         if (rc != 0) {
2833                 LASSERT(req == NULL);
2834
2835                 if (oa)
2836                         OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
2837                 if (pga) {
2838                         osc_release_bounce_pages(pga, page_count);
2839                         osc_release_ppga(pga, page_count);
2840                 }
2841                 /* this should happen rarely and is pretty bad, it makes the
2842                  * pending list not follow the dirty order
2843                  */
2844                 while ((ext = list_first_entry_or_null(ext_list,
2845                                                        struct osc_extent,
2846                                                        oe_link)) != NULL) {
2847                         list_del_init(&ext->oe_link);
2848                         osc_extent_finish(env, ext, 0, rc);
2849                 }
2850         }
2851         RETURN(rc);
2852 }
2853
2854 /* This is to refresh our lock in face of no RPCs. */
2855 void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start)
2856 {
2857         struct ptlrpc_request *req;
2858         struct obdo oa;
2859         struct brw_page bpg = { .off = start, .count = 1};
2860         struct brw_page *pga = &bpg;
2861         int rc;
2862
2863         memset(&oa, 0, sizeof(oa));
2864         oa.o_oi = osc->oo_oinfo->loi_oi;
2865         oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLFLAGS;
2866         /* For updated servers - don't do a read */
2867         oa.o_flags = OBD_FL_NORPC;
2868
2869         rc = osc_brw_prep_request(OBD_BRW_READ, osc_cli(osc), &oa, 1, &pga,
2870                                   &req, 0);
2871
2872         /* If we succeeded we ship it off, if not there's no point in doing
2873          * anything. Also no resends.
2874          * No interpret callback, no commit callback.
2875          */
2876         if (!rc) {
2877                 req->rq_no_resend = 1;
2878                 ptlrpcd_add_req(req);
2879         }
2880 }
2881
2882 static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
2883 {
2884         int set = 0;
2885
2886         LASSERT(lock != NULL);
2887
2888         lock_res_and_lock(lock);
2889
2890         if (lock->l_ast_data == NULL)
2891                 lock->l_ast_data = data;
2892         if (lock->l_ast_data == data)
2893                 set = 1;
2894
2895         unlock_res_and_lock(lock);
2896
2897         return set;
2898 }
2899
2900 int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
2901                      void *cookie, struct lustre_handle *lockh,
2902                      enum ldlm_mode mode, __u64 *flags, bool speculative,
2903                      int errcode)
2904 {
2905         bool intent = *flags & LDLM_FL_HAS_INTENT;
2906         int rc;
2907         ENTRY;
2908
2909         /* The request was created before ldlm_cli_enqueue call. */
2910         if (intent && errcode == ELDLM_LOCK_ABORTED) {
2911                 struct ldlm_reply *rep;
2912
2913                 rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
2914                 LASSERT(rep != NULL);
2915
2916                 rep->lock_policy_res1 =
2917                         ptlrpc_status_ntoh(rep->lock_policy_res1);
2918                 if (rep->lock_policy_res1)
2919                         errcode = rep->lock_policy_res1;
2920                 if (!speculative)
2921                         *flags |= LDLM_FL_LVB_READY;
2922         } else if (errcode == ELDLM_OK) {
2923                 *flags |= LDLM_FL_LVB_READY;
2924         }
2925
2926         /* Call the update callback. */
2927         rc = (*upcall)(cookie, lockh, errcode);
2928
2929         /* release the reference taken in ldlm_cli_enqueue() */
2930         if (errcode == ELDLM_LOCK_MATCHED)
2931                 errcode = ELDLM_OK;
2932         if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
2933                 ldlm_lock_decref(lockh, mode);
2934
2935         RETURN(rc);
2936 }
2937
2938 int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
2939                           void *args, int rc)
2940 {
2941         struct osc_enqueue_args *aa = args;
2942         struct ldlm_lock *lock;
2943         struct lustre_handle *lockh = &aa->oa_lockh;
2944         enum ldlm_mode mode = aa->oa_mode;
2945         struct ost_lvb *lvb = aa->oa_lvb;
2946         __u32 lvb_len = sizeof(*lvb);
2947         __u64 flags = 0;
2948         struct ldlm_enqueue_info einfo = {
2949                 .ei_type = aa->oa_type,
2950                 .ei_mode = mode,
2951         };
2952
2953         ENTRY;
2954
2955         /* ldlm_cli_enqueue is holding a reference on the lock, so it must
2956          * be valid. */
2957         lock = ldlm_handle2lock(lockh);
2958         LASSERTF(lock != NULL,
2959                  "lockh %#llx, req %p, aa %p - client evicted?\n",
2960                  lockh->cookie, req, aa);
2961
2962         /* Take an additional reference so that a blocking AST that
2963          * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
2964          * to arrive after an upcall has been executed by
2965          * osc_enqueue_fini(). */
2966         ldlm_lock_addref(lockh, mode);
2967
2968         /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
2969         OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
2970
2971         /* Let CP AST to grant the lock first. */
2972         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
2973
2974         if (aa->oa_speculative) {
2975                 LASSERT(aa->oa_lvb == NULL);
2976                 LASSERT(aa->oa_flags == NULL);
2977                 aa->oa_flags = &flags;
2978         }
2979
2980         /* Complete obtaining the lock procedure. */
2981         rc = ldlm_cli_enqueue_fini(aa->oa_exp, &req->rq_pill, &einfo, 1,
2982                                    aa->oa_flags, lvb, lvb_len, lockh, rc,
2983                                    false);
2984         /* Complete osc stuff. */
2985         rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
2986                               aa->oa_flags, aa->oa_speculative, rc);
2987
2988         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
2989
2990         ldlm_lock_decref(lockh, mode);
2991         LDLM_LOCK_PUT(lock);
2992         RETURN(rc);
2993 }
2994
2995 /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
2996  * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
2997  * other synchronous requests, however keeping some locks and trying to obtain
2998  * others may take a considerable amount of time in a case of ost failure; and
2999  * when other sync requests do not get released lock from a client, the client
3000  * is evicted from the cluster -- such scenarious make the life difficult, so
3001  * release locks just after they are obtained. */
3002 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
3003                      __u64 *flags, union ldlm_policy_data *policy,
3004                      struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
3005                      void *cookie, struct ldlm_enqueue_info *einfo,
3006                      struct ptlrpc_request_set *rqset, int async,
3007                      bool speculative)
3008 {
3009         struct obd_device *obd = exp->exp_obd;
3010         struct lustre_handle lockh = { 0 };
3011         struct ptlrpc_request *req = NULL;
3012         int intent = *flags & LDLM_FL_HAS_INTENT;
3013         __u64 search_flags = *flags;
3014         __u64 match_flags = 0;
3015         enum ldlm_mode mode;
3016         int rc;
3017         ENTRY;
3018
3019         /* Filesystem lock extents are extended to page boundaries so that
3020          * dealing with the page cache is a little smoother.  */
3021         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
3022         policy->l_extent.end |= ~PAGE_MASK;
3023
3024         /* Next, search for already existing extent locks that will cover us */
3025         /* If we're trying to read, we also search for an existing PW lock.  The
3026          * VFS and page cache already protect us locally, so lots of readers/
3027          * writers can share a single PW lock.
3028          *
3029          * There are problems with conversion deadlocks, so instead of
3030          * converting a read lock to a write lock, we'll just enqueue a new
3031          * one.
3032          *
3033          * At some point we should cancel the read lock instead of making them
3034          * send us a blocking callback, but there are problems with canceling
3035          * locks out from other users right now, too. */
3036         mode = einfo->ei_mode;
3037         if (einfo->ei_mode == LCK_PR)
3038                 mode |= LCK_PW;
3039         /* Normal lock requests must wait for the LVB to be ready before
3040          * matching a lock; speculative lock requests do not need to,
3041          * because they will not actually use the lock. */
3042         if (!speculative)
3043                 search_flags |= LDLM_FL_LVB_READY;
3044         if (intent != 0)
3045                 search_flags |= LDLM_FL_BLOCK_GRANTED;
3046         if (mode == LCK_GROUP)
3047                 match_flags = LDLM_MATCH_GROUP;
3048         mode = ldlm_lock_match_with_skip(obd->obd_namespace, search_flags, 0,
3049                                          res_id, einfo->ei_type, policy, mode,
3050                                          &lockh, match_flags);
3051         if (mode) {
3052                 struct ldlm_lock *matched;
3053
3054                 if (*flags & LDLM_FL_TEST_LOCK)
3055                         RETURN(ELDLM_OK);
3056
3057                 matched = ldlm_handle2lock(&lockh);
3058                 if (speculative) {
3059                         /* This DLM lock request is speculative, and does not
3060                          * have an associated IO request. Therefore if there
3061                          * is already a DLM lock, it wll just inform the
3062                          * caller to cancel the request for this stripe.*/
3063                         lock_res_and_lock(matched);
3064                         if (ldlm_extent_equal(&policy->l_extent,
3065                             &matched->l_policy_data.l_extent))
3066                                 rc = -EEXIST;
3067                         else
3068                                 rc = -ECANCELED;
3069                         unlock_res_and_lock(matched);
3070
3071                         ldlm_lock_decref(&lockh, mode);
3072                         LDLM_LOCK_PUT(matched);
3073                         RETURN(rc);
3074                 } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
3075                         *flags |= LDLM_FL_LVB_READY;
3076
3077                         /* We already have a lock, and it's referenced. */
3078                         (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
3079
3080                         ldlm_lock_decref(&lockh, mode);
3081                         LDLM_LOCK_PUT(matched);
3082                         RETURN(ELDLM_OK);
3083                 } else {
3084                         ldlm_lock_decref(&lockh, mode);
3085                         LDLM_LOCK_PUT(matched);
3086                 }
3087         }
3088
3089         if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
3090                 RETURN(-ENOLCK);
3091
3092         /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
3093         *flags &= ~LDLM_FL_BLOCK_GRANTED;
3094
3095         rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
3096                               sizeof(*lvb), LVB_T_OST, &lockh, async);
3097         if (async) {
3098                 if (!rc) {
3099                         struct osc_enqueue_args *aa;
3100                         aa = ptlrpc_req_async_args(aa, req);
3101                         aa->oa_exp         = exp;
3102                         aa->oa_mode        = einfo->ei_mode;
3103                         aa->oa_type        = einfo->ei_type;
3104                         lustre_handle_copy(&aa->oa_lockh, &lockh);
3105                         aa->oa_upcall      = upcall;
3106                         aa->oa_cookie      = cookie;
3107                         aa->oa_speculative = speculative;
3108                         if (!speculative) {
3109                                 aa->oa_flags  = flags;
3110                                 aa->oa_lvb    = lvb;
3111                         } else {
3112                                 /* speculative locks are essentially to enqueue
3113                                  * a DLM lock  in advance, so we don't care
3114                                  * about the result of the enqueue. */
3115                                 aa->oa_lvb    = NULL;
3116                                 aa->oa_flags  = NULL;
3117                         }
3118
3119                         req->rq_interpret_reply = osc_enqueue_interpret;
3120                         ptlrpc_set_add_req(rqset, req);
3121                 }
3122                 RETURN(rc);
3123         }
3124
3125         rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
3126                               flags, speculative, rc);
3127
3128         RETURN(rc);
3129 }
3130
3131 int osc_match_base(const struct lu_env *env, struct obd_export *exp,
3132                    struct ldlm_res_id *res_id, enum ldlm_type type,
3133                    union ldlm_policy_data *policy, enum ldlm_mode mode,
3134                    __u64 *flags, struct osc_object *obj,
3135                    struct lustre_handle *lockh, enum ldlm_match_flags match_flags)
3136 {
3137         struct obd_device *obd = exp->exp_obd;
3138         __u64 lflags = *flags;
3139         enum ldlm_mode rc;
3140         ENTRY;
3141
3142         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
3143                 RETURN(-EIO);
3144
3145         /* Filesystem lock extents are extended to page boundaries so that
3146          * dealing with the page cache is a little smoother */
3147         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
3148         policy->l_extent.end |= ~PAGE_MASK;
3149
3150         /* Next, search for already existing extent locks that will cover us */
3151         rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0,
3152                                         res_id, type, policy, mode, lockh,
3153                                         match_flags);
3154         if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
3155                 RETURN(rc);
3156
3157         if (obj != NULL) {
3158                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
3159
3160                 LASSERT(lock != NULL);
3161                 if (osc_set_lock_data(lock, obj)) {
3162                         lock_res_and_lock(lock);
3163                         if (!ldlm_is_lvb_cached(lock)) {
3164                                 LASSERT(lock->l_ast_data == obj);
3165                                 osc_lock_lvb_update(env, obj, lock, NULL);
3166                                 ldlm_set_lvb_cached(lock);
3167                         }
3168                         unlock_res_and_lock(lock);
3169                 } else {
3170                         ldlm_lock_decref(lockh, rc);
3171                         rc = 0;
3172                 }
3173                 LDLM_LOCK_PUT(lock);
3174         }
3175         RETURN(rc);
3176 }
3177
3178 static int osc_statfs_interpret(const struct lu_env *env,
3179                                 struct ptlrpc_request *req, void *args, int rc)
3180 {
3181         struct osc_async_args *aa = args;
3182         struct obd_statfs *msfs;
3183
3184         ENTRY;
3185         if (rc == -EBADR)
3186                 /*
3187                  * The request has in fact never been sent due to issues at
3188                  * a higher level (LOV).  Exit immediately since the caller
3189                  * is aware of the problem and takes care of the clean up.
3190                  */
3191                 RETURN(rc);
3192
3193         if ((rc == -ENOTCONN || rc == -EAGAIN) &&
3194             (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
3195                 GOTO(out, rc = 0);
3196
3197         if (rc != 0)
3198                 GOTO(out, rc);
3199
3200         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
3201         if (msfs == NULL)
3202                 GOTO(out, rc = -EPROTO);
3203
3204         *aa->aa_oi->oi_osfs = *msfs;
3205 out:
3206         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
3207
3208         RETURN(rc);
3209 }
3210
3211 static int osc_statfs_async(struct obd_export *exp,
3212                             struct obd_info *oinfo, time64_t max_age,
3213                             struct ptlrpc_request_set *rqset)
3214 {
3215         struct obd_device     *obd = class_exp2obd(exp);
3216         struct ptlrpc_request *req;
3217         struct osc_async_args *aa;
3218         int rc;
3219         ENTRY;
3220
3221         if (obd->obd_osfs_age >= max_age) {
3222                 CDEBUG(D_SUPER,
3223                        "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
3224                        obd->obd_name, &obd->obd_osfs,
3225                        obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
3226                        obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
3227                 spin_lock(&obd->obd_osfs_lock);
3228                 memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
3229                 spin_unlock(&obd->obd_osfs_lock);
3230                 oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
3231                 if (oinfo->oi_cb_up)
3232                         oinfo->oi_cb_up(oinfo, 0);
3233
3234                 RETURN(0);
3235         }
3236
3237         /* We could possibly pass max_age in the request (as an absolute
3238          * timestamp or a "seconds.usec ago") so the target can avoid doing
3239          * extra calls into the filesystem if that isn't necessary (e.g.
3240          * during mount that would help a bit).  Having relative timestamps
3241          * is not so great if request processing is slow, while absolute
3242          * timestamps are not ideal because they need time synchronization. */
3243         req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
3244         if (req == NULL)
3245                 RETURN(-ENOMEM);
3246
3247         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3248         if (rc) {
3249                 ptlrpc_request_free(req);
3250                 RETURN(rc);
3251         }
3252         ptlrpc_request_set_replen(req);
3253         req->rq_request_portal = OST_CREATE_PORTAL;
3254         ptlrpc_at_set_req_timeout(req);
3255
3256         if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
3257                 /* procfs requests not want stat in wait for avoid deadlock */
3258                 req->rq_no_resend = 1;
3259                 req->rq_no_delay = 1;
3260         }
3261
3262         req->rq_interpret_reply = osc_statfs_interpret;
3263         aa = ptlrpc_req_async_args(aa, req);
3264         aa->aa_oi = oinfo;
3265
3266         ptlrpc_set_add_req(rqset, req);
3267         RETURN(0);
3268 }
3269
3270 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
3271                       struct obd_statfs *osfs, time64_t max_age, __u32 flags)
3272 {
3273         struct obd_device     *obd = class_exp2obd(exp);
3274         struct obd_statfs     *msfs;
3275         struct ptlrpc_request *req;
3276         struct obd_import     *imp, *imp0;
3277         int rc;
3278         ENTRY;
3279
3280         /*Since the request might also come from lprocfs, so we need
3281          *sync this with client_disconnect_export Bug15684
3282          */
3283         with_imp_locked(obd, imp0, rc)
3284                 imp = class_import_get(imp0);
3285         if (rc)
3286                 RETURN(rc);
3287
3288         /* We could possibly pass max_age in the request (as an absolute
3289          * timestamp or a "seconds.usec ago") so the target can avoid doing
3290          * extra calls into the filesystem if that isn't necessary (e.g.
3291          * during mount that would help a bit).  Having relative timestamps
3292          * is not so great if request processing is slow, while absolute
3293          * timestamps are not ideal because they need time synchronization. */
3294         req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
3295
3296         class_import_put(imp);
3297
3298         if (req == NULL)
3299                 RETURN(-ENOMEM);
3300
3301         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3302         if (rc) {
3303                 ptlrpc_request_free(req);
3304                 RETURN(rc);
3305         }
3306         ptlrpc_request_set_replen(req);
3307         req->rq_request_portal = OST_CREATE_PORTAL;
3308         ptlrpc_at_set_req_timeout(req);
3309
3310         if (flags & OBD_STATFS_NODELAY) {
3311                 /* procfs requests not want stat in wait for avoid deadlock */
3312                 req->rq_no_resend = 1;
3313                 req->rq_no_delay = 1;
3314         }
3315
3316         rc = ptlrpc_queue_wait(req);
3317         if (rc)
3318                 GOTO(out, rc);
3319
3320         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
3321         if (msfs == NULL)
3322                 GOTO(out, rc = -EPROTO);
3323
3324         *osfs = *msfs;
3325
3326         EXIT;
3327 out:
3328         ptlrpc_req_finished(req);
3329         return rc;
3330 }
3331
3332 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
3333                          void *karg, void __user *uarg)
3334 {
3335         struct obd_device *obd = exp->exp_obd;
3336         struct obd_ioctl_data *data = karg;
3337         int rc = 0;
3338
3339         ENTRY;
3340         CDEBUG(D_IOCTL, "%s: cmd=%x len=%u karg=%pK uarg=%pK\n",
3341                obd->obd_name, cmd, len, karg, uarg);
3342
3343         if (!try_module_get(THIS_MODULE)) {
3344                 CERROR("%s: cannot get module '%s'\n", obd->obd_name,
3345                        module_name(THIS_MODULE));
3346                 return -EINVAL;
3347         }
3348         switch (cmd) {
3349         case OBD_IOC_CLIENT_RECOVER:
3350                 rc = ptlrpc_recover_import(obd->u.cli.cl_import,
3351                                            data->ioc_inlbuf1, 0);
3352                 if (rc > 0)
3353                         rc = 0;
3354                 break;
3355         case OBD_IOC_GETATTR:
3356                 rc = obd_getattr(NULL, exp, &data->ioc_obdo1);
3357                 break;
3358         case IOC_OSC_SET_ACTIVE:
3359                 rc = ptlrpc_set_import_active(obd->u.cli.cl_import,
3360                                               data->ioc_offset);
3361                 break;
3362         default:
3363                 rc = OBD_IOC_DEBUG(D_IOCTL, obd->obd_name, cmd, "unrecognized",
3364                                    -ENOTTY);
3365                 break;
3366         }
3367
3368         module_put(THIS_MODULE);
3369         return rc;
3370 }
3371
3372 int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
3373                        u32 keylen, void *key, u32 vallen, void *val,
3374                        struct ptlrpc_request_set *set)
3375 {
3376         struct ptlrpc_request *req;
3377         struct obd_device     *obd = exp->exp_obd;
3378         struct obd_import     *imp = class_exp2cliimp(exp);
3379         char                  *tmp;
3380         int                    rc;
3381         ENTRY;
3382
3383         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
3384
3385         if (KEY_IS(KEY_CHECKSUM)) {
3386                 if (vallen != sizeof(int))
3387                         RETURN(-EINVAL);
3388                 exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
3389                 RETURN(0);
3390         }
3391
3392         if (KEY_IS(KEY_SPTLRPC_CONF)) {
3393                 sptlrpc_conf_client_adapt(obd);
3394                 RETURN(0);
3395         }
3396
3397         if (KEY_IS(KEY_FLUSH_CTX)) {
3398                 sptlrpc_import_flush_my_ctx(imp);
3399                 RETURN(0);
3400         }
3401
3402         if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
3403                 struct client_obd *cli = &obd->u.cli;
3404                 long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
3405                 long target = *(long *)val;
3406
3407                 nr = osc_lru_shrink(env, cli, min(nr, target), true);
3408                 *(long *)val -= nr;
3409                 RETURN(0);
3410         }
3411
3412         if (!set && !KEY_IS(KEY_GRANT_SHRINK))
3413                 RETURN(-EINVAL);
3414
3415         /* We pass all other commands directly to OST. Since nobody calls osc
3416            methods directly and everybody is supposed to go through LOV, we
3417            assume lov checked invalid values for us.
3418            The only recognised values so far are evict_by_nid and mds_conn.
3419            Even if something bad goes through, we'd get a -EINVAL from OST
3420            anyway. */
3421
3422         req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
3423                                                 &RQF_OST_SET_GRANT_INFO :
3424                                                 &RQF_OBD_SET_INFO);
3425         if (req == NULL)
3426                 RETURN(-ENOMEM);
3427
3428         req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
3429                              RCL_CLIENT, keylen);
3430         if (!KEY_IS(KEY_GRANT_SHRINK))
3431                 req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
3432                                      RCL_CLIENT, vallen);
3433         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
3434         if (rc) {
3435                 ptlrpc_request_free(req);
3436                 RETURN(rc);
3437         }
3438
3439         tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
3440         memcpy(tmp, key, keylen);
3441         tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
3442                                                         &RMF_OST_BODY :
3443                                                         &RMF_SETINFO_VAL);
3444         memcpy(tmp, val, vallen);
3445
3446         if (KEY_IS(KEY_GRANT_SHRINK)) {
3447                 struct osc_grant_args *aa;
3448                 struct obdo *oa;
3449
3450                 aa = ptlrpc_req_async_args(aa, req);
3451                 OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
3452                 if (!oa) {
3453                         ptlrpc_req_finished(req);
3454                         RETURN(-ENOMEM);
3455                 }
3456                 *oa = ((struct ost_body *)val)->oa;
3457                 aa->aa_oa = oa;
3458                 req->rq_interpret_reply = osc_shrink_grant_interpret;
3459         }
3460
3461         ptlrpc_request_set_replen(req);
3462         if (!KEY_IS(KEY_GRANT_SHRINK)) {
3463                 LASSERT(set != NULL);
3464                 ptlrpc_set_add_req(set, req);
3465                 ptlrpc_check_set(NULL, set);
3466         } else {
3467                 ptlrpcd_add_req(req);
3468         }
3469
3470         RETURN(0);
3471 }
3472 EXPORT_SYMBOL(osc_set_info_async);
3473
3474 int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
3475                   struct obd_device *obd, struct obd_uuid *cluuid,
3476                   struct obd_connect_data *data, void *localdata)
3477 {
3478         struct client_obd *cli = &obd->u.cli;
3479
3480         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
3481                 long lost_grant;
3482                 long grant;
3483
3484                 spin_lock(&cli->cl_loi_list_lock);
3485                 grant = cli->cl_avail_grant + cli->cl_reserved_grant;
3486                 if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
3487                         /* restore ocd_grant_blkbits as client page bits */
3488                         data->ocd_grant_blkbits = PAGE_SHIFT;
3489                         grant += cli->cl_dirty_grant;
3490                 } else {
3491                         grant += cli->cl_dirty_pages << PAGE_SHIFT;
3492                 }
3493                 data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
3494                 lost_grant = cli->cl_lost_grant;
3495                 cli->cl_lost_grant = 0;
3496                 spin_unlock(&cli->cl_loi_list_lock);
3497
3498                 CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
3499                        " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
3500                        data->ocd_version, data->ocd_grant, lost_grant);
3501         }
3502
3503         RETURN(0);
3504 }
3505 EXPORT_SYMBOL(osc_reconnect);
3506
3507 int osc_disconnect(struct obd_export *exp)
3508 {
3509         struct obd_device *obd = class_exp2obd(exp);
3510         int rc;
3511
3512         rc = client_disconnect_export(exp);
3513         /**
3514          * Initially we put del_shrink_grant before disconnect_export, but it
3515          * causes the following problem if setup (connect) and cleanup
3516          * (disconnect) are tangled together.
3517          *      connect p1                     disconnect p2
3518          *   ptlrpc_connect_import
3519          *     ...............               class_manual_cleanup
3520          *                                     osc_disconnect
3521          *                                     del_shrink_grant
3522          *   ptlrpc_connect_interrupt
3523          *     osc_init_grant
3524          *   add this client to shrink list
3525          *                                      cleanup_osc
3526          * Bang! grant shrink thread trigger the shrink. BUG18662
3527          */
3528         osc_del_grant_list(&obd->u.cli);
3529         return rc;
3530 }
3531 EXPORT_SYMBOL(osc_disconnect);
3532
3533 int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
3534                                  struct hlist_node *hnode, void *arg)
3535 {
3536         struct lu_env *env = arg;
3537         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
3538         struct ldlm_lock *lock;
3539         struct osc_object *osc = NULL;
3540         ENTRY;
3541
3542         lock_res(res);
3543         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
3544                 if (lock->l_ast_data != NULL && osc == NULL) {
3545                         osc = lock->l_ast_data;
3546                         cl_object_get(osc2cl(osc));
3547                 }
3548
3549                 /* clear LDLM_FL_CLEANED flag to make sure it will be canceled
3550                  * by the 2nd round of ldlm_namespace_clean() call in
3551                  * osc_import_event(). */
3552                 ldlm_clear_cleaned(lock);
3553         }
3554         unlock_res(res);
3555
3556         if (osc != NULL) {
3557                 osc_object_invalidate(env, osc);
3558                 cl_object_put(env, osc2cl(osc));
3559         }
3560
3561         RETURN(0);
3562 }
3563 EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
3564
3565 static int osc_import_event(struct obd_device *obd,
3566                             struct obd_import *imp,
3567                             enum obd_import_event event)
3568 {
3569         struct client_obd *cli;
3570         int rc = 0;
3571
3572         ENTRY;
3573         LASSERT(imp->imp_obd == obd);
3574
3575         switch (event) {
3576         case IMP_EVENT_DISCON: {
3577                 cli = &obd->u.cli;
3578                 spin_lock(&cli->cl_loi_list_lock);
3579                 cli->cl_avail_grant = 0;
3580                 cli->cl_lost_grant = 0;
3581                 spin_unlock(&cli->cl_loi_list_lock);
3582                 break;
3583         }
3584         case IMP_EVENT_INACTIVE: {
3585                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
3586                 break;
3587         }
3588         case IMP_EVENT_INVALIDATE: {
3589                 struct ldlm_namespace *ns = obd->obd_namespace;
3590                 struct lu_env         *env;
3591                 __u16                  refcheck;
3592
3593                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3594
3595                 env = cl_env_get(&refcheck);
3596                 if (!IS_ERR(env)) {
3597                         osc_io_unplug(env, &obd->u.cli, NULL);
3598
3599                         cfs_hash_for_each_nolock(ns->ns_rs_hash,
3600                                                  osc_ldlm_resource_invalidate,
3601                                                  env, 0);
3602                         cl_env_put(env, &refcheck);
3603
3604                         ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3605                 } else
3606                         rc = PTR_ERR(env);
3607                 break;
3608         }
3609         case IMP_EVENT_ACTIVE: {
3610                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
3611                 break;
3612         }
3613         case IMP_EVENT_OCD: {
3614                 struct obd_connect_data *ocd = &imp->imp_connect_data;
3615
3616                 if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
3617                         osc_init_grant(&obd->u.cli, ocd);
3618
3619                 /* See bug 7198 */
3620                 if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
3621                         imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
3622
3623                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
3624                 break;
3625         }
3626         case IMP_EVENT_DEACTIVATE: {
3627                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
3628                 break;
3629         }
3630         case IMP_EVENT_ACTIVATE: {
3631                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
3632                 break;
3633         }
3634         default:
3635                 CERROR("Unknown import event %d\n", event);
3636                 LBUG();
3637         }
3638         RETURN(rc);
3639 }
3640
3641 /**
3642  * Determine whether the lock can be canceled before replaying the lock
3643  * during recovery, see bug16774 for detailed information.
3644  *
3645  * \retval zero the lock can't be canceled
3646  * \retval other ok to cancel
3647  */
3648 static int osc_cancel_weight(struct ldlm_lock *lock)
3649 {
3650         /*
3651          * Cancel all unused and granted extent lock.
3652          */
3653         if (lock->l_resource->lr_type == LDLM_EXTENT &&
3654             ldlm_is_granted(lock) &&
3655             osc_ldlm_weigh_ast(lock) == 0)
3656                 RETURN(1);
3657
3658         RETURN(0);
3659 }
3660
3661 static int brw_queue_work(const struct lu_env *env, void *data)
3662 {
3663         struct client_obd *cli = data;
3664
3665         CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
3666
3667         osc_io_unplug(env, cli, NULL);
3668         RETURN(0);
3669 }
3670
3671 int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
3672 {
3673         struct client_obd *cli = &obd->u.cli;
3674         void *handler;
3675         int rc;
3676
3677         ENTRY;
3678
3679         rc = ptlrpcd_addref();
3680         if (rc)
3681                 RETURN(rc);
3682
3683         rc = client_obd_setup(obd, lcfg);
3684         if (rc)
3685                 GOTO(out_ptlrpcd, rc);
3686
3687
3688         handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
3689         if (IS_ERR(handler))
3690                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3691         cli->cl_writeback_work = handler;
3692
3693         handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
3694         if (IS_ERR(handler))
3695                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3696         cli->cl_lru_work = handler;
3697
3698         rc = osc_quota_setup(obd);
3699         if (rc)
3700                 GOTO(out_ptlrpcd_work, rc);
3701
3702         cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
3703         cli->cl_root_squash = 0;
3704         osc_update_next_shrink(cli);
3705
3706         RETURN(rc);
3707
3708 out_ptlrpcd_work:
3709         if (cli->cl_writeback_work != NULL) {
3710                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3711                 cli->cl_writeback_work = NULL;
3712         }
3713         if (cli->cl_lru_work != NULL) {
3714                 ptlrpcd_destroy_work(cli->cl_lru_work);
3715                 cli->cl_lru_work = NULL;
3716         }
3717         client_obd_cleanup(obd);
3718 out_ptlrpcd:
3719         ptlrpcd_decref();
3720         RETURN(rc);
3721 }
3722 EXPORT_SYMBOL(osc_setup_common);
3723
3724 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
3725 {
3726         struct client_obd *cli = &obd->u.cli;
3727         int                adding;
3728         int                added;
3729         int                req_count;
3730         int                rc;
3731
3732         ENTRY;
3733
3734         rc = osc_setup_common(obd, lcfg);
3735         if (rc < 0)
3736                 RETURN(rc);
3737
3738         rc = osc_tunables_init(obd);
3739         if (rc)
3740                 RETURN(rc);
3741
3742         /*
3743          * We try to control the total number of requests with a upper limit
3744          * osc_reqpool_maxreqcount. There might be some race which will cause
3745          * over-limit allocation, but it is fine.
3746          */
3747         req_count = atomic_read(&osc_pool_req_count);
3748         if (req_count < osc_reqpool_maxreqcount) {
3749                 adding = cli->cl_max_rpcs_in_flight + 2;
3750                 if (req_count + adding > osc_reqpool_maxreqcount)
3751                         adding = osc_reqpool_maxreqcount - req_count;
3752
3753                 added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
3754                 atomic_add(added, &osc_pool_req_count);
3755         }
3756
3757         ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
3758
3759         spin_lock(&osc_shrink_lock);
3760         list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
3761         spin_unlock(&osc_shrink_lock);
3762         cli->cl_import->imp_idle_timeout = osc_idle_timeout;
3763         cli->cl_import->imp_idle_debug = D_HA;
3764
3765         RETURN(0);
3766 }
3767
3768 int osc_precleanup_common(struct obd_device *obd)
3769 {
3770         struct client_obd *cli = &obd->u.cli;
3771         ENTRY;
3772
3773         /* LU-464
3774          * for echo client, export may be on zombie list, wait for
3775          * zombie thread to cull it, because cli.cl_import will be
3776          * cleared in client_disconnect_export():
3777          *   class_export_destroy() -> obd_cleanup() ->
3778          *   echo_device_free() -> echo_client_cleanup() ->
3779          *   obd_disconnect() -> osc_disconnect() ->
3780          *   client_disconnect_export()
3781          */
3782         obd_zombie_barrier();
3783         if (cli->cl_writeback_work) {
3784                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3785                 cli->cl_writeback_work = NULL;
3786         }
3787
3788         if (cli->cl_lru_work) {
3789                 ptlrpcd_destroy_work(cli->cl_lru_work);
3790                 cli->cl_lru_work = NULL;
3791         }
3792
3793         obd_cleanup_client_import(obd);
3794         RETURN(0);
3795 }
3796 EXPORT_SYMBOL(osc_precleanup_common);
3797
3798 static int osc_precleanup(struct obd_device *obd)
3799 {
3800         ENTRY;
3801
3802         osc_precleanup_common(obd);
3803
3804         ptlrpc_lprocfs_unregister_obd(obd);
3805         RETURN(0);
3806 }
3807
3808 int osc_cleanup_common(struct obd_device *obd)
3809 {
3810         struct client_obd *cli = &obd->u.cli;
3811         int rc;
3812
3813         ENTRY;
3814
3815         spin_lock(&osc_shrink_lock);
3816         list_del(&cli->cl_shrink_list);
3817         spin_unlock(&osc_shrink_lock);
3818
3819         /* lru cleanup */
3820         if (cli->cl_cache != NULL) {
3821                 LASSERT(refcount_read(&cli->cl_cache->ccc_users) > 0);
3822                 spin_lock(&cli->cl_cache->ccc_lru_lock);
3823                 list_del_init(&cli->cl_lru_osc);
3824                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
3825                 cli->cl_lru_left = NULL;
3826                 cl_cache_decref(cli->cl_cache);
3827                 cli->cl_cache = NULL;
3828         }
3829
3830         /* free memory of osc quota cache */
3831         osc_quota_cleanup(obd);
3832
3833         rc = client_obd_cleanup(obd);
3834
3835         ptlrpcd_decref();
3836         RETURN(rc);
3837 }
3838 EXPORT_SYMBOL(osc_cleanup_common);
3839
3840 static const struct obd_ops osc_obd_ops = {
3841         .o_owner                = THIS_MODULE,
3842         .o_setup                = osc_setup,
3843         .o_precleanup           = osc_precleanup,
3844         .o_cleanup              = osc_cleanup_common,
3845         .o_add_conn             = client_import_add_conn,
3846         .o_del_conn             = client_import_del_conn,
3847         .o_connect              = client_connect_import,
3848         .o_reconnect            = osc_reconnect,
3849         .o_disconnect           = osc_disconnect,
3850         .o_statfs               = osc_statfs,
3851         .o_statfs_async         = osc_statfs_async,
3852         .o_create               = osc_create,
3853         .o_destroy              = osc_destroy,
3854         .o_getattr              = osc_getattr,
3855         .o_setattr              = osc_setattr,
3856         .o_iocontrol            = osc_iocontrol,
3857         .o_set_info_async       = osc_set_info_async,
3858         .o_import_event         = osc_import_event,
3859         .o_quotactl             = osc_quotactl,
3860 };
3861
3862 LIST_HEAD(osc_shrink_list);
3863 DEFINE_SPINLOCK(osc_shrink_lock);
3864
3865 #ifdef HAVE_SHRINKER_COUNT
3866 static struct shrinker osc_cache_shrinker = {
3867         .count_objects  = osc_cache_shrink_count,
3868         .scan_objects   = osc_cache_shrink_scan,
3869         .seeks          = DEFAULT_SEEKS,
3870 };
3871 #else
3872 static int osc_cache_shrink(struct shrinker *shrinker,
3873                             struct shrink_control *sc)
3874 {
3875         (void)osc_cache_shrink_scan(shrinker, sc);
3876
3877         return osc_cache_shrink_count(shrinker, sc);
3878 }
3879
3880 static struct shrinker osc_cache_shrinker = {
3881         .shrink   = osc_cache_shrink,
3882         .seeks    = DEFAULT_SEEKS,
3883 };
3884 #endif
3885
3886 static int __init osc_init(void)
3887 {
3888         unsigned int reqpool_size;
3889         unsigned int reqsize;
3890         int rc;
3891         ENTRY;
3892
3893         /* print an address of _any_ initialized kernel symbol from this
3894          * module, to allow debugging with gdb that doesn't support data
3895          * symbols from modules.*/
3896         CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
3897
3898         rc = lu_kmem_init(osc_caches);
3899         if (rc)
3900                 RETURN(rc);
3901
3902         rc = register_shrinker(&osc_cache_shrinker);
3903         if (rc)
3904                 GOTO(out_kmem, rc);
3905
3906         /* This is obviously too much memory, only prevent overflow here */
3907         if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
3908                 GOTO(out_shrinker, rc = -EINVAL);
3909
3910         reqpool_size = osc_reqpool_mem_max << 20;
3911
3912         reqsize = 1;
3913         while (reqsize < OST_IO_MAXREQSIZE)
3914                 reqsize = reqsize << 1;
3915
3916         /*
3917          * We don't enlarge the request count in OSC pool according to
3918          * cl_max_rpcs_in_flight. The allocation from the pool will only be
3919          * tried after normal allocation failed. So a small OSC pool won't
3920          * cause much performance degression in most of cases.
3921          */
3922         osc_reqpool_maxreqcount = reqpool_size / reqsize;
3923
3924         atomic_set(&osc_pool_req_count, 0);
3925         osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
3926                                           ptlrpc_add_rqs_to_pool);
3927
3928         if (osc_rq_pool == NULL)
3929                 GOTO(out_shrinker, rc = -ENOMEM);
3930
3931         rc = osc_start_grant_work();
3932         if (rc != 0)
3933                 GOTO(out_req_pool, rc);
3934
3935         rc = class_register_type(&osc_obd_ops, NULL, true,
3936                                  LUSTRE_OSC_NAME, &osc_device_type);
3937         if (rc < 0)
3938                 GOTO(out_stop_grant, rc);
3939
3940         RETURN(rc);
3941
3942 out_stop_grant:
3943         osc_stop_grant_work();
3944 out_req_pool:
3945         ptlrpc_free_rq_pool(osc_rq_pool);
3946 out_shrinker:
3947         unregister_shrinker(&osc_cache_shrinker);
3948 out_kmem:
3949         lu_kmem_fini(osc_caches);
3950
3951         RETURN(rc);
3952 }
3953
3954 static void __exit osc_exit(void)
3955 {
3956         class_unregister_type(LUSTRE_OSC_NAME);
3957         ptlrpc_free_rq_pool(osc_rq_pool);
3958         osc_stop_grant_work();
3959         unregister_shrinker(&osc_cache_shrinker);
3960         lu_kmem_fini(osc_caches);
3961 }
3962
3963 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
3964 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
3965 MODULE_VERSION(LUSTRE_VERSION_STRING);
3966 MODULE_LICENSE("GPL");
3967
3968 module_init(osc_init);
3969 module_exit(osc_exit);