Whamcloud - gitweb
LU-13212 osc: fall back to vmalloc for large RPCs
[fs/lustre-release.git] / lustre / osc / osc_request.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  */
32
33 #define DEBUG_SUBSYSTEM S_OSC
34
35 #include <linux/workqueue.h>
36 #include <libcfs/libcfs.h>
37 #include <linux/falloc.h>
38 #include <lprocfs_status.h>
39 #include <lustre_dlm.h>
40 #include <lustre_fid.h>
41 #include <lustre_ha.h>
42 #include <uapi/linux/lustre/lustre_ioctl.h>
43 #include <lustre_net.h>
44 #include <lustre_obdo.h>
45 #include <obd.h>
46 #include <obd_cksum.h>
47 #include <obd_class.h>
48 #include <lustre_osc.h>
49 #include <linux/falloc.h>
50
51 #include "osc_internal.h"
52
53 atomic_t osc_pool_req_count;
54 unsigned int osc_reqpool_maxreqcount;
55 struct ptlrpc_request_pool *osc_rq_pool;
56
57 /* max memory used for request pool, unit is MB */
58 static unsigned int osc_reqpool_mem_max = 5;
59 module_param(osc_reqpool_mem_max, uint, 0444);
60
61 static int osc_idle_timeout = 20;
62 module_param(osc_idle_timeout, uint, 0644);
63
64 #define osc_grant_args osc_brw_async_args
65
66 struct osc_setattr_args {
67         struct obdo             *sa_oa;
68         obd_enqueue_update_f     sa_upcall;
69         void                    *sa_cookie;
70 };
71
72 struct osc_fsync_args {
73         struct osc_object       *fa_obj;
74         struct obdo             *fa_oa;
75         obd_enqueue_update_f    fa_upcall;
76         void                    *fa_cookie;
77 };
78
79 struct osc_ladvise_args {
80         struct obdo             *la_oa;
81         obd_enqueue_update_f     la_upcall;
82         void                    *la_cookie;
83 };
84
85 static void osc_release_ppga(struct brw_page **ppga, size_t count);
86 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
87                          void *data, int rc);
88
89 void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
90 {
91         struct ost_body *body;
92
93         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
94         LASSERT(body);
95
96         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
97 }
98
99 static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
100                        struct obdo *oa)
101 {
102         struct ptlrpc_request   *req;
103         struct ost_body         *body;
104         int                      rc;
105
106         ENTRY;
107         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
108         if (req == NULL)
109                 RETURN(-ENOMEM);
110
111         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
112         if (rc) {
113                 ptlrpc_request_free(req);
114                 RETURN(rc);
115         }
116
117         osc_pack_req_body(req, oa);
118
119         ptlrpc_request_set_replen(req);
120
121         rc = ptlrpc_queue_wait(req);
122         if (rc)
123                 GOTO(out, rc);
124
125         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
126         if (body == NULL)
127                 GOTO(out, rc = -EPROTO);
128
129         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
130         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
131
132         oa->o_blksize = cli_brw_size(exp->exp_obd);
133         oa->o_valid |= OBD_MD_FLBLKSZ;
134
135         EXIT;
136 out:
137         ptlrpc_req_finished(req);
138
139         return rc;
140 }
141
142 static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
143                        struct obdo *oa)
144 {
145         struct ptlrpc_request   *req;
146         struct ost_body         *body;
147         int                      rc;
148
149         ENTRY;
150         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
151
152         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
153         if (req == NULL)
154                 RETURN(-ENOMEM);
155
156         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
157         if (rc) {
158                 ptlrpc_request_free(req);
159                 RETURN(rc);
160         }
161
162         osc_pack_req_body(req, oa);
163
164         ptlrpc_request_set_replen(req);
165
166         rc = ptlrpc_queue_wait(req);
167         if (rc)
168                 GOTO(out, rc);
169
170         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
171         if (body == NULL)
172                 GOTO(out, rc = -EPROTO);
173
174         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
175
176         EXIT;
177 out:
178         ptlrpc_req_finished(req);
179
180         RETURN(rc);
181 }
182
183 static int osc_setattr_interpret(const struct lu_env *env,
184                                  struct ptlrpc_request *req, void *args, int rc)
185 {
186         struct osc_setattr_args *sa = args;
187         struct ost_body *body;
188
189         ENTRY;
190
191         if (rc != 0)
192                 GOTO(out, rc);
193
194         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
195         if (body == NULL)
196                 GOTO(out, rc = -EPROTO);
197
198         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
199                              &body->oa);
200 out:
201         rc = sa->sa_upcall(sa->sa_cookie, rc);
202         RETURN(rc);
203 }
204
205 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
206                       obd_enqueue_update_f upcall, void *cookie,
207                       struct ptlrpc_request_set *rqset)
208 {
209         struct ptlrpc_request   *req;
210         struct osc_setattr_args *sa;
211         int                      rc;
212
213         ENTRY;
214
215         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
216         if (req == NULL)
217                 RETURN(-ENOMEM);
218
219         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
220         if (rc) {
221                 ptlrpc_request_free(req);
222                 RETURN(rc);
223         }
224
225         osc_pack_req_body(req, oa);
226
227         ptlrpc_request_set_replen(req);
228
229         /* do mds to ost setattr asynchronously */
230         if (!rqset) {
231                 /* Do not wait for response. */
232                 ptlrpcd_add_req(req);
233         } else {
234                 req->rq_interpret_reply = osc_setattr_interpret;
235
236                 sa = ptlrpc_req_async_args(sa, req);
237                 sa->sa_oa = oa;
238                 sa->sa_upcall = upcall;
239                 sa->sa_cookie = cookie;
240
241                 ptlrpc_set_add_req(rqset, req);
242         }
243
244         RETURN(0);
245 }
246
247 static int osc_ladvise_interpret(const struct lu_env *env,
248                                  struct ptlrpc_request *req,
249                                  void *arg, int rc)
250 {
251         struct osc_ladvise_args *la = arg;
252         struct ost_body *body;
253         ENTRY;
254
255         if (rc != 0)
256                 GOTO(out, rc);
257
258         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
259         if (body == NULL)
260                 GOTO(out, rc = -EPROTO);
261
262         *la->la_oa = body->oa;
263 out:
264         rc = la->la_upcall(la->la_cookie, rc);
265         RETURN(rc);
266 }
267
268 /**
269  * If rqset is NULL, do not wait for response. Upcall and cookie could also
270  * be NULL in this case
271  */
272 int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
273                      struct ladvise_hdr *ladvise_hdr,
274                      obd_enqueue_update_f upcall, void *cookie,
275                      struct ptlrpc_request_set *rqset)
276 {
277         struct ptlrpc_request   *req;
278         struct ost_body         *body;
279         struct osc_ladvise_args *la;
280         int                      rc;
281         struct lu_ladvise       *req_ladvise;
282         struct lu_ladvise       *ladvise = ladvise_hdr->lah_advise;
283         int                      num_advise = ladvise_hdr->lah_count;
284         struct ladvise_hdr      *req_ladvise_hdr;
285         ENTRY;
286
287         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
288         if (req == NULL)
289                 RETURN(-ENOMEM);
290
291         req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
292                              num_advise * sizeof(*ladvise));
293         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
294         if (rc != 0) {
295                 ptlrpc_request_free(req);
296                 RETURN(rc);
297         }
298         req->rq_request_portal = OST_IO_PORTAL;
299         ptlrpc_at_set_req_timeout(req);
300
301         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
302         LASSERT(body);
303         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
304                              oa);
305
306         req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
307                                                  &RMF_OST_LADVISE_HDR);
308         memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
309
310         req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
311         memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
312         ptlrpc_request_set_replen(req);
313
314         if (rqset == NULL) {
315                 /* Do not wait for response. */
316                 ptlrpcd_add_req(req);
317                 RETURN(0);
318         }
319
320         req->rq_interpret_reply = osc_ladvise_interpret;
321         la = ptlrpc_req_async_args(la, req);
322         la->la_oa = oa;
323         la->la_upcall = upcall;
324         la->la_cookie = cookie;
325
326         ptlrpc_set_add_req(rqset, req);
327
328         RETURN(0);
329 }
330
331 static int osc_create(const struct lu_env *env, struct obd_export *exp,
332                       struct obdo *oa)
333 {
334         struct ptlrpc_request *req;
335         struct ost_body       *body;
336         int                    rc;
337         ENTRY;
338
339         LASSERT(oa != NULL);
340         LASSERT(oa->o_valid & OBD_MD_FLGROUP);
341         LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
342
343         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
344         if (req == NULL)
345                 GOTO(out, rc = -ENOMEM);
346
347         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
348         if (rc) {
349                 ptlrpc_request_free(req);
350                 GOTO(out, rc);
351         }
352
353         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
354         LASSERT(body);
355
356         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
357
358         ptlrpc_request_set_replen(req);
359
360         rc = ptlrpc_queue_wait(req);
361         if (rc)
362                 GOTO(out_req, rc);
363
364         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
365         if (body == NULL)
366                 GOTO(out_req, rc = -EPROTO);
367
368         CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
369         lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
370
371         oa->o_blksize = cli_brw_size(exp->exp_obd);
372         oa->o_valid |= OBD_MD_FLBLKSZ;
373
374         CDEBUG(D_HA, "transno: %lld\n",
375                lustre_msg_get_transno(req->rq_repmsg));
376 out_req:
377         ptlrpc_req_finished(req);
378 out:
379         RETURN(rc);
380 }
381
382 int osc_punch_send(struct obd_export *exp, struct obdo *oa,
383                    obd_enqueue_update_f upcall, void *cookie)
384 {
385         struct ptlrpc_request *req;
386         struct osc_setattr_args *sa;
387         struct obd_import *imp = class_exp2cliimp(exp);
388         struct ost_body *body;
389         int rc;
390
391         ENTRY;
392
393         req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
394         if (req == NULL)
395                 RETURN(-ENOMEM);
396
397         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
398         if (rc < 0) {
399                 ptlrpc_request_free(req);
400                 RETURN(rc);
401         }
402
403         osc_set_io_portal(req);
404
405         ptlrpc_at_set_req_timeout(req);
406
407         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
408
409         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
410
411         ptlrpc_request_set_replen(req);
412
413         req->rq_interpret_reply = osc_setattr_interpret;
414         sa = ptlrpc_req_async_args(sa, req);
415         sa->sa_oa = oa;
416         sa->sa_upcall = upcall;
417         sa->sa_cookie = cookie;
418
419         ptlrpcd_add_req(req);
420
421         RETURN(0);
422 }
423 EXPORT_SYMBOL(osc_punch_send);
424
425 /**
426  * osc_fallocate_base() - Handles fallocate request.
427  *
428  * @exp:        Export structure
429  * @oa:         Attributes passed to OSS from client (obdo structure)
430  * @upcall:     Primary & supplementary group information
431  * @cookie:     Exclusive identifier
432  * @rqset:      Request list.
433  * @mode:       Operation done on given range.
434  *
435  * osc_fallocate_base() - Handles fallocate requests only. Only block
436  * allocation or standard preallocate operation is supported currently.
437  * Other mode flags is not supported yet. ftruncate(2) or truncate(2)
438  * is supported via SETATTR request.
439  *
440  * Return: Non-zero on failure and O on success.
441  */
442 int osc_fallocate_base(struct obd_export *exp, struct obdo *oa,
443                        obd_enqueue_update_f upcall, void *cookie, int mode)
444 {
445         struct ptlrpc_request *req;
446         struct osc_setattr_args *sa;
447         struct ost_body *body;
448         struct obd_import *imp = class_exp2cliimp(exp);
449         int rc;
450         ENTRY;
451
452         /*
453          * Only mode == 0 (which is standard prealloc) is supported now.
454          * Punch is not supported yet.
455          */
456         if (mode & ~FALLOC_FL_KEEP_SIZE)
457                 RETURN(-EOPNOTSUPP);
458         oa->o_falloc_mode = mode;
459
460         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
461                                    &RQF_OST_FALLOCATE);
462         if (req == NULL)
463                 RETURN(-ENOMEM);
464
465         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_FALLOCATE);
466         if (rc != 0) {
467                 ptlrpc_request_free(req);
468                 RETURN(rc);
469         }
470
471         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
472         LASSERT(body);
473
474         lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
475
476         ptlrpc_request_set_replen(req);
477
478         req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
479         BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
480         sa = ptlrpc_req_async_args(sa, req);
481         sa->sa_oa = oa;
482         sa->sa_upcall = upcall;
483         sa->sa_cookie = cookie;
484
485         ptlrpcd_add_req(req);
486
487         RETURN(0);
488 }
489
490 static int osc_sync_interpret(const struct lu_env *env,
491                               struct ptlrpc_request *req, void *args, int rc)
492 {
493         struct osc_fsync_args *fa = args;
494         struct ost_body *body;
495         struct cl_attr *attr = &osc_env_info(env)->oti_attr;
496         unsigned long valid = 0;
497         struct cl_object *obj;
498         ENTRY;
499
500         if (rc != 0)
501                 GOTO(out, rc);
502
503         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
504         if (body == NULL) {
505                 CERROR("can't unpack ost_body\n");
506                 GOTO(out, rc = -EPROTO);
507         }
508
509         *fa->fa_oa = body->oa;
510         obj = osc2cl(fa->fa_obj);
511
512         /* Update osc object's blocks attribute */
513         cl_object_attr_lock(obj);
514         if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
515                 attr->cat_blocks = body->oa.o_blocks;
516                 valid |= CAT_BLOCKS;
517         }
518
519         if (valid != 0)
520                 cl_object_attr_update(env, obj, attr, valid);
521         cl_object_attr_unlock(obj);
522
523 out:
524         rc = fa->fa_upcall(fa->fa_cookie, rc);
525         RETURN(rc);
526 }
527
528 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
529                   obd_enqueue_update_f upcall, void *cookie,
530                   struct ptlrpc_request_set *rqset)
531 {
532         struct obd_export     *exp = osc_export(obj);
533         struct ptlrpc_request *req;
534         struct ost_body       *body;
535         struct osc_fsync_args *fa;
536         int                    rc;
537         ENTRY;
538
539         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
540         if (req == NULL)
541                 RETURN(-ENOMEM);
542
543         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
544         if (rc) {
545                 ptlrpc_request_free(req);
546                 RETURN(rc);
547         }
548
549         /* overload the size and blocks fields in the oa with start/end */
550         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
551         LASSERT(body);
552         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
553
554         ptlrpc_request_set_replen(req);
555         req->rq_interpret_reply = osc_sync_interpret;
556
557         fa = ptlrpc_req_async_args(fa, req);
558         fa->fa_obj = obj;
559         fa->fa_oa = oa;
560         fa->fa_upcall = upcall;
561         fa->fa_cookie = cookie;
562
563         ptlrpc_set_add_req(rqset, req);
564
565         RETURN (0);
566 }
567
568 /* Find and cancel locally locks matched by @mode in the resource found by
569  * @objid. Found locks are added into @cancel list. Returns the amount of
570  * locks added to @cancels list. */
571 static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
572                                    struct list_head *cancels,
573                                    enum ldlm_mode mode, __u64 lock_flags)
574 {
575         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
576         struct ldlm_res_id res_id;
577         struct ldlm_resource *res;
578         int count;
579         ENTRY;
580
581         /* Return, i.e. cancel nothing, only if ELC is supported (flag in
582          * export) but disabled through procfs (flag in NS).
583          *
584          * This distinguishes from a case when ELC is not supported originally,
585          * when we still want to cancel locks in advance and just cancel them
586          * locally, without sending any RPC. */
587         if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
588                 RETURN(0);
589
590         ostid_build_res_name(&oa->o_oi, &res_id);
591         res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
592         if (IS_ERR(res))
593                 RETURN(0);
594
595         LDLM_RESOURCE_ADDREF(res);
596         count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
597                                            lock_flags, 0, NULL);
598         LDLM_RESOURCE_DELREF(res);
599         ldlm_resource_putref(res);
600         RETURN(count);
601 }
602
603 static int osc_destroy_interpret(const struct lu_env *env,
604                                  struct ptlrpc_request *req, void *args, int rc)
605 {
606         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
607
608         atomic_dec(&cli->cl_destroy_in_flight);
609         wake_up(&cli->cl_destroy_waitq);
610
611         return 0;
612 }
613
614 static int osc_can_send_destroy(struct client_obd *cli)
615 {
616         if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
617             cli->cl_max_rpcs_in_flight) {
618                 /* The destroy request can be sent */
619                 return 1;
620         }
621         if (atomic_dec_return(&cli->cl_destroy_in_flight) <
622             cli->cl_max_rpcs_in_flight) {
623                 /*
624                  * The counter has been modified between the two atomic
625                  * operations.
626                  */
627                 wake_up(&cli->cl_destroy_waitq);
628         }
629         return 0;
630 }
631
632 static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
633                        struct obdo *oa)
634 {
635         struct client_obd     *cli = &exp->exp_obd->u.cli;
636         struct ptlrpc_request *req;
637         struct ost_body       *body;
638         LIST_HEAD(cancels);
639         int rc, count;
640         ENTRY;
641
642         if (!oa) {
643                 CDEBUG(D_INFO, "oa NULL\n");
644                 RETURN(-EINVAL);
645         }
646
647         count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
648                                         LDLM_FL_DISCARD_DATA);
649
650         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
651         if (req == NULL) {
652                 ldlm_lock_list_put(&cancels, l_bl_ast, count);
653                 RETURN(-ENOMEM);
654         }
655
656         rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
657                                0, &cancels, count);
658         if (rc) {
659                 ptlrpc_request_free(req);
660                 RETURN(rc);
661         }
662
663         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
664         ptlrpc_at_set_req_timeout(req);
665
666         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
667         LASSERT(body);
668         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
669
670         ptlrpc_request_set_replen(req);
671
672         req->rq_interpret_reply = osc_destroy_interpret;
673         if (!osc_can_send_destroy(cli)) {
674                 /*
675                  * Wait until the number of on-going destroy RPCs drops
676                  * under max_rpc_in_flight
677                  */
678                 rc = l_wait_event_abortable_exclusive(
679                         cli->cl_destroy_waitq,
680                         osc_can_send_destroy(cli));
681                 if (rc) {
682                         ptlrpc_req_finished(req);
683                         RETURN(-EINTR);
684                 }
685         }
686
687         /* Do not wait for response */
688         ptlrpcd_add_req(req);
689         RETURN(0);
690 }
691
692 static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
693                                 long writing_bytes)
694 {
695         u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
696
697         LASSERT(!(oa->o_valid & bits));
698
699         oa->o_valid |= bits;
700         spin_lock(&cli->cl_loi_list_lock);
701         if (cli->cl_ocd_grant_param)
702                 oa->o_dirty = cli->cl_dirty_grant;
703         else
704                 oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
705         if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
706                 CERROR("dirty %lu > dirty_max %lu\n",
707                        cli->cl_dirty_pages,
708                        cli->cl_dirty_max_pages);
709                 oa->o_undirty = 0;
710         } else if (unlikely(atomic_long_read(&obd_dirty_pages) >
711                             (long)(obd_max_dirty_pages + 1))) {
712                 /* The atomic_read() allowing the atomic_inc() are
713                  * not covered by a lock thus they may safely race and trip
714                  * this CERROR() unless we add in a small fudge factor (+1). */
715                 CERROR("%s: dirty %ld > system dirty_max %ld\n",
716                        cli_name(cli), atomic_long_read(&obd_dirty_pages),
717                        obd_max_dirty_pages);
718                 oa->o_undirty = 0;
719         } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
720                             0x7fffffff)) {
721                 CERROR("dirty %lu - dirty_max %lu too big???\n",
722                        cli->cl_dirty_pages, cli->cl_dirty_max_pages);
723                 oa->o_undirty = 0;
724         } else {
725                 unsigned long nrpages;
726                 unsigned long undirty;
727
728                 nrpages = cli->cl_max_pages_per_rpc;
729                 nrpages *= cli->cl_max_rpcs_in_flight + 1;
730                 nrpages = max(nrpages, cli->cl_dirty_max_pages);
731                 undirty = nrpages << PAGE_SHIFT;
732                 if (cli->cl_ocd_grant_param) {
733                         int nrextents;
734
735                         /* take extent tax into account when asking for more
736                          * grant space */
737                         nrextents = (nrpages + cli->cl_max_extent_pages - 1) /
738                                      cli->cl_max_extent_pages;
739                         undirty += nrextents * cli->cl_grant_extent_tax;
740                 }
741                 /* Do not ask for more than OBD_MAX_GRANT - a margin for server
742                  * to add extent tax, etc.
743                  */
744                 oa->o_undirty = min(undirty, OBD_MAX_GRANT &
745                                     ~(PTLRPC_MAX_BRW_SIZE * 4UL));
746         }
747         oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
748         /* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
749         if (cli->cl_lost_grant > INT_MAX) {
750                 CDEBUG(D_CACHE,
751                       "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
752                       cli_name(cli), cli->cl_lost_grant);
753                 oa->o_dropped = INT_MAX;
754         } else {
755                 oa->o_dropped = cli->cl_lost_grant;
756         }
757         cli->cl_lost_grant -= oa->o_dropped;
758         spin_unlock(&cli->cl_loi_list_lock);
759         CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
760                " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
761                oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
762 }
763
764 void osc_update_next_shrink(struct client_obd *cli)
765 {
766         cli->cl_next_shrink_grant = ktime_get_seconds() +
767                                     cli->cl_grant_shrink_interval;
768
769         CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
770                cli->cl_next_shrink_grant);
771 }
772
773 static void __osc_update_grant(struct client_obd *cli, u64 grant)
774 {
775         spin_lock(&cli->cl_loi_list_lock);
776         cli->cl_avail_grant += grant;
777         spin_unlock(&cli->cl_loi_list_lock);
778 }
779
780 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
781 {
782         if (body->oa.o_valid & OBD_MD_FLGRANT) {
783                 CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
784                 __osc_update_grant(cli, body->oa.o_grant);
785         }
786 }
787
788 /**
789  * grant thread data for shrinking space.
790  */
791 struct grant_thread_data {
792         struct list_head        gtd_clients;
793         struct mutex            gtd_mutex;
794         unsigned long           gtd_stopped:1;
795 };
796 static struct grant_thread_data client_gtd;
797
798 static int osc_shrink_grant_interpret(const struct lu_env *env,
799                                       struct ptlrpc_request *req,
800                                       void *args, int rc)
801 {
802         struct osc_grant_args *aa = args;
803         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
804         struct ost_body *body;
805
806         if (rc != 0) {
807                 __osc_update_grant(cli, aa->aa_oa->o_grant);
808                 GOTO(out, rc);
809         }
810
811         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
812         LASSERT(body);
813         osc_update_grant(cli, body);
814 out:
815         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
816         aa->aa_oa = NULL;
817
818         return rc;
819 }
820
821 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
822 {
823         spin_lock(&cli->cl_loi_list_lock);
824         oa->o_grant = cli->cl_avail_grant / 4;
825         cli->cl_avail_grant -= oa->o_grant;
826         spin_unlock(&cli->cl_loi_list_lock);
827         if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
828                 oa->o_valid |= OBD_MD_FLFLAGS;
829                 oa->o_flags = 0;
830         }
831         oa->o_flags |= OBD_FL_SHRINK_GRANT;
832         osc_update_next_shrink(cli);
833 }
834
835 /* Shrink the current grant, either from some large amount to enough for a
836  * full set of in-flight RPCs, or if we have already shrunk to that limit
837  * then to enough for a single RPC.  This avoids keeping more grant than
838  * needed, and avoids shrinking the grant piecemeal. */
839 static int osc_shrink_grant(struct client_obd *cli)
840 {
841         __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
842                              (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
843
844         spin_lock(&cli->cl_loi_list_lock);
845         if (cli->cl_avail_grant <= target_bytes)
846                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
847         spin_unlock(&cli->cl_loi_list_lock);
848
849         return osc_shrink_grant_to_target(cli, target_bytes);
850 }
851
852 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
853 {
854         int                     rc = 0;
855         struct ost_body        *body;
856         ENTRY;
857
858         spin_lock(&cli->cl_loi_list_lock);
859         /* Don't shrink if we are already above or below the desired limit
860          * We don't want to shrink below a single RPC, as that will negatively
861          * impact block allocation and long-term performance. */
862         if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
863                 target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
864
865         if (target_bytes >= cli->cl_avail_grant) {
866                 spin_unlock(&cli->cl_loi_list_lock);
867                 RETURN(0);
868         }
869         spin_unlock(&cli->cl_loi_list_lock);
870
871         OBD_ALLOC_PTR(body);
872         if (!body)
873                 RETURN(-ENOMEM);
874
875         osc_announce_cached(cli, &body->oa, 0);
876
877         spin_lock(&cli->cl_loi_list_lock);
878         if (target_bytes >= cli->cl_avail_grant) {
879                 /* available grant has changed since target calculation */
880                 spin_unlock(&cli->cl_loi_list_lock);
881                 GOTO(out_free, rc = 0);
882         }
883         body->oa.o_grant = cli->cl_avail_grant - target_bytes;
884         cli->cl_avail_grant = target_bytes;
885         spin_unlock(&cli->cl_loi_list_lock);
886         if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
887                 body->oa.o_valid |= OBD_MD_FLFLAGS;
888                 body->oa.o_flags = 0;
889         }
890         body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
891         osc_update_next_shrink(cli);
892
893         rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
894                                 sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
895                                 sizeof(*body), body, NULL);
896         if (rc != 0)
897                 __osc_update_grant(cli, body->oa.o_grant);
898 out_free:
899         OBD_FREE_PTR(body);
900         RETURN(rc);
901 }
902
903 static int osc_should_shrink_grant(struct client_obd *client)
904 {
905         time64_t next_shrink = client->cl_next_shrink_grant;
906
907         if (client->cl_import == NULL)
908                 return 0;
909
910         if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
911             client->cl_import->imp_grant_shrink_disabled) {
912                 osc_update_next_shrink(client);
913                 return 0;
914         }
915
916         if (ktime_get_seconds() >= next_shrink - 5) {
917                 /* Get the current RPC size directly, instead of going via:
918                  * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
919                  * Keep comment here so that it can be found by searching. */
920                 int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
921
922                 if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
923                     client->cl_avail_grant > brw_size)
924                         return 1;
925                 else
926                         osc_update_next_shrink(client);
927         }
928         return 0;
929 }
930
931 #define GRANT_SHRINK_RPC_BATCH  100
932
933 static struct delayed_work work;
934
935 static void osc_grant_work_handler(struct work_struct *data)
936 {
937         struct client_obd *cli;
938         int rpc_sent;
939         bool init_next_shrink = true;
940         time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
941
942         rpc_sent = 0;
943         mutex_lock(&client_gtd.gtd_mutex);
944         list_for_each_entry(cli, &client_gtd.gtd_clients,
945                             cl_grant_chain) {
946                 if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
947                     osc_should_shrink_grant(cli)) {
948                         osc_shrink_grant(cli);
949                         rpc_sent++;
950                 }
951
952                 if (!init_next_shrink) {
953                         if (cli->cl_next_shrink_grant < next_shrink &&
954                             cli->cl_next_shrink_grant > ktime_get_seconds())
955                                 next_shrink = cli->cl_next_shrink_grant;
956                 } else {
957                         init_next_shrink = false;
958                         next_shrink = cli->cl_next_shrink_grant;
959                 }
960         }
961         mutex_unlock(&client_gtd.gtd_mutex);
962
963         if (client_gtd.gtd_stopped == 1)
964                 return;
965
966         if (next_shrink > ktime_get_seconds()) {
967                 time64_t delay = next_shrink - ktime_get_seconds();
968
969                 schedule_delayed_work(&work, cfs_time_seconds(delay));
970         } else {
971                 schedule_work(&work.work);
972         }
973 }
974
975 void osc_schedule_grant_work(void)
976 {
977         cancel_delayed_work_sync(&work);
978         schedule_work(&work.work);
979 }
980
981 /**
982  * Start grant thread for returing grant to server for idle clients.
983  */
984 static int osc_start_grant_work(void)
985 {
986         client_gtd.gtd_stopped = 0;
987         mutex_init(&client_gtd.gtd_mutex);
988         INIT_LIST_HEAD(&client_gtd.gtd_clients);
989
990         INIT_DELAYED_WORK(&work, osc_grant_work_handler);
991         schedule_work(&work.work);
992
993         return 0;
994 }
995
996 static void osc_stop_grant_work(void)
997 {
998         client_gtd.gtd_stopped = 1;
999         cancel_delayed_work_sync(&work);
1000 }
1001
1002 static void osc_add_grant_list(struct client_obd *client)
1003 {
1004         mutex_lock(&client_gtd.gtd_mutex);
1005         list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
1006         mutex_unlock(&client_gtd.gtd_mutex);
1007 }
1008
1009 static void osc_del_grant_list(struct client_obd *client)
1010 {
1011         if (list_empty(&client->cl_grant_chain))
1012                 return;
1013
1014         mutex_lock(&client_gtd.gtd_mutex);
1015         list_del_init(&client->cl_grant_chain);
1016         mutex_unlock(&client_gtd.gtd_mutex);
1017 }
1018
1019 void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
1020 {
1021         /*
1022          * ocd_grant is the total grant amount we're expect to hold: if we've
1023          * been evicted, it's the new avail_grant amount, cl_dirty_pages will
1024          * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
1025          * dirty.
1026          *
1027          * race is tolerable here: if we're evicted, but imp_state already
1028          * left EVICTED state, then cl_dirty_pages must be 0 already.
1029          */
1030         spin_lock(&cli->cl_loi_list_lock);
1031         cli->cl_avail_grant = ocd->ocd_grant;
1032         if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
1033                 unsigned long consumed = cli->cl_reserved_grant;
1034
1035                 if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
1036                         consumed += cli->cl_dirty_grant;
1037                 else
1038                         consumed += cli->cl_dirty_pages << PAGE_SHIFT;
1039                 if (cli->cl_avail_grant < consumed) {
1040                         CERROR("%s: granted %ld but already consumed %ld\n",
1041                                cli_name(cli), cli->cl_avail_grant, consumed);
1042                         cli->cl_avail_grant = 0;
1043                 } else {
1044                         cli->cl_avail_grant -= consumed;
1045                 }
1046         }
1047
1048         if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
1049                 u64 size;
1050                 int chunk_mask;
1051
1052                 /* overhead for each extent insertion */
1053                 cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
1054                 /* determine the appropriate chunk size used by osc_extent. */
1055                 cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
1056                                           ocd->ocd_grant_blkbits);
1057                 /* max_pages_per_rpc must be chunk aligned */
1058                 chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
1059                 cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
1060                                              ~chunk_mask) & chunk_mask;
1061                 /* determine maximum extent size, in #pages */
1062                 size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
1063                 cli->cl_max_extent_pages = (size >> PAGE_SHIFT) ?: 1;
1064                 cli->cl_ocd_grant_param = 1;
1065         } else {
1066                 cli->cl_ocd_grant_param = 0;
1067                 cli->cl_grant_extent_tax = 0;
1068                 cli->cl_chunkbits = PAGE_SHIFT;
1069                 cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
1070         }
1071         spin_unlock(&cli->cl_loi_list_lock);
1072
1073         CDEBUG(D_CACHE,
1074                "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld. chunk bits: %d cl_max_extent_pages: %d\n",
1075                cli_name(cli),
1076                cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
1077                cli->cl_max_extent_pages);
1078
1079         if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
1080                 osc_add_grant_list(cli);
1081 }
1082 EXPORT_SYMBOL(osc_init_grant);
1083
1084 /* We assume that the reason this OSC got a short read is because it read
1085  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
1086  * via the LOV, and it _knows_ it's reading inside the file, it's just that
1087  * this stripe never got written at or beyond this stripe offset yet. */
1088 static void handle_short_read(int nob_read, size_t page_count,
1089                               struct brw_page **pga)
1090 {
1091         char *ptr;
1092         int i = 0;
1093
1094         /* skip bytes read OK */
1095         while (nob_read > 0) {
1096                 LASSERT (page_count > 0);
1097
1098                 if (pga[i]->count > nob_read) {
1099                         /* EOF inside this page */
1100                         ptr = kmap(pga[i]->pg) +
1101                                 (pga[i]->off & ~PAGE_MASK);
1102                         memset(ptr + nob_read, 0, pga[i]->count - nob_read);
1103                         kunmap(pga[i]->pg);
1104                         page_count--;
1105                         i++;
1106                         break;
1107                 }
1108
1109                 nob_read -= pga[i]->count;
1110                 page_count--;
1111                 i++;
1112         }
1113
1114         /* zero remaining pages */
1115         while (page_count-- > 0) {
1116                 ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
1117                 memset(ptr, 0, pga[i]->count);
1118                 kunmap(pga[i]->pg);
1119                 i++;
1120         }
1121 }
1122
1123 static int check_write_rcs(struct ptlrpc_request *req,
1124                            int requested_nob, int niocount,
1125                            size_t page_count, struct brw_page **pga)
1126 {
1127         int     i;
1128         __u32   *remote_rcs;
1129
1130         remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
1131                                                   sizeof(*remote_rcs) *
1132                                                   niocount);
1133         if (remote_rcs == NULL) {
1134                 CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
1135                 return(-EPROTO);
1136         }
1137
1138         /* return error if any niobuf was in error */
1139         for (i = 0; i < niocount; i++) {
1140                 if ((int)remote_rcs[i] < 0) {
1141                         CDEBUG(D_INFO, "rc[%d]: %d req %p\n",
1142                                i, remote_rcs[i], req);
1143                         return remote_rcs[i];
1144                 }
1145
1146                 if (remote_rcs[i] != 0) {
1147                         CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
1148                                 i, remote_rcs[i], req);
1149                         return(-EPROTO);
1150                 }
1151         }
1152         if (req->rq_bulk != NULL &&
1153             req->rq_bulk->bd_nob_transferred != requested_nob) {
1154                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
1155                        req->rq_bulk->bd_nob_transferred, requested_nob);
1156                 return(-EPROTO);
1157         }
1158
1159         return (0);
1160 }
1161
1162 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
1163 {
1164         if (p1->flag != p2->flag) {
1165                 unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
1166                                   OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
1167                                   OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
1168
1169                 /* warn if we try to combine flags that we don't know to be
1170                  * safe to combine */
1171                 if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
1172                         CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
1173                               "report this at https://jira.whamcloud.com/\n",
1174                               p1->flag, p2->flag);
1175                 }
1176                 return 0;
1177         }
1178
1179         return (p1->off + p1->count == p2->off);
1180 }
1181
1182 #if IS_ENABLED(CONFIG_CRC_T10DIF)
1183 static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
1184                                    size_t pg_count, struct brw_page **pga,
1185                                    int opc, obd_dif_csum_fn *fn,
1186                                    int sector_size,
1187                                    u32 *check_sum)
1188 {
1189         struct ahash_request *req;
1190         /* Used Adler as the default checksum type on top of DIF tags */
1191         unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
1192         struct page *__page;
1193         unsigned char *buffer;
1194         __u16 *guard_start;
1195         unsigned int bufsize;
1196         int guard_number;
1197         int used_number = 0;
1198         int used;
1199         u32 cksum;
1200         int rc = 0;
1201         int i = 0;
1202
1203         LASSERT(pg_count > 0);
1204
1205         __page = alloc_page(GFP_KERNEL);
1206         if (__page == NULL)
1207                 return -ENOMEM;
1208
1209         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1210         if (IS_ERR(req)) {
1211                 rc = PTR_ERR(req);
1212                 CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
1213                        obd_name, cfs_crypto_hash_name(cfs_alg), rc);
1214                 GOTO(out, rc);
1215         }
1216
1217         buffer = kmap(__page);
1218         guard_start = (__u16 *)buffer;
1219         guard_number = PAGE_SIZE / sizeof(*guard_start);
1220         while (nob > 0 && pg_count > 0) {
1221                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1222
1223                 /* corrupt the data before we compute the checksum, to
1224                  * simulate an OST->client data error */
1225                 if (unlikely(i == 0 && opc == OST_READ &&
1226                              OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
1227                         unsigned char *ptr = kmap(pga[i]->pg);
1228                         int off = pga[i]->off & ~PAGE_MASK;
1229
1230                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1231                         kunmap(pga[i]->pg);
1232                 }
1233
1234                 /*
1235                  * The left guard number should be able to hold checksums of a
1236                  * whole page
1237                  */
1238                 rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
1239                                                   pga[i]->off & ~PAGE_MASK,
1240                                                   count,
1241                                                   guard_start + used_number,
1242                                                   guard_number - used_number,
1243                                                   &used, sector_size,
1244                                                   fn);
1245                 if (rc)
1246                         break;
1247
1248                 used_number += used;
1249                 if (used_number == guard_number) {
1250                         cfs_crypto_hash_update_page(req, __page, 0,
1251                                 used_number * sizeof(*guard_start));
1252                         used_number = 0;
1253                 }
1254
1255                 nob -= pga[i]->count;
1256                 pg_count--;
1257                 i++;
1258         }
1259         kunmap(__page);
1260         if (rc)
1261                 GOTO(out, rc);
1262
1263         if (used_number != 0)
1264                 cfs_crypto_hash_update_page(req, __page, 0,
1265                         used_number * sizeof(*guard_start));
1266
1267         bufsize = sizeof(cksum);
1268         cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
1269
1270         /* For sending we only compute the wrong checksum instead
1271          * of corrupting the data so it is still correct on a redo */
1272         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1273                 cksum++;
1274
1275         *check_sum = cksum;
1276 out:
1277         __free_page(__page);
1278         return rc;
1279 }
1280 #else /* !CONFIG_CRC_T10DIF */
1281 #define obd_dif_ip_fn NULL
1282 #define obd_dif_crc_fn NULL
1283 #define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum)  \
1284         -EOPNOTSUPP
1285 #endif /* CONFIG_CRC_T10DIF */
1286
1287 static int osc_checksum_bulk(int nob, size_t pg_count,
1288                              struct brw_page **pga, int opc,
1289                              enum cksum_types cksum_type,
1290                              u32 *cksum)
1291 {
1292         int                             i = 0;
1293         struct ahash_request           *req;
1294         unsigned int                    bufsize;
1295         unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
1296
1297         LASSERT(pg_count > 0);
1298
1299         req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
1300         if (IS_ERR(req)) {
1301                 CERROR("Unable to initialize checksum hash %s\n",
1302                        cfs_crypto_hash_name(cfs_alg));
1303                 return PTR_ERR(req);
1304         }
1305
1306         while (nob > 0 && pg_count > 0) {
1307                 unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
1308
1309                 /* corrupt the data before we compute the checksum, to
1310                  * simulate an OST->client data error */
1311                 if (i == 0 && opc == OST_READ &&
1312                     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
1313                         unsigned char *ptr = kmap(pga[i]->pg);
1314                         int off = pga[i]->off & ~PAGE_MASK;
1315
1316                         memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
1317                         kunmap(pga[i]->pg);
1318                 }
1319                 cfs_crypto_hash_update_page(req, pga[i]->pg,
1320                                             pga[i]->off & ~PAGE_MASK,
1321                                             count);
1322                 LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
1323                                (int)(pga[i]->off & ~PAGE_MASK));
1324
1325                 nob -= pga[i]->count;
1326                 pg_count--;
1327                 i++;
1328         }
1329
1330         bufsize = sizeof(*cksum);
1331         cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
1332
1333         /* For sending we only compute the wrong checksum instead
1334          * of corrupting the data so it is still correct on a redo */
1335         if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
1336                 (*cksum)++;
1337
1338         return 0;
1339 }
1340
1341 static int osc_checksum_bulk_rw(const char *obd_name,
1342                                 enum cksum_types cksum_type,
1343                                 int nob, size_t pg_count,
1344                                 struct brw_page **pga, int opc,
1345                                 u32 *check_sum)
1346 {
1347         obd_dif_csum_fn *fn = NULL;
1348         int sector_size = 0;
1349         int rc;
1350
1351         ENTRY;
1352         obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
1353
1354         if (fn)
1355                 rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
1356                                              opc, fn, sector_size, check_sum);
1357         else
1358                 rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
1359                                        check_sum);
1360
1361         RETURN(rc);
1362 }
1363
1364 static inline void osc_release_bounce_pages(struct brw_page **pga,
1365                                             u32 page_count)
1366 {
1367 #ifdef HAVE_LUSTRE_CRYPTO
1368         int i;
1369
1370         for (i = 0; i < page_count; i++) {
1371                 /* Bounce pages allocated by a call to
1372                  * llcrypt_encrypt_pagecache_blocks() in osc_brw_prep_request()
1373                  * are identified thanks to the PageChecked flag.
1374                  */
1375                 if (PageChecked(pga[i]->pg))
1376                         llcrypt_finalize_bounce_page(&pga[i]->pg);
1377                 pga[i]->count -= pga[i]->bp_count_diff;
1378                 pga[i]->off += pga[i]->bp_off_diff;
1379         }
1380 #endif
1381 }
1382
1383 static int
1384 osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
1385                      u32 page_count, struct brw_page **pga,
1386                      struct ptlrpc_request **reqp, int resend)
1387 {
1388         struct ptlrpc_request *req;
1389         struct ptlrpc_bulk_desc *desc;
1390         struct ost_body *body;
1391         struct obd_ioobj *ioobj;
1392         struct niobuf_remote *niobuf;
1393         int niocount, i, requested_nob, opc, rc, short_io_size = 0;
1394         struct osc_brw_async_args *aa;
1395         struct req_capsule *pill;
1396         struct brw_page *pg_prev;
1397         void *short_io_buf;
1398         const char *obd_name = cli->cl_import->imp_obd->obd_name;
1399         struct inode *inode;
1400         bool directio = false;
1401
1402         ENTRY;
1403         inode = page2inode(pga[0]->pg);
1404         if (inode == NULL) {
1405                 /* Try to get reference to inode from cl_page if we are
1406                  * dealing with direct IO, as handled pages are not
1407                  * actual page cache pages.
1408                  */
1409                 struct osc_async_page *oap = brw_page2oap(pga[0]);
1410                 struct cl_page *clpage = oap2cl_page(oap);
1411
1412                 inode = clpage->cp_inode;
1413                 if (inode)
1414                         directio = true;
1415         }
1416         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
1417                 RETURN(-ENOMEM); /* Recoverable */
1418         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
1419                 RETURN(-EINVAL); /* Fatal */
1420
1421         if ((cmd & OBD_BRW_WRITE) != 0) {
1422                 opc = OST_WRITE;
1423                 req = ptlrpc_request_alloc_pool(cli->cl_import,
1424                                                 osc_rq_pool,
1425                                                 &RQF_OST_BRW_WRITE);
1426         } else {
1427                 opc = OST_READ;
1428                 req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
1429         }
1430         if (req == NULL)
1431                 RETURN(-ENOMEM);
1432
1433         if (opc == OST_WRITE && inode && IS_ENCRYPTED(inode)) {
1434                 for (i = 0; i < page_count; i++) {
1435                         struct brw_page *pg = pga[i];
1436                         struct page *data_page = NULL;
1437                         bool retried = false;
1438                         bool lockedbymyself;
1439                         u32 nunits = (pg->off & ~PAGE_MASK) + pg->count;
1440                         struct address_space *map_orig = NULL;
1441                         pgoff_t index_orig;
1442
1443 retry_encrypt:
1444                         if (nunits & ~LUSTRE_ENCRYPTION_MASK)
1445                                 nunits = (nunits & LUSTRE_ENCRYPTION_MASK) +
1446                                         LUSTRE_ENCRYPTION_UNIT_SIZE;
1447                         /* The page can already be locked when we arrive here.
1448                          * This is possible when cl_page_assume/vvp_page_assume
1449                          * is stuck on wait_on_page_writeback with page lock
1450                          * held. In this case there is no risk for the lock to
1451                          * be released while we are doing our encryption
1452                          * processing, because writeback against that page will
1453                          * end in vvp_page_completion_write/cl_page_completion,
1454                          * which means only once the page is fully processed.
1455                          */
1456                         lockedbymyself = trylock_page(pg->pg);
1457                         if (directio) {
1458                                 map_orig = pg->pg->mapping;
1459                                 pg->pg->mapping = inode->i_mapping;
1460                                 index_orig = pg->pg->index;
1461                                 pg->pg->index = pg->off >> PAGE_SHIFT;
1462                         }
1463                         data_page =
1464                                 llcrypt_encrypt_pagecache_blocks(pg->pg,
1465                                                                  nunits, 0,
1466                                                                  GFP_NOFS);
1467                         if (directio) {
1468                                 pg->pg->mapping = map_orig;
1469                                 pg->pg->index = index_orig;
1470                         }
1471                         if (lockedbymyself)
1472                                 unlock_page(pg->pg);
1473                         if (IS_ERR(data_page)) {
1474                                 rc = PTR_ERR(data_page);
1475                                 if (rc == -ENOMEM && !retried) {
1476                                         retried = true;
1477                                         rc = 0;
1478                                         goto retry_encrypt;
1479                                 }
1480                                 ptlrpc_request_free(req);
1481                                 RETURN(rc);
1482                         }
1483                         /* Set PageChecked flag on bounce page for
1484                          * disambiguation in osc_release_bounce_pages().
1485                          */
1486                         SetPageChecked(data_page);
1487                         pg->pg = data_page;
1488                         /* there should be no gap in the middle of page array */
1489                         if (i == page_count - 1) {
1490                                 struct osc_async_page *oap = brw_page2oap(pg);
1491
1492                                 oa->o_size = oap->oap_count +
1493                                         oap->oap_obj_off + oap->oap_page_off;
1494                         }
1495                         /* len is forced to nunits, and relative offset to 0
1496                          * so store the old, clear text info
1497                          */
1498                         pg->bp_count_diff = nunits - pg->count;
1499                         pg->count = nunits;
1500                         pg->bp_off_diff = pg->off & ~PAGE_MASK;
1501                         pg->off = pg->off & PAGE_MASK;
1502                 }
1503         } else if (opc == OST_READ && inode && IS_ENCRYPTED(inode)) {
1504                 for (i = 0; i < page_count; i++) {
1505                         struct brw_page *pg = pga[i];
1506                         u32 nunits = (pg->off & ~PAGE_MASK) + pg->count;
1507
1508                         if (nunits & ~LUSTRE_ENCRYPTION_MASK)
1509                                 nunits = (nunits & LUSTRE_ENCRYPTION_MASK) +
1510                                         LUSTRE_ENCRYPTION_UNIT_SIZE;
1511                         /* count/off are forced to cover the whole encryption
1512                          * unit size so that all encrypted data is stored on the
1513                          * OST, so adjust bp_{count,off}_diff for the size of
1514                          * the clear text.
1515                          */
1516                         pg->bp_count_diff = nunits - pg->count;
1517                         pg->count = nunits;
1518                         pg->bp_off_diff = pg->off & ~PAGE_MASK;
1519                         pg->off = pg->off & PAGE_MASK;
1520                 }
1521         }
1522
1523         for (niocount = i = 1; i < page_count; i++) {
1524                 if (!can_merge_pages(pga[i - 1], pga[i]))
1525                         niocount++;
1526         }
1527
1528         pill = &req->rq_pill;
1529         req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
1530                              sizeof(*ioobj));
1531         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
1532                              niocount * sizeof(*niobuf));
1533
1534         for (i = 0; i < page_count; i++) {
1535                 short_io_size += pga[i]->count;
1536                 if (!inode || !IS_ENCRYPTED(inode)) {
1537                         pga[i]->bp_count_diff = 0;
1538                         pga[i]->bp_off_diff = 0;
1539                 }
1540         }
1541
1542         /* Check if read/write is small enough to be a short io. */
1543         if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
1544             !imp_connect_shortio(cli->cl_import))
1545                 short_io_size = 0;
1546
1547         req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
1548                              opc == OST_READ ? 0 : short_io_size);
1549         if (opc == OST_READ)
1550                 req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
1551                                      short_io_size);
1552
1553         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
1554         if (rc) {
1555                 ptlrpc_request_free(req);
1556                 RETURN(rc);
1557         }
1558         osc_set_io_portal(req);
1559
1560         ptlrpc_at_set_req_timeout(req);
1561         /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
1562          * retry logic */
1563         req->rq_no_retry_einprogress = 1;
1564
1565         if (short_io_size != 0) {
1566                 desc = NULL;
1567                 short_io_buf = NULL;
1568                 goto no_bulk;
1569         }
1570
1571         desc = ptlrpc_prep_bulk_imp(req, page_count,
1572                 cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
1573                 (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
1574                         PTLRPC_BULK_PUT_SINK),
1575                 OST_BULK_PORTAL,
1576                 &ptlrpc_bulk_kiov_pin_ops);
1577
1578         if (desc == NULL)
1579                 GOTO(out, rc = -ENOMEM);
1580         /* NB request now owns desc and will free it when it gets freed */
1581 no_bulk:
1582         body = req_capsule_client_get(pill, &RMF_OST_BODY);
1583         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
1584         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1585         LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
1586
1587         lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
1588
1589         /* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
1590          * and from_kgid(), because they are asynchronous. Fortunately, variable
1591          * oa contains valid o_uid and o_gid in these two operations.
1592          * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
1593          * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
1594          * other process logic */
1595         body->oa.o_uid = oa->o_uid;
1596         body->oa.o_gid = oa->o_gid;
1597
1598         obdo_to_ioobj(oa, ioobj);
1599         ioobj->ioo_bufcnt = niocount;
1600         /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
1601          * that might be send for this request.  The actual number is decided
1602          * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
1603          * "max - 1" for old client compatibility sending "0", and also so the
1604          * the actual maximum is a power-of-two number, not one less. LU-1431 */
1605         if (desc != NULL)
1606                 ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
1607         else /* short io */
1608                 ioobj_max_brw_set(ioobj, 0);
1609
1610         if (short_io_size != 0) {
1611                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1612                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1613                         body->oa.o_flags = 0;
1614                 }
1615                 body->oa.o_flags |= OBD_FL_SHORT_IO;
1616                 CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
1617                        short_io_size);
1618                 if (opc == OST_WRITE) {
1619                         short_io_buf = req_capsule_client_get(pill,
1620                                                               &RMF_SHORT_IO);
1621                         LASSERT(short_io_buf != NULL);
1622                 }
1623         }
1624
1625         LASSERT(page_count > 0);
1626         pg_prev = pga[0];
1627         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
1628                 struct brw_page *pg = pga[i];
1629                 int poff = pg->off & ~PAGE_MASK;
1630
1631                 LASSERT(pg->count > 0);
1632                 /* make sure there is no gap in the middle of page array */
1633                 LASSERTF(page_count == 1 ||
1634                          (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
1635                           ergo(i > 0 && i < page_count - 1,
1636                                poff == 0 && pg->count == PAGE_SIZE)   &&
1637                           ergo(i == page_count - 1, poff == 0)),
1638                          "i: %d/%d pg: %p off: %llu, count: %u\n",
1639                          i, page_count, pg, pg->off, pg->count);
1640                 LASSERTF(i == 0 || pg->off > pg_prev->off,
1641                          "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
1642                          " prev_pg %p [pri %lu ind %lu] off %llu\n",
1643                          i, page_count,
1644                          pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
1645                          pg_prev->pg, page_private(pg_prev->pg),
1646                          pg_prev->pg->index, pg_prev->off);
1647                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
1648                         (pg->flag & OBD_BRW_SRVLOCK));
1649                 if (short_io_size != 0 && opc == OST_WRITE) {
1650                         unsigned char *ptr = kmap_atomic(pg->pg);
1651
1652                         LASSERT(short_io_size >= requested_nob + pg->count);
1653                         memcpy(short_io_buf + requested_nob,
1654                                ptr + poff,
1655                                pg->count);
1656                         kunmap_atomic(ptr);
1657                 } else if (short_io_size == 0) {
1658                         desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
1659                                                          pg->count);
1660                 }
1661                 requested_nob += pg->count;
1662
1663                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
1664                         niobuf--;
1665                         niobuf->rnb_len += pg->count;
1666                 } else {
1667                         niobuf->rnb_offset = pg->off;
1668                         niobuf->rnb_len    = pg->count;
1669                         niobuf->rnb_flags  = pg->flag;
1670                 }
1671                 pg_prev = pg;
1672         }
1673
1674         LASSERTF((void *)(niobuf - niocount) ==
1675                 req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
1676                 "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
1677                 &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
1678
1679         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
1680         if (resend) {
1681                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
1682                         body->oa.o_valid |= OBD_MD_FLFLAGS;
1683                         body->oa.o_flags = 0;
1684                 }
1685                 body->oa.o_flags |= OBD_FL_RECOV_RESEND;
1686         }
1687
1688         if (osc_should_shrink_grant(cli))
1689                 osc_shrink_grant_local(cli, &body->oa);
1690
1691         /* size[REQ_REC_OFF] still sizeof (*body) */
1692         if (opc == OST_WRITE) {
1693                 if (cli->cl_checksum &&
1694                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
1695                         /* store cl_cksum_type in a local variable since
1696                          * it can be changed via lprocfs */
1697                         enum cksum_types cksum_type = cli->cl_cksum_type;
1698
1699                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1700                                 body->oa.o_flags = 0;
1701
1702                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1703                                                                 cksum_type);
1704                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1705
1706                         rc = osc_checksum_bulk_rw(obd_name, cksum_type,
1707                                                   requested_nob, page_count,
1708                                                   pga, OST_WRITE,
1709                                                   &body->oa.o_cksum);
1710                         if (rc < 0) {
1711                                 CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
1712                                        rc);
1713                                 GOTO(out, rc);
1714                         }
1715                         CDEBUG(D_PAGE, "checksum at write origin: %x\n",
1716                                body->oa.o_cksum);
1717
1718                         /* save this in 'oa', too, for later checking */
1719                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1720                         oa->o_flags |= obd_cksum_type_pack(obd_name,
1721                                                            cksum_type);
1722                 } else {
1723                         /* clear out the checksum flag, in case this is a
1724                          * resend but cl_checksum is no longer set. b=11238 */
1725                         oa->o_valid &= ~OBD_MD_FLCKSUM;
1726                 }
1727                 oa->o_cksum = body->oa.o_cksum;
1728                 /* 1 RC per niobuf */
1729                 req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
1730                                      sizeof(__u32) * niocount);
1731         } else {
1732                 if (cli->cl_checksum &&
1733                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
1734                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
1735                                 body->oa.o_flags = 0;
1736                         body->oa.o_flags |= obd_cksum_type_pack(obd_name,
1737                                 cli->cl_cksum_type);
1738                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
1739                 }
1740
1741                 /* Client cksum has been already copied to wire obdo in previous
1742                  * lustre_set_wire_obdo(), and in the case a bulk-read is being
1743                  * resent due to cksum error, this will allow Server to
1744                  * check+dump pages on its side */
1745         }
1746         ptlrpc_request_set_replen(req);
1747
1748         aa = ptlrpc_req_async_args(aa, req);
1749         aa->aa_oa = oa;
1750         aa->aa_requested_nob = requested_nob;
1751         aa->aa_nio_count = niocount;
1752         aa->aa_page_count = page_count;
1753         aa->aa_resends = 0;
1754         aa->aa_ppga = pga;
1755         aa->aa_cli = cli;
1756         INIT_LIST_HEAD(&aa->aa_oaps);
1757
1758         *reqp = req;
1759         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
1760         CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
1761                 req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
1762                 niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
1763         RETURN(0);
1764
1765  out:
1766         ptlrpc_req_finished(req);
1767         RETURN(rc);
1768 }
1769
1770 char dbgcksum_file_name[PATH_MAX];
1771
1772 static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
1773                                 struct brw_page **pga, __u32 server_cksum,
1774                                 __u32 client_cksum)
1775 {
1776         struct file *filp;
1777         int rc, i;
1778         unsigned int len;
1779         char *buf;
1780
1781         /* will only keep dump of pages on first error for the same range in
1782          * file/fid, not during the resends/retries. */
1783         snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
1784                  "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
1785                  (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
1786                   libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
1787                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
1788                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1789                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1790                  pga[0]->off,
1791                  pga[page_count-1]->off + pga[page_count-1]->count - 1,
1792                  client_cksum, server_cksum);
1793         filp = filp_open(dbgcksum_file_name,
1794                          O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
1795         if (IS_ERR(filp)) {
1796                 rc = PTR_ERR(filp);
1797                 if (rc == -EEXIST)
1798                         CDEBUG(D_INFO, "%s: can't open to dump pages with "
1799                                "checksum error: rc = %d\n", dbgcksum_file_name,
1800                                rc);
1801                 else
1802                         CERROR("%s: can't open to dump pages with checksum "
1803                                "error: rc = %d\n", dbgcksum_file_name, rc);
1804                 return;
1805         }
1806
1807         for (i = 0; i < page_count; i++) {
1808                 len = pga[i]->count;
1809                 buf = kmap(pga[i]->pg);
1810                 while (len != 0) {
1811                         rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
1812                         if (rc < 0) {
1813                                 CERROR("%s: wanted to write %u but got %d "
1814                                        "error\n", dbgcksum_file_name, len, rc);
1815                                 break;
1816                         }
1817                         len -= rc;
1818                         buf += rc;
1819                         CDEBUG(D_INFO, "%s: wrote %d bytes\n",
1820                                dbgcksum_file_name, rc);
1821                 }
1822                 kunmap(pga[i]->pg);
1823         }
1824
1825         rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
1826         if (rc)
1827                 CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
1828         filp_close(filp, NULL);
1829 }
1830
1831 static int
1832 check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
1833                      __u32 client_cksum, __u32 server_cksum,
1834                      struct osc_brw_async_args *aa)
1835 {
1836         const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
1837         enum cksum_types cksum_type;
1838         obd_dif_csum_fn *fn = NULL;
1839         int sector_size = 0;
1840         __u32 new_cksum;
1841         char *msg;
1842         int rc;
1843
1844         if (server_cksum == client_cksum) {
1845                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
1846                 return 0;
1847         }
1848
1849         if (aa->aa_cli->cl_checksum_dump)
1850                 dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
1851                                     server_cksum, client_cksum);
1852
1853         cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
1854                                            oa->o_flags : 0);
1855
1856         switch (cksum_type) {
1857         case OBD_CKSUM_T10IP512:
1858                 fn = obd_dif_ip_fn;
1859                 sector_size = 512;
1860                 break;
1861         case OBD_CKSUM_T10IP4K:
1862                 fn = obd_dif_ip_fn;
1863                 sector_size = 4096;
1864                 break;
1865         case OBD_CKSUM_T10CRC512:
1866                 fn = obd_dif_crc_fn;
1867                 sector_size = 512;
1868                 break;
1869         case OBD_CKSUM_T10CRC4K:
1870                 fn = obd_dif_crc_fn;
1871                 sector_size = 4096;
1872                 break;
1873         default:
1874                 break;
1875         }
1876
1877         if (fn)
1878                 rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
1879                                              aa->aa_page_count, aa->aa_ppga,
1880                                              OST_WRITE, fn, sector_size,
1881                                              &new_cksum);
1882         else
1883                 rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
1884                                        aa->aa_ppga, OST_WRITE, cksum_type,
1885                                        &new_cksum);
1886
1887         if (rc < 0)
1888                 msg = "failed to calculate the client write checksum";
1889         else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
1890                 msg = "the server did not use the checksum type specified in "
1891                       "the original request - likely a protocol problem";
1892         else if (new_cksum == server_cksum)
1893                 msg = "changed on the client after we checksummed it - "
1894                       "likely false positive due to mmap IO (bug 11742)";
1895         else if (new_cksum == client_cksum)
1896                 msg = "changed in transit before arrival at OST";
1897         else
1898                 msg = "changed in transit AND doesn't match the original - "
1899                       "likely false positive due to mmap IO (bug 11742)";
1900
1901         LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
1902                            DFID " object "DOSTID" extent [%llu-%llu], original "
1903                            "client csum %x (type %x), server csum %x (type %x),"
1904                            " client csum now %x\n",
1905                            obd_name, msg, libcfs_nid2str(peer->nid),
1906                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
1907                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
1908                            oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
1909                            POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
1910                            aa->aa_ppga[aa->aa_page_count - 1]->off +
1911                                 aa->aa_ppga[aa->aa_page_count-1]->count - 1,
1912                            client_cksum,
1913                            obd_cksum_type_unpack(aa->aa_oa->o_flags),
1914                            server_cksum, cksum_type, new_cksum);
1915         return 1;
1916 }
1917
1918 /* Note rc enters this function as number of bytes transferred */
1919 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
1920 {
1921         struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
1922         struct client_obd *cli = aa->aa_cli;
1923         const char *obd_name = cli->cl_import->imp_obd->obd_name;
1924         const struct lnet_process_id *peer =
1925                 &req->rq_import->imp_connection->c_peer;
1926         struct ost_body *body;
1927         u32 client_cksum = 0;
1928         struct inode *inode;
1929         unsigned int blockbits = 0, blocksize = 0;
1930
1931         ENTRY;
1932
1933         if (rc < 0 && rc != -EDQUOT) {
1934                 DEBUG_REQ(D_INFO, req, "Failed request: rc = %d", rc);
1935                 RETURN(rc);
1936         }
1937
1938         LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
1939         body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
1940         if (body == NULL) {
1941                 DEBUG_REQ(D_INFO, req, "cannot unpack body");
1942                 RETURN(-EPROTO);
1943         }
1944
1945         /* set/clear over quota flag for a uid/gid/projid */
1946         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
1947             body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
1948                 unsigned qid[LL_MAXQUOTAS] = {
1949                                          body->oa.o_uid, body->oa.o_gid,
1950                                          body->oa.o_projid };
1951                 CDEBUG(D_QUOTA,
1952                        "setdq for [%u %u %u] with valid %#llx, flags %x\n",
1953                        body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
1954                        body->oa.o_valid, body->oa.o_flags);
1955                        osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
1956                                        body->oa.o_flags);
1957         }
1958
1959         osc_update_grant(cli, body);
1960
1961         if (rc < 0)
1962                 RETURN(rc);
1963
1964         if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
1965                 client_cksum = aa->aa_oa->o_cksum; /* save for later */
1966
1967         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
1968                 if (rc > 0) {
1969                         CERROR("%s: unexpected positive size %d\n",
1970                                obd_name, rc);
1971                         RETURN(-EPROTO);
1972                 }
1973
1974                 if (req->rq_bulk != NULL &&
1975                     sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
1976                         RETURN(-EAGAIN);
1977
1978                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
1979                     check_write_checksum(&body->oa, peer, client_cksum,
1980                                          body->oa.o_cksum, aa))
1981                         RETURN(-EAGAIN);
1982
1983                 rc = check_write_rcs(req, aa->aa_requested_nob,
1984                                      aa->aa_nio_count, aa->aa_page_count,
1985                                      aa->aa_ppga);
1986                 GOTO(out, rc);
1987         }
1988
1989         /* The rest of this function executes only for OST_READs */
1990
1991         if (req->rq_bulk == NULL) {
1992                 rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
1993                                           RCL_SERVER);
1994                 LASSERT(rc == req->rq_status);
1995         } else {
1996                 /* if unwrap_bulk failed, return -EAGAIN to retry */
1997                 rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
1998         }
1999         if (rc < 0)
2000                 GOTO(out, rc = -EAGAIN);
2001
2002         if (rc > aa->aa_requested_nob) {
2003                 CERROR("%s: unexpected size %d, requested %d\n", obd_name,
2004                        rc, aa->aa_requested_nob);
2005                 RETURN(-EPROTO);
2006         }
2007
2008         if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
2009                 CERROR("%s: unexpected size %d, transferred %d\n", obd_name,
2010                        rc, req->rq_bulk->bd_nob_transferred);
2011                 RETURN(-EPROTO);
2012         }
2013
2014         if (req->rq_bulk == NULL) {
2015                 /* short io */
2016                 int nob, pg_count, i = 0;
2017                 unsigned char *buf;
2018
2019                 CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
2020                 pg_count = aa->aa_page_count;
2021                 buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
2022                                                    rc);
2023                 nob = rc;
2024                 while (nob > 0 && pg_count > 0) {
2025                         unsigned char *ptr;
2026                         int count = aa->aa_ppga[i]->count > nob ?
2027                                     nob : aa->aa_ppga[i]->count;
2028
2029                         CDEBUG(D_CACHE, "page %p count %d\n",
2030                                aa->aa_ppga[i]->pg, count);
2031                         ptr = kmap_atomic(aa->aa_ppga[i]->pg);
2032                         memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
2033                                count);
2034                         kunmap_atomic((void *) ptr);
2035
2036                         buf += count;
2037                         nob -= count;
2038                         i++;
2039                         pg_count--;
2040                 }
2041         }
2042
2043         if (rc < aa->aa_requested_nob)
2044                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
2045
2046         if (body->oa.o_valid & OBD_MD_FLCKSUM) {
2047                 static int cksum_counter;
2048                 u32        server_cksum = body->oa.o_cksum;
2049                 char      *via = "";
2050                 char      *router = "";
2051                 enum cksum_types cksum_type;
2052                 u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
2053                         body->oa.o_flags : 0;
2054
2055                 cksum_type = obd_cksum_type_unpack(o_flags);
2056                 rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
2057                                           aa->aa_page_count, aa->aa_ppga,
2058                                           OST_READ, &client_cksum);
2059                 if (rc < 0)
2060                         GOTO(out, rc);
2061
2062                 if (req->rq_bulk != NULL &&
2063                     peer->nid != req->rq_bulk->bd_sender) {
2064                         via = " via ";
2065                         router = libcfs_nid2str(req->rq_bulk->bd_sender);
2066                 }
2067
2068                 if (server_cksum != client_cksum) {
2069                         struct ost_body *clbody;
2070                         u32 page_count = aa->aa_page_count;
2071
2072                         clbody = req_capsule_client_get(&req->rq_pill,
2073                                                         &RMF_OST_BODY);
2074                         if (cli->cl_checksum_dump)
2075                                 dump_all_bulk_pages(&clbody->oa, page_count,
2076                                                     aa->aa_ppga, server_cksum,
2077                                                     client_cksum);
2078
2079                         LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
2080                                            "%s%s%s inode "DFID" object "DOSTID
2081                                            " extent [%llu-%llu], client %x, "
2082                                            "server %x, cksum_type %x\n",
2083                                            obd_name,
2084                                            libcfs_nid2str(peer->nid),
2085                                            via, router,
2086                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2087                                                 clbody->oa.o_parent_seq : 0ULL,
2088                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2089                                                 clbody->oa.o_parent_oid : 0,
2090                                            clbody->oa.o_valid & OBD_MD_FLFID ?
2091                                                 clbody->oa.o_parent_ver : 0,
2092                                            POSTID(&body->oa.o_oi),
2093                                            aa->aa_ppga[0]->off,
2094                                            aa->aa_ppga[page_count-1]->off +
2095                                            aa->aa_ppga[page_count-1]->count - 1,
2096                                            client_cksum, server_cksum,
2097                                            cksum_type);
2098                         cksum_counter = 0;
2099                         aa->aa_oa->o_cksum = client_cksum;
2100                         rc = -EAGAIN;
2101                 } else {
2102                         cksum_counter++;
2103                         CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
2104                         rc = 0;
2105                 }
2106         } else if (unlikely(client_cksum)) {
2107                 static int cksum_missed;
2108
2109                 cksum_missed++;
2110                 if ((cksum_missed & (-cksum_missed)) == cksum_missed)
2111                         CERROR("%s: checksum %u requested from %s but not sent\n",
2112                                obd_name, cksum_missed,
2113                                libcfs_nid2str(peer->nid));
2114         } else {
2115                 rc = 0;
2116         }
2117
2118         inode = page2inode(aa->aa_ppga[0]->pg);
2119         if (inode == NULL) {
2120                 /* Try to get reference to inode from cl_page if we are
2121                  * dealing with direct IO, as handled pages are not
2122                  * actual page cache pages.
2123                  */
2124                 struct osc_async_page *oap = brw_page2oap(aa->aa_ppga[0]);
2125
2126                 inode = oap2cl_page(oap)->cp_inode;
2127                 if (inode) {
2128                         blockbits = inode->i_blkbits;
2129                         blocksize = 1 << blockbits;
2130                 }
2131         }
2132         if (inode && IS_ENCRYPTED(inode)) {
2133                 int idx;
2134
2135                 if (!llcrypt_has_encryption_key(inode)) {
2136                         CDEBUG(D_SEC, "no enc key for ino %lu\n", inode->i_ino);
2137                         GOTO(out, rc);
2138                 }
2139                 for (idx = 0; idx < aa->aa_page_count; idx++) {
2140                         struct brw_page *pg = aa->aa_ppga[idx];
2141                         unsigned int offs = 0;
2142
2143                         while (offs < PAGE_SIZE) {
2144                                 /* do not decrypt if page is all 0s */
2145                                 if (memchr_inv(page_address(pg->pg) + offs, 0,
2146                                          LUSTRE_ENCRYPTION_UNIT_SIZE) == NULL) {
2147                                         /* if page is empty forward info to
2148                                          * upper layers (ll_io_zero_page) by
2149                                          * clearing PagePrivate2
2150                                          */
2151                                         if (!offs)
2152                                                 ClearPagePrivate2(pg->pg);
2153                                         break;
2154                                 }
2155
2156                                 if (blockbits) {
2157                                         /* This is direct IO case. Directly call
2158                                          * decrypt function that takes inode as
2159                                          * input parameter. Page does not need
2160                                          * to be locked.
2161                                          */
2162                                         u64 lblk_num =
2163                                                 ((u64)(pg->off >> PAGE_SHIFT) <<
2164                                                      (PAGE_SHIFT - blockbits)) +
2165                                                        (offs >> blockbits);
2166                                         unsigned int i;
2167
2168                                         for (i = offs;
2169                                              i < offs +
2170                                                     LUSTRE_ENCRYPTION_UNIT_SIZE;
2171                                              i += blocksize, lblk_num++) {
2172                                                 rc =
2173                                                   llcrypt_decrypt_block_inplace(
2174                                                           inode, pg->pg,
2175                                                           blocksize, i,
2176                                                           lblk_num);
2177                                                 if (rc)
2178                                                         break;
2179                                         }
2180                                 } else {
2181                                         rc = llcrypt_decrypt_pagecache_blocks(
2182                                                 pg->pg,
2183                                                 LUSTRE_ENCRYPTION_UNIT_SIZE,
2184                                                 offs);
2185                                 }
2186                                 if (rc)
2187                                         GOTO(out, rc);
2188
2189                                 offs += LUSTRE_ENCRYPTION_UNIT_SIZE;
2190                         }
2191                 }
2192         }
2193
2194 out:
2195         if (rc >= 0)
2196                 lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
2197                                      aa->aa_oa, &body->oa);
2198
2199         RETURN(rc);
2200 }
2201
2202 static int osc_brw_redo_request(struct ptlrpc_request *request,
2203                                 struct osc_brw_async_args *aa, int rc)
2204 {
2205         struct ptlrpc_request *new_req;
2206         struct osc_brw_async_args *new_aa;
2207         struct osc_async_page *oap;
2208         ENTRY;
2209
2210         /* The below message is checked in replay-ost-single.sh test_8ae*/
2211         DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
2212                   "redo for recoverable error %d", rc);
2213
2214         rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
2215                                 OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
2216                                   aa->aa_cli, aa->aa_oa, aa->aa_page_count,
2217                                   aa->aa_ppga, &new_req, 1);
2218         if (rc)
2219                 RETURN(rc);
2220
2221         list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
2222                 if (oap->oap_request != NULL) {
2223                         LASSERTF(request == oap->oap_request,
2224                                  "request %p != oap_request %p\n",
2225                                  request, oap->oap_request);
2226                 }
2227         }
2228         /*
2229          * New request takes over pga and oaps from old request.
2230          * Note that copying a list_head doesn't work, need to move it...
2231          */
2232         aa->aa_resends++;
2233         new_req->rq_interpret_reply = request->rq_interpret_reply;
2234         new_req->rq_async_args = request->rq_async_args;
2235         new_req->rq_commit_cb = request->rq_commit_cb;
2236         /* cap resend delay to the current request timeout, this is similar to
2237          * what ptlrpc does (see after_reply()) */
2238         if (aa->aa_resends > new_req->rq_timeout)
2239                 new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
2240         else
2241                 new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
2242         new_req->rq_generation_set = 1;
2243         new_req->rq_import_generation = request->rq_import_generation;
2244
2245         new_aa = ptlrpc_req_async_args(new_aa, new_req);
2246
2247         INIT_LIST_HEAD(&new_aa->aa_oaps);
2248         list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
2249         INIT_LIST_HEAD(&new_aa->aa_exts);
2250         list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
2251         new_aa->aa_resends = aa->aa_resends;
2252
2253         list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
2254                 if (oap->oap_request) {
2255                         ptlrpc_req_finished(oap->oap_request);
2256                         oap->oap_request = ptlrpc_request_addref(new_req);
2257                 }
2258         }
2259
2260         /* XXX: This code will run into problem if we're going to support
2261          * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
2262          * and wait for all of them to be finished. We should inherit request
2263          * set from old request. */
2264         ptlrpcd_add_req(new_req);
2265
2266         DEBUG_REQ(D_INFO, new_req, "new request");
2267         RETURN(0);
2268 }
2269
2270 /*
2271  * ugh, we want disk allocation on the target to happen in offset order.  we'll
2272  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
2273  * fine for our small page arrays and doesn't require allocation.  its an
2274  * insertion sort that swaps elements that are strides apart, shrinking the
2275  * stride down until its '1' and the array is sorted.
2276  */
2277 static void sort_brw_pages(struct brw_page **array, int num)
2278 {
2279         int stride, i, j;
2280         struct brw_page *tmp;
2281
2282         if (num == 1)
2283                 return;
2284         for (stride = 1; stride < num ; stride = (stride * 3) + 1)
2285                 ;
2286
2287         do {
2288                 stride /= 3;
2289                 for (i = stride ; i < num ; i++) {
2290                         tmp = array[i];
2291                         j = i;
2292                         while (j >= stride && array[j - stride]->off > tmp->off) {
2293                                 array[j] = array[j - stride];
2294                                 j -= stride;
2295                         }
2296                         array[j] = tmp;
2297                 }
2298         } while (stride > 1);
2299 }
2300
2301 static void osc_release_ppga(struct brw_page **ppga, size_t count)
2302 {
2303         LASSERT(ppga != NULL);
2304         OBD_FREE_PTR_ARRAY_LARGE(ppga, count);
2305 }
2306
2307 static int brw_interpret(const struct lu_env *env,
2308                          struct ptlrpc_request *req, void *args, int rc)
2309 {
2310         struct osc_brw_async_args *aa = args;
2311         struct osc_extent *ext;
2312         struct osc_extent *tmp;
2313         struct client_obd *cli = aa->aa_cli;
2314         unsigned long transferred = 0;
2315
2316         ENTRY;
2317
2318         rc = osc_brw_fini_request(req, rc);
2319         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
2320
2321         /* restore clear text pages */
2322         osc_release_bounce_pages(aa->aa_ppga, aa->aa_page_count);
2323
2324         /*
2325          * When server returns -EINPROGRESS, client should always retry
2326          * regardless of the number of times the bulk was resent already.
2327          */
2328         if (osc_recoverable_error(rc) && !req->rq_no_delay) {
2329                 if (req->rq_import_generation !=
2330                     req->rq_import->imp_generation) {
2331                         CDEBUG(D_HA, "%s: resend cross eviction for object: "
2332                                ""DOSTID", rc = %d.\n",
2333                                req->rq_import->imp_obd->obd_name,
2334                                POSTID(&aa->aa_oa->o_oi), rc);
2335                 } else if (rc == -EINPROGRESS ||
2336                     client_should_resend(aa->aa_resends, aa->aa_cli)) {
2337                         rc = osc_brw_redo_request(req, aa, rc);
2338                 } else {
2339                         CERROR("%s: too many resent retries for object: "
2340                                "%llu:%llu, rc = %d.\n",
2341                                req->rq_import->imp_obd->obd_name,
2342                                POSTID(&aa->aa_oa->o_oi), rc);
2343                 }
2344
2345                 if (rc == 0)
2346                         RETURN(0);
2347                 else if (rc == -EAGAIN || rc == -EINPROGRESS)
2348                         rc = -EIO;
2349         }
2350
2351         if (rc == 0) {
2352                 struct obdo *oa = aa->aa_oa;
2353                 struct cl_attr *attr = &osc_env_info(env)->oti_attr;
2354                 unsigned long valid = 0;
2355                 struct cl_object *obj;
2356                 struct osc_async_page *last;
2357
2358                 last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
2359                 obj = osc2cl(last->oap_obj);
2360
2361                 cl_object_attr_lock(obj);
2362                 if (oa->o_valid & OBD_MD_FLBLOCKS) {
2363                         attr->cat_blocks = oa->o_blocks;
2364                         valid |= CAT_BLOCKS;
2365                 }
2366                 if (oa->o_valid & OBD_MD_FLMTIME) {
2367                         attr->cat_mtime = oa->o_mtime;
2368                         valid |= CAT_MTIME;
2369                 }
2370                 if (oa->o_valid & OBD_MD_FLATIME) {
2371                         attr->cat_atime = oa->o_atime;
2372                         valid |= CAT_ATIME;
2373                 }
2374                 if (oa->o_valid & OBD_MD_FLCTIME) {
2375                         attr->cat_ctime = oa->o_ctime;
2376                         valid |= CAT_CTIME;
2377                 }
2378
2379                 if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
2380                         struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
2381                         loff_t last_off = last->oap_count + last->oap_obj_off +
2382                                 last->oap_page_off;
2383
2384                         /* Change file size if this is an out of quota or
2385                          * direct IO write and it extends the file size */
2386                         if (loi->loi_lvb.lvb_size < last_off) {
2387                                 attr->cat_size = last_off;
2388                                 valid |= CAT_SIZE;
2389                         }
2390                         /* Extend KMS if it's not a lockless write */
2391                         if (loi->loi_kms < last_off &&
2392                             oap2osc_page(last)->ops_srvlock == 0) {
2393                                 attr->cat_kms = last_off;
2394                                 valid |= CAT_KMS;
2395                         }
2396                 }
2397
2398                 if (valid != 0)
2399                         cl_object_attr_update(env, obj, attr, valid);
2400                 cl_object_attr_unlock(obj);
2401         }
2402         OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
2403         aa->aa_oa = NULL;
2404
2405         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
2406                 osc_inc_unstable_pages(req);
2407
2408         list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
2409                 list_del_init(&ext->oe_link);
2410                 osc_extent_finish(env, ext, 1,
2411                                   rc && req->rq_no_delay ? -EAGAIN : rc);
2412         }
2413         LASSERT(list_empty(&aa->aa_exts));
2414         LASSERT(list_empty(&aa->aa_oaps));
2415
2416         transferred = (req->rq_bulk == NULL ? /* short io */
2417                        aa->aa_requested_nob :
2418                        req->rq_bulk->bd_nob_transferred);
2419
2420         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
2421         ptlrpc_lprocfs_brw(req, transferred);
2422
2423         spin_lock(&cli->cl_loi_list_lock);
2424         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
2425          * is called so we know whether to go to sync BRWs or wait for more
2426          * RPCs to complete */
2427         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
2428                 cli->cl_w_in_flight--;
2429         else
2430                 cli->cl_r_in_flight--;
2431         osc_wake_cache_waiters(cli);
2432         spin_unlock(&cli->cl_loi_list_lock);
2433
2434         osc_io_unplug(env, cli, NULL);
2435         RETURN(rc);
2436 }
2437
2438 static void brw_commit(struct ptlrpc_request *req)
2439 {
2440         /* If osc_inc_unstable_pages (via osc_extent_finish) races with
2441          * this called via the rq_commit_cb, I need to ensure
2442          * osc_dec_unstable_pages is still called. Otherwise unstable
2443          * pages may be leaked. */
2444         spin_lock(&req->rq_lock);
2445         if (likely(req->rq_unstable)) {
2446                 req->rq_unstable = 0;
2447                 spin_unlock(&req->rq_lock);
2448
2449                 osc_dec_unstable_pages(req);
2450         } else {
2451                 req->rq_committed = 1;
2452                 spin_unlock(&req->rq_lock);
2453         }
2454 }
2455
2456 /**
2457  * Build an RPC by the list of extent @ext_list. The caller must ensure
2458  * that the total pages in this list are NOT over max pages per RPC.
2459  * Extents in the list must be in OES_RPC state.
2460  */
2461 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
2462                   struct list_head *ext_list, int cmd)
2463 {
2464         struct ptlrpc_request           *req = NULL;
2465         struct osc_extent               *ext;
2466         struct brw_page                 **pga = NULL;
2467         struct osc_brw_async_args       *aa = NULL;
2468         struct obdo                     *oa = NULL;
2469         struct osc_async_page           *oap;
2470         struct osc_object               *obj = NULL;
2471         struct cl_req_attr              *crattr = NULL;
2472         loff_t                          starting_offset = OBD_OBJECT_EOF;
2473         loff_t                          ending_offset = 0;
2474         /* '1' for consistency with code that checks !mpflag to restore */
2475         int mpflag = 1;
2476         int                             mem_tight = 0;
2477         int                             page_count = 0;
2478         bool                            soft_sync = false;
2479         bool                            ndelay = false;
2480         int                             i;
2481         int                             grant = 0;
2482         int                             rc;
2483         __u32                           layout_version = 0;
2484         LIST_HEAD(rpc_list);
2485         struct ost_body                 *body;
2486         ENTRY;
2487         LASSERT(!list_empty(ext_list));
2488
2489         /* add pages into rpc_list to build BRW rpc */
2490         list_for_each_entry(ext, ext_list, oe_link) {
2491                 LASSERT(ext->oe_state == OES_RPC);
2492                 mem_tight |= ext->oe_memalloc;
2493                 grant += ext->oe_grants;
2494                 page_count += ext->oe_nr_pages;
2495                 layout_version = max(layout_version, ext->oe_layout_version);
2496                 if (obj == NULL)
2497                         obj = ext->oe_obj;
2498         }
2499
2500         soft_sync = osc_over_unstable_soft_limit(cli);
2501         if (mem_tight)
2502                 mpflag = memalloc_noreclaim_save();
2503
2504         OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count);
2505         if (pga == NULL)
2506                 GOTO(out, rc = -ENOMEM);
2507
2508         OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
2509         if (oa == NULL)
2510                 GOTO(out, rc = -ENOMEM);
2511
2512         i = 0;
2513         list_for_each_entry(ext, ext_list, oe_link) {
2514                 list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
2515                         if (mem_tight)
2516                                 oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
2517                         if (soft_sync)
2518                                 oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
2519                         pga[i] = &oap->oap_brw_page;
2520                         pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
2521                         i++;
2522
2523                         list_add_tail(&oap->oap_rpc_item, &rpc_list);
2524                         if (starting_offset == OBD_OBJECT_EOF ||
2525                             starting_offset > oap->oap_obj_off)
2526                                 starting_offset = oap->oap_obj_off;
2527                         else
2528                                 LASSERT(oap->oap_page_off == 0);
2529                         if (ending_offset < oap->oap_obj_off + oap->oap_count)
2530                                 ending_offset = oap->oap_obj_off +
2531                                                 oap->oap_count;
2532                         else
2533                                 LASSERT(oap->oap_page_off + oap->oap_count ==
2534                                         PAGE_SIZE);
2535                 }
2536                 if (ext->oe_ndelay)
2537                         ndelay = true;
2538         }
2539
2540         /* first page in the list */
2541         oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item);
2542
2543         crattr = &osc_env_info(env)->oti_req_attr;
2544         memset(crattr, 0, sizeof(*crattr));
2545         crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
2546         crattr->cra_flags = ~0ULL;
2547         crattr->cra_page = oap2cl_page(oap);
2548         crattr->cra_oa = oa;
2549         cl_req_attr_set(env, osc2cl(obj), crattr);
2550
2551         if (cmd == OBD_BRW_WRITE) {
2552                 oa->o_grant_used = grant;
2553                 if (layout_version > 0) {
2554                         CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
2555                                PFID(&oa->o_oi.oi_fid), layout_version);
2556
2557                         oa->o_layout_version = layout_version;
2558                         oa->o_valid |= OBD_MD_LAYOUT_VERSION;
2559                 }
2560         }
2561
2562         sort_brw_pages(pga, page_count);
2563         rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
2564         if (rc != 0) {
2565                 CERROR("prep_req failed: %d\n", rc);
2566                 GOTO(out, rc);
2567         }
2568
2569         req->rq_commit_cb = brw_commit;
2570         req->rq_interpret_reply = brw_interpret;
2571         req->rq_memalloc = mem_tight != 0;
2572         oap->oap_request = ptlrpc_request_addref(req);
2573         if (ndelay) {
2574                 req->rq_no_resend = req->rq_no_delay = 1;
2575                 /* probably set a shorter timeout value.
2576                  * to handle ETIMEDOUT in brw_interpret() correctly. */
2577                 /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
2578         }
2579
2580         /* Need to update the timestamps after the request is built in case
2581          * we race with setattr (locally or in queue at OST).  If OST gets
2582          * later setattr before earlier BRW (as determined by the request xid),
2583          * the OST will not use BRW timestamps.  Sadly, there is no obvious
2584          * way to do this in a single call.  bug 10150 */
2585         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
2586         crattr->cra_oa = &body->oa;
2587         crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
2588         cl_req_attr_set(env, osc2cl(obj), crattr);
2589         lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
2590
2591         aa = ptlrpc_req_async_args(aa, req);
2592         INIT_LIST_HEAD(&aa->aa_oaps);
2593         list_splice_init(&rpc_list, &aa->aa_oaps);
2594         INIT_LIST_HEAD(&aa->aa_exts);
2595         list_splice_init(ext_list, &aa->aa_exts);
2596
2597         spin_lock(&cli->cl_loi_list_lock);
2598         starting_offset >>= PAGE_SHIFT;
2599         if (cmd == OBD_BRW_READ) {
2600                 cli->cl_r_in_flight++;
2601                 lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
2602                 lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
2603                 lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
2604                                       starting_offset + 1);
2605         } else {
2606                 cli->cl_w_in_flight++;
2607                 lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
2608                 lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
2609                 lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
2610                                       starting_offset + 1);
2611         }
2612         spin_unlock(&cli->cl_loi_list_lock);
2613
2614         DEBUG_REQ(D_INODE, req, "%d pages, aa %p, now %ur/%uw in flight",
2615                   page_count, aa, cli->cl_r_in_flight,
2616                   cli->cl_w_in_flight);
2617         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
2618
2619         ptlrpcd_add_req(req);
2620         rc = 0;
2621         EXIT;
2622
2623 out:
2624         if (mem_tight)
2625                 memalloc_noreclaim_restore(mpflag);
2626
2627         if (rc != 0) {
2628                 LASSERT(req == NULL);
2629
2630                 if (oa)
2631                         OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
2632                 if (pga) {
2633                         osc_release_bounce_pages(pga, page_count);
2634                         osc_release_ppga(pga, page_count);
2635                 }
2636                 /* this should happen rarely and is pretty bad, it makes the
2637                  * pending list not follow the dirty order */
2638                 while (!list_empty(ext_list)) {
2639                         ext = list_entry(ext_list->next, struct osc_extent,
2640                                          oe_link);
2641                         list_del_init(&ext->oe_link);
2642                         osc_extent_finish(env, ext, 0, rc);
2643                 }
2644         }
2645         RETURN(rc);
2646 }
2647
2648 static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
2649 {
2650         int set = 0;
2651
2652         LASSERT(lock != NULL);
2653
2654         lock_res_and_lock(lock);
2655
2656         if (lock->l_ast_data == NULL)
2657                 lock->l_ast_data = data;
2658         if (lock->l_ast_data == data)
2659                 set = 1;
2660
2661         unlock_res_and_lock(lock);
2662
2663         return set;
2664 }
2665
2666 int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
2667                      void *cookie, struct lustre_handle *lockh,
2668                      enum ldlm_mode mode, __u64 *flags, bool speculative,
2669                      int errcode)
2670 {
2671         bool intent = *flags & LDLM_FL_HAS_INTENT;
2672         int rc;
2673         ENTRY;
2674
2675         /* The request was created before ldlm_cli_enqueue call. */
2676         if (intent && errcode == ELDLM_LOCK_ABORTED) {
2677                 struct ldlm_reply *rep;
2678
2679                 rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
2680                 LASSERT(rep != NULL);
2681
2682                 rep->lock_policy_res1 =
2683                         ptlrpc_status_ntoh(rep->lock_policy_res1);
2684                 if (rep->lock_policy_res1)
2685                         errcode = rep->lock_policy_res1;
2686                 if (!speculative)
2687                         *flags |= LDLM_FL_LVB_READY;
2688         } else if (errcode == ELDLM_OK) {
2689                 *flags |= LDLM_FL_LVB_READY;
2690         }
2691
2692         /* Call the update callback. */
2693         rc = (*upcall)(cookie, lockh, errcode);
2694
2695         /* release the reference taken in ldlm_cli_enqueue() */
2696         if (errcode == ELDLM_LOCK_MATCHED)
2697                 errcode = ELDLM_OK;
2698         if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
2699                 ldlm_lock_decref(lockh, mode);
2700
2701         RETURN(rc);
2702 }
2703
2704 int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
2705                           void *args, int rc)
2706 {
2707         struct osc_enqueue_args *aa = args;
2708         struct ldlm_lock *lock;
2709         struct lustre_handle *lockh = &aa->oa_lockh;
2710         enum ldlm_mode mode = aa->oa_mode;
2711         struct ost_lvb *lvb = aa->oa_lvb;
2712         __u32 lvb_len = sizeof(*lvb);
2713         __u64 flags = 0;
2714         struct ldlm_enqueue_info einfo = {
2715                 .ei_type = aa->oa_type,
2716                 .ei_mode = mode,
2717         };
2718
2719         ENTRY;
2720
2721         /* ldlm_cli_enqueue is holding a reference on the lock, so it must
2722          * be valid. */
2723         lock = ldlm_handle2lock(lockh);
2724         LASSERTF(lock != NULL,
2725                  "lockh %#llx, req %p, aa %p - client evicted?\n",
2726                  lockh->cookie, req, aa);
2727
2728         /* Take an additional reference so that a blocking AST that
2729          * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
2730          * to arrive after an upcall has been executed by
2731          * osc_enqueue_fini(). */
2732         ldlm_lock_addref(lockh, mode);
2733
2734         /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
2735         OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
2736
2737         /* Let CP AST to grant the lock first. */
2738         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
2739
2740         if (aa->oa_speculative) {
2741                 LASSERT(aa->oa_lvb == NULL);
2742                 LASSERT(aa->oa_flags == NULL);
2743                 aa->oa_flags = &flags;
2744         }
2745
2746         /* Complete obtaining the lock procedure. */
2747         rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, &einfo, 1, aa->oa_flags,
2748                                    lvb, lvb_len, lockh, rc);
2749         /* Complete osc stuff. */
2750         rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
2751                               aa->oa_flags, aa->oa_speculative, rc);
2752
2753         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
2754
2755         ldlm_lock_decref(lockh, mode);
2756         LDLM_LOCK_PUT(lock);
2757         RETURN(rc);
2758 }
2759
2760 /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
2761  * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
2762  * other synchronous requests, however keeping some locks and trying to obtain
2763  * others may take a considerable amount of time in a case of ost failure; and
2764  * when other sync requests do not get released lock from a client, the client
2765  * is evicted from the cluster -- such scenarious make the life difficult, so
2766  * release locks just after they are obtained. */
2767 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
2768                      __u64 *flags, union ldlm_policy_data *policy,
2769                      struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
2770                      void *cookie, struct ldlm_enqueue_info *einfo,
2771                      struct ptlrpc_request_set *rqset, int async,
2772                      bool speculative)
2773 {
2774         struct obd_device *obd = exp->exp_obd;
2775         struct lustre_handle lockh = { 0 };
2776         struct ptlrpc_request *req = NULL;
2777         int intent = *flags & LDLM_FL_HAS_INTENT;
2778         __u64 match_flags = *flags;
2779         enum ldlm_mode mode;
2780         int rc;
2781         ENTRY;
2782
2783         /* Filesystem lock extents are extended to page boundaries so that
2784          * dealing with the page cache is a little smoother.  */
2785         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2786         policy->l_extent.end |= ~PAGE_MASK;
2787
2788         /* Next, search for already existing extent locks that will cover us */
2789         /* If we're trying to read, we also search for an existing PW lock.  The
2790          * VFS and page cache already protect us locally, so lots of readers/
2791          * writers can share a single PW lock.
2792          *
2793          * There are problems with conversion deadlocks, so instead of
2794          * converting a read lock to a write lock, we'll just enqueue a new
2795          * one.
2796          *
2797          * At some point we should cancel the read lock instead of making them
2798          * send us a blocking callback, but there are problems with canceling
2799          * locks out from other users right now, too. */
2800         mode = einfo->ei_mode;
2801         if (einfo->ei_mode == LCK_PR)
2802                 mode |= LCK_PW;
2803         /* Normal lock requests must wait for the LVB to be ready before
2804          * matching a lock; speculative lock requests do not need to,
2805          * because they will not actually use the lock. */
2806         if (!speculative)
2807                 match_flags |= LDLM_FL_LVB_READY;
2808         if (intent != 0)
2809                 match_flags |= LDLM_FL_BLOCK_GRANTED;
2810         mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
2811                                einfo->ei_type, policy, mode, &lockh);
2812         if (mode) {
2813                 struct ldlm_lock *matched;
2814
2815                 if (*flags & LDLM_FL_TEST_LOCK)
2816                         RETURN(ELDLM_OK);
2817
2818                 matched = ldlm_handle2lock(&lockh);
2819                 if (speculative) {
2820                         /* This DLM lock request is speculative, and does not
2821                          * have an associated IO request. Therefore if there
2822                          * is already a DLM lock, it wll just inform the
2823                          * caller to cancel the request for this stripe.*/
2824                         lock_res_and_lock(matched);
2825                         if (ldlm_extent_equal(&policy->l_extent,
2826                             &matched->l_policy_data.l_extent))
2827                                 rc = -EEXIST;
2828                         else
2829                                 rc = -ECANCELED;
2830                         unlock_res_and_lock(matched);
2831
2832                         ldlm_lock_decref(&lockh, mode);
2833                         LDLM_LOCK_PUT(matched);
2834                         RETURN(rc);
2835                 } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
2836                         *flags |= LDLM_FL_LVB_READY;
2837
2838                         /* We already have a lock, and it's referenced. */
2839                         (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
2840
2841                         ldlm_lock_decref(&lockh, mode);
2842                         LDLM_LOCK_PUT(matched);
2843                         RETURN(ELDLM_OK);
2844                 } else {
2845                         ldlm_lock_decref(&lockh, mode);
2846                         LDLM_LOCK_PUT(matched);
2847                 }
2848         }
2849
2850         if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
2851                 RETURN(-ENOLCK);
2852
2853         /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
2854         *flags &= ~LDLM_FL_BLOCK_GRANTED;
2855
2856         rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
2857                               sizeof(*lvb), LVB_T_OST, &lockh, async);
2858         if (async) {
2859                 if (!rc) {
2860                         struct osc_enqueue_args *aa;
2861                         aa = ptlrpc_req_async_args(aa, req);
2862                         aa->oa_exp         = exp;
2863                         aa->oa_mode        = einfo->ei_mode;
2864                         aa->oa_type        = einfo->ei_type;
2865                         lustre_handle_copy(&aa->oa_lockh, &lockh);
2866                         aa->oa_upcall      = upcall;
2867                         aa->oa_cookie      = cookie;
2868                         aa->oa_speculative = speculative;
2869                         if (!speculative) {
2870                                 aa->oa_flags  = flags;
2871                                 aa->oa_lvb    = lvb;
2872                         } else {
2873                                 /* speculative locks are essentially to enqueue
2874                                  * a DLM lock  in advance, so we don't care
2875                                  * about the result of the enqueue. */
2876                                 aa->oa_lvb    = NULL;
2877                                 aa->oa_flags  = NULL;
2878                         }
2879
2880                         req->rq_interpret_reply = osc_enqueue_interpret;
2881                         ptlrpc_set_add_req(rqset, req);
2882                 }
2883                 RETURN(rc);
2884         }
2885
2886         rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
2887                               flags, speculative, rc);
2888
2889         RETURN(rc);
2890 }
2891
2892 int osc_match_base(const struct lu_env *env, struct obd_export *exp,
2893                    struct ldlm_res_id *res_id, enum ldlm_type type,
2894                    union ldlm_policy_data *policy, enum ldlm_mode mode,
2895                    __u64 *flags, struct osc_object *obj,
2896                    struct lustre_handle *lockh, enum ldlm_match_flags match_flags)
2897 {
2898         struct obd_device *obd = exp->exp_obd;
2899         __u64 lflags = *flags;
2900         enum ldlm_mode rc;
2901         ENTRY;
2902
2903         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
2904                 RETURN(-EIO);
2905
2906         /* Filesystem lock extents are extended to page boundaries so that
2907          * dealing with the page cache is a little smoother */
2908         policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
2909         policy->l_extent.end |= ~PAGE_MASK;
2910
2911         /* Next, search for already existing extent locks that will cover us */
2912         rc = ldlm_lock_match_with_skip(obd->obd_namespace, lflags, 0,
2913                                         res_id, type, policy, mode, lockh,
2914                                         match_flags);
2915         if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
2916                 RETURN(rc);
2917
2918         if (obj != NULL) {
2919                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
2920
2921                 LASSERT(lock != NULL);
2922                 if (osc_set_lock_data(lock, obj)) {
2923                         lock_res_and_lock(lock);
2924                         if (!ldlm_is_lvb_cached(lock)) {
2925                                 LASSERT(lock->l_ast_data == obj);
2926                                 osc_lock_lvb_update(env, obj, lock, NULL);
2927                                 ldlm_set_lvb_cached(lock);
2928                         }
2929                         unlock_res_and_lock(lock);
2930                 } else {
2931                         ldlm_lock_decref(lockh, rc);
2932                         rc = 0;
2933                 }
2934                 LDLM_LOCK_PUT(lock);
2935         }
2936         RETURN(rc);
2937 }
2938
2939 static int osc_statfs_interpret(const struct lu_env *env,
2940                                 struct ptlrpc_request *req, void *args, int rc)
2941 {
2942         struct osc_async_args *aa = args;
2943         struct obd_statfs *msfs;
2944
2945         ENTRY;
2946         if (rc == -EBADR)
2947                 /*
2948                  * The request has in fact never been sent due to issues at
2949                  * a higher level (LOV).  Exit immediately since the caller
2950                  * is aware of the problem and takes care of the clean up.
2951                  */
2952                 RETURN(rc);
2953
2954         if ((rc == -ENOTCONN || rc == -EAGAIN) &&
2955             (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
2956                 GOTO(out, rc = 0);
2957
2958         if (rc != 0)
2959                 GOTO(out, rc);
2960
2961         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
2962         if (msfs == NULL)
2963                 GOTO(out, rc = -EPROTO);
2964
2965         *aa->aa_oi->oi_osfs = *msfs;
2966 out:
2967         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
2968
2969         RETURN(rc);
2970 }
2971
2972 static int osc_statfs_async(struct obd_export *exp,
2973                             struct obd_info *oinfo, time64_t max_age,
2974                             struct ptlrpc_request_set *rqset)
2975 {
2976         struct obd_device     *obd = class_exp2obd(exp);
2977         struct ptlrpc_request *req;
2978         struct osc_async_args *aa;
2979         int rc;
2980         ENTRY;
2981
2982         if (obd->obd_osfs_age >= max_age) {
2983                 CDEBUG(D_SUPER,
2984                        "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
2985                        obd->obd_name, &obd->obd_osfs,
2986                        obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
2987                        obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
2988                 spin_lock(&obd->obd_osfs_lock);
2989                 memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
2990                 spin_unlock(&obd->obd_osfs_lock);
2991                 oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
2992                 if (oinfo->oi_cb_up)
2993                         oinfo->oi_cb_up(oinfo, 0);
2994
2995                 RETURN(0);
2996         }
2997
2998         /* We could possibly pass max_age in the request (as an absolute
2999          * timestamp or a "seconds.usec ago") so the target can avoid doing
3000          * extra calls into the filesystem if that isn't necessary (e.g.
3001          * during mount that would help a bit).  Having relative timestamps
3002          * is not so great if request processing is slow, while absolute
3003          * timestamps are not ideal because they need time synchronization. */
3004         req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
3005         if (req == NULL)
3006                 RETURN(-ENOMEM);
3007
3008         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3009         if (rc) {
3010                 ptlrpc_request_free(req);
3011                 RETURN(rc);
3012         }
3013         ptlrpc_request_set_replen(req);
3014         req->rq_request_portal = OST_CREATE_PORTAL;
3015         ptlrpc_at_set_req_timeout(req);
3016
3017         if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
3018                 /* procfs requests not want stat in wait for avoid deadlock */
3019                 req->rq_no_resend = 1;
3020                 req->rq_no_delay = 1;
3021         }
3022
3023         req->rq_interpret_reply = osc_statfs_interpret;
3024         aa = ptlrpc_req_async_args(aa, req);
3025         aa->aa_oi = oinfo;
3026
3027         ptlrpc_set_add_req(rqset, req);
3028         RETURN(0);
3029 }
3030
3031 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
3032                       struct obd_statfs *osfs, time64_t max_age, __u32 flags)
3033 {
3034         struct obd_device     *obd = class_exp2obd(exp);
3035         struct obd_statfs     *msfs;
3036         struct ptlrpc_request *req;
3037         struct obd_import     *imp, *imp0;
3038         int rc;
3039         ENTRY;
3040
3041         /*Since the request might also come from lprocfs, so we need
3042          *sync this with client_disconnect_export Bug15684
3043          */
3044         with_imp_locked(obd, imp0, rc)
3045                 imp = class_import_get(imp0);
3046         if (rc)
3047                 RETURN(rc);
3048
3049         /* We could possibly pass max_age in the request (as an absolute
3050          * timestamp or a "seconds.usec ago") so the target can avoid doing
3051          * extra calls into the filesystem if that isn't necessary (e.g.
3052          * during mount that would help a bit).  Having relative timestamps
3053          * is not so great if request processing is slow, while absolute
3054          * timestamps are not ideal because they need time synchronization. */
3055         req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
3056
3057         class_import_put(imp);
3058
3059         if (req == NULL)
3060                 RETURN(-ENOMEM);
3061
3062         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
3063         if (rc) {
3064                 ptlrpc_request_free(req);
3065                 RETURN(rc);
3066         }
3067         ptlrpc_request_set_replen(req);
3068         req->rq_request_portal = OST_CREATE_PORTAL;
3069         ptlrpc_at_set_req_timeout(req);
3070
3071         if (flags & OBD_STATFS_NODELAY) {
3072                 /* procfs requests not want stat in wait for avoid deadlock */
3073                 req->rq_no_resend = 1;
3074                 req->rq_no_delay = 1;
3075         }
3076
3077         rc = ptlrpc_queue_wait(req);
3078         if (rc)
3079                 GOTO(out, rc);
3080
3081         msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
3082         if (msfs == NULL)
3083                 GOTO(out, rc = -EPROTO);
3084
3085         *osfs = *msfs;
3086
3087         EXIT;
3088 out:
3089         ptlrpc_req_finished(req);
3090         return rc;
3091 }
3092
3093 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
3094                          void *karg, void __user *uarg)
3095 {
3096         struct obd_device *obd = exp->exp_obd;
3097         struct obd_ioctl_data *data = karg;
3098         int rc = 0;
3099
3100         ENTRY;
3101         if (!try_module_get(THIS_MODULE)) {
3102                 CERROR("%s: cannot get module '%s'\n", obd->obd_name,
3103                        module_name(THIS_MODULE));
3104                 return -EINVAL;
3105         }
3106         switch (cmd) {
3107         case OBD_IOC_CLIENT_RECOVER:
3108                 rc = ptlrpc_recover_import(obd->u.cli.cl_import,
3109                                            data->ioc_inlbuf1, 0);
3110                 if (rc > 0)
3111                         rc = 0;
3112                 break;
3113         case IOC_OSC_SET_ACTIVE:
3114                 rc = ptlrpc_set_import_active(obd->u.cli.cl_import,
3115                                               data->ioc_offset);
3116                 break;
3117         default:
3118                 rc = -ENOTTY;
3119                 CDEBUG(D_INODE, "%s: unrecognised ioctl %#x by %s: rc = %d\n",
3120                        obd->obd_name, cmd, current->comm, rc);
3121                 break;
3122         }
3123
3124         module_put(THIS_MODULE);
3125         return rc;
3126 }
3127
3128 int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
3129                        u32 keylen, void *key, u32 vallen, void *val,
3130                        struct ptlrpc_request_set *set)
3131 {
3132         struct ptlrpc_request *req;
3133         struct obd_device     *obd = exp->exp_obd;
3134         struct obd_import     *imp = class_exp2cliimp(exp);
3135         char                  *tmp;
3136         int                    rc;
3137         ENTRY;
3138
3139         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
3140
3141         if (KEY_IS(KEY_CHECKSUM)) {
3142                 if (vallen != sizeof(int))
3143                         RETURN(-EINVAL);
3144                 exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
3145                 RETURN(0);
3146         }
3147
3148         if (KEY_IS(KEY_SPTLRPC_CONF)) {
3149                 sptlrpc_conf_client_adapt(obd);
3150                 RETURN(0);
3151         }
3152
3153         if (KEY_IS(KEY_FLUSH_CTX)) {
3154                 sptlrpc_import_flush_my_ctx(imp);
3155                 RETURN(0);
3156         }
3157
3158         if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
3159                 struct client_obd *cli = &obd->u.cli;
3160                 long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
3161                 long target = *(long *)val;
3162
3163                 nr = osc_lru_shrink(env, cli, min(nr, target), true);
3164                 *(long *)val -= nr;
3165                 RETURN(0);
3166         }
3167
3168         if (!set && !KEY_IS(KEY_GRANT_SHRINK))
3169                 RETURN(-EINVAL);
3170
3171         /* We pass all other commands directly to OST. Since nobody calls osc
3172            methods directly and everybody is supposed to go through LOV, we
3173            assume lov checked invalid values for us.
3174            The only recognised values so far are evict_by_nid and mds_conn.
3175            Even if something bad goes through, we'd get a -EINVAL from OST
3176            anyway. */
3177
3178         req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
3179                                                 &RQF_OST_SET_GRANT_INFO :
3180                                                 &RQF_OBD_SET_INFO);
3181         if (req == NULL)
3182                 RETURN(-ENOMEM);
3183
3184         req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
3185                              RCL_CLIENT, keylen);
3186         if (!KEY_IS(KEY_GRANT_SHRINK))
3187                 req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
3188                                      RCL_CLIENT, vallen);
3189         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
3190         if (rc) {
3191                 ptlrpc_request_free(req);
3192                 RETURN(rc);
3193         }
3194
3195         tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
3196         memcpy(tmp, key, keylen);
3197         tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
3198                                                         &RMF_OST_BODY :
3199                                                         &RMF_SETINFO_VAL);
3200         memcpy(tmp, val, vallen);
3201
3202         if (KEY_IS(KEY_GRANT_SHRINK)) {
3203                 struct osc_grant_args *aa;
3204                 struct obdo *oa;
3205
3206                 aa = ptlrpc_req_async_args(aa, req);
3207                 OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
3208                 if (!oa) {
3209                         ptlrpc_req_finished(req);
3210                         RETURN(-ENOMEM);
3211                 }
3212                 *oa = ((struct ost_body *)val)->oa;
3213                 aa->aa_oa = oa;
3214                 req->rq_interpret_reply = osc_shrink_grant_interpret;
3215         }
3216
3217         ptlrpc_request_set_replen(req);
3218         if (!KEY_IS(KEY_GRANT_SHRINK)) {
3219                 LASSERT(set != NULL);
3220                 ptlrpc_set_add_req(set, req);
3221                 ptlrpc_check_set(NULL, set);
3222         } else {
3223                 ptlrpcd_add_req(req);
3224         }
3225
3226         RETURN(0);
3227 }
3228 EXPORT_SYMBOL(osc_set_info_async);
3229
3230 int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
3231                   struct obd_device *obd, struct obd_uuid *cluuid,
3232                   struct obd_connect_data *data, void *localdata)
3233 {
3234         struct client_obd *cli = &obd->u.cli;
3235
3236         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
3237                 long lost_grant;
3238                 long grant;
3239
3240                 spin_lock(&cli->cl_loi_list_lock);
3241                 grant = cli->cl_avail_grant + cli->cl_reserved_grant;
3242                 if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
3243                         /* restore ocd_grant_blkbits as client page bits */
3244                         data->ocd_grant_blkbits = PAGE_SHIFT;
3245                         grant += cli->cl_dirty_grant;
3246                 } else {
3247                         grant += cli->cl_dirty_pages << PAGE_SHIFT;
3248                 }
3249                 data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
3250                 lost_grant = cli->cl_lost_grant;
3251                 cli->cl_lost_grant = 0;
3252                 spin_unlock(&cli->cl_loi_list_lock);
3253
3254                 CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
3255                        " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
3256                        data->ocd_version, data->ocd_grant, lost_grant);
3257         }
3258
3259         RETURN(0);
3260 }
3261 EXPORT_SYMBOL(osc_reconnect);
3262
3263 int osc_disconnect(struct obd_export *exp)
3264 {
3265         struct obd_device *obd = class_exp2obd(exp);
3266         int rc;
3267
3268         rc = client_disconnect_export(exp);
3269         /**
3270          * Initially we put del_shrink_grant before disconnect_export, but it
3271          * causes the following problem if setup (connect) and cleanup
3272          * (disconnect) are tangled together.
3273          *      connect p1                     disconnect p2
3274          *   ptlrpc_connect_import
3275          *     ...............               class_manual_cleanup
3276          *                                     osc_disconnect
3277          *                                     del_shrink_grant
3278          *   ptlrpc_connect_interrupt
3279          *     osc_init_grant
3280          *   add this client to shrink list
3281          *                                      cleanup_osc
3282          * Bang! grant shrink thread trigger the shrink. BUG18662
3283          */
3284         osc_del_grant_list(&obd->u.cli);
3285         return rc;
3286 }
3287 EXPORT_SYMBOL(osc_disconnect);
3288
3289 int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
3290                                  struct hlist_node *hnode, void *arg)
3291 {
3292         struct lu_env *env = arg;
3293         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
3294         struct ldlm_lock *lock;
3295         struct osc_object *osc = NULL;
3296         ENTRY;
3297
3298         lock_res(res);
3299         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
3300                 if (lock->l_ast_data != NULL && osc == NULL) {
3301                         osc = lock->l_ast_data;
3302                         cl_object_get(osc2cl(osc));
3303                 }
3304
3305                 /* clear LDLM_FL_CLEANED flag to make sure it will be canceled
3306                  * by the 2nd round of ldlm_namespace_clean() call in
3307                  * osc_import_event(). */
3308                 ldlm_clear_cleaned(lock);
3309         }
3310         unlock_res(res);
3311
3312         if (osc != NULL) {
3313                 osc_object_invalidate(env, osc);
3314                 cl_object_put(env, osc2cl(osc));
3315         }
3316
3317         RETURN(0);
3318 }
3319 EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
3320
3321 static int osc_import_event(struct obd_device *obd,
3322                             struct obd_import *imp,
3323                             enum obd_import_event event)
3324 {
3325         struct client_obd *cli;
3326         int rc = 0;
3327
3328         ENTRY;
3329         LASSERT(imp->imp_obd == obd);
3330
3331         switch (event) {
3332         case IMP_EVENT_DISCON: {
3333                 cli = &obd->u.cli;
3334                 spin_lock(&cli->cl_loi_list_lock);
3335                 cli->cl_avail_grant = 0;
3336                 cli->cl_lost_grant = 0;
3337                 spin_unlock(&cli->cl_loi_list_lock);
3338                 break;
3339         }
3340         case IMP_EVENT_INACTIVE: {
3341                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
3342                 break;
3343         }
3344         case IMP_EVENT_INVALIDATE: {
3345                 struct ldlm_namespace *ns = obd->obd_namespace;
3346                 struct lu_env         *env;
3347                 __u16                  refcheck;
3348
3349                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3350
3351                 env = cl_env_get(&refcheck);
3352                 if (!IS_ERR(env)) {
3353                         osc_io_unplug(env, &obd->u.cli, NULL);
3354
3355                         cfs_hash_for_each_nolock(ns->ns_rs_hash,
3356                                                  osc_ldlm_resource_invalidate,
3357                                                  env, 0);
3358                         cl_env_put(env, &refcheck);
3359
3360                         ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
3361                 } else
3362                         rc = PTR_ERR(env);
3363                 break;
3364         }
3365         case IMP_EVENT_ACTIVE: {
3366                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
3367                 break;
3368         }
3369         case IMP_EVENT_OCD: {
3370                 struct obd_connect_data *ocd = &imp->imp_connect_data;
3371
3372                 if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
3373                         osc_init_grant(&obd->u.cli, ocd);
3374
3375                 /* See bug 7198 */
3376                 if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
3377                         imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
3378
3379                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
3380                 break;
3381         }
3382         case IMP_EVENT_DEACTIVATE: {
3383                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
3384                 break;
3385         }
3386         case IMP_EVENT_ACTIVATE: {
3387                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
3388                 break;
3389         }
3390         default:
3391                 CERROR("Unknown import event %d\n", event);
3392                 LBUG();
3393         }
3394         RETURN(rc);
3395 }
3396
3397 /**
3398  * Determine whether the lock can be canceled before replaying the lock
3399  * during recovery, see bug16774 for detailed information.
3400  *
3401  * \retval zero the lock can't be canceled
3402  * \retval other ok to cancel
3403  */
3404 static int osc_cancel_weight(struct ldlm_lock *lock)
3405 {
3406         /*
3407          * Cancel all unused and granted extent lock.
3408          */
3409         if (lock->l_resource->lr_type == LDLM_EXTENT &&
3410             ldlm_is_granted(lock) &&
3411             osc_ldlm_weigh_ast(lock) == 0)
3412                 RETURN(1);
3413
3414         RETURN(0);
3415 }
3416
3417 static int brw_queue_work(const struct lu_env *env, void *data)
3418 {
3419         struct client_obd *cli = data;
3420
3421         CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
3422
3423         osc_io_unplug(env, cli, NULL);
3424         RETURN(0);
3425 }
3426
3427 int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
3428 {
3429         struct client_obd *cli = &obd->u.cli;
3430         void *handler;
3431         int rc;
3432
3433         ENTRY;
3434
3435         rc = ptlrpcd_addref();
3436         if (rc)
3437                 RETURN(rc);
3438
3439         rc = client_obd_setup(obd, lcfg);
3440         if (rc)
3441                 GOTO(out_ptlrpcd, rc);
3442
3443
3444         handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
3445         if (IS_ERR(handler))
3446                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3447         cli->cl_writeback_work = handler;
3448
3449         handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
3450         if (IS_ERR(handler))
3451                 GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
3452         cli->cl_lru_work = handler;
3453
3454         rc = osc_quota_setup(obd);
3455         if (rc)
3456                 GOTO(out_ptlrpcd_work, rc);
3457
3458         cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
3459         osc_update_next_shrink(cli);
3460
3461         RETURN(rc);
3462
3463 out_ptlrpcd_work:
3464         if (cli->cl_writeback_work != NULL) {
3465                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3466                 cli->cl_writeback_work = NULL;
3467         }
3468         if (cli->cl_lru_work != NULL) {
3469                 ptlrpcd_destroy_work(cli->cl_lru_work);
3470                 cli->cl_lru_work = NULL;
3471         }
3472         client_obd_cleanup(obd);
3473 out_ptlrpcd:
3474         ptlrpcd_decref();
3475         RETURN(rc);
3476 }
3477 EXPORT_SYMBOL(osc_setup_common);
3478
3479 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
3480 {
3481         struct client_obd *cli = &obd->u.cli;
3482         int                adding;
3483         int                added;
3484         int                req_count;
3485         int                rc;
3486
3487         ENTRY;
3488
3489         rc = osc_setup_common(obd, lcfg);
3490         if (rc < 0)
3491                 RETURN(rc);
3492
3493         rc = osc_tunables_init(obd);
3494         if (rc)
3495                 RETURN(rc);
3496
3497         /*
3498          * We try to control the total number of requests with a upper limit
3499          * osc_reqpool_maxreqcount. There might be some race which will cause
3500          * over-limit allocation, but it is fine.
3501          */
3502         req_count = atomic_read(&osc_pool_req_count);
3503         if (req_count < osc_reqpool_maxreqcount) {
3504                 adding = cli->cl_max_rpcs_in_flight + 2;
3505                 if (req_count + adding > osc_reqpool_maxreqcount)
3506                         adding = osc_reqpool_maxreqcount - req_count;
3507
3508                 added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
3509                 atomic_add(added, &osc_pool_req_count);
3510         }
3511
3512         ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
3513
3514         spin_lock(&osc_shrink_lock);
3515         list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
3516         spin_unlock(&osc_shrink_lock);
3517         cli->cl_import->imp_idle_timeout = osc_idle_timeout;
3518         cli->cl_import->imp_idle_debug = D_HA;
3519
3520         RETURN(0);
3521 }
3522
3523 int osc_precleanup_common(struct obd_device *obd)
3524 {
3525         struct client_obd *cli = &obd->u.cli;
3526         ENTRY;
3527
3528         /* LU-464
3529          * for echo client, export may be on zombie list, wait for
3530          * zombie thread to cull it, because cli.cl_import will be
3531          * cleared in client_disconnect_export():
3532          *   class_export_destroy() -> obd_cleanup() ->
3533          *   echo_device_free() -> echo_client_cleanup() ->
3534          *   obd_disconnect() -> osc_disconnect() ->
3535          *   client_disconnect_export()
3536          */
3537         obd_zombie_barrier();
3538         if (cli->cl_writeback_work) {
3539                 ptlrpcd_destroy_work(cli->cl_writeback_work);
3540                 cli->cl_writeback_work = NULL;
3541         }
3542
3543         if (cli->cl_lru_work) {
3544                 ptlrpcd_destroy_work(cli->cl_lru_work);
3545                 cli->cl_lru_work = NULL;
3546         }
3547
3548         obd_cleanup_client_import(obd);
3549         RETURN(0);
3550 }
3551 EXPORT_SYMBOL(osc_precleanup_common);
3552
3553 static int osc_precleanup(struct obd_device *obd)
3554 {
3555         ENTRY;
3556
3557         osc_precleanup_common(obd);
3558
3559         ptlrpc_lprocfs_unregister_obd(obd);
3560         RETURN(0);
3561 }
3562
3563 int osc_cleanup_common(struct obd_device *obd)
3564 {
3565         struct client_obd *cli = &obd->u.cli;
3566         int rc;
3567
3568         ENTRY;
3569
3570         spin_lock(&osc_shrink_lock);
3571         list_del(&cli->cl_shrink_list);
3572         spin_unlock(&osc_shrink_lock);
3573
3574         /* lru cleanup */
3575         if (cli->cl_cache != NULL) {
3576                 LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
3577                 spin_lock(&cli->cl_cache->ccc_lru_lock);
3578                 list_del_init(&cli->cl_lru_osc);
3579                 spin_unlock(&cli->cl_cache->ccc_lru_lock);
3580                 cli->cl_lru_left = NULL;
3581                 cl_cache_decref(cli->cl_cache);
3582                 cli->cl_cache = NULL;
3583         }
3584
3585         /* free memory of osc quota cache */
3586         osc_quota_cleanup(obd);
3587
3588         rc = client_obd_cleanup(obd);
3589
3590         ptlrpcd_decref();
3591         RETURN(rc);
3592 }
3593 EXPORT_SYMBOL(osc_cleanup_common);
3594
3595 static const struct obd_ops osc_obd_ops = {
3596         .o_owner                = THIS_MODULE,
3597         .o_setup                = osc_setup,
3598         .o_precleanup           = osc_precleanup,
3599         .o_cleanup              = osc_cleanup_common,
3600         .o_add_conn             = client_import_add_conn,
3601         .o_del_conn             = client_import_del_conn,
3602         .o_connect              = client_connect_import,
3603         .o_reconnect            = osc_reconnect,
3604         .o_disconnect           = osc_disconnect,
3605         .o_statfs               = osc_statfs,
3606         .o_statfs_async         = osc_statfs_async,
3607         .o_create               = osc_create,
3608         .o_destroy              = osc_destroy,
3609         .o_getattr              = osc_getattr,
3610         .o_setattr              = osc_setattr,
3611         .o_iocontrol            = osc_iocontrol,
3612         .o_set_info_async       = osc_set_info_async,
3613         .o_import_event         = osc_import_event,
3614         .o_quotactl             = osc_quotactl,
3615 };
3616
3617 LIST_HEAD(osc_shrink_list);
3618 DEFINE_SPINLOCK(osc_shrink_lock);
3619
3620 #ifdef HAVE_SHRINKER_COUNT
3621 static struct shrinker osc_cache_shrinker = {
3622         .count_objects  = osc_cache_shrink_count,
3623         .scan_objects   = osc_cache_shrink_scan,
3624         .seeks          = DEFAULT_SEEKS,
3625 };
3626 #else
3627 static int osc_cache_shrink(struct shrinker *shrinker,
3628                             struct shrink_control *sc)
3629 {
3630         (void)osc_cache_shrink_scan(shrinker, sc);
3631
3632         return osc_cache_shrink_count(shrinker, sc);
3633 }
3634
3635 static struct shrinker osc_cache_shrinker = {
3636         .shrink   = osc_cache_shrink,
3637         .seeks    = DEFAULT_SEEKS,
3638 };
3639 #endif
3640
3641 static int __init osc_init(void)
3642 {
3643         unsigned int reqpool_size;
3644         unsigned int reqsize;
3645         int rc;
3646         ENTRY;
3647
3648         /* print an address of _any_ initialized kernel symbol from this
3649          * module, to allow debugging with gdb that doesn't support data
3650          * symbols from modules.*/
3651         CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
3652
3653         rc = lu_kmem_init(osc_caches);
3654         if (rc)
3655                 RETURN(rc);
3656
3657         rc = class_register_type(&osc_obd_ops, NULL, true,
3658                                  LUSTRE_OSC_NAME, &osc_device_type);
3659         if (rc)
3660                 GOTO(out_kmem, rc);
3661
3662         rc = register_shrinker(&osc_cache_shrinker);
3663         if (rc)
3664                 GOTO(out_type, rc);
3665
3666         /* This is obviously too much memory, only prevent overflow here */
3667         if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
3668                 GOTO(out_shrinker, rc = -EINVAL);
3669
3670         reqpool_size = osc_reqpool_mem_max << 20;
3671
3672         reqsize = 1;
3673         while (reqsize < OST_IO_MAXREQSIZE)
3674                 reqsize = reqsize << 1;
3675
3676         /*
3677          * We don't enlarge the request count in OSC pool according to
3678          * cl_max_rpcs_in_flight. The allocation from the pool will only be
3679          * tried after normal allocation failed. So a small OSC pool won't
3680          * cause much performance degression in most of cases.
3681          */
3682         osc_reqpool_maxreqcount = reqpool_size / reqsize;
3683
3684         atomic_set(&osc_pool_req_count, 0);
3685         osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
3686                                           ptlrpc_add_rqs_to_pool);
3687
3688         if (osc_rq_pool == NULL)
3689                 GOTO(out_shrinker, rc = -ENOMEM);
3690
3691         rc = osc_start_grant_work();
3692         if (rc != 0)
3693                 GOTO(out_req_pool, rc);
3694
3695         RETURN(rc);
3696
3697 out_req_pool:
3698         ptlrpc_free_rq_pool(osc_rq_pool);
3699 out_shrinker:
3700         unregister_shrinker(&osc_cache_shrinker);
3701 out_type:
3702         class_unregister_type(LUSTRE_OSC_NAME);
3703 out_kmem:
3704         lu_kmem_fini(osc_caches);
3705
3706         RETURN(rc);
3707 }
3708
3709 static void __exit osc_exit(void)
3710 {
3711         osc_stop_grant_work();
3712         unregister_shrinker(&osc_cache_shrinker);
3713         class_unregister_type(LUSTRE_OSC_NAME);
3714         lu_kmem_fini(osc_caches);
3715         ptlrpc_free_rq_pool(osc_rq_pool);
3716 }
3717
3718 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
3719 MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
3720 MODULE_VERSION(LUSTRE_VERSION_STRING);
3721 MODULE_LICENSE("GPL");
3722
3723 module_init(osc_init);
3724 module_exit(osc_exit);