Whamcloud - gitweb
LU-1808 build: Lustre build does not support FC15.
[fs/lustre-release.git] / lustre / mdc / mdc_locks.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  */
36
37 #define DEBUG_SUBSYSTEM S_MDC
38
39 #ifdef __KERNEL__
40 # include <linux/module.h>
41 # include <linux/pagemap.h>
42 # include <linux/miscdevice.h>
43 # include <linux/init.h>
44 #else
45 # include <liblustre.h>
46 #endif
47
48 #include <lustre_acl.h>
49 #include <obd_class.h>
50 #include <lustre_dlm.h>
51 /* fid_res_name_eq() */
52 #include <lustre_fid.h>
53 #include <lprocfs_status.h>
54 #include "mdc_internal.h"
55
56 struct mdc_getattr_args {
57         struct obd_export           *ga_exp;
58         struct md_enqueue_info      *ga_minfo;
59         struct ldlm_enqueue_info    *ga_einfo;
60 };
61
62 int it_disposition(struct lookup_intent *it, int flag)
63 {
64         return it->d.lustre.it_disposition & flag;
65 }
66 EXPORT_SYMBOL(it_disposition);
67
68 void it_set_disposition(struct lookup_intent *it, int flag)
69 {
70         it->d.lustre.it_disposition |= flag;
71 }
72 EXPORT_SYMBOL(it_set_disposition);
73
74 void it_clear_disposition(struct lookup_intent *it, int flag)
75 {
76         it->d.lustre.it_disposition &= ~flag;
77 }
78 EXPORT_SYMBOL(it_clear_disposition);
79
80 int it_open_error(int phase, struct lookup_intent *it)
81 {
82         if (it_disposition(it, DISP_OPEN_OPEN)) {
83                 if (phase >= DISP_OPEN_OPEN)
84                         return it->d.lustre.it_status;
85                 else
86                         return 0;
87         }
88
89         if (it_disposition(it, DISP_OPEN_CREATE)) {
90                 if (phase >= DISP_OPEN_CREATE)
91                         return it->d.lustre.it_status;
92                 else
93                         return 0;
94         }
95
96         if (it_disposition(it, DISP_LOOKUP_EXECD)) {
97                 if (phase >= DISP_LOOKUP_EXECD)
98                         return it->d.lustre.it_status;
99                 else
100                         return 0;
101         }
102
103         if (it_disposition(it, DISP_IT_EXECD)) {
104                 if (phase >= DISP_IT_EXECD)
105                         return it->d.lustre.it_status;
106                 else
107                         return 0;
108         }
109         CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
110                it->d.lustre.it_status);
111         LBUG();
112         return 0;
113 }
114 EXPORT_SYMBOL(it_open_error);
115
116 /* this must be called on a lockh that is known to have a referenced lock */
117 int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
118                       __u64 *bits)
119 {
120         struct ldlm_lock *lock;
121         ENTRY;
122
123         if(bits)
124                 *bits = 0;
125
126         if (!*lockh)
127                 RETURN(0);
128
129         lock = ldlm_handle2lock((struct lustre_handle *)lockh);
130
131         LASSERT(lock != NULL);
132         lock_res_and_lock(lock);
133 #ifdef __KERNEL__
134         if (lock->l_ast_data && lock->l_ast_data != data) {
135                 struct inode *new_inode = data;
136                 struct inode *old_inode = lock->l_ast_data;
137                 LASSERTF(old_inode->i_state & I_FREEING,
138                          "Found existing inode %p/%lu/%u state %lu in lock: "
139                          "setting data to %p/%lu/%u\n", old_inode,
140                          old_inode->i_ino, old_inode->i_generation,
141                          old_inode->i_state,
142                          new_inode, new_inode->i_ino, new_inode->i_generation);
143         }
144 #endif
145         lock->l_ast_data = data;
146         if (bits)
147                 *bits = lock->l_policy_data.l_inodebits.bits;
148
149         unlock_res_and_lock(lock);
150         LDLM_LOCK_PUT(lock);
151
152         RETURN(0);
153 }
154
155 ldlm_mode_t mdc_lock_match(struct obd_export *exp, int flags,
156                            const struct lu_fid *fid, ldlm_type_t type,
157                            ldlm_policy_data_t *policy, ldlm_mode_t mode,
158                            struct lustre_handle *lockh)
159 {
160         struct ldlm_res_id res_id;
161         ldlm_mode_t rc;
162         ENTRY;
163
164         fid_build_reg_res_name(fid, &res_id);
165         rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
166                              &res_id, type, policy, mode, lockh, 0);
167         RETURN(rc);
168 }
169
170 int mdc_cancel_unused(struct obd_export *exp,
171                       const struct lu_fid *fid,
172                       ldlm_policy_data_t *policy,
173                       ldlm_mode_t mode,
174                       ldlm_cancel_flags_t flags,
175                       void *opaque)
176 {
177         struct ldlm_res_id res_id;
178         struct obd_device *obd = class_exp2obd(exp);
179         int rc;
180
181         ENTRY;
182
183         fid_build_reg_res_name(fid, &res_id);
184         rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
185                                              policy, mode, flags, opaque);
186         RETURN(rc);
187 }
188
189 int mdc_change_cbdata(struct obd_export *exp,
190                       const struct lu_fid *fid,
191                       ldlm_iterator_t it, void *data)
192 {
193         struct ldlm_res_id res_id;
194         ENTRY;
195
196         fid_build_reg_res_name(fid, &res_id);
197         ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace,
198                               &res_id, it, data);
199
200         EXIT;
201         return 0;
202 }
203
204 /* find any ldlm lock of the inode in mdc
205  * return 0    not find
206  *        1    find one
207  *      < 0    error */
208 int mdc_find_cbdata(struct obd_export *exp,
209                     const struct lu_fid *fid,
210                     ldlm_iterator_t it, void *data)
211 {
212         struct ldlm_res_id res_id;
213         int rc = 0;
214         ENTRY;
215
216         fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
217         rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
218                                    it, data);
219         if (rc == LDLM_ITER_STOP)
220                 RETURN(1);
221         else if (rc == LDLM_ITER_CONTINUE)
222                 RETURN(0);
223         RETURN(rc);
224 }
225
226 static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
227 {
228         /* Don't hold error requests for replay. */
229         if (req->rq_replay) {
230                 cfs_spin_lock(&req->rq_lock);
231                 req->rq_replay = 0;
232                 cfs_spin_unlock(&req->rq_lock);
233         }
234         if (rc && req->rq_transno != 0) {
235                 DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
236                 LBUG();
237         }
238 }
239
240 /* Save a large LOV EA into the request buffer so that it is available
241  * for replay.  We don't do this in the initial request because the
242  * original request doesn't need this buffer (at most it sends just the
243  * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
244  * buffer and may also be difficult to allocate and save a very large
245  * request buffer for each open. (bug 5707)
246  *
247  * OOM here may cause recovery failure if lmm is needed (only for the
248  * original open if the MDS crashed just when this client also OOM'd)
249  * but this is incredibly unlikely, and questionable whether the client
250  * could do MDS recovery under OOM anyways... */
251 static void mdc_realloc_openmsg(struct ptlrpc_request *req,
252                                 struct mdt_body *body)
253 {
254         int     rc;
255
256         /* FIXME: remove this explicit offset. */
257         rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
258                                         body->eadatasize);
259         if (rc) {
260                 CERROR("Can't enlarge segment %d size to %d\n",
261                        DLM_INTENT_REC_OFF + 4, body->eadatasize);
262                 body->valid &= ~OBD_MD_FLEASIZE;
263                 body->eadatasize = 0;
264         }
265 }
266
267 static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
268                                                    struct lookup_intent *it,
269                                                    struct md_op_data *op_data,
270                                                    void *lmm, int lmmsize,
271                                                    void *cb_data)
272 {
273         struct ptlrpc_request *req;
274         struct obd_device     *obddev = class_exp2obd(exp);
275         struct ldlm_intent    *lit;
276         CFS_LIST_HEAD(cancels);
277         int                    count = 0;
278         int                    mode;
279         int                    rc;
280         ENTRY;
281
282         it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
283
284         /* XXX: openlock is not cancelled for cross-refs. */
285         /* If inode is known, cancel conflicting OPEN locks. */
286         if (fid_is_sane(&op_data->op_fid2)) {
287                 if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
288                         mode = LCK_CW;
289 #ifdef FMODE_EXEC
290                 else if (it->it_flags & FMODE_EXEC)
291                         mode = LCK_PR;
292 #endif
293                 else
294                         mode = LCK_CR;
295                 count = mdc_resource_get_unused(exp, &op_data->op_fid2,
296                                                 &cancels, mode,
297                                                 MDS_INODELOCK_OPEN);
298         }
299
300         /* If CREATE, cancel parent's UPDATE lock. */
301         if (it->it_op & IT_CREAT)
302                 mode = LCK_EX;
303         else
304                 mode = LCK_CR;
305         count += mdc_resource_get_unused(exp, &op_data->op_fid1,
306                                          &cancels, mode,
307                                          MDS_INODELOCK_UPDATE);
308
309         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
310                                    &RQF_LDLM_INTENT_OPEN);
311         if (req == NULL) {
312                 ldlm_lock_list_put(&cancels, l_bl_ast, count);
313                 RETURN(ERR_PTR(-ENOMEM));
314         }
315
316         /* parent capability */
317         mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
318         /* child capability, reserve the size according to parent capa, it will
319          * be filled after we get the reply */
320         mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
321
322         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
323                              op_data->op_namelen + 1);
324         req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
325                              max(lmmsize, obddev->u.cli.cl_default_mds_easize));
326
327         rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
328         if (rc) {
329                 ptlrpc_request_free(req);
330                 return NULL;
331         }
332
333         cfs_spin_lock(&req->rq_lock);
334         req->rq_replay = req->rq_import->imp_replayable;
335         cfs_spin_unlock(&req->rq_lock);
336
337         /* pack the intent */
338         lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
339         lit->opc = (__u64)it->it_op;
340
341         /* pack the intended request */
342         mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
343                       lmmsize);
344
345         /* for remote client, fetch remote perm for current user */
346         if (client_is_remote(exp))
347                 req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
348                                      sizeof(struct mdt_remote_perm));
349         ptlrpc_request_set_replen(req);
350         return req;
351 }
352
353 static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
354                                                      struct lookup_intent *it,
355                                                      struct md_op_data *op_data)
356 {
357         struct ptlrpc_request *req;
358         struct obd_device     *obddev = class_exp2obd(exp);
359         struct ldlm_intent    *lit;
360         int                    rc;
361         ENTRY;
362
363         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
364                                    &RQF_LDLM_INTENT_UNLINK);
365         if (req == NULL)
366                 RETURN(ERR_PTR(-ENOMEM));
367
368         mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
369         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
370                              op_data->op_namelen + 1);
371
372         rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
373         if (rc) {
374                 ptlrpc_request_free(req);
375                 RETURN(ERR_PTR(rc));
376         }
377
378         /* pack the intent */
379         lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
380         lit->opc = (__u64)it->it_op;
381
382         /* pack the intended request */
383         mdc_unlink_pack(req, op_data);
384
385         req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
386                              obddev->u.cli.cl_max_mds_easize);
387         req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
388                              obddev->u.cli.cl_max_mds_cookiesize);
389         ptlrpc_request_set_replen(req);
390         RETURN(req);
391 }
392
393 static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
394                                                       struct lookup_intent *it,
395                                                       struct md_op_data *op_data)
396 {
397         struct ptlrpc_request *req;
398         struct obd_device     *obddev = class_exp2obd(exp);
399         obd_valid              valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
400                                        OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
401                                        OBD_MD_FLMDSCAPA | OBD_MD_MEA |
402                                        (client_is_remote(exp) ?
403                                                OBD_MD_FLRMTPERM : OBD_MD_FLACL);
404         struct ldlm_intent    *lit;
405         int                    rc;
406         ENTRY;
407
408         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
409                                    &RQF_LDLM_INTENT_GETATTR);
410         if (req == NULL)
411                 RETURN(ERR_PTR(-ENOMEM));
412
413         mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
414         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
415                              op_data->op_namelen + 1);
416
417         rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
418         if (rc) {
419                 ptlrpc_request_free(req);
420                 RETURN(ERR_PTR(rc));
421         }
422
423         /* pack the intent */
424         lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
425         lit->opc = (__u64)it->it_op;
426
427         /* pack the intended request */
428         mdc_getattr_pack(req, valid, it->it_flags, op_data,
429                          obddev->u.cli.cl_max_mds_easize);
430
431         req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
432                              obddev->u.cli.cl_max_mds_easize);
433         if (client_is_remote(exp))
434                 req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
435                                      sizeof(struct mdt_remote_perm));
436         ptlrpc_request_set_replen(req);
437         RETURN(req);
438 }
439
440 static struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp)
441 {
442         struct ptlrpc_request *req;
443         int rc;
444         ENTRY;
445
446         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
447         if (req == NULL)
448                 RETURN(ERR_PTR(-ENOMEM));
449
450         rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
451         if (rc) {
452                 ptlrpc_request_free(req);
453                 RETURN(ERR_PTR(rc));
454         }
455
456         ptlrpc_request_set_replen(req);
457         RETURN(req);
458 }
459
460 static int mdc_finish_enqueue(struct obd_export *exp,
461                               struct ptlrpc_request *req,
462                               struct ldlm_enqueue_info *einfo,
463                               struct lookup_intent *it,
464                               struct lustre_handle *lockh,
465                               int rc)
466 {
467         struct req_capsule  *pill = &req->rq_pill;
468         struct ldlm_request *lockreq;
469         struct ldlm_reply   *lockrep;
470         struct lustre_intent_data *intent = &it->d.lustre;
471         ENTRY;
472
473         LASSERT(rc >= 0);
474         /* Similarly, if we're going to replay this request, we don't want to
475          * actually get a lock, just perform the intent. */
476         if (req->rq_transno || req->rq_replay) {
477                 lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
478                 lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
479         }
480
481         if (rc == ELDLM_LOCK_ABORTED) {
482                 einfo->ei_mode = 0;
483                 memset(lockh, 0, sizeof(*lockh));
484                 rc = 0;
485         } else { /* rc = 0 */
486                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
487                 LASSERT(lock);
488
489                 /* If the server gave us back a different lock mode, we should
490                  * fix up our variables. */
491                 if (lock->l_req_mode != einfo->ei_mode) {
492                         ldlm_lock_addref(lockh, lock->l_req_mode);
493                         ldlm_lock_decref(lockh, einfo->ei_mode);
494                         einfo->ei_mode = lock->l_req_mode;
495                 }
496                 LDLM_LOCK_PUT(lock);
497         }
498
499         lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
500         LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
501
502         intent->it_disposition = (int)lockrep->lock_policy_res1;
503         intent->it_status = (int)lockrep->lock_policy_res2;
504         intent->it_lock_mode = einfo->ei_mode;
505         intent->it_lock_handle = lockh->cookie;
506         intent->it_data = req;
507
508         if (intent->it_status < 0 && req->rq_replay)
509                 mdc_clear_replay_flag(req, intent->it_status);
510
511         /* If we're doing an IT_OPEN which did not result in an actual
512          * successful open, then we need to remove the bit which saves
513          * this request for unconditional replay.
514          *
515          * It's important that we do this first!  Otherwise we might exit the
516          * function without doing so, and try to replay a failed create
517          * (bug 3440) */
518         if (it->it_op & IT_OPEN && req->rq_replay &&
519             (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0))
520                 mdc_clear_replay_flag(req, intent->it_status);
521
522         DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
523                   it->it_op, intent->it_disposition, intent->it_status);
524
525         /* We know what to expect, so we do any byte flipping required here */
526         if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
527                 struct mdt_body *body;
528
529                 body = req_capsule_server_get(pill, &RMF_MDT_BODY);
530                 if (body == NULL) {
531                         CERROR ("Can't swab mdt_body\n");
532                         RETURN (-EPROTO);
533                 }
534
535                 if (it_disposition(it, DISP_OPEN_OPEN) &&
536                     !it_open_error(DISP_OPEN_OPEN, it)) {
537                         /*
538                          * If this is a successful OPEN request, we need to set
539                          * replay handler and data early, so that if replay
540                          * happens immediately after swabbing below, new reply
541                          * is swabbed by that handler correctly.
542                          */
543                         mdc_set_open_replay_data(NULL, NULL, req);
544                 }
545
546                 /* TODO: make sure LAYOUT lock must be granted along with EA */
547
548                 if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
549                         void *eadata;
550
551                          mdc_update_max_ea_from_body(exp, body);
552
553                         /*
554                          * The eadata is opaque; just check that it is there.
555                          * Eventually, obd_unpackmd() will check the contents.
556                          */
557                         eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
558                                                               body->eadatasize);
559                         if (eadata == NULL)
560                                 RETURN(-EPROTO);
561
562                         /*
563                          * We save the reply LOV EA in case we have to replay a
564                          * create for recovery.  If we didn't allocate a large
565                          * enough request buffer above we need to reallocate it
566                          * here to hold the actual LOV EA.
567                          *
568                          * To not save LOV EA if request is not going to replay
569                          * (for example error one).
570                          */
571                         if ((it->it_op & IT_OPEN) && req->rq_replay) {
572                                 void *lmm;
573                                 if (req_capsule_get_size(pill, &RMF_EADATA,
574                                                          RCL_CLIENT) <
575                                     body->eadatasize)
576                                         mdc_realloc_openmsg(req, body);
577                                 else
578                                         req_capsule_shrink(pill, &RMF_EADATA,
579                                                            body->eadatasize,
580                                                            RCL_CLIENT);
581
582                                 req_capsule_set_size(pill, &RMF_EADATA,
583                                                      RCL_CLIENT,
584                                                      body->eadatasize);
585
586                                 lmm = req_capsule_client_get(pill, &RMF_EADATA);
587                                 if (lmm)
588                                         memcpy(lmm, eadata, body->eadatasize);
589                         }
590                 }
591
592                 if (body->valid & OBD_MD_FLRMTPERM) {
593                         struct mdt_remote_perm *perm;
594
595                         LASSERT(client_is_remote(exp));
596                         perm = req_capsule_server_swab_get(pill, &RMF_ACL,
597                                                 lustre_swab_mdt_remote_perm);
598                         if (perm == NULL)
599                                 RETURN(-EPROTO);
600                 }
601                 if (body->valid & OBD_MD_FLMDSCAPA) {
602                         struct lustre_capa *capa, *p;
603
604                         capa = req_capsule_server_get(pill, &RMF_CAPA1);
605                         if (capa == NULL)
606                                 RETURN(-EPROTO);
607
608                         if (it->it_op & IT_OPEN) {
609                                 /* client fid capa will be checked in replay */
610                                 p = req_capsule_client_get(pill, &RMF_CAPA2);
611                                 LASSERT(p);
612                                 *p = *capa;
613                         }
614                 }
615                 if (body->valid & OBD_MD_FLOSSCAPA) {
616                         struct lustre_capa *capa;
617
618                         capa = req_capsule_server_get(pill, &RMF_CAPA2);
619                         if (capa == NULL)
620                                 RETURN(-EPROTO);
621                 }
622         } else if (it->it_op & IT_LAYOUT) {
623                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
624
625                 if (lock != NULL && lock->l_lvb_data == NULL) {
626                         int lvb_len;
627
628                         /* maybe the lock was granted right away and layout
629                          * is packed into RMF_DLM_LVB of req */
630                         lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB,
631                                                        RCL_SERVER);
632                         if (lvb_len > 0) {
633                                 void *lvb;
634                                 void *lmm;
635
636                                 lvb = req_capsule_server_get(pill,
637                                                              &RMF_DLM_LVB);
638                                 if (lvb == NULL) {
639                                         LDLM_LOCK_PUT(lock);
640                                         RETURN(-EPROTO);
641                                 }
642
643                                 OBD_ALLOC_LARGE(lmm, lvb_len);
644                                 if (lmm == NULL) {
645                                         LDLM_LOCK_PUT(lock);
646                                         RETURN(-ENOMEM);
647                                 }
648                                 memcpy(lmm, lvb, lvb_len);
649
650                                 /* install lvb_data */
651                                 lock_res_and_lock(lock);
652                                 LASSERT(lock->l_lvb_data == NULL);
653                                 lock->l_lvb_data = lmm;
654                                 lock->l_lvb_len = lvb_len;
655                                 unlock_res_and_lock(lock);
656                         }
657                 }
658                 if (lock != NULL)
659                         LDLM_LOCK_PUT(lock);
660         }
661
662         RETURN(rc);
663 }
664
665 /* We always reserve enough space in the reply packet for a stripe MD, because
666  * we don't know in advance the file type. */
667 int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
668                 struct lookup_intent *it, struct md_op_data *op_data,
669                 struct lustre_handle *lockh, void *lmm, int lmmsize,
670                 struct ptlrpc_request **reqp, int extra_lock_flags)
671 {
672         struct obd_device     *obddev = class_exp2obd(exp);
673         struct ptlrpc_request *req = NULL;
674         int                    flags, saved_flags = extra_lock_flags;
675         int                    rc;
676         struct ldlm_res_id res_id;
677         static const ldlm_policy_data_t lookup_policy =
678                             { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
679         static const ldlm_policy_data_t update_policy =
680                             { .l_inodebits = { MDS_INODELOCK_UPDATE } };
681         static const ldlm_policy_data_t layout_policy =
682                             { .l_inodebits = { MDS_INODELOCK_LAYOUT } };
683         ldlm_policy_data_t const *policy = &lookup_policy;
684         int                    generation, resends = 0;
685         struct ldlm_reply     *lockrep;
686         ENTRY;
687
688         LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
689                  einfo->ei_type);
690
691         fid_build_reg_res_name(&op_data->op_fid1, &res_id);
692
693         if (it) {
694                 saved_flags |= LDLM_FL_HAS_INTENT;
695                 if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
696                         policy = &update_policy;
697                 else if (it->it_op & IT_LAYOUT)
698                         policy = &layout_policy;
699         }
700
701         LASSERT(reqp == NULL);
702
703         generation = obddev->u.cli.cl_import->imp_generation;
704 resend:
705         flags = saved_flags;
706         if (!it) {
707                 /* The only way right now is FLOCK, in this case we hide flock
708                    policy as lmm, but lmmsize is 0 */
709                 LASSERT(lmm && lmmsize == 0);
710                 LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
711                          einfo->ei_type);
712                 policy = (ldlm_policy_data_t *)lmm;
713                 res_id.name[3] = LDLM_FLOCK;
714         } else if (it->it_op & IT_OPEN) {
715                 req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
716                                            einfo->ei_cbdata);
717                 policy = &update_policy;
718                 einfo->ei_cbdata = NULL;
719                 lmm = NULL;
720         } else if (it->it_op & IT_UNLINK)
721                 req = mdc_intent_unlink_pack(exp, it, op_data);
722         else if (it->it_op & (IT_GETATTR | IT_LOOKUP))
723                 req = mdc_intent_getattr_pack(exp, it, op_data);
724         else if (it->it_op & (IT_READDIR | IT_LAYOUT))
725                 req = ldlm_enqueue_pack(exp);
726         else {
727                 LBUG();
728                 RETURN(-EINVAL);
729         }
730
731         if (IS_ERR(req))
732                 RETURN(PTR_ERR(req));
733
734         if (req != NULL && it && it->it_op & IT_CREAT)
735                 /* ask ptlrpc not to resend on EINPROGRESS since we have our own
736                  * retry logic */
737                 req->rq_no_retry_einprogress = 1;
738
739         if (resends) {
740                 req->rq_generation_set = 1;
741                 req->rq_import_generation = generation;
742                 req->rq_sent = cfs_time_current_sec() + resends;
743         }
744
745         /* It is important to obtain rpc_lock first (if applicable), so that
746          * threads that are serialised with rpc_lock are not polluting our
747          * rpcs in flight counter. We do not do flock request limiting, though*/
748         if (it) {
749                 mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
750                 rc = mdc_enter_request(&obddev->u.cli);
751                 if (rc != 0) {
752                         mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
753                         mdc_clear_replay_flag(req, 0);
754                         ptlrpc_req_finished(req);
755                         RETURN(rc);
756                 }
757         }
758
759         rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
760                               0, lockh, 0);
761         if (!it) {
762                 /* For flock requests we immediatelly return without further
763                    delay and let caller deal with the rest, since rest of
764                    this function metadata processing makes no sense for flock
765                    requests anyway */
766                 RETURN(rc);
767         }
768
769         mdc_exit_request(&obddev->u.cli);
770         mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
771
772         if (rc < 0) {
773                 CERROR("ldlm_cli_enqueue: %d\n", rc);
774                 mdc_clear_replay_flag(req, rc);
775                 ptlrpc_req_finished(req);
776                 RETURN(rc);
777         }
778
779         lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
780         LASSERT(lockrep != NULL);
781
782         /* Retry the create infinitely when we get -EINPROGRESS from
783          * server. This is required by the new quota design. */
784         if (it && it->it_op & IT_CREAT &&
785             (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
786                 mdc_clear_replay_flag(req, rc);
787                 ptlrpc_req_finished(req);
788                 resends++;
789
790                 CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
791                        obddev->obd_name, resends, it->it_op,
792                        PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
793
794                 if (generation == obddev->u.cli.cl_import->imp_generation) {
795                         goto resend;
796                 } else {
797                         CDEBUG(D_HA, "resend cross eviction\n");
798                         RETURN(-EIO);
799                 }
800         }
801
802         rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
803
804         RETURN(rc);
805 }
806
807 static int mdc_finish_intent_lock(struct obd_export *exp,
808                                   struct ptlrpc_request *request,
809                                   struct md_op_data *op_data,
810                                   struct lookup_intent *it,
811                                   struct lustre_handle *lockh)
812 {
813         struct lustre_handle old_lock;
814         struct mdt_body *mdt_body;
815         struct ldlm_lock *lock;
816         int rc;
817
818
819         LASSERT(request != NULL);
820         LASSERT(request != LP_POISON);
821         LASSERT(request->rq_repmsg != LP_POISON);
822
823         if (!it_disposition(it, DISP_IT_EXECD)) {
824                 /* The server failed before it even started executing the
825                  * intent, i.e. because it couldn't unpack the request. */
826                 LASSERT(it->d.lustre.it_status != 0);
827                 RETURN(it->d.lustre.it_status);
828         }
829         rc = it_open_error(DISP_IT_EXECD, it);
830         if (rc)
831                 RETURN(rc);
832
833         mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
834         LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
835
836         /* If we were revalidating a fid/name pair, mark the intent in
837          * case we fail and get called again from lookup */
838         if (fid_is_sane(&op_data->op_fid2) &&
839             it->it_create_mode & M_CHECK_STALE &&
840             it->it_op != IT_GETATTR) {
841                 it_set_disposition(it, DISP_ENQ_COMPLETE);
842
843                 /* Also: did we find the same inode? */
844                 /* sever can return one of two fids:
845                  * op_fid2 - new allocated fid - if file is created.
846                  * op_fid3 - existent fid - if file only open.
847                  * op_fid3 is saved in lmv_intent_open */
848                 if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
849                     (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
850                         CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
851                                "\n", PFID(&op_data->op_fid2),
852                                PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
853                         RETURN(-ESTALE);
854                 }
855         }
856
857         rc = it_open_error(DISP_LOOKUP_EXECD, it);
858         if (rc)
859                 RETURN(rc);
860
861         /* keep requests around for the multiple phases of the call
862          * this shows the DISP_XX must guarantee we make it into the call
863          */
864         if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
865             it_disposition(it, DISP_OPEN_CREATE) &&
866             !it_open_error(DISP_OPEN_CREATE, it)) {
867                 it_set_disposition(it, DISP_ENQ_CREATE_REF);
868                 ptlrpc_request_addref(request); /* balanced in ll_create_node */
869         }
870         if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
871             it_disposition(it, DISP_OPEN_OPEN) &&
872             !it_open_error(DISP_OPEN_OPEN, it)) {
873                 it_set_disposition(it, DISP_ENQ_OPEN_REF);
874                 ptlrpc_request_addref(request); /* balanced in ll_file_open */
875                 /* BUG 11546 - eviction in the middle of open rpc processing */
876                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
877         }
878
879         if (it->it_op & IT_CREAT) {
880                 /* XXX this belongs in ll_create_it */
881         } else if (it->it_op == IT_OPEN) {
882                 LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
883         } else {
884                 LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
885         }
886
887         /* If we already have a matching lock, then cancel the new
888          * one.  We have to set the data here instead of in
889          * mdc_enqueue, because we need to use the child's inode as
890          * the l_ast_data to match, and that's not available until
891          * intent_finish has performed the iget().) */
892         lock = ldlm_handle2lock(lockh);
893         if (lock) {
894                 ldlm_policy_data_t policy = lock->l_policy_data;
895                 LDLM_DEBUG(lock, "matching against this");
896
897                 LASSERTF(fid_res_name_eq(&mdt_body->fid1,
898                                          &lock->l_resource->lr_name),
899                          "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n",
900                          (unsigned long)lock->l_resource->lr_name.name[0],
901                          (unsigned long)lock->l_resource->lr_name.name[1],
902                          (unsigned long)lock->l_resource->lr_name.name[2],
903                          (unsigned long)fid_seq(&mdt_body->fid1),
904                          (unsigned long)fid_oid(&mdt_body->fid1),
905                          (unsigned long)fid_ver(&mdt_body->fid1));
906                 LDLM_LOCK_PUT(lock);
907
908                 memcpy(&old_lock, lockh, sizeof(*lockh));
909                 if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
910                                     LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
911                         ldlm_lock_decref_and_cancel(lockh,
912                                                     it->d.lustre.it_lock_mode);
913                         memcpy(lockh, &old_lock, sizeof(old_lock));
914                         it->d.lustre.it_lock_handle = lockh->cookie;
915                 }
916         }
917         CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
918                op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
919                it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
920         RETURN(rc);
921 }
922
923 int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
924                         struct lu_fid *fid, __u64 *bits)
925 {
926         /* We could just return 1 immediately, but since we should only
927          * be called in revalidate_it if we already have a lock, let's
928          * verify that. */
929         struct ldlm_res_id res_id;
930         struct lustre_handle lockh;
931         ldlm_policy_data_t policy;
932         ldlm_mode_t mode;
933         ENTRY;
934
935         if (it->d.lustre.it_lock_handle) {
936                 lockh.cookie = it->d.lustre.it_lock_handle;
937                 mode = ldlm_revalidate_lock_handle(&lockh, bits);
938         } else {
939                 fid_build_reg_res_name(fid, &res_id);
940                 switch (it->it_op) {
941                 case IT_GETATTR:
942                         policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
943                         break;
944                 case IT_LAYOUT:
945                         policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
946                         break;
947                 default:
948                         policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
949                         break;
950                 }
951                 mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
952                                        LDLM_FL_BLOCK_GRANTED, &res_id,
953                                        LDLM_IBITS, &policy,
954                                        LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
955         }
956
957         if (mode) {
958                 it->d.lustre.it_lock_handle = lockh.cookie;
959                 it->d.lustre.it_lock_mode = mode;
960         } else {
961                 it->d.lustre.it_lock_handle = 0;
962                 it->d.lustre.it_lock_mode = 0;
963         }
964
965         RETURN(!!mode);
966 }
967
968 /*
969  * This long block is all about fixing up the lock and request state
970  * so that it is correct as of the moment _before_ the operation was
971  * applied; that way, the VFS will think that everything is normal and
972  * call Lustre's regular VFS methods.
973  *
974  * If we're performing a creation, that means that unless the creation
975  * failed with EEXIST, we should fake up a negative dentry.
976  *
977  * For everything else, we want to lookup to succeed.
978  *
979  * One additional note: if CREATE or OPEN succeeded, we add an extra
980  * reference to the request because we need to keep it around until
981  * ll_create/ll_open gets called.
982  *
983  * The server will return to us, in it_disposition, an indication of
984  * exactly what d.lustre.it_status refers to.
985  *
986  * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
987  * otherwise if DISP_OPEN_CREATE is set, then it status is the
988  * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
989  * DISP_LOOKUP_POS will be set, indicating whether the child lookup
990  * was successful.
991  *
992  * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
993  * child lookup.
994  */
995 int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
996                     void *lmm, int lmmsize, struct lookup_intent *it,
997                     int lookup_flags, struct ptlrpc_request **reqp,
998                     ldlm_blocking_callback cb_blocking,
999                     int extra_lock_flags)
1000 {
1001         struct lustre_handle lockh;
1002         int rc = 0;
1003         ENTRY;
1004         LASSERT(it);
1005
1006         CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
1007                ", intent: %s flags %#o\n", op_data->op_namelen,
1008                op_data->op_name, PFID(&op_data->op_fid2),
1009                PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
1010                it->it_flags);
1011
1012         lockh.cookie = 0;
1013         if (fid_is_sane(&op_data->op_fid2) &&
1014             (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))) {
1015                 /* We could just return 1 immediately, but since we should only
1016                  * be called in revalidate_it if we already have a lock, let's
1017                  * verify that. */
1018                 it->d.lustre.it_lock_handle = 0;
1019                 rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
1020                 /* Only return failure if it was not GETATTR by cfid
1021                    (from inode_revalidate) */
1022                 if (rc || op_data->op_namelen != 0)
1023                         RETURN(rc);
1024         }
1025
1026         /* lookup_it may be called only after revalidate_it has run, because
1027          * revalidate_it cannot return errors, only zero.  Returning zero causes
1028          * this call to lookup, which *can* return an error.
1029          *
1030          * We only want to execute the request associated with the intent one
1031          * time, however, so don't send the request again.  Instead, skip past
1032          * this and use the request from revalidate.  In this case, revalidate
1033          * never dropped its reference, so the refcounts are all OK */
1034         if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
1035                 struct ldlm_enqueue_info einfo =
1036                         { LDLM_IBITS, it_to_lock_mode(it), cb_blocking,
1037                           ldlm_completion_ast, NULL, NULL, NULL };
1038
1039                 /* For case if upper layer did not alloc fid, do it now. */
1040                 if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
1041                         rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
1042                         if (rc < 0) {
1043                                 CERROR("Can't alloc new fid, rc %d\n", rc);
1044                                 RETURN(rc);
1045                         }
1046                 }
1047                 rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh,
1048                                  lmm, lmmsize, NULL, extra_lock_flags);
1049                 if (rc < 0)
1050                         RETURN(rc);
1051         } else if (!fid_is_sane(&op_data->op_fid2) ||
1052                    !(it->it_create_mode & M_CHECK_STALE)) {
1053                 /* DISP_ENQ_COMPLETE set means there is extra reference on
1054                  * request referenced from this intent, saved for subsequent
1055                  * lookup.  This path is executed when we proceed to this
1056                  * lookup, so we clear DISP_ENQ_COMPLETE */
1057                 it_clear_disposition(it, DISP_ENQ_COMPLETE);
1058         }
1059         *reqp = it->d.lustre.it_data;
1060         rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
1061         RETURN(rc);
1062 }
1063
1064 static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
1065                                               struct ptlrpc_request *req,
1066                                               void *args, int rc)
1067 {
1068         struct mdc_getattr_args  *ga = args;
1069         struct obd_export        *exp = ga->ga_exp;
1070         struct md_enqueue_info   *minfo = ga->ga_minfo;
1071         struct ldlm_enqueue_info *einfo = ga->ga_einfo;
1072         struct lookup_intent     *it;
1073         struct lustre_handle     *lockh;
1074         struct obd_device        *obddev;
1075         int                       flags = LDLM_FL_HAS_INTENT;
1076         ENTRY;
1077
1078         it    = &minfo->mi_it;
1079         lockh = &minfo->mi_lockh;
1080
1081         obddev = class_exp2obd(exp);
1082
1083         mdc_exit_request(&obddev->u.cli);
1084         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
1085                 rc = -ETIMEDOUT;
1086
1087         rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
1088                                    &flags, NULL, 0, lockh, rc);
1089         if (rc < 0) {
1090                 CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
1091                 mdc_clear_replay_flag(req, rc);
1092                 GOTO(out, rc);
1093         }
1094
1095         rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
1096         if (rc)
1097                 GOTO(out, rc);
1098
1099         rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
1100         EXIT;
1101
1102 out:
1103         OBD_FREE_PTR(einfo);
1104         minfo->mi_cb(req, minfo, rc);
1105         return 0;
1106 }
1107
1108 int mdc_intent_getattr_async(struct obd_export *exp,
1109                              struct md_enqueue_info *minfo,
1110                              struct ldlm_enqueue_info *einfo)
1111 {
1112         struct md_op_data       *op_data = &minfo->mi_data;
1113         struct lookup_intent    *it = &minfo->mi_it;
1114         struct ptlrpc_request   *req;
1115         struct mdc_getattr_args *ga;
1116         struct obd_device       *obddev = class_exp2obd(exp);
1117         struct ldlm_res_id       res_id;
1118         /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
1119          *     for statahead currently. Consider CMD in future, such two bits
1120          *     maybe managed by different MDS, should be adjusted then. */
1121         ldlm_policy_data_t       policy = {
1122                                         .l_inodebits = { MDS_INODELOCK_LOOKUP | 
1123                                                          MDS_INODELOCK_UPDATE }
1124                                  };
1125         int                      rc = 0;
1126         int                      flags = LDLM_FL_HAS_INTENT;
1127         ENTRY;
1128
1129         CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n",
1130                op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
1131                ldlm_it2str(it->it_op), it->it_flags);
1132
1133         fid_build_reg_res_name(&op_data->op_fid1, &res_id);
1134         req = mdc_intent_getattr_pack(exp, it, op_data);
1135         if (!req)
1136                 RETURN(-ENOMEM);
1137
1138         rc = mdc_enter_request(&obddev->u.cli);
1139         if (rc != 0) {
1140                 ptlrpc_req_finished(req);
1141                 RETURN(rc);
1142         }
1143
1144         rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
1145                               0, &minfo->mi_lockh, 1);
1146         if (rc < 0) {
1147                 mdc_exit_request(&obddev->u.cli);
1148                 ptlrpc_req_finished(req);
1149                 RETURN(rc);
1150         }
1151
1152         CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
1153         ga = ptlrpc_req_async_args(req);
1154         ga->ga_exp = exp;
1155         ga->ga_minfo = minfo;
1156         ga->ga_einfo = einfo;
1157
1158         req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
1159         ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
1160
1161         RETURN(0);
1162 }