1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 # define EXPORT_SYMTAB
25 #define DEBUG_SUBSYSTEM S_LMV
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
34 #include <liblustre.h>
37 #include <linux/obd_support.h>
38 #include <linux/lustre_lib.h>
39 #include <linux/lustre_net.h>
40 #include <linux/lustre_idl.h>
41 #include <linux/lustre_dlm.h>
42 #include <linux/lustre_mds.h>
43 #include <linux/obd_class.h>
44 #include <linux/obd_ost.h>
45 #include <linux/seq_file.h>
46 #include <linux/lprocfs_status.h>
47 #include <linux/lustre_fsfilt.h>
48 #include <linux/obd_lmv.h>
49 #include "lmv_internal.h"
52 int lmv_handle_remote_inode(struct obd_export *exp, struct ll_uctxt *uctxt,
53 void *lmm, int lmmsize,
54 struct lookup_intent *it, int flags,
55 struct ptlrpc_request **reqp,
56 ldlm_blocking_callback cb_blocking)
58 struct obd_device *obd = exp->exp_obd;
59 struct lmv_obd *lmv = &obd->u.lmv;
60 struct mds_body *body = NULL;
64 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
65 LASSERT(body != NULL);
67 if (body->valid & OBD_MD_MDS) {
68 /* oh, MDS reports that this is remote inode case
69 * i.e. we have to ask for real attrs on another MDS */
70 struct ptlrpc_request *req;
72 struct lustre_handle plock;
75 /* we got LOOKUP lock, but we really need attrs */
76 pmode = it->d.lustre.it_lock_mode;
78 memcpy(&plock, &it->d.lustre.it_lock_handle,
80 it->d.lustre.it_lock_mode = 0;
84 it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
85 rc = md_intent_lock(lmv->tgts[nfid.mds].exp, uctxt, &nfid,
86 NULL, 0, lmm, lmmsize, NULL, it, flags,
89 /* llite needs LOOKUP lock to track dentry revocation in
90 * order to maintain dcache consistency. thus drop UPDATE
91 * lock here and put LOOKUP in request */
93 LASSERT(it->d.lustre.it_lock_mode != 0);
94 ldlm_lock_decref((void *)&it->d.lustre.it_lock_handle,
95 it->d.lustre.it_lock_mode);
96 memcpy(&it->d.lustre.it_lock_handle, &plock,
98 it->d.lustre.it_lock_mode = pmode;
101 ldlm_lock_decref(&plock, pmode);
103 ptlrpc_req_finished(*reqp);
109 int lmv_intent_open(struct obd_export *exp, struct ll_uctxt *uctxt,
110 struct ll_fid *pfid, const char *name, int len,
111 void *lmm, int lmmsize, struct ll_fid *cfid,
112 struct lookup_intent *it, int flags,
113 struct ptlrpc_request **reqp,
114 ldlm_blocking_callback cb_blocking)
116 struct obd_device *obd = exp->exp_obd;
117 struct lmv_obd *lmv = &obd->u.lmv;
118 struct mds_body *body = NULL;
119 struct ll_fid rpfid = *pfid;
125 /* IT_OPEN is intended to open (and create, possible) an object.
126 * parent (pfid) may be splitted dir */
129 obj = lmv_grab_obj(obd, pfid, 0);
131 /* directory is already splitted, so we have to forward
132 * request to the right MDS */
133 mds = raw_name2idx(obj->objcount, name, len);
134 rpfid = obj->objs[mds].fid;
135 CDEBUG(D_OTHER, "forward to MDS #%u\n", mds);
138 rc = md_intent_lock(lmv->tgts[mds].exp, uctxt, &rpfid, name, len,
139 lmm, lmmsize, cfid, it, flags, reqp, cb_blocking);
145 /* okay, MDS has returned success. probably name has been
146 * resolved in remote inode */
147 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it, flags,
154 /* caller may use attrs MDS returns on IT_OPEN lock request
155 * so, we have to update them for splitted dir */
156 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
157 LASSERT(body != NULL);
159 obj = lmv_grab_obj(obd, cfid, 0);
160 if (rc == 0 && !obj && (mea = is_body_of_splitted_dir(*reqp, 1))) {
161 /* wow! this is splitted dir, we'd like to handle it */
162 rc = lmv_create_obj_from_attrs(exp, &body->fid1, mea);
164 obj = lmv_grab_obj(obd, cfid, 0);
166 /* this is splitted dir and we'd want to get attrs */
167 CDEBUG(D_OTHER, "attrs from slaves for %lu/%lu/%lu\n",
168 (unsigned long) cfid->mds,
169 (unsigned long) cfid->id,
170 (unsigned long) cfid->generation);
171 rc = lmv_revalidate_slaves(exp, reqp, cfid,
173 } else if (S_ISDIR(body->mode)) {
174 CWARN("hmmm, %lu/%lu/%lu has not lmv obj?!\n",
175 (unsigned long) cfid->mds,
176 (unsigned long) cfid->id,
177 (unsigned long) cfid->generation);
183 int lmv_intent_getattr(struct obd_export *exp, struct ll_uctxt *uctxt,
184 struct ll_fid *pfid, const char *name, int len,
185 void *lmm, int lmmsize, struct ll_fid *cfid,
186 struct lookup_intent *it, int flags,
187 struct ptlrpc_request **reqp,
188 ldlm_blocking_callback cb_blocking)
190 struct obd_device *obd = exp->exp_obd;
191 struct lmv_obd *lmv = &obd->u.lmv;
192 struct mds_body *body = NULL;
193 struct ll_fid rpfid = *pfid;
194 struct lmv_obj *obj, *obj2;
200 /* caller wants to revalidate attrs of obj
201 * we have to revalidate slaves if requested
202 * object is splitted directory */
203 CDEBUG(D_OTHER, "revalidate attrs for %lu/%lu/%lu\n",
204 (unsigned long) cfid->mds,
205 (unsigned long) cfid->id,
206 (unsigned long) cfid->generation);
208 obj = lmv_grab_obj(obd, cfid, 0);
210 /* in fact, we need not this with current
211 * _intent_lock(), but it may change some day */
212 rpfid = obj->objs[mds].fid;
214 rc = md_intent_lock(lmv->tgts[mds].exp, uctxt, &rpfid, name,
215 len, lmm, lmmsize, cfid, it, flags, reqp,
217 if (obj && rc >= 0) {
218 /* this is splitted dir. in order to optimize things
219 * a bit, we consider obj valid updating missing
220 * parts. FIXME: do we need to return any lock here?
221 * it would be fine if we don't. this means that
222 * nobody should use UPDATE lock to notify about
225 "revalidate slaves for %lu/%lu/%lu, rc %d\n",
226 (unsigned long) cfid->mds,
227 (unsigned long) cfid->id,
228 (unsigned long) cfid->generation, rc);
229 rc = lmv_revalidate_slaves(exp, reqp, cfid, it, rc,
235 CDEBUG(D_OTHER, "INTENT getattr for %*s on %lu/%lu/%lu\n",
236 len, name, (unsigned long) pfid->mds,
237 (unsigned long) pfid->id,
238 (unsigned long) pfid->generation);
241 obj = lmv_grab_obj(obd, pfid, 0);
243 /* directory is already splitted. calculate mds */
244 mds = raw_name2idx(obj->objcount, (char *) name, len);
245 rpfid = obj->objs[mds].fid;
246 CDEBUG(D_OTHER, "forward to MDS #%u (slave %lu/%lu/%lu)\n",
247 mds, (unsigned long) rpfid.mds,
248 (unsigned long) rpfid.id,
249 (unsigned long) rpfid.generation);
251 rc = md_intent_lock(lmv->tgts[mds].exp, uctxt, &rpfid, name,
252 len, lmm, lmmsize, NULL, it, flags, reqp,
258 /* okay, MDS has returned success. probably name has been
259 * resolved in remote inode */
260 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it, flags,
265 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
266 LASSERT(body != NULL);
268 obj2 = lmv_grab_obj(obd, cfid, 0);
270 if (rc == 0 && !obj2 && (mea = is_body_of_splitted_dir(*reqp, 1))) {
271 /* wow! this is splitted dir, we'd like to handle it */
272 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
273 LASSERT(body != NULL);
274 rc = lmv_create_obj_from_attrs(exp, &body->fid1, mea);
275 obj2 = lmv_grab_obj(obd, cfid, 0);
279 /* this is splitted dir and we'd want to get attrs */
281 "attrs from slaves for %lu/%lu/%lu, rc %d\n",
282 (unsigned long) cfid->mds,
283 (unsigned long) cfid->id,
284 (unsigned long) cfid->generation, rc);
285 rc = lmv_revalidate_slaves(exp, reqp, cfid,
291 void lmv_update_body_from_obj(struct mds_body *body, struct lmv_inode *obj)
294 body->size += obj->size;
302 int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
304 struct obd_device *obd = exp->exp_obd;
305 struct lmv_obd *lmv = &obd->u.lmv;
306 struct mds_body *body = NULL;
307 struct lustre_handle *lockh;
308 struct ldlm_lock *lock;
309 struct mds_body *body2;
310 struct ll_uctxt uctxt;
318 /* master is locked. we'd like to take locks on slaves
319 * and update attributes to be returned from the slaves
320 * it's important that lookup is called in two cases:
321 * - for first time (dcache has no such a resolving yet
322 * - ->d_revalidate() returned false
323 * last case possible only if all the objs (master and
324 * all slaves aren't valid */
326 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
327 LASSERT(body != NULL);
329 obj = lmv_grab_obj(obd, &body->fid1, 0);
332 CDEBUG(D_OTHER, "lookup slaves for %lu/%lu/%lu\n",
333 (unsigned long) body->fid1.mds,
334 (unsigned long) body->fid1.id,
335 (unsigned long) body->fid1.generation);
339 for (i = 0; i < obj->objcount; i++) {
340 struct ll_fid fid = obj->objs[i].fid;
341 struct ptlrpc_request *req = NULL;
342 struct lookup_intent it;
344 if (fid_equal(&fid, &obj->fid)) {
345 /* skip master obj */
349 CDEBUG(D_OTHER, "lookup slave %lu/%lu/%lu\n",
350 (unsigned long) fid.mds,
351 (unsigned long) fid.id,
352 (unsigned long) fid.generation);
355 memset(&it, 0, sizeof(it));
356 it.it_op = IT_GETATTR;
357 rc = md_intent_lock(lmv->tgts[fid.mds].exp, &uctxt, &fid,
358 NULL, 0, NULL, 0, &fid, &it, 0, &req,
359 lmv_dirobj_blocking_ast);
360 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
362 /* nice, this slave is valid */
363 LASSERT(req == NULL);
364 CDEBUG(D_OTHER, "cached\n");
369 /* error during revalidation */
373 /* rc == 0, this means we have no such a lock and can't
374 * think obj is still valid. lookup it again */
375 LASSERT(req == NULL);
377 memset(&it, 0, sizeof(it));
378 it.it_op = IT_GETATTR;
379 rc = md_intent_lock(lmv->tgts[fid.mds].exp, &uctxt, &fid,
380 NULL, 0, NULL, 0, NULL, &it, 0, &req,
381 lmv_dirobj_blocking_ast);
382 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
385 /* error during lookup */
389 lock = ldlm_handle2lock(lockh);
391 lock->l_ast_data = obj;
392 atomic_inc(&obj->count);
394 body2 = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body2));
397 obj->objs[i].size = body2->size;
398 CDEBUG(D_OTHER, "fresh: %lu\n",
399 (unsigned long) obj->objs[i].size);
404 ptlrpc_req_finished(req);
406 lmv_update_body_from_obj(body, obj->objs + i);
407 if (it.d.lustre.it_lock_mode)
408 ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
414 int lmv_intent_lookup(struct obd_export *exp, struct ll_uctxt *uctxt,
415 struct ll_fid *pfid, const char *name, int len,
416 void *lmm, int lmmsize, struct ll_fid *cfid,
417 struct lookup_intent *it, int flags,
418 struct ptlrpc_request **reqp,
419 ldlm_blocking_callback cb_blocking)
421 struct obd_device *obd = exp->exp_obd;
422 struct lmv_obd *lmv = &obd->u.lmv;
423 struct mds_body *body = NULL;
424 struct ll_fid rpfid = *pfid;
430 /* IT_LOOKUP is intended to produce name -> fid resolving
431 * (let's call this lookup below) or to confirm requested
432 * resolving is still valid (let's call this revalidation)
433 * cfid != NULL specifies revalidation */
436 /* this is revalidation during revalidation it's
437 * enough to return 1 if we think attrs are uptodate
438 * it may return updated attrs, though */
440 rc = md_intent_lock(lmv->tgts[mds].exp, uctxt, pfid, name,
441 len, lmm, lmmsize, cfid, it, flags,
443 CDEBUG(D_OTHER, "revalidate lookup for %lu/%lu/%lu = %d\n",
444 (unsigned long) cfid->mds,
445 (unsigned long) cfid->id,
446 (unsigned long) cfid->generation, rc);
452 /* this is lookup. during lookup we have to update all the
453 * attributes, because returned values will be put in struct
456 obj = lmv_grab_obj(obd, pfid, 0);
458 /* directory is already splitted. calculate mds */
459 mds = raw_name2idx(obj->objcount, (char *) name, len);
460 rpfid = obj->objs[mds].fid;
464 rc = md_intent_lock(lmv->tgts[mds].exp, uctxt, &rpfid, name,
465 len, lmm, lmmsize, NULL, it, flags, reqp,
468 /* very interesting. it seems object is still valid
469 * but for some reason llite calls lookup, not revalidate */
470 CWARN("lookup for %lu/%lu/%lu and data should be uptodate\n",
471 (unsigned long) rpfid.mds,
472 (unsigned long) rpfid.id,
473 (unsigned long) rpfid.generation);
474 LASSERT(*reqp == NULL);
478 if (rc == 0 && *reqp == NULL) {
479 /* once again, we're asked for lookup, not revalidate */
480 CWARN("lookup for %lu/%lu/%lu and data should be uptodate\n",
481 (unsigned long) rpfid.mds,
482 (unsigned long) rpfid.id,
483 (unsigned long) rpfid.generation);
488 /* directory got splitted since last update. this shouldn't
489 * be becasue splitting causes lock revocation, so revalidate
490 * had to fail and lookup on dir had to return mea */
491 CWARN("we haven't knew about directory splitting!\n");
492 LASSERT(obj == NULL);
493 rc = lmv_create_obj_from_attrs(exp, &rpfid, NULL);
502 /* okay, MDS has returned success. probably name has been
503 * resolved in remote inode */
504 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it, flags,
507 if (rc == 0 && (mea = is_body_of_splitted_dir(*reqp, 1))) {
508 /* wow! this is splitted dir, we'd like to handle it */
509 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
510 LASSERT(body != NULL);
511 obj = lmv_grab_obj(obd, &body->fid1, 0);
513 rc = lmv_create_obj_from_attrs(exp, &body->fid1, mea);
520 int lmv_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt,
521 struct ll_fid *pfid, const char *name, int len,
522 void *lmm, int lmmsize, struct ll_fid *cfid,
523 struct lookup_intent *it, int flags,
524 struct ptlrpc_request **reqp,
525 ldlm_blocking_callback cb_blocking)
527 struct obd_device *obd = exp->exp_obd;
534 CDEBUG(D_OTHER, "INTENT LOCK '%s' for '%*s' on %lu/%lu -> %u\n",
535 LL_IT2STR(it), len, name, (unsigned long) pfid->id,
536 (unsigned long) pfid->generation, pfid->mds);
539 if (it->it_op == IT_LOOKUP)
540 rc = lmv_intent_lookup(exp, uctxt, pfid, name, len, lmm,
541 lmmsize, cfid, it, flags, reqp,
543 else if (it->it_op & IT_OPEN)
544 rc = lmv_intent_open(exp, uctxt, pfid, name, len, lmm,
545 lmmsize, cfid, it, flags, reqp,
547 else if (it->it_op == IT_GETATTR)
548 rc = lmv_intent_getattr(exp, uctxt, pfid, name, len, lmm,
549 lmmsize, cfid, it, flags, reqp,
556 int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
557 struct ll_fid *mfid, struct lookup_intent *oit,
558 int master_valid, ldlm_blocking_callback cb_blocking)
560 struct obd_device *obd = exp->exp_obd;
561 struct ptlrpc_request *mreq = *reqp;
562 struct lmv_obd *lmv = &obd->u.lmv;
563 struct lustre_handle master_lockh;
564 unsigned long size = 0;
565 struct ldlm_lock *lock;
566 struct mds_body *body;
567 struct ll_uctxt uctxt;
569 int master_lock_mode;
573 /* we have to loop over the subobjects, check validity and update
574 * them from MDSs if needed. it's very useful that we need not to
575 * update all the fields. say, common fields (that are equal on
576 * all the subojects need not to be update, another fields (i_size,
577 * for example) are cached all the time */
578 obj = lmv_grab_obj(obd, mfid, 0);
581 master_lock_mode = 0;
584 for (i = 0; i < obj->objcount; i++) {
585 struct ll_fid fid = obj->objs[i].fid;
586 struct lustre_handle *lockh = NULL;
587 struct ptlrpc_request *req = NULL;
588 ldlm_blocking_callback cb;
589 struct lookup_intent it;
592 CDEBUG(D_OTHER, "revalidate subobj %lu/%lu/%lu\n",
593 (unsigned long) fid.mds,
594 (unsigned long) fid.id,
595 (unsigned long) fid.generation);
597 memset(&it, 0, sizeof(it));
598 it.it_op = IT_GETATTR;
599 cb = lmv_dirobj_blocking_ast;
601 if (fid_equal(&fid, &obj->fid)) {
603 /* lmv_intent_getattr() already checked
604 * validness and took the lock */
606 /* it even got the reply
607 * refresh attrs from that reply */
608 body = lustre_msg_buf(mreq->rq_repmsg,
610 LASSERT(body != NULL);
613 /* take already cached attrs into account */
615 "master is locked and cached\n");
623 rc = md_intent_lock(lmv->tgts[fid.mds].exp, &uctxt, &fid,
624 NULL, 0, NULL, 0, &fid, &it, 0, &req, cb);
625 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
627 /* nice, this slave is valid */
628 LASSERT(req == NULL);
629 CDEBUG(D_OTHER, "cached\n");
634 /* error during revalidation */
638 /* rc == 0, this means we have no such a lock and can't
639 * think obj is still valid. lookup it again */
640 LASSERT(req == NULL);
642 memset(&it, 0, sizeof(it));
643 it.it_op = IT_GETATTR;
644 rc = md_intent_lock(lmv->tgts[fid.mds].exp, &uctxt, &fid,
645 NULL, 0, NULL, 0, NULL, &it, 0, &req, cb);
646 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
649 /* error during lookup */
654 LASSERT(master_valid == 0);
655 /* save lock on master to be returned to the caller */
656 CDEBUG(D_OTHER, "no lock on master yet\n");
657 memcpy(&master_lockh, lockh, sizeof(master_lockh));
658 master_lock_mode = it.d.lustre.it_lock_mode;
659 it.d.lustre.it_lock_mode = 0;
661 /* this is slave. we want to control it */
662 lock = ldlm_handle2lock(lockh);
664 lock->l_ast_data = obj;
665 atomic_inc(&obj->count);
670 /* this is first reply, we'll use it to return
671 * updated data back to the caller */
673 ptlrpc_request_addref(req);
678 body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
682 obj->objs[i].size = body->size;
683 CDEBUG(D_OTHER, "fresh: %lu\n",
684 (unsigned long) obj->objs[i].size);
687 ptlrpc_req_finished(req);
689 size += obj->objs[i].size;
690 if (it.d.lustre.it_lock_mode)
691 ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
695 /* some attrs got refreshed, we have reply and it's time
696 * to put fresh attrs to it */
697 CDEBUG(D_OTHER, "return refreshed attrs: size = %lu\n",
698 (unsigned long) size);
699 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
701 /* FIXME: what about another attributes? */
704 /* very important to maintain lli->mds the same
705 * because of revalidation. mreq == NULL means
706 * that caller has no reply and the only attr
707 * we can return is size */
708 body->valid = OBD_MD_FLSIZE;
709 body->mds = obj->fid.mds;
711 if (master_valid == 0) {
712 memcpy(&oit->d.lustre.it_lock_handle,
713 &master_lockh, sizeof(master_lockh));
714 oit->d.lustre.it_lock_mode = master_lock_mode;
718 /* it seems all the attrs are fresh and we did no request */
719 CDEBUG(D_OTHER, "all the attrs were fresh\n");
720 if (master_valid == 0)
721 oit->d.lustre.it_lock_mode = master_lock_mode;