1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 # define EXPORT_SYMTAB
25 #define DEBUG_SUBSYSTEM S_LMV
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
33 #include <linux/seq_file.h>
35 #include <liblustre.h>
38 #include <linux/obd_support.h>
39 #include <linux/lustre_lib.h>
40 #include <linux/lustre_net.h>
41 #include <linux/lustre_idl.h>
42 #include <linux/lustre_dlm.h>
43 #include <linux/lustre_mds.h>
44 #include <linux/obd_class.h>
45 #include <linux/obd_ost.h>
46 #include <linux/lprocfs_status.h>
47 #include <linux/lustre_fsfilt.h>
48 #include <linux/obd_lmv.h>
49 #include "lmv_internal.h"
52 static inline void lmv_drop_intent_lock(struct lookup_intent *it)
54 if (it->d.lustre.it_lock_mode != 0)
55 ldlm_lock_decref((void *)&it->d.lustre.it_lock_handle,
56 it->d.lustre.it_lock_mode);
59 int lmv_handle_remote_inode(struct obd_export *exp, struct ll_uctxt *uctxt,
60 void *lmm, int lmmsize,
61 struct lookup_intent *it, int flags,
62 struct ptlrpc_request **reqp,
63 ldlm_blocking_callback cb_blocking)
65 struct obd_device *obd = exp->exp_obd;
66 struct lmv_obd *lmv = &obd->u.lmv;
67 struct mds_body *body = NULL;
71 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
72 LASSERT(body != NULL);
74 if (body->valid & OBD_MD_MDS) {
75 /* oh, MDS reports that this is remote inode case
76 * i.e. we have to ask for real attrs on another MDS */
77 struct ptlrpc_request *req;
79 struct lustre_handle plock;
82 if (it->it_op == IT_LOOKUP) {
83 /* unfortunately, we have to lie to MDC/MDS to
84 * retrieve attributes llite needs */
85 it->it_op = IT_GETATTR;
88 /* we got LOOKUP lock, but we really need attrs */
89 pmode = it->d.lustre.it_lock_mode;
91 memcpy(&plock, &it->d.lustre.it_lock_handle,
93 it->d.lustre.it_lock_mode = 0;
97 it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
98 rc = md_intent_lock(lmv->tgts[nfid.mds].ltd_exp, uctxt, &nfid,
99 NULL, 0, lmm, lmmsize, NULL, it, flags,
102 /* llite needs LOOKUP lock to track dentry revocation in
103 * order to maintain dcache consistency. thus drop UPDATE
104 * lock here and put LOOKUP in request */
106 lmv_drop_intent_lock(it);
107 memcpy(&it->d.lustre.it_lock_handle, &plock,
109 it->d.lustre.it_lock_mode = pmode;
112 ldlm_lock_decref(&plock, pmode);
114 ptlrpc_req_finished(*reqp);
120 int lmv_intent_open(struct obd_export *exp, struct ll_uctxt *uctxt,
121 struct ll_fid *pfid, const char *name, int len,
122 void *lmm, int lmmsize, struct ll_fid *cfid,
123 struct lookup_intent *it, int flags,
124 struct ptlrpc_request **reqp,
125 ldlm_blocking_callback cb_blocking)
127 struct obd_device *obd = exp->exp_obd;
128 struct lmv_obd *lmv = &obd->u.lmv;
129 struct mds_body *body = NULL;
130 struct ll_fid rpfid = *pfid;
136 /* IT_OPEN is intended to open (and create, possible) an object. Parent
137 * (pfid) may be splitted dir */
141 obj = lmv_grab_obj(obd, &rpfid);
143 /* directory is already splitted, so we have to forward
144 * request to the right MDS */
145 mds = raw_name2idx(obj->objcount, (char *)name, len);
146 CDEBUG(D_OTHER, "forward to MDS #%u\n", mds);
147 rpfid = obj->objs[mds].fid;
151 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
152 len, lmm, lmmsize, cfid, it, flags, reqp,
154 if (rc == -ERESTART) {
155 /* directory got splitted. time to update local object
156 * and repeat the request with proper MDS */
157 LASSERT(fid_equal(pfid, &rpfid));
158 rc = lmv_get_mea_and_update_object(exp, &rpfid);
160 ptlrpc_req_finished(*reqp);
167 /* okay, MDS has returned success. Probably name has been resolved in
169 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it,
170 flags, reqp, cb_blocking);
176 /* caller may use attrs MDS returns on IT_OPEN lock request so, we have
177 * to update them for splitted dir */
178 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
179 LASSERT(body != NULL);
182 obj = lmv_grab_obj(obd, cfid);
183 if (!obj && (mea = body_of_splitted_dir(*reqp, 1))) {
184 /* wow! this is splitted dir, we'd like to handle it */
185 obj = lmv_create_obj(exp, &body->fid1, mea);
187 RETURN(PTR_ERR(obj));
191 /* this is splitted dir and we'd want to get attrs */
192 CDEBUG(D_OTHER, "attrs from slaves for %lu/%lu/%lu\n",
193 (unsigned long)cfid->mds, (unsigned long)cfid->id,
194 (unsigned long)cfid->generation);
195 rc = lmv_revalidate_slaves(exp, reqp, cfid,
197 } else if (S_ISDIR(body->mode)) {
198 /*CWARN("hmmm, %lu/%lu/%lu has not lmv obj?!\n",
199 (unsigned long) cfid->mds,
200 (unsigned long) cfid->id,
201 (unsigned long) cfid->generation);*/
210 int lmv_intent_getattr(struct obd_export *exp, struct ll_uctxt *uctxt,
211 struct ll_fid *pfid, const char *name, int len,
212 void *lmm, int lmmsize, struct ll_fid *cfid,
213 struct lookup_intent *it, int flags,
214 struct ptlrpc_request **reqp,
215 ldlm_blocking_callback cb_blocking)
217 struct obd_device *obd = exp->exp_obd;
218 struct lmv_obd *lmv = &obd->u.lmv;
219 struct mds_body *body = NULL;
220 struct ll_fid rpfid = *pfid;
221 struct lmv_obj *obj, *obj2;
227 /* caller wants to revalidate attrs of obj we have to revalidate
228 * slaves if requested object is splitted directory */
229 CDEBUG(D_OTHER, "revalidate attrs for %lu/%lu/%lu\n",
230 (unsigned long)cfid->mds, (unsigned long)cfid->id,
231 (unsigned long)cfid->generation);
233 obj = lmv_grab_obj(obd, cfid);
235 /* in fact, we need not this with current intent_lock(),
236 * but it may change some day */
237 rpfid = obj->objs[mds].fid;
240 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
241 len, lmm, lmmsize, cfid, it, flags, reqp,
243 if (obj && rc >= 0) {
244 /* this is splitted dir. In order to optimize things a
245 * bit, we consider obj valid updating missing parts.
247 * FIXME: do we need to return any lock here? It would
248 * be fine if we don't. this means that nobody should
249 * use UPDATE lock to notify about object * removal */
251 "revalidate slaves for %lu/%lu/%lu, rc %d\n",
252 (unsigned long)cfid->mds, (unsigned long)cfid->id,
253 (unsigned long)cfid->generation, rc);
255 rc = lmv_revalidate_slaves(exp, reqp, cfid, it, rc,
262 CDEBUG(D_OTHER, "INTENT getattr for %*s on %lu/%lu/%lu\n",
263 len, name, (unsigned long)pfid->mds, (unsigned long)pfid->id,
264 (unsigned long)pfid->generation);
267 obj = lmv_grab_obj(obd, pfid);
269 /* directory is already splitted. calculate mds */
270 mds = raw_name2idx(obj->objcount, (char *) name, len);
271 rpfid = obj->objs[mds].fid;
274 CDEBUG(D_OTHER, "forward to MDS #%u (slave %lu/%lu/%lu)\n",
275 mds, (unsigned long)rpfid.mds, (unsigned long)rpfid.id,
276 (unsigned long)rpfid.generation);
279 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
280 len, lmm, lmmsize, NULL, it, flags, reqp,
288 /* okay, MDS has returned success. probably name has been
289 * resolved in remote inode */
290 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it,
291 flags, reqp, cb_blocking);
295 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
296 LASSERT(body != NULL);
299 obj2 = lmv_grab_obj(obd, cfid);
301 if (!obj2 && (mea = body_of_splitted_dir(*reqp, 1))) {
302 /* wow! this is splitted dir, we'd like to handle it. */
303 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
304 LASSERT(body != NULL);
306 obj2 = lmv_create_obj(exp, &body->fid1, mea);
308 RETURN(PTR_ERR(obj2));
312 /* this is splitted dir and we'd want to get attrs */
313 CDEBUG(D_OTHER, "attrs from slaves for %lu/%lu/%lu, rc %d\n",
314 (unsigned long)cfid->mds, (unsigned long)cfid->id,
315 (unsigned long)cfid->generation, rc);
317 rc = lmv_revalidate_slaves(exp, reqp, cfid, it, 1, cb_blocking);
323 void lmv_update_body_from_obj(struct mds_body *body, struct lmv_inode *obj)
326 body->size += obj->size;
327 /* body->atime = obj->atime;
328 body->ctime = obj->ctime;
329 body->mtime = obj->mtime;
330 body->nlink = obj->nlink;*/
333 int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
335 struct obd_device *obd = exp->exp_obd;
336 struct lmv_obd *lmv = &obd->u.lmv;
337 struct mds_body *body = NULL;
338 struct lustre_handle *lockh;
339 struct ldlm_lock *lock;
340 struct mds_body *body2;
341 struct ll_uctxt uctxt;
349 /* master is locked. we'd like to take locks on slaves and update
350 * attributes to be returned from the slaves it's important that lookup
351 * is called in two cases:
353 * - for first time (dcache has no such a resolving yet).
354 * - ->d_revalidate() returned false.
356 * last case possible only if all the objs (master and all slaves aren't
359 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
360 LASSERT(body != NULL);
362 obj = lmv_grab_obj(obd, &body->fid1);
363 LASSERT(obj != NULL);
365 CDEBUG(D_OTHER, "lookup slaves for %lu/%lu/%lu\n",
366 (unsigned long)body->fid1.mds,
367 (unsigned long)body->fid1.id,
368 (unsigned long)body->fid1.generation);
375 for (i = 0; i < obj->objcount; i++) {
376 struct ll_fid fid = obj->objs[i].fid;
377 struct ptlrpc_request *req = NULL;
378 struct lookup_intent it;
380 if (fid_equal(&fid, &obj->fid))
381 /* skip master obj */
384 CDEBUG(D_OTHER, "lookup slave %lu/%lu/%lu\n",
385 (unsigned long)fid.mds, (unsigned long)fid.id,
386 (unsigned long)fid.generation);
389 memset(&it, 0, sizeof(it));
390 it.it_op = IT_GETATTR;
391 rc = md_intent_lock(lmv->tgts[fid.mds].ltd_exp, &uctxt, &fid,
392 NULL, 0, NULL, 0, &fid, &it, 0, &req,
393 lmv_dirobj_blocking_ast);
395 lockh = (struct lustre_handle *)&it.d.lustre.it_lock_handle;
397 /* nice, this slave is valid */
398 LASSERT(req == NULL);
399 CDEBUG(D_OTHER, "cached\n");
404 /* error during revalidation */
407 /* rc == 0, this means we have no such a lock and can't think
408 * obj is still valid. lookup it again */
409 LASSERT(req == NULL);
412 memset(&it, 0, sizeof(it));
413 it.it_op = IT_GETATTR;
414 rc = md_intent_lock(lmv->tgts[fid.mds].ltd_exp, &uctxt, &fid,
415 NULL, 0, NULL, 0, NULL, &it, 0, &req,
416 lmv_dirobj_blocking_ast);
418 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
422 /* error during lookup */
425 lock = ldlm_handle2lock(lockh);
428 lock->l_ast_data = lmv_get_obj(obj);
430 body2 = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body2));
433 obj->objs[i].size = body2->size;
435 CDEBUG(D_OTHER, "fresh: %lu\n",
436 (unsigned long)obj->objs[i].size);
441 ptlrpc_req_finished(req);
443 lmv_update_body_from_obj(body, obj->objs + i);
445 if (it.d.lustre.it_lock_mode)
446 ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
454 int lmv_intent_lookup(struct obd_export *exp, struct ll_uctxt *uctxt,
455 struct ll_fid *pfid, const char *name, int len,
456 void *lmm, int lmmsize, struct ll_fid *cfid,
457 struct lookup_intent *it, int flags,
458 struct ptlrpc_request **reqp,
459 ldlm_blocking_callback cb_blocking)
461 struct obd_device *obd = exp->exp_obd;
462 struct lmv_obd *lmv = &obd->u.lmv;
463 struct mds_body *body = NULL;
464 struct ll_fid rpfid = *pfid;
470 /* IT_LOOKUP is intended to produce name -> fid resolving (let's call
471 * this lookup below) or to confirm requested resolving is still valid
472 * (let's call this revalidation) cfid != NULL specifies revalidation */
475 /* this is revalidation: we have to check is LOOKUP lock still
476 * valid for given fid. very important part is that we have to
477 * choose right mds because namespace is per mds */
479 obj = lmv_grab_obj(obd, pfid);
481 mds = raw_name2idx(obj->objcount, (char *) name, len);
482 rpfid = obj->objs[mds].fid;
487 CDEBUG(D_OTHER, "revalidate lookup for %lu/%lu/%lu to %d MDS\n",
488 (unsigned long)cfid->mds, (unsigned long)cfid->id,
489 (unsigned long)cfid->generation, mds);
491 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, pfid, name,
492 len, lmm, lmmsize, cfid, it, flags,
499 /* this is lookup. during lookup we have to update all the attributes,
500 * because returned values will be put in struct inode */
502 obj = lmv_grab_obj(obd, pfid);
505 /* directory is already splitted. calculate mds */
506 mds = raw_name2idx(obj->objcount, (char *)name, len);
507 rpfid = obj->objs[mds].fid;
512 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, uctxt, &rpfid, name,
513 len, lmm, lmmsize, NULL, it, flags, reqp,
516 /* very interesting. it seems object is still valid but for some
517 * reason llite calls lookup, not revalidate */
518 CWARN("lookup for %lu/%lu/%lu and data should be uptodate\n",
519 (unsigned long)rpfid.mds, (unsigned long)rpfid.id,
520 (unsigned long)rpfid.generation);
521 LASSERT(*reqp == NULL);
525 if (rc == 0 && *reqp == NULL) {
526 /* once again, we're asked for lookup, not revalidate */
527 CWARN("lookup for %lu/%lu/%lu and data should be uptodate\n",
528 (unsigned long)rpfid.mds, (unsigned long)rpfid.id,
529 (unsigned long)rpfid.generation);
533 if (rc == -ERESTART) {
534 /* directory got splitted since last update. this shouldn't be
535 * becasue splitting causes lock revocation, so revalidate had
536 * to fail and lookup on dir had to return mea */
537 CWARN("we haven't knew about directory splitting!\n");
538 LASSERT(obj == NULL);
540 obj = lmv_create_obj(exp, &rpfid, NULL);
542 RETURN(PTR_ERR(obj));
550 /* okay, MDS has returned success. probably name has been resolved in
552 rc = lmv_handle_remote_inode(exp, uctxt, lmm, lmmsize, it, flags,
555 if (rc == 0 && (mea = body_of_splitted_dir(*reqp, 1))) {
556 /* wow! this is splitted dir, we'd like to handle it */
557 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
558 LASSERT(body != NULL);
560 obj = lmv_grab_obj(obd, &body->fid1);
562 obj = lmv_create_obj(exp, &body->fid1, mea);
564 RETURN(PTR_ERR(obj));
572 int lmv_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt,
573 struct ll_fid *pfid, const char *name, int len,
574 void *lmm, int lmmsize, struct ll_fid *cfid,
575 struct lookup_intent *it, int flags,
576 struct ptlrpc_request **reqp,
577 ldlm_blocking_callback cb_blocking)
579 struct obd_device *obd = exp->exp_obd;
586 CDEBUG(D_OTHER, "INTENT LOCK '%s' for '%*s' on %lu/%lu -> %u\n",
587 LL_IT2STR(it), len, name, (unsigned long) pfid->id,
588 (unsigned long) pfid->generation, pfid->mds);
590 rc = lmv_check_connect(obd);
594 if (it->it_op == IT_LOOKUP)
595 rc = lmv_intent_lookup(exp, uctxt, pfid, name, len, lmm,
596 lmmsize, cfid, it, flags, reqp,
598 else if (it->it_op & IT_OPEN)
599 rc = lmv_intent_open(exp, uctxt, pfid, name, len, lmm,
600 lmmsize, cfid, it, flags, reqp,
602 else if (it->it_op == IT_GETATTR || it->it_op == IT_CHDIR)
603 rc = lmv_intent_getattr(exp, uctxt, pfid, name, len, lmm,
604 lmmsize, cfid, it, flags, reqp,
611 int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
612 struct ll_fid *mfid, struct lookup_intent *oit,
613 int master_valid, ldlm_blocking_callback cb_blocking)
615 struct obd_device *obd = exp->exp_obd;
616 struct ptlrpc_request *mreq = *reqp;
617 struct lmv_obd *lmv = &obd->u.lmv;
618 struct lustre_handle master_lockh;
619 struct ldlm_lock *lock;
620 unsigned long size = 0;
621 struct mds_body *body;
622 struct ll_uctxt uctxt;
624 int master_lock_mode;
628 /* we have to loop over the subobjects, check validity and update them
629 * from MDSs if needed. it's very useful that we need not to update all
630 * the fields. say, common fields (that are equal on all the subojects
631 * need not to be update, another fields (i_size, for example) are
632 * cached all the time */
633 obj = lmv_grab_obj(obd, mfid);
634 LASSERT(obj != NULL);
638 master_lock_mode = 0;
642 for (i = 0; i < obj->objcount; i++) {
643 struct ll_fid fid = obj->objs[i].fid;
644 struct lustre_handle *lockh = NULL;
645 struct ptlrpc_request *req = NULL;
646 ldlm_blocking_callback cb;
647 struct lookup_intent it;
650 CDEBUG(D_OTHER, "revalidate subobj %lu/%lu/%lu\n",
651 (unsigned long)fid.mds, (unsigned long)fid.id,
652 (unsigned long) fid.generation);
654 memset(&it, 0, sizeof(it));
655 it.it_op = IT_GETATTR;
656 cb = lmv_dirobj_blocking_ast;
658 if (fid_equal(&fid, &obj->fid)) {
660 /* lmv_intent_getattr() already checked
661 * validness and took the lock */
663 /* it even got the reply refresh attrs
665 body = lustre_msg_buf(mreq->rq_repmsg,
667 LASSERT(body != NULL);
670 /* take already cached attrs into account */
672 "master is locked and cached\n");
680 rc = md_intent_lock(lmv->tgts[fid.mds].ltd_exp, &uctxt, &fid,
681 NULL, 0, NULL, 0, &fid, &it, 0, &req, cb);
682 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
684 /* nice, this slave is valid */
685 LASSERT(req == NULL);
686 CDEBUG(D_OTHER, "cached\n");
691 /* error during revalidation */
694 /* rc == 0, this means we have no such a lock and can't think
695 * obj is still valid. lookup it again */
696 LASSERT(req == NULL);
699 memset(&it, 0, sizeof(it));
700 it.it_op = IT_GETATTR;
701 rc = md_intent_lock(lmv->tgts[fid.mds].ltd_exp, &uctxt, &fid,
702 NULL, 0, NULL, 0, NULL, &it, 0, &req, cb);
703 lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
707 /* error during lookup */
711 LASSERT(master_valid == 0);
712 /* save lock on master to be returned to the caller */
713 CDEBUG(D_OTHER, "no lock on master yet\n");
714 memcpy(&master_lockh, lockh, sizeof(master_lockh));
715 master_lock_mode = it.d.lustre.it_lock_mode;
716 it.d.lustre.it_lock_mode = 0;
718 /* this is slave. we want to control it */
719 lock = ldlm_handle2lock(lockh);
721 lock->l_ast_data = lmv_get_obj(obj);
726 /* this is first reply, we'll use it to return
727 * updated data back to the caller */
729 ptlrpc_request_addref(req);
734 body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
738 obj->objs[i].size = body->size;
740 CDEBUG(D_OTHER, "fresh: %lu\n",
741 (unsigned long)obj->objs[i].size);
744 ptlrpc_req_finished(req);
746 size += obj->objs[i].size;
748 if (it.d.lustre.it_lock_mode)
749 ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
753 /* some attrs got refreshed, we have reply and it's time to put
754 * fresh attrs to it */
755 CDEBUG(D_OTHER, "return refreshed attrs: size = %lu\n",
756 (unsigned long)size);
758 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
761 /* FIXME: what about other attributes? */
765 /* very important to maintain lli->mds the same because
766 * of revalidation. mreq == NULL means that caller has
767 * no reply and the only attr we can return is size */
768 body->valid = OBD_MD_FLSIZE;
769 body->mds = obj->fid.mds;
771 if (master_valid == 0) {
772 memcpy(&oit->d.lustre.it_lock_handle,
773 &master_lockh, sizeof(master_lockh));
774 oit->d.lustre.it_lock_mode = master_lock_mode;
778 /* it seems all the attrs are fresh and we did no request */
779 CDEBUG(D_OTHER, "all the attrs were fresh\n");
780 if (master_valid == 0)
781 oit->d.lustre.it_lock_mode = master_lock_mode;