1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 # define EXPORT_SYMTAB
25 #define DEBUG_SUBSYSTEM S_LMV
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
33 #include <linux/seq_file.h>
34 #include <linux/namei.h>
36 #include <liblustre.h>
39 #include <linux/obd_support.h>
40 #include <linux/lustre_lib.h>
41 #include <linux/lustre_net.h>
42 #include <linux/lustre_idl.h>
43 #include <linux/lustre_dlm.h>
44 #include <linux/lustre_mds.h>
45 #include <linux/obd_class.h>
46 #include <linux/obd_ost.h>
47 #include <linux/lprocfs_status.h>
48 #include <linux/lustre_fsfilt.h>
49 #include <linux/obd_lmv.h>
50 #include <linux/lustre_lite.h>
51 #include "lmv_internal.h"
53 static inline void lmv_drop_intent_lock(struct lookup_intent *it)
55 if (LUSTRE_IT(it)->it_lock_mode != 0)
56 ldlm_lock_decref((void *)&LUSTRE_IT(it)->it_lock_handle,
57 LUSTRE_IT(it)->it_lock_mode);
60 int lmv_intent_remote(struct obd_export *exp, void *lmm,
61 int lmmsize, struct lookup_intent *it,
62 int flags, struct ptlrpc_request **reqp,
63 ldlm_blocking_callback cb_blocking)
65 struct obd_device *obd = exp->exp_obd;
66 struct lmv_obd *lmv = &obd->u.lmv;
67 struct ptlrpc_request *req = NULL;
68 struct mds_body *body = NULL;
69 struct lustre_handle plock;
74 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
75 LASSERT(body != NULL);
77 if (!(body->valid & OBD_MD_MDS))
81 * oh, MDS reports that this is remote inode case i.e. we have to ask
82 * for real attrs on another MDS.
84 if (it->it_op == IT_LOOKUP || it->it_op == IT_CHDIR) {
86 * unfortunately, we have to lie to MDC/MDS to retrieve
87 * attributes llite needs.
89 it->it_op = IT_GETATTR;
92 /* we got LOOKUP lock, but we really need attrs */
93 pmode = LUSTRE_IT(it)->it_lock_mode;
95 memcpy(&plock, &LUSTRE_IT(it)->it_lock_handle,
97 LUSTRE_IT(it)->it_lock_mode = 0;
98 LUSTRE_IT(it)->it_data = 0;
101 LASSERT((body->valid & OBD_MD_FID) != 0);
104 LUSTRE_IT(it)->it_disposition &= ~DISP_ENQ_COMPLETE;
105 rc = md_intent_lock(lmv->tgts[id_group(&nid)].ltd_exp, &nid,
106 NULL, 0, lmm, lmmsize, NULL, it, flags,
110 * llite needs LOOKUP lock to track dentry revocation in order to
111 * maintain dcache consistency. Thus drop UPDATE lock here and put
115 lmv_drop_intent_lock(it);
116 memcpy(&LUSTRE_IT(it)->it_lock_handle, &plock,
118 LUSTRE_IT(it)->it_lock_mode = pmode;
120 ldlm_lock_decref(&plock, pmode);
123 ptlrpc_req_finished(*reqp);
128 int lmv_intent_open(struct obd_export *exp, struct lustre_id *pid,
129 const char *name, int len, void *lmm, int lmmsize,
130 struct lustre_id *cid, struct lookup_intent *it,
131 int flags, struct ptlrpc_request **reqp,
132 ldlm_blocking_callback cb_blocking)
134 struct obd_device *obd = exp->exp_obd;
135 struct lmv_obd *lmv = &obd->u.lmv;
136 struct mds_body *body = NULL;
137 struct lustre_id rpid = *pid;
138 int rc, mds, loop = 0;
143 /* IT_OPEN is intended to open (and create, possible) an object. Parent
144 * (pid) may be splitted dir */
147 LASSERT(++loop <= 2);
148 mds = id_group(&rpid);
149 obj = lmv_grab_obj(obd, &rpid);
151 /* directory is already splitted, so we have to forward
152 * request to the right MDS */
153 mds = raw_name2idx(obj->hashtype, obj->objcount,
156 CDEBUG(D_OTHER, "forward to MDS #%u ("DLID4")\n",
158 rpid = obj->objs[mds].id;
162 rc = md_intent_lock(lmv->tgts[id_group(&rpid)].ltd_exp, &rpid, name,
163 len, lmm, lmmsize, cid, it, flags, reqp, cb_blocking);
164 if (rc == -ERESTART) {
165 /* directory got splitted. time to update local object and
166 * repeat the request with proper MDS */
167 LASSERT(id_equal_fid(pid, &rpid));
168 rc = lmv_get_mea_and_update_object(exp, &rpid);
170 ptlrpc_req_finished(*reqp);
177 /* okay, MDS has returned success. Probably name has been resolved in
179 rc = lmv_intent_remote(exp, lmm, lmmsize, it, flags, reqp, cb_blocking);
184 * this is possible, that some userspace application will try to
185 * open file as directory and we will have -ENOTDIR here. As
186 * this is "usual" situation, we should not print error here,
189 CDEBUG(D_OTHER, "can't handle remote %s: dir "DLID4"("DLID4"):"
190 "%*s: %d\n", LL_IT2STR(it), OLID4(pid), OLID4(&rpid),
196 * nothing is found, do not access body->id1 as it is zero and thus
199 if ((LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG) &&
200 !(LUSTRE_IT(it)->it_disposition & DISP_OPEN_CREATE) &&
201 !(LUSTRE_IT(it)->it_disposition & DISP_OPEN_OPEN))
204 /* caller may use attrs MDS returns on IT_OPEN lock request so, we have
205 * to update them for splitted dir */
206 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
207 LASSERT(body != NULL);
209 /* could not find object, FID is not present in response. */
210 if (!(body->valid & OBD_MD_FID))
214 obj = lmv_grab_obj(obd, cid);
215 if (!obj && (mea = lmv_splitted_dir_body(*reqp, 1))) {
216 /* wow! this is splitted dir, we'd like to handle it */
217 obj = lmv_create_obj(exp, &body->id1, mea);
219 RETURN(PTR_ERR(obj));
223 /* this is splitted dir and we'd want to get attrs */
224 CDEBUG(D_OTHER, "attrs from slaves for "DLID4"\n",
227 rc = lmv_revalidate_slaves(exp, reqp, cid, it, 1,
229 } else if (S_ISDIR(body->mode)) {
230 CDEBUG(D_OTHER, "object "DLID4" has not lmv obj?\n",
240 int lmv_intent_getattr(struct obd_export *exp, struct lustre_id *pid,
241 const char *name, int len, void *lmm, int lmmsize,
242 struct lustre_id *cid, struct lookup_intent *it,
243 int flags, struct ptlrpc_request **reqp,
244 ldlm_blocking_callback cb_blocking)
246 struct obd_device *obd = exp->exp_obd;
247 struct lmv_obd *lmv = &obd->u.lmv;
248 struct mds_body *body = NULL;
249 struct lustre_id rpid = *pid;
250 struct lmv_obj *obj = NULL, *obj2 = NULL;
256 /* caller wants to revalidate attrs of obj we have to revalidate
257 * slaves if requested object is splitted directory */
258 CDEBUG(D_OTHER, "revalidate attrs for "DLID4"\n", OLID4(cid));
261 obj = lmv_grab_obj(obd, cid);
263 /* in fact, we need not this with current intent_lock(),
264 * but it may change some day */
265 if (!id_equal_fid(pid, cid)){
266 rpid = obj->objs[mds].id;
267 mds = id_group(&rpid);
273 CDEBUG(D_OTHER, "INTENT getattr for %*s on "DLID4"\n",
274 len, name, OLID4(pid));
276 obj = lmv_grab_obj(obd, pid);
278 /* directory is already splitted. calculate mds */
279 mds = raw_name2idx(obj->hashtype, obj->objcount,
281 rpid = obj->objs[mds].id;
282 mds = id_group(&rpid);
285 CDEBUG(D_OTHER, "forward to MDS #%u (slave "DLID4")\n",
290 /* the same about fid returning. */
291 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, &rpid, name, len, lmm,
292 lmmsize, cid, it, flags, reqp, cb_blocking);
297 /* this is splitted dir. In order to optimize things a
298 * bit, we consider obj valid updating missing parts.
300 * FIXME: do we need to return any lock here? It would
301 * be fine if we don't. this means that nobody should
302 * use UPDATE lock to notify about object * removal */
304 "revalidate slaves for "DLID4", rc %d\n",
308 rc = lmv_revalidate_slaves(exp, reqp, cid, it, rc,
316 /* okay, MDS has returned success. probably name has been
317 * resolved in remote inode */
318 rc = lmv_intent_remote(exp, lmm, lmmsize, it, flags,
324 * nothing is found, do not access body->id1 as it is zero and thus
327 if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG)
330 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
331 LASSERT(body != NULL);
333 /* could not find object, FID is not present in response. */
334 if (!(body->valid & OBD_MD_FID))
338 obj2 = lmv_grab_obj(obd, cid);
340 if (!obj2 && (mea = lmv_splitted_dir_body(*reqp, 1))) {
341 /* wow! this is splitted dir, we'd like to handle it. */
342 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
343 LASSERT(body != NULL);
345 obj2 = lmv_create_obj(exp, &body->id1, mea);
347 RETURN(PTR_ERR(obj2));
351 /* this is splitted dir and we'd want to get attrs */
352 CDEBUG(D_OTHER, "attrs from slaves for "DLID4", rc %d\n",
355 rc = lmv_revalidate_slaves(exp, reqp, cid, it, 1,
362 void lmv_update_body_from_obj(struct mds_body *body, struct lmv_inode *obj)
365 body->size += obj->size;
368 int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
370 struct obd_device *obd = exp->exp_obd;
371 struct lmv_obd *lmv = &obd->u.lmv;
372 struct mds_body *body = NULL;
373 struct lustre_handle *lockh;
374 struct ldlm_lock *lock;
375 struct mds_body *body2;
383 /* master is locked. we'd like to take locks on slaves and update
384 * attributes to be returned from the slaves it's important that lookup
385 * is called in two cases:
387 * - for first time (dcache has no such a resolving yet).
388 * - ->d_revalidate() returned false.
390 * last case possible only if all the objs (master and all slaves aren't
393 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
394 LASSERT(body != NULL);
395 LASSERT((body->valid & OBD_MD_FID) != 0);
397 obj = lmv_grab_obj(obd, &body->id1);
398 LASSERT(obj != NULL);
400 CDEBUG(D_OTHER, "lookup slaves for "DLID4"\n",
405 for (i = 0; i < obj->objcount; i++) {
406 struct lustre_id id = obj->objs[i].id;
407 struct ptlrpc_request *req = NULL;
408 struct lookup_intent it;
410 if (id_equal_fid(&id, &obj->id))
411 /* skip master obj */
414 CDEBUG(D_OTHER, "lookup slave "DLID4"\n", OLID4(&id));
417 memset(&it, 0, sizeof(it));
418 it.it_op = IT_GETATTR;
419 OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
421 GOTO(cleanup, rc = -ENOMEM);
423 rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp, &id,
424 NULL, 0, NULL, 0, &id, &it, 0, &req,
425 lmv_dirobj_blocking_ast);
427 lockh = (struct lustre_handle *)&LUSTRE_IT(&it)->it_lock_handle;
428 if (rc > 0 && req == NULL) {
429 /* nice, this slave is valid */
430 LASSERT(req == NULL);
431 CDEBUG(D_OTHER, "cached\n");
436 OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
437 /* error during lookup */
440 lock = ldlm_handle2lock(lockh);
443 lock->l_ast_data = lmv_get_obj(obj);
445 body2 = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body2));
448 obj->objs[i].size = body2->size;
450 CDEBUG(D_OTHER, "fresh: %lu\n",
451 (unsigned long)obj->objs[i].size);
456 ptlrpc_req_finished(req);
458 lmv_update_body_from_obj(body, obj->objs + i);
460 if (LUSTRE_IT(&it)->it_lock_mode)
461 ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
462 OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
472 int lmv_intent_lookup(struct obd_export *exp, struct lustre_id *pid,
473 const char *name, int len, void *lmm, int lmmsize,
474 struct lustre_id *cid, struct lookup_intent *it,
475 int flags, struct ptlrpc_request **reqp,
476 ldlm_blocking_callback cb_blocking)
478 struct obd_device *obd = exp->exp_obd;
479 struct lmv_obd *lmv = &obd->u.lmv;
480 struct mds_body *body = NULL;
481 struct lustre_id rpid = *pid;
484 int rc, mds, loop = 0;
488 * IT_LOOKUP is intended to produce name -> id resolving (let's call
489 * this lookup below) or to confirm requested resolving is still valid
490 * (let's call this revalidation) cid != NULL specifies revalidation.
494 * this is revalidation: we have to check is LOOKUP lock still
495 * valid for given id. Very important part is that we have to
496 * choose right mds because namespace is per mds.
499 obj = lmv_grab_obj(obd, pid);
501 mds = raw_name2idx(obj->hashtype, obj->objcount,
503 rpid = obj->objs[mds].id;
506 mds = id_group(&rpid);
508 CDEBUG(D_OTHER, "revalidate lookup for "DLID4" to %d MDS\n",
514 LASSERT(++loop <= 2);
516 /* this is lookup. during lookup we have to update all the
517 * attributes, because returned values will be put in struct
520 obj = lmv_grab_obj(obd, pid);
523 /* directory is already splitted. calculate mds */
524 mds = raw_name2idx(obj->hashtype, obj->objcount,
526 rpid = obj->objs[mds].id;
527 mds = id_group(&rpid);
532 rc = md_intent_lock(lmv->tgts[mds].ltd_exp, &rpid, name,
533 len, lmm, lmmsize, cid, it, flags,
540 /* very interesting. it seems object is still valid but for some
541 * reason llite calls lookup, not revalidate */
542 CDEBUG(D_OTHER, "lookup for "DLID4" and data should be uptodate\n",
544 LASSERT(*reqp == NULL);
548 if (rc == 0 && *reqp == NULL) {
549 /* once again, we're asked for lookup, not revalidate */
550 CDEBUG(D_OTHER, "lookup for "DLID4" and data should be uptodate\n",
555 if (rc == -ERESTART) {
556 /* directory got splitted since last update. this shouldn't be
557 * becasue splitting causes lock revocation, so revalidate had
558 * to fail and lookup on dir had to return mea */
559 CWARN("we haven't knew about directory splitting!\n");
560 LASSERT(obj == NULL);
562 obj = lmv_create_obj(exp, &rpid, NULL);
564 RETURN(PTR_ERR(obj));
572 /* okay, MDS has returned success. Probably name has been resolved in
574 rc = lmv_intent_remote(exp, lmm, lmmsize, it, flags, reqp, cb_blocking);
576 if (rc == 0 && (mea = lmv_splitted_dir_body(*reqp, 1))) {
577 /* wow! this is splitted dir, we'd like to handle it */
578 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
579 LASSERT(body != NULL);
580 LASSERT((body->valid & OBD_MD_FID) != 0);
582 obj = lmv_grab_obj(obd, &body->id1);
584 obj = lmv_create_obj(exp, &body->id1, mea);
586 RETURN(PTR_ERR(obj));
594 int lmv_intent_lock(struct obd_export *exp, struct lustre_id *pid,
595 const char *name, int len, void *lmm, int lmmsize,
596 struct lustre_id *cid, struct lookup_intent *it,
597 int flags, struct ptlrpc_request **reqp,
598 ldlm_blocking_callback cb_blocking)
600 struct obd_device *obd = exp->exp_obd;
607 CDEBUG(D_OTHER, "INTENT LOCK '%s' for '%*s' on %lu/%lu -> %lu\n",
608 LL_IT2STR(it), len, name, (unsigned long)id_ino(pid),
609 (unsigned long)id_gen(pid), (unsigned long)id_group(pid));
611 rc = lmv_check_connect(obd);
615 if (it->it_op == IT_LOOKUP)
616 rc = lmv_intent_lookup(exp, pid, name, len, lmm,
617 lmmsize, cid, it, flags, reqp,
619 else if (it->it_op & IT_OPEN)
620 rc = lmv_intent_open(exp, pid, name, len, lmm,
621 lmmsize, cid, it, flags, reqp,
623 else if (it->it_op == IT_GETATTR || it->it_op == IT_CHDIR)
624 rc = lmv_intent_getattr(exp, pid, name, len, lmm,
625 lmmsize, cid, it, flags, reqp,
632 int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
633 struct lustre_id *mid, struct lookup_intent *oit,
634 int master_valid, ldlm_blocking_callback cb_blocking)
636 struct obd_device *obd = exp->exp_obd;
637 struct ptlrpc_request *mreq = *reqp;
638 struct lmv_obd *lmv = &obd->u.lmv;
639 struct lustre_handle master_lockh;
640 struct ldlm_lock *lock;
641 unsigned long size = 0;
642 struct mds_body *body;
644 int master_lock_mode;
648 /* we have to loop over the subobjects, check validity and update them
649 * from MDSs if needed. it's very useful that we need not to update all
650 * the fields. say, common fields (that are equal on all the subojects
651 * need not to be update, another fields (i_size, for example) are
652 * cached all the time */
653 obj = lmv_grab_obj(obd, mid);
654 LASSERT(obj != NULL);
656 master_lock_mode = 0;
660 for (i = 0; i < obj->objcount; i++) {
661 struct lustre_id id = obj->objs[i].id;
662 struct lustre_handle *lockh = NULL;
663 struct ptlrpc_request *req = NULL;
664 ldlm_blocking_callback cb;
665 struct lookup_intent it;
668 CDEBUG(D_OTHER, "revalidate subobj "DLID4"\n",
671 memset(&it, 0, sizeof(it));
672 it.it_op = IT_GETATTR;
674 cb = lmv_dirobj_blocking_ast;
676 OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
678 GOTO(cleanup, rc = -ENOMEM);
680 if (id_equal_fid(&id, &obj->id)) {
682 /* lmv_intent_getattr() already checked
683 * validness and took the lock */
685 /* it even got the reply refresh attrs
687 body = lustre_msg_buf(mreq->rq_repmsg,
689 LASSERT(body != NULL);
692 /* take already cached attrs into account */
694 "master is locked and cached\n");
703 rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp,
704 &id, NULL, 0, NULL, 0, &id, &it, 0,
706 lockh = (struct lustre_handle *) &LUSTRE_IT(&it)->it_lock_handle;
707 if (rc > 0 && req == NULL) {
708 /* nice, this slave is valid */
709 LASSERT(req == NULL);
710 CDEBUG(D_OTHER, "cached\n");
715 OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
716 /* error during revalidation */
720 LASSERT(master_valid == 0);
721 /* save lock on master to be returned to the caller */
722 CDEBUG(D_OTHER, "no lock on master yet\n");
723 memcpy(&master_lockh, lockh, sizeof(master_lockh));
724 master_lock_mode = LUSTRE_IT(&it)->it_lock_mode;
725 LUSTRE_IT(&it)->it_lock_mode = 0;
727 /* this is slave. we want to control it */
728 lock = ldlm_handle2lock(lockh);
730 lock->l_ast_data = lmv_get_obj(obj);
735 /* this is first reply, we'll use it to return updated
736 * data back to the caller */
738 ptlrpc_request_addref(req);
743 body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
747 obj->objs[i].size = body->size;
749 CDEBUG(D_OTHER, "fresh: %lu\n",
750 (unsigned long)obj->objs[i].size);
753 ptlrpc_req_finished(req);
755 size += obj->objs[i].size;
757 if (LUSTRE_IT(&it)->it_lock_mode)
758 ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
759 OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
763 /* some attrs got refreshed, we have reply and it's time to put
764 * fresh attrs to it */
765 CDEBUG(D_OTHER, "return refreshed attrs: size = %lu\n",
766 (unsigned long)size);
768 body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
771 /* FIXME: what about other attributes? */
775 /* very important to maintain id_group(lli->lli_id) the
776 * same because of revalidation. mreq == NULL means that
777 * caller has no reply and the only attr we can return
779 body->valid = OBD_MD_FLSIZE;
780 // body->mds = id_group(&obj->id);
782 if (master_valid == 0) {
783 memcpy(&LUSTRE_IT(oit)->it_lock_handle,
784 &master_lockh, sizeof(master_lockh));
785 LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;
789 /* it seems all the attrs are fresh and we did no request */
790 CDEBUG(D_OTHER, "all the attrs were fresh\n");
791 if (master_valid == 0)
792 LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;