1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 # define EXPORT_SYMTAB
25 #define DEBUG_SUBSYSTEM S_LMV
27 #include <linux/slab.h>
28 #include <linux/module.h>
29 #include <linux/init.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <asm/div64.h>
33 #include <linux/seq_file.h>
35 #include <liblustre.h>
37 #include <linux/ext2_fs.h>
39 #include <linux/obd_support.h>
40 #include <linux/lustre_lib.h>
41 #include <linux/lustre_net.h>
42 #include <linux/lustre_idl.h>
43 #include <linux/lustre_dlm.h>
44 #include <linux/lustre_mds.h>
45 #include <linux/obd_class.h>
46 #include <linux/obd_ost.h>
47 #include <linux/lprocfs_status.h>
48 #include <linux/lustre_fsfilt.h>
49 #include <linux/obd_lmv.h>
50 #include "lmv_internal.h"
54 * -EINVAL : UUID can't be found in the LMV's target list
55 * -ENOTCONN: The UUID is found, but the target connection is bad (!)
56 * -EBADF : The UUID is found, but the OBD of the wrong type (!)
58 static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
61 struct obd_device *obd;
62 struct lmv_tgt_desc *tgt;
66 CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
67 lmv, uuid->uuid, activate);
69 spin_lock(&lmv->lmv_lock);
70 for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgt++) {
71 CDEBUG(D_INFO, "lmv idx %d is %s conn "LPX64"\n",
72 i, tgt->uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
73 if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0)
77 if (i == lmv->desc.ld_tgt_count)
78 GOTO(out, rc = -EINVAL);
80 obd = class_exp2obd(tgt->ltd_exp);
82 /* This can happen if OST failure races with node shutdown */
83 GOTO(out, rc = -ENOTCONN);
86 CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
87 obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
88 obd->obd_type->typ_name, i);
89 LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
91 if (tgt->active == activate) {
92 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
93 activate ? "" : "in");
97 CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
99 tgt->active = activate;
101 lmv->desc.ld_active_tgt_count++;
103 lmv->desc.ld_active_tgt_count--;
107 spin_unlock(&lmv->lmv_lock);
111 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
115 struct obd_uuid *uuid;
117 if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
118 CERROR("unexpected notification of %s %s!\n",
119 watched->obd_type->typ_name,
123 uuid = &watched->u.cli.cl_import->imp_target_uuid;
125 /* Set MDC as active before notifying the observer, so the
126 * observer can use the MDC normally.
128 rc = lmv_set_mdc_active(&obd->u.lmv, uuid, active);
130 CERROR("%sactivation of %s failed: %d\n",
131 active ? "" : "de", uuid->uuid, rc);
135 if (obd->obd_observer)
136 /* Pass the notification up the chain. */
137 rc = obd_notify(obd->obd_observer, watched, active);
142 int lmv_attach(struct obd_device *dev, obd_count len, void *data)
144 struct lprocfs_static_vars lvars;
148 lprocfs_init_vars(lmv, &lvars);
149 rc = lprocfs_obd_attach(dev, lvars.obd_vars);
152 struct proc_dir_entry *entry;
154 entry = create_proc_entry("target_obd", 0444, dev->obd_proc_entry);
157 /* entry->proc_fops = &lmv_proc_target_fops; */
164 int lmv_detach(struct obd_device *dev)
166 return lprocfs_obd_detach(dev);
169 /* This is fake connect function. Its purpose is to initialize lmv and
170 * say caller that everything is okay. Real connection will be performed
172 static int lmv_connect(struct lustre_handle *conn, struct obd_device *obd,
173 struct obd_uuid *cluuid)
175 struct lmv_obd *lmv = &obd->u.lmv;
176 struct obd_export *exp;
180 rc = class_connect(conn, obd, cluuid);
182 CERROR("class_connection() returned %d\n", rc);
186 exp = class_conn2export(conn);
187 /* We don't want to actually do the underlying connections more than
188 * once, so keep track. */
190 if (lmv->refcount > 1) {
191 class_export_put(exp);
195 lmv->cluuid = *cluuid;
202 void lmv_set_timeouts(struct obd_device *obd)
204 struct lmv_tgt_desc *tgts;
209 if (lmv->server_timeout == 0)
212 if (lmv->connected == 0)
215 for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
216 if (tgts->ltd_exp == NULL)
218 obd_set_info(tgts->ltd_exp, strlen("inter_mds"),
219 "inter_mds", 0, NULL);
223 /* Performs a check if passed obd is connected. If no - connect it. */
224 int lmv_check_connect(struct obd_device *obd) {
225 struct lmv_obd *lmv = &obd->u.lmv;
226 struct obd_uuid *cluuid;
227 struct lmv_tgt_desc *tgts;
228 struct obd_export *exp;
236 cluuid = &lmv->cluuid;
238 CDEBUG(D_OTHER, "time to connect %s to %s\n",
239 cluuid->uuid, obd->obd_name);
241 for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
242 struct obd_device *tgt_obd;
243 struct obd_uuid lmv_osc_uuid = { "LMV_OSC_UUID" };
244 struct lustre_handle conn = {0, };
246 LASSERT(tgts != NULL);
248 tgt_obd = class_find_client_obd(&tgts->uuid, LUSTRE_MDC_NAME,
251 CERROR("Target %s not attached\n", tgts->uuid.uuid);
252 GOTO(out_disc, rc = -EINVAL);
255 /* for MDS: don't connect to yourself */
256 if (obd_uuid_equals(&tgts->uuid, cluuid)) {
257 CDEBUG(D_OTHER, "don't connect back to %s\n",
259 tgts->ltd_exp = NULL;
263 CDEBUG(D_OTHER, "connect to %s(%s) - %s, %s FOR %s\n",
264 tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
265 tgts->uuid.uuid, obd->obd_uuid.uuid,
268 if (!tgt_obd->obd_set_up) {
269 CERROR("Target %s not set up\n", tgts->uuid.uuid);
270 GOTO(out_disc, rc = -EINVAL);
273 rc = obd_connect(&conn, tgt_obd, &lmv_osc_uuid);
275 CERROR("Target %s connect error %d\n",
276 tgts->uuid.uuid, rc);
279 tgts->ltd_exp = class_conn2export(&conn);
281 obd_init_ea_size(tgts->ltd_exp, lmv->max_easize,
282 lmv->max_cookiesize);
284 rc = obd_register_observer(tgt_obd, obd);
286 CERROR("Target %s register_observer error %d\n",
287 tgts->uuid.uuid, rc);
288 obd_disconnect(tgts->ltd_exp, 0);
292 lmv->desc.ld_active_tgt_count++;
295 CDEBUG(D_OTHER, "connected to %s(%s) successfully (%d)\n",
296 tgt_obd->obd_name, tgt_obd->obd_uuid.uuid,
297 atomic_read(&obd->obd_refcount));
300 lmv_set_timeouts(obd);
302 class_export_put(exp);
307 struct obd_uuid uuid;
309 --lmv->desc.ld_active_tgt_count;
311 /* save for CERROR below; (we know it's terminated) */
313 rc2 = obd_disconnect(tgts->ltd_exp, 0);
315 CERROR("error: LMV target %s disconnect on MDT idx %d: "
316 "rc = %d\n", uuid.uuid, i, rc2);
318 class_disconnect(exp, 0);
322 static int lmv_disconnect(struct obd_export *exp, int flags)
324 struct obd_device *obd = class_exp2obd(exp);
325 struct lmv_obd *lmv = &obd->u.lmv;
332 /* Only disconnect the underlying layers on the final disconnect. */
334 if (lmv->refcount != 0)
337 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
338 if (lmv->tgts[i].ltd_exp == NULL)
341 if (obd->obd_no_recov) {
342 /* Pass it on to our clients.
343 * XXX This should be an argument to disconnect,
344 * XXX not a back-door flag on the OBD. Ah well.
346 struct obd_device *mdc_obd;
347 mdc_obd = class_exp2obd(lmv->tgts[i].ltd_exp);
349 mdc_obd->obd_no_recov = 1;
352 CDEBUG(D_OTHER, "disconnected from %s(%s) successfully\n",
353 lmv->tgts[i].ltd_exp->exp_obd->obd_name,
354 lmv->tgts[i].ltd_exp->exp_obd->obd_uuid.uuid);
356 obd_register_observer(lmv->tgts[i].ltd_exp->exp_obd, NULL);
358 rc = obd_disconnect(lmv->tgts[i].ltd_exp, flags);
359 if (lmv->tgts[i].active) {
360 lmv->desc.ld_active_tgt_count--;
361 lmv->tgts[i].active = 0;
363 lmv->tgts[i].ltd_exp = NULL;
367 /* FIXME: cleanup here */
369 class_export_put(exp);
370 rc = class_disconnect(exp, 0);
374 static int lmv_setup(struct obd_device *obd, obd_count len, void *buf)
376 struct lustre_cfg *lcfg = buf;
377 struct lmv_desc *desc;
378 struct lmv_obd *lmv = &obd->u.lmv;
379 struct obd_uuid *uuids;
380 struct lmv_tgt_desc *tgts;
385 if (lcfg->lcfg_inllen1 < 1) {
386 CERROR("LMV setup requires a descriptor\n");
390 if (lcfg->lcfg_inllen2 < 1) {
391 CERROR("LMV setup requires an OST UUID list\n");
395 desc = (struct lmv_desc *)lcfg->lcfg_inlbuf1;
396 if (sizeof(*desc) > lcfg->lcfg_inllen1) {
397 CERROR("descriptor size wrong: %d > %d\n",
398 (int)sizeof(*desc), lcfg->lcfg_inllen1);
402 uuids = (struct obd_uuid *)lcfg->lcfg_inlbuf2;
403 if (sizeof(*uuids) * desc->ld_tgt_count != lcfg->lcfg_inllen2) {
404 CERROR("UUID array size wrong: %u * %u != %u\n",
405 sizeof(*uuids), desc->ld_tgt_count, lcfg->lcfg_inllen2);
409 lmv->bufsize = sizeof(struct lmv_tgt_desc) * desc->ld_tgt_count;
410 OBD_ALLOC(lmv->tgts, lmv->bufsize);
411 if (lmv->tgts == NULL) {
412 CERROR("Out of memory\n");
417 spin_lock_init(&lmv->lmv_lock);
419 for (i = 0, tgts = lmv->tgts; i < desc->ld_tgt_count; i++, tgts++)
420 tgts->uuid = uuids[i];
422 lmv->max_easize = sizeof(struct ll_fid) * desc->ld_tgt_count
423 + sizeof(struct mea);
424 lmv->max_cookiesize = 0;
429 static int lmv_statfs(struct obd_device *obd, struct obd_statfs *osfs,
430 unsigned long max_age)
432 struct lmv_obd *lmv = &obd->u.lmv;
433 struct obd_statfs temp;
437 rc = lmv_check_connect(obd);
441 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
442 rc = obd_statfs(lmv->tgts[i].ltd_exp->exp_obd, &temp, max_age);
444 CERROR("can't stat MDS #%d (%s)\n", i,
445 lmv->tgts[i].ltd_exp->exp_obd->obd_name);
449 memcpy(osfs, &temp, sizeof(temp));
451 osfs->os_bavail += temp.os_bavail;
452 osfs->os_blocks += temp.os_blocks;
453 osfs->os_ffree += temp.os_ffree;
454 osfs->os_files += temp.os_files;
460 static int lmv_cleanup(struct obd_device *obd, int flags)
462 struct lmv_obd *lmv = &obd->u.lmv;
464 lmv_cleanup_objs(obd);
465 OBD_FREE(lmv->tgts, lmv->bufsize);
469 static int lmv_getstatus(struct obd_export *exp, struct ll_fid *fid)
471 struct obd_device *obd = exp->exp_obd;
472 struct lmv_obd *lmv = &obd->u.lmv;
475 rc = lmv_check_connect(obd);
478 rc = md_getstatus(lmv->tgts[0].ltd_exp, fid);
483 static int lmv_getattr(struct obd_export *exp, struct ll_fid *fid,
484 unsigned long valid, unsigned int ea_size,
485 struct ptlrpc_request **request)
487 struct obd_device *obd = exp->exp_obd;
488 struct lmv_obd *lmv = &obd->u.lmv;
489 int rc, i = fid->mds;
492 rc = lmv_check_connect(obd);
495 obj = lmv_grab_obj(obd, fid, 0);
496 CDEBUG(D_OTHER, "GETATTR for %lu/%lu/%lu %s\n",
497 (unsigned long) fid->mds,
498 (unsigned long) fid->id,
499 (unsigned long) fid->generation,
500 obj ? "(splitted)" : "");
502 LASSERT(fid->mds < lmv->desc.ld_tgt_count);
503 rc = md_getattr(lmv->tgts[i].ltd_exp, fid,
504 valid, ea_size, request);
505 if (rc == 0 && obj) {
506 /* we have to loop over dirobjs here and gather attrs
507 * for all the slaves */
508 #warning "attrs gathering here"
514 static int lmv_change_cbdata(struct obd_export *exp,
516 ldlm_iterator_t it, void *data)
518 struct obd_device *obd = exp->exp_obd;
519 struct lmv_obd *lmv = &obd->u.lmv;
523 rc = lmv_check_connect(obd);
526 CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu\n",
527 (unsigned long) fid->mds,
528 (unsigned long) fid->id,
529 (unsigned long) fid->generation);
530 LASSERT(fid->mds < lmv->desc.ld_tgt_count);
531 rc = md_change_cbdata(lmv->tgts[fid->mds].ltd_exp, fid, it, data);
535 static int lmv_change_cbdata_name(struct obd_export *exp, struct ll_fid *pfid,
536 char *name, int len, struct ll_fid *cfid,
537 ldlm_iterator_t it, void *data)
539 struct obd_device *obd = exp->exp_obd;
540 struct lmv_obd *lmv = &obd->u.lmv;
544 rc = lmv_check_connect(obd);
547 LASSERT(pfid->mds < lmv->desc.ld_tgt_count);
548 LASSERT(cfid->mds < lmv->desc.ld_tgt_count);
549 CDEBUG(D_OTHER, "CBDATA for %lu/%lu/%lu:%*s -> %lu/%lu/%lu\n",
550 (unsigned long) pfid->mds, (unsigned long) pfid->id,
551 (unsigned long) pfid->generation, len, name,
552 (unsigned long) cfid->mds, (unsigned long) cfid->id,
553 (unsigned long) cfid->generation);
555 /* this is default mds for directory name belongs to */
557 obj = lmv_grab_obj(obd, pfid, 0);
559 /* directory is splitted. look for right mds for this name */
560 mds = raw_name2idx(obj->objcount, name, len);
563 rc = md_change_cbdata(lmv->tgts[mds].ltd_exp, cfid, it, data);
567 static int lmv_valid_attrs(struct obd_export *exp, struct ll_fid *fid)
569 struct obd_device *obd = exp->exp_obd;
570 struct lmv_obd *lmv = &obd->u.lmv;
573 rc = lmv_check_connect(obd);
576 CDEBUG(D_OTHER, "validate %lu/%lu/%lu\n",
577 (unsigned long) fid->mds,
578 (unsigned long) fid->id,
579 (unsigned long) fid->generation);
580 LASSERT(fid->mds < lmv->desc.ld_tgt_count);
581 rc = md_valid_attrs(lmv->tgts[fid->mds].ltd_exp, fid);
585 int lmv_close(struct obd_export *exp, struct obdo *obdo,
586 struct obd_client_handle *och,
587 struct ptlrpc_request **request)
589 struct obd_device *obd = exp->exp_obd;
590 struct lmv_obd *lmv = &obd->u.lmv;
591 int rc, i = obdo->o_mds;
593 rc = lmv_check_connect(obd);
596 LASSERT(i < lmv->desc.ld_tgt_count);
597 CDEBUG(D_OTHER, "CLOSE %lu/%lu/%lu\n", (unsigned long) obdo->o_mds,
598 (unsigned long) obdo->o_id, (unsigned long) obdo->o_generation);
599 rc = md_close(lmv->tgts[i].ltd_exp, obdo, och, request);
603 int lmv_get_mea_and_update_object(struct obd_export *exp, struct ll_fid *fid)
605 struct obd_device *obd = exp->exp_obd;
606 struct lmv_obd *lmv = &obd->u.lmv;
607 struct ptlrpc_request *req = NULL;
613 mealen = MEA_SIZE_LMV(lmv);
615 valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
617 /* time to update mea of parent fid */
618 rc = md_getattr(lmv->tgts[fid->mds].ltd_exp, fid,
619 valid, mealen, &req);
621 CERROR("md_getattr() failed, rc = %d\n", rc);
625 rc = mdc_req2lustre_md(exp, req, 0, NULL, &md);
627 CERROR("mdc_req2lustre_md() failed, rc = %d\n", rc);
632 GOTO(cleanup, rc = -ENODATA);
634 rc = lmv_create_obj_from_attrs(exp, fid, md.mea);
635 obd_free_memmd(exp, (struct lov_stripe_md **) &md.mea);
639 ptlrpc_req_finished(req);
643 int lmv_create(struct obd_export *exp, struct mdc_op_data *op_data,
644 const void *data, int datalen, int mode, __u32 uid,
645 __u32 gid, __u64 rdev, struct ptlrpc_request **request)
647 struct obd_device *obd = exp->exp_obd;
648 struct lmv_obd *lmv = &obd->u.lmv;
649 struct mds_body *mds_body;
654 rc = lmv_check_connect(obd);
658 if (!lmv->desc.ld_active_tgt_count)
661 obj = lmv_grab_obj(obd, &op_data->fid1, 0);
663 mds = raw_name2idx(obj->objcount, op_data->name,
665 op_data->fid1 = obj->objs[mds].fid;
669 CDEBUG(D_OTHER, "CREATE '%*s' on %lu/%lu/%lu\n",
670 op_data->namelen, op_data->name,
671 (unsigned long) op_data->fid1.mds,
672 (unsigned long) op_data->fid1.id,
673 (unsigned long) op_data->fid1.generation);
674 rc = md_create(lmv->tgts[op_data->fid1.mds].ltd_exp, op_data, data,
675 datalen, mode, uid, gid, rdev, request);
677 if (*request == NULL)
679 mds_body = lustre_msg_buf((*request)->rq_repmsg, 0,
681 LASSERT(mds_body != NULL);
682 CDEBUG(D_OTHER, "created. id = %lu, generation = %lu, mds = %d\n",
683 (unsigned long) mds_body->fid1.id,
684 (unsigned long) mds_body->fid1.generation,
686 LASSERT(mds_body->valid & OBD_MD_MDS ||
687 mds_body->mds == op_data->fid1.mds);
688 } else if (rc == -ERESTART) {
689 /* directory got splitted. time to update local object
690 * and repeat the request with proper MDS */
691 rc = lmv_get_mea_and_update_object(exp, &op_data->fid1);
693 ptlrpc_req_finished(*request);
700 int lmv_done_writing(struct obd_export *exp, struct obdo *obdo)
702 struct obd_device *obd = exp->exp_obd;
703 struct lmv_obd *lmv = &obd->u.lmv;
706 rc = lmv_check_connect(obd);
710 /* FIXME: choose right MDC here */
711 rc = md_done_writing(lmv->tgts[0].ltd_exp, obdo);
715 int lmv_enqueue(struct obd_export *exp, int lock_type,
716 struct lookup_intent *it, int lock_mode,
717 struct mdc_op_data *data, struct lustre_handle *lockh,
718 void *lmm, int lmmsize,
719 ldlm_completion_callback cb_completion,
720 ldlm_blocking_callback cb_blocking, void *cb_data)
722 struct obd_device *obd = exp->exp_obd;
723 struct lmv_obd *lmv = &obd->u.lmv;
728 rc = lmv_check_connect(obd);
733 obj = lmv_grab_obj(obd, &data->fid1, 0);
735 /* directory is splitted. look for
736 * right mds for this name */
737 mds = raw_name2idx(obj->objcount, (char *)data->name,
739 data->fid1 = obj->objs[mds].fid;
743 CDEBUG(D_OTHER, "ENQUEUE '%s' on %lu/%lu\n",
744 LL_IT2STR(it), (unsigned long) data->fid1.id,
745 (unsigned long) data->fid1.generation);
746 rc = md_enqueue(lmv->tgts[data->fid1.mds].ltd_exp, lock_type, it,
747 lock_mode, data, lockh, lmm, lmmsize, cb_completion,
748 cb_blocking, cb_data);
753 int lmv_getattr_name(struct obd_export *exp, struct ll_fid *fid,
754 char *filename, int namelen, unsigned long valid,
755 unsigned int ea_size, struct ptlrpc_request **request)
757 struct obd_device *obd = exp->exp_obd;
758 struct lmv_obd *lmv = &obd->u.lmv;
759 struct ll_fid rfid = *fid;
760 int rc, mds = fid->mds;
761 struct mds_body *body;
764 rc = lmv_check_connect(obd);
768 obj = lmv_grab_obj(obd, fid, 0);
770 /* directory is splitted. look for right mds for this name */
771 mds = raw_name2idx(obj->objcount, filename, namelen - 1);
772 rfid = obj->objs[mds].fid;
775 CDEBUG(D_OTHER, "getattr_name for %*s on %lu/%lu/%lu -> %lu/%lu/%lu\n",
776 namelen, filename, (unsigned long) fid->mds,
777 (unsigned long) fid->id, (unsigned long) fid->generation,
778 (unsigned long) rfid.mds, (unsigned long) rfid.id,
779 (unsigned long) rfid.generation);
780 rc = md_getattr_name(lmv->tgts[mds].ltd_exp, &rfid, filename, namelen,
781 valid, ea_size, request);
783 /* this could be cross-node reference. in this case all
784 * we have right now is mds/ino/generation triple. we'd
785 * like to find other attributes */
786 body = lustre_msg_buf((*request)->rq_repmsg, 0, sizeof(*body));
787 LASSERT(body != NULL);
788 if (body->valid & OBD_MD_MDS) {
789 struct ptlrpc_request *req = NULL;
791 CDEBUG(D_OTHER, "request attrs for %lu/%lu/%lu\n",
792 (unsigned long) rfid.mds,
793 (unsigned long) rfid.id,
794 (unsigned long) rfid.generation);
795 rc = md_getattr_name(lmv->tgts[rfid.mds].ltd_exp, &rfid,
796 NULL, 1, valid, ea_size, &req);
797 ptlrpc_req_finished(*request);
800 } else if (rc == -ERESTART) {
801 /* directory got splitted. time to update local object
802 * and repeat the request with proper MDS */
803 rc = lmv_get_mea_and_update_object(exp, &rfid);
805 ptlrpc_req_finished(*request);
814 * llite passes fid of an target inode in data->fid1 and
815 * fid of directory in data->fid2
817 int lmv_link(struct obd_export *exp, struct mdc_op_data *data,
818 struct ptlrpc_request **request)
820 struct obd_device *obd = exp->exp_obd;
821 struct lmv_obd *lmv = &obd->u.lmv;
825 rc = lmv_check_connect(obd);
828 if (data->namelen != 0) {
829 /* usual link request */
830 obj = lmv_grab_obj(obd, &data->fid1, 0);
832 rc = raw_name2idx(obj->objcount, data->name,
834 data->fid1 = obj->objs[rc].fid;
837 CDEBUG(D_OTHER,"link %u/%u/%u:%*s to %u/%u/%u mds %d\n",
838 (unsigned) data->fid2.mds, (unsigned) data->fid2.id,
839 (unsigned) data->fid2.generation, data->namelen,
840 data->name, (unsigned) data->fid1.mds,
841 (unsigned) data->fid1.id,
842 (unsigned) data->fid1.generation, data->fid1.mds);
844 /* request from MDS to acquire i_links for inode by fid1 */
845 CDEBUG(D_OTHER, "inc i_nlinks for %u/%u/%u\n",
846 (unsigned) data->fid1.mds, (unsigned) data->fid1.id,
847 (unsigned) data->fid1.generation);
850 rc = md_link(lmv->tgts[data->fid1.mds].ltd_exp, data, request);
854 int lmv_rename(struct obd_export *exp, struct mdc_op_data *data,
855 const char *old, int oldlen, const char *new, int newlen,
856 struct ptlrpc_request **request)
858 struct obd_device *obd = exp->exp_obd;
859 struct lmv_obd *lmv = &obd->u.lmv;
864 CDEBUG(D_OTHER, "rename %*s in %lu/%lu/%lu to %*s in %lu/%lu/%lu\n",
865 oldlen, old, (unsigned long) data->fid1.mds,
866 (unsigned long) data->fid1.id,
867 (unsigned long) data->fid1.generation,
868 newlen, new, (unsigned long) data->fid2.mds,
869 (unsigned long) data->fid2.id,
870 (unsigned long) data->fid2.generation);
871 if (!fid_equal(&data->fid1, &data->fid2))
872 CWARN("cross-node rename %lu/%lu/%lu:%*s to %lu/%lu/%lu:%*s\n",
873 (unsigned long) data->fid1.mds,
874 (unsigned long) data->fid1.id,
875 (unsigned long) data->fid1.generation, oldlen, old,
876 (unsigned long) data->fid2.mds,
877 (unsigned long) data->fid2.id,
878 (unsigned long) data->fid2.generation, newlen, new);
880 rc = lmv_check_connect(obd);
885 /* MDS with old dir entry is asking another MDS
886 * to create name there */
888 "create %*s(%d/%d) in %lu/%lu/%lu pointing to %lu/%lu/%lu\n",
889 newlen, new, oldlen, newlen,
890 (unsigned long) data->fid2.mds,
891 (unsigned long) data->fid2.id,
892 (unsigned long) data->fid2.generation,
893 (unsigned long) data->fid1.mds,
894 (unsigned long) data->fid1.id,
895 (unsigned long) data->fid1.generation);
896 mds = data->fid2.mds;
900 obj = lmv_grab_obj(obd, &data->fid1, 0);
902 /* directory is already splitted, so we have to forward
903 * request to the right MDS */
904 mds = raw_name2idx(obj->objcount, (char *)old, oldlen);
905 data->fid1 = obj->objs[mds].fid;
906 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
907 (unsigned long) obj->objs[mds].fid.mds,
908 (unsigned long) obj->objs[mds].fid.id,
909 (unsigned long) obj->objs[mds].fid.generation);
913 obj = lmv_grab_obj(obd, &data->fid2, 0);
915 /* directory is already splitted, so we have to forward
916 * request to the right MDS */
917 mds = raw_name2idx(obj->objcount, (char *)new, newlen);
918 data->fid2 = obj->objs[mds].fid;
919 CDEBUG(D_OTHER, "forward to MDS #%u (%lu/%lu/%lu)\n", mds,
920 (unsigned long) obj->objs[mds].fid.mds,
921 (unsigned long) obj->objs[mds].fid.id,
922 (unsigned long) obj->objs[mds].fid.generation);
926 mds = data->fid1.mds;
929 rc = md_rename(lmv->tgts[mds].ltd_exp, data, old, oldlen,
930 new, newlen, request);
934 int lmv_setattr(struct obd_export *exp, struct mdc_op_data *data,
935 struct iattr *iattr, void *ea, int ealen, void *ea2, int ea2len,
936 struct ptlrpc_request **request)
938 struct obd_device *obd = exp->exp_obd;
939 struct lmv_obd *lmv = &obd->u.lmv;
940 int rc = 0, i = data->fid1.mds;
941 struct ptlrpc_request *req;
942 struct mds_body *mds_body;
946 rc = lmv_check_connect(obd);
950 obj = lmv_grab_obj(obd, &data->fid1, 0);
951 CDEBUG(D_OTHER, "SETATTR for %lu/%lu/%lu, valid 0x%x%s\n",
952 (unsigned long) data->fid1.mds,
953 (unsigned long) data->fid1.id,
954 (unsigned long) data->fid1.generation, iattr->ia_valid,
955 obj ? ", splitted" : "");
957 for (i = 0; i < obj->objcount; i++) {
958 data->fid1 = obj->objs[i].fid;
959 rc = md_setattr(lmv->tgts[i].ltd_exp, data, iattr, ea,
960 ealen, ea2, ea2len, &req);
962 if (fid_equal(&obj->fid, &obj->objs[i].fid)) {
963 /* this is master object and this request
964 * should be returned back to llite */
967 ptlrpc_req_finished(req);
972 LASSERT(data->fid1.mds < lmv->desc.ld_tgt_count);
973 rc = md_setattr(lmv->tgts[i].ltd_exp, data, iattr, ea, ealen,
974 ea2, ea2len, request);
976 mds_body = lustre_msg_buf((*request)->rq_repmsg, 0,
978 LASSERT(mds_body != NULL);
979 LASSERT(mds_body->mds == i);
985 int lmv_sync(struct obd_export *exp, struct ll_fid *fid,
986 struct ptlrpc_request **request)
988 struct obd_device *obd = exp->exp_obd;
989 struct lmv_obd *lmv = &obd->u.lmv;
993 rc = lmv_check_connect(obd);
997 rc = md_sync(lmv->tgts[0].ltd_exp, fid, request);
1001 int lmv_dirobj_blocking_ast(struct ldlm_lock *lock,
1002 struct ldlm_lock_desc *desc, void *data, int flag)
1004 struct lustre_handle lockh;
1005 struct lmv_obj *obj;
1010 case LDLM_CB_BLOCKING:
1011 ldlm_lock2handle(lock, &lockh);
1012 rc = ldlm_cli_cancel(&lockh);
1014 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
1018 case LDLM_CB_CANCELING:
1019 /* time to drop cached attrs for dirobj */
1020 obj = lock->l_ast_data;
1024 CDEBUG(D_OTHER, "cancel %s on %lu/%lu, master %lu/%lu/%lu\n",
1025 lock->l_resource->lr_name.name[3] == 1 ?
1026 "LOOKUP" : "UPDATE",
1027 (unsigned long) lock->l_resource->lr_name.name[0],
1028 (unsigned long) lock->l_resource->lr_name.name[1],
1029 (unsigned long) obj->fid.mds,
1030 (unsigned long) obj->fid.id,
1031 (unsigned long) obj->fid.generation);
1039 void lmv_remove_dots(struct page *page)
1041 char *kaddr = page_address(page);
1042 unsigned limit = PAGE_CACHE_SIZE;
1043 unsigned offs, rec_len;
1044 struct ext2_dir_entry_2 *p;
1046 for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
1047 p = (struct ext2_dir_entry_2 *)(kaddr + offs);
1048 rec_len = le16_to_cpu(p->rec_len);
1050 if ((p->name_len == 1 && p->name[0] == '.') ||
1051 (p->name_len == 2 && p->name[0] == '.' && p->name[1] == '.'))
1056 int lmv_readpage(struct obd_export *exp, struct ll_fid *mdc_fid,
1057 __u64 offset, struct page *page,
1058 struct ptlrpc_request **request)
1060 struct obd_device *obd = exp->exp_obd;
1061 struct lmv_obd *lmv = &obd->u.lmv;
1062 struct ll_fid rfid = *mdc_fid;
1063 struct lmv_obj *obj;
1067 rc = lmv_check_connect(obd);
1071 LASSERT(mdc_fid->mds < lmv->desc.ld_tgt_count);
1072 CDEBUG(D_OTHER, "READPAGE at %llu from %lu/%lu/%lu\n",
1073 offset, (unsigned long) rfid.mds,
1074 (unsigned long) rfid.id,
1075 (unsigned long) rfid.generation);
1077 obj = lmv_grab_obj(obd, mdc_fid, 0);
1079 /* find dirobj containing page with requested offset */
1080 /* FIXME: what about protecting cached attrs here? */
1081 for (i = 0; i < obj->objcount; i++) {
1082 if (offset < obj->objs[i].size)
1084 offset -= obj->objs[i].size;
1086 rfid = obj->objs[i].fid;
1087 CDEBUG(D_OTHER, "forward to %lu/%lu/%lu with offset %lu\n",
1088 (unsigned long) rfid.mds,
1089 (unsigned long) rfid.id,
1090 (unsigned long) rfid.generation,
1091 (unsigned long) offset);
1093 rc = md_readpage(lmv->tgts[rfid.mds].ltd_exp, &rfid, offset, page, request);
1094 if (rc == 0 && !fid_equal(&rfid, mdc_fid)) {
1095 /* this page isn't from master object. to avoid
1096 * ./.. duplication in directory, we have to remove them
1097 * from all slave objects */
1098 lmv_remove_dots(page);
1106 int lmv_unlink(struct obd_export *exp, struct mdc_op_data *data,
1107 struct ptlrpc_request **request)
1109 struct obd_device *obd = exp->exp_obd;
1110 struct lmv_obd *lmv = &obd->u.lmv;
1114 rc = lmv_check_connect(obd);
1118 if (data->namelen != 0) {
1119 struct lmv_obj *obj;
1120 obj = lmv_grab_obj(obd, &data->fid1, 0);
1122 i = raw_name2idx(obj->objcount, data->name,
1124 data->fid1 = obj->objs[i].fid;
1127 CDEBUG(D_OTHER, "unlink '%*s' in %lu/%lu/%lu -> %u\n",
1128 data->namelen, data->name,
1129 (unsigned long) data->fid1.mds,
1130 (unsigned long) data->fid1.id,
1131 (unsigned long) data->fid1.generation, i);
1133 CDEBUG(D_OTHER, "drop i_nlink on %lu/%lu/%lu\n",
1134 (unsigned long) data->fid1.mds,
1135 (unsigned long) data->fid1.id,
1136 (unsigned long) data->fid1.generation);
1138 rc = md_unlink(lmv->tgts[data->fid1.mds].ltd_exp, data, request);
1142 struct obd_device *lmv_get_real_obd(struct obd_export *exp,
1143 char *name, int len)
1145 struct obd_device *obd = exp->exp_obd;
1146 struct lmv_obd *lmv = &obd->u.lmv;
1150 rc = lmv_check_connect(obd);
1152 RETURN(ERR_PTR(rc));
1153 obd = lmv->tgts[0].ltd_exp->exp_obd;
1158 int lmv_init_ea_size(struct obd_export *exp, int easize, int cookiesize)
1160 struct obd_device *obd = exp->exp_obd;
1161 struct lmv_obd *lmv = &obd->u.lmv;
1162 int i, rc = 0, change = 0;
1165 if (lmv->max_easize < easize) {
1166 lmv->max_easize = easize;
1169 if (lmv->max_cookiesize < cookiesize) {
1170 lmv->max_cookiesize = cookiesize;
1176 if (lmv->connected == 0)
1179 /* FIXME: error handling? */
1180 for (i = 0; i < lmv->desc.ld_tgt_count; i++)
1181 rc = obd_init_ea_size(lmv->tgts[i].ltd_exp, easize, cookiesize);
1185 int lmv_obd_create_single(struct obd_export *exp, struct obdo *oa,
1186 struct lov_stripe_md **ea, struct obd_trans_info *oti)
1188 struct obd_device *obd = exp->exp_obd;
1189 struct lmv_obd *lmv = &obd->u.lmv;
1190 struct lov_stripe_md obj_md;
1191 struct lov_stripe_md *obj_mdp = &obj_md;
1195 rc = lmv_check_connect(obd);
1199 LASSERT(ea == NULL);
1200 LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1202 rc = obd_create(lmv->tgts[oa->o_mds].ltd_exp, oa, &obj_mdp, oti);
1209 * to be called from MDS only
1211 int lmv_obd_create(struct obd_export *exp, struct obdo *oa,
1212 struct lov_stripe_md **ea, struct obd_trans_info *oti)
1214 struct obd_device *obd = exp->exp_obd;
1215 struct lmv_obd *lmv = &obd->u.lmv;
1221 rc = lmv_check_connect(obd);
1225 LASSERT(oa != NULL);
1228 rc = lmv_obd_create_single(exp, oa, NULL, oti);
1233 rc = obd_alloc_diskmd(exp, (struct lov_mds_md **)ea);
1234 LASSERT(*ea != NULL);
1237 mea = (struct mea *)*ea;
1239 mfid.generation = oa->o_generation;
1241 if (!mea->mea_count || mea->mea_count > lmv->desc.ld_tgt_count)
1242 mea->mea_count = lmv->desc.ld_tgt_count;
1244 mea->mea_master = -1;
1246 /* FIXME: error handling? */
1247 for (i = 0, c = 0; c < mea->mea_count &&
1248 i < lmv->desc.ld_tgt_count; i++) {
1249 struct lov_stripe_md obj_md;
1250 struct lov_stripe_md *obj_mdp = &obj_md;
1252 if (lmv->tgts[i].ltd_exp == NULL) {
1253 /* this is master MDS */
1254 mea->mea_fids[c].id = mfid.id;
1255 mea->mea_fids[c].generation = mfid.generation;
1256 mea->mea_fids[c].mds = i;
1257 mea->mea_master = i;
1262 /* "Master" MDS should always be part of stripped dir, so
1264 if (mea->mea_master == -1 && c == mea->mea_count - 1)
1267 oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLTYPE | OBD_MD_FLMODE
1268 | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLID;
1270 rc = obd_create(lmv->tgts[c].ltd_exp, oa, &obj_mdp, oti);
1271 /* FIXME: error handling here */
1274 mea->mea_fids[c].id = oa->o_id;
1275 mea->mea_fids[c].generation = oa->o_generation;
1276 mea->mea_fids[c].mds = i;
1278 CDEBUG(D_OTHER, "dirobj at mds %d: "LPU64"/%u\n",
1279 i, oa->o_id, oa->o_generation);
1281 LASSERT(c == mea->mea_count);
1282 CDEBUG(D_OTHER, "%d dirobjects created\n", (int) mea->mea_count);
1287 static int lmv_get_info(struct obd_export *exp, __u32 keylen,
1288 void *key, __u32 *vallen, void *val)
1290 struct obd_device *obd;
1291 struct lmv_obd *lmv;
1294 obd = class_exp2obd(exp);
1296 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1297 exp->exp_handle.h_cookie);
1302 if (keylen == 6 && memcmp(key, "mdsize", 6) == 0) {
1303 __u32 *mdsize = val;
1304 *vallen = sizeof(__u32);
1305 *mdsize = sizeof(struct ll_fid) * lmv->desc.ld_tgt_count
1306 + sizeof(struct mea);
1308 } else if (keylen == 6 && memcmp(key, "mdsnum", 6) == 0) {
1309 struct obd_uuid *cluuid = &lmv->cluuid;
1310 struct lmv_tgt_desc *tgts;
1311 __u32 *mdsnum = val;
1314 for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) {
1315 if (obd_uuid_equals(&tgts->uuid, cluuid)) {
1316 *vallen = sizeof(__u32);
1324 CDEBUG(D_IOCTL, "invalid key\n");
1328 int lmv_set_info(struct obd_export *exp, obd_count keylen,
1329 void *key, obd_count vallen, void *val)
1331 struct obd_device *obd;
1332 struct lmv_obd *lmv;
1335 obd = class_exp2obd(exp);
1337 CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
1338 exp->exp_handle.h_cookie);
1343 if (keylen >= strlen("client") && strcmp(key, "client") == 0) {
1344 struct lmv_tgt_desc *tgts;
1347 rc = lmv_check_connect(obd);
1351 for (i = 0, tgts = lmv->tgts;
1352 i < lmv->desc.ld_tgt_count; i++, tgts++) {
1353 rc = obd_set_info(tgts->ltd_exp, keylen, key, vallen, val);
1358 } else if (keylen >= strlen("inter_mds") && strcmp(key, "inter_mds") == 0) {
1359 lmv->server_timeout = 1;
1360 lmv_set_timeouts(obd);
1367 int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
1368 struct lov_stripe_md *lsm)
1370 struct obd_device *obd = class_exp2obd(exp);
1371 struct lmv_obd *lmv = &obd->u.lmv;
1375 mea_size = sizeof(struct ll_fid) *
1376 lmv->desc.ld_tgt_count + sizeof(struct mea);
1380 if (*lmmp && !lsm) {
1381 OBD_FREE(*lmmp, mea_size);
1387 OBD_ALLOC(*lmmp, mea_size);
1395 #warning "MEA packing/convertation must be here! -bzzz"
1396 memcpy(*lmmp, lsm, mea_size);
1400 int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **mem_tgt,
1401 struct lov_mds_md *disk_src, int mdsize)
1403 struct obd_device *obd = class_exp2obd(exp);
1404 struct lmv_obd *lmv = &obd->u.lmv;
1405 struct mea **tmea = (struct mea **) mem_tgt;
1406 struct mea *mea = (void *) disk_src;
1410 mea_size = sizeof(struct ll_fid) *
1411 lmv->desc.ld_tgt_count + sizeof(struct mea);
1412 if (mem_tgt == NULL)
1415 if (*mem_tgt != NULL && disk_src == NULL) {
1416 OBD_FREE(*tmea, mea_size);
1420 LASSERT(mea_size == mdsize);
1422 OBD_ALLOC(*tmea, mea_size);
1423 /* FIXME: error handling here */
1424 LASSERT(*tmea != NULL);
1429 #warning "MEA unpacking/convertation must be here! -bzzz"
1430 memcpy(*tmea, mea, mdsize);
1434 int lmv_brw(int rw, struct obd_export *exp, struct obdo *oa,
1435 struct lov_stripe_md *ea, obd_count oa_bufs,
1436 struct brw_page *pgarr, struct obd_trans_info *oti)
1438 struct obd_device *obd = exp->exp_obd;
1439 struct lmv_obd *lmv = &obd->u.lmv;
1440 struct mea *mea = (struct mea *) ea;
1443 LASSERT(oa != NULL);
1444 LASSERT(ea != NULL);
1445 LASSERT(pgarr != NULL);
1446 LASSERT(oa->o_mds < lmv->desc.ld_tgt_count);
1448 oa->o_gr = mea->mea_fids[oa->o_mds].generation;
1449 oa->o_id = mea->mea_fids[oa->o_mds].id;
1450 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1451 err = obd_brw(rw, lmv->tgts[oa->o_mds].ltd_exp, oa,
1452 NULL, oa_bufs, pgarr, oti);
1456 struct obd_ops lmv_obd_ops = {
1457 .o_owner = THIS_MODULE,
1458 .o_attach = lmv_attach,
1459 .o_detach = lmv_detach,
1460 .o_setup = lmv_setup,
1461 .o_cleanup = lmv_cleanup,
1462 .o_connect = lmv_connect,
1463 .o_disconnect = lmv_disconnect,
1464 .o_statfs = lmv_statfs,
1465 .o_get_info = lmv_get_info,
1466 .o_set_info = lmv_set_info,
1467 .o_create = lmv_obd_create,
1468 .o_packmd = lmv_packmd,
1469 .o_unpackmd = lmv_unpackmd,
1471 .o_init_ea_size = lmv_init_ea_size,
1472 .o_notify = lmv_notify,
1475 struct md_ops lmv_md_ops = {
1476 .m_getstatus = lmv_getstatus,
1477 .m_getattr = lmv_getattr,
1478 .m_change_cbdata = lmv_change_cbdata,
1479 .m_change_cbdata_name = lmv_change_cbdata_name,
1480 .m_close = lmv_close,
1481 .m_create = lmv_create,
1482 .m_done_writing = lmv_done_writing,
1483 .m_enqueue = lmv_enqueue,
1484 .m_getattr_name = lmv_getattr_name,
1485 .m_intent_lock = lmv_intent_lock,
1487 .m_rename = lmv_rename,
1488 .m_setattr = lmv_setattr,
1490 .m_readpage = lmv_readpage,
1491 .m_unlink = lmv_unlink,
1492 .m_get_real_obd = lmv_get_real_obd,
1493 .m_valid_attrs = lmv_valid_attrs,
1496 int __init lmv_init(void)
1498 struct lprocfs_static_vars lvars;
1501 lprocfs_init_vars(lmv, &lvars);
1502 rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
1503 lvars.module_vars, OBD_LMV_DEVICENAME);
1508 static void lmv_exit(void)
1510 class_unregister_type(OBD_LMV_DEVICENAME);
1513 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1514 MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
1515 MODULE_LICENSE("GPL");
1517 module_init(lmv_init);
1518 module_exit(lmv_exit);