1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Copyright (C) 2002 Cluster File Systems, Inc.
7 * Author: Phil Schwan <phil@off.net>
8 * Peter Braam <braam@clusterfs.com>
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
15 #define DEBUG_SUBSYSTEM S_LOV
17 #include <linux/slab.h>
18 #include <linux/module.h>
19 #include <linux/obd_support.h>
20 #include <linux/lustre_lib.h>
21 #include <linux/lustre_net.h>
22 #include <linux/lustre_idl.h>
23 #include <linux/lustre_mds.h>
24 #include <linux/obd_class.h>
25 #include <linux/obd_lov.h>
26 #include <linux/init.h>
28 extern struct obd_device obd_dev[MAX_OBD_DEVICES];
31 static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
34 struct ptlrpc_request *req;
35 struct lov_obd *lov = &obd->u.lov;
36 struct client_obd *mdc = &lov->mdcobd->u.cli;
37 struct lov_desc *desc = &lov->desc;
38 struct lustre_handle mdc_conn;
44 rc = class_connect(conn, obd, cluuid);
50 /* retrieve LOV metadata from MDS */
51 rc = obd_connect(&mdc_conn, lov->mdcobd, NULL);
53 CERROR("cannot connect to mdc: rc = %d\n", rc);
54 GOTO(out, rc = -EINVAL);
57 rc = mdc_getlovinfo(obd, &mdc_conn, &req);
58 rc2 = obd_disconnect(&mdc_conn);
60 CERROR("cannot get lov info or disconnect %d/%d\n", rc, rc2);
61 GOTO(out, (rc) ? rc : rc2 );
65 if (req->rq_repmsg->bufcount < 2 ||
66 req->rq_repmsg->buflens[0] < sizeof(*desc)) {
67 CERROR("invalid descriptor returned\n");
68 GOTO(out, rc = -EINVAL);
71 memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
74 if (req->rq_repmsg->buflens[1] < sizeof(*uuidarray)*desc->ld_tgt_count){
75 CERROR("invalid uuid array returned\n");
76 GOTO(out, rc = -EINVAL);
79 mdc->cl_max_mdsize = sizeof(struct lov_mds_md) +
80 desc->ld_tgt_count * sizeof(struct lov_object_id);
82 if (memcmp(obd->obd_uuid, desc->ld_uuid, sizeof(desc->ld_uuid))) {
83 CERROR("lov uuid %s not on mds device (%s)\n",
84 obd->obd_uuid, desc->ld_uuid);
85 GOTO(out, rc = -EINVAL);
88 if (desc->ld_tgt_count > 1000) {
89 CERROR("configuration error: target count > 1000 (%d)\n",
91 GOTO(out, rc = -EINVAL);
94 lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count;
95 OBD_ALLOC(lov->tgts, lov->bufsize);
97 CERROR("Out of memory\n");
98 GOTO(out, rc = -ENOMEM);
101 uuidarray = lustre_msg_buf(req->rq_repmsg, 1);
102 for (i = 0 ; i < desc->ld_tgt_count; i++)
103 memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray));
105 for (i = 0 ; i < desc->ld_tgt_count; i++) {
106 struct obd_device *tgt = class_uuid2obd(uuidarray[i]);
108 CERROR("Target %s not attached\n", uuidarray[i]);
109 GOTO(out_mem, rc = -EINVAL);
111 if (!(tgt->obd_flags & OBD_SET_UP)) {
112 CERROR("Target %s not set up\n", uuidarray[i]);
113 GOTO(out_mem, rc = -EINVAL);
115 rc = obd_connect(&lov->tgts[i].conn, tgt, NULL);
117 CERROR("Target %s connect error %d\n",
125 for (i = 0 ; i < desc->ld_tgt_count; i++) {
126 rc2 = obd_disconnect(&lov->tgts[i].conn);
128 CERROR("BAD: Target %s disconnect error %d\n",
131 OBD_FREE(lov->tgts, lov->bufsize);
135 class_disconnect(conn);
136 ptlrpc_free_req(req);
140 static int lov_disconnect(struct lustre_handle *conn)
142 struct obd_device *obd = class_conn2obd(conn);
143 struct lov_obd *lov = &obd->u.lov;
150 for (i = 0 ; i < lov->desc.ld_tgt_count; i++) {
151 rc = obd_disconnect(&lov->tgts[i].conn);
153 CERROR("Target %s disconnect error %d\n",
154 lov->tgts[i].uuid, rc);
158 OBD_FREE(lov->tgts, lov->bufsize);
163 rc = class_disconnect(conn);
169 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
171 struct obd_ioctl_data* data = buf;
172 struct lov_obd *lov = &obd->u.lov;
176 if (data->ioc_inllen1 < 1) {
177 CERROR("osc setup requires an MDC UUID\n");
181 if (data->ioc_inllen1 > 37) {
182 CERROR("mdc UUID must be less than 38 characters\n");
186 lov->mdcobd = class_uuid2obd(data->ioc_inlbuf1);
188 CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid,
196 static inline int lov_stripe_md_size(struct obd_device *obd)
198 struct lov_obd *lov = &obd->u.lov;
201 size = sizeof(struct lov_stripe_md) +
202 lov->desc.ld_tgt_count * sizeof(struct lov_oinfo);
206 static inline int lov_mds_md_size(struct obd_device *obd)
208 struct lov_obd *lov = &obd->u.lov;
211 size = sizeof(struct lov_mds_md) +
212 lov->desc.ld_tgt_count * sizeof(struct lov_object_id);
216 /* the LOV counts on oa->o_id to be set as the LOV object id */
217 static int lov_create(struct lustre_handle *conn, struct obdo *oa,
218 struct lov_stripe_md **ea)
222 struct obd_export *export = class_conn2export(conn);
224 struct lov_stripe_md *md;
228 CERROR("lov_create needs EA for striping information\n");
233 lov = &export->exp_obd->u.lov;
235 oa->o_easize = lov_stripe_md_size(export->exp_obd);
237 OBD_ALLOC(*ea, oa->o_easize);
243 md->lmd_easize = lov_mds_md_size(export->exp_obd);
244 md->lmd_object_id = oa->o_id;
245 if (!md->lmd_stripe_count)
246 md->lmd_stripe_count = lov->desc.ld_default_stripe_count;
248 if (!md->lmd_stripe_size)
249 md->lmd_stripe_size = lov->desc.ld_default_stripe_size;
253 for (i = 0; i < md->lmd_stripe_count; i++) {
254 struct lov_stripe_md obj_md;
255 struct lov_stripe_md *obj_mdp = &obj_md;
256 /* create data objects with "parent" OA */
257 memcpy(&tmp, oa, sizeof(tmp));
258 tmp.o_easize = sizeof(struct lov_stripe_md);
259 rc = obd_create(&lov->tgts[i].conn, &tmp, &obj_mdp);
261 GOTO(out_cleanup, rc);
262 md->lmd_oinfo[i].loi_id = tmp.o_id;
263 md->lmd_oinfo[i].loi_size = tmp.o_size;
269 for (i2 = 0; i2 < i; i2++) {
270 /* destroy already created objects here */
271 tmp.o_id = md->lmd_oinfo[i].loi_id;
272 rc2 = obd_destroy(&lov->tgts[i].conn, &tmp, NULL);
274 CERROR("Failed to remove object from target "
281 static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
282 struct lov_stripe_md *md)
286 struct obd_export *export = class_conn2export(conn);
291 CERROR("LOV requires striping ea for destruction\n");
295 if (!export || !export->exp_obd)
298 lov = &export->exp_obd->u.lov;
299 for (i = 0; i < md->lmd_stripe_count; i++) {
300 /* create data objects with "parent" OA */
301 memcpy(&tmp, oa, sizeof(tmp));
302 tmp.o_id = md->lmd_oinfo[i].loi_id;
303 rc = obd_destroy(&lov->tgts[i].conn, &tmp, NULL);
305 CERROR("Error destroying object %Ld on %d\n",
306 md->lmd_oinfo[i].loi_id, i);
311 static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
312 struct lov_stripe_md *md)
316 struct obd_export *export = class_conn2export(conn);
322 CERROR("LOV requires striping ea\n");
326 if (!export || !export->exp_obd)
329 lov = &export->exp_obd->u.lov;
332 for (i = 0; i < md->lmd_stripe_count; i++) {
335 if (md->lmd_oinfo[i].loi_id == 0)
338 /* create data objects with "parent" OA */
339 memcpy(&tmp, oa, sizeof(tmp));
340 tmp.o_id = md->lmd_oinfo[i].loi_id;
342 err = obd_getattr(&lov->tgts[i].conn, &tmp, NULL);
344 CERROR("Error getattr object %Ld on %d: err = %d\n",
345 md->lmd_oinfo[i].loi_id, i, err);
348 continue; /* XXX or break? */
351 obdo_cpy_md(oa, &tmp, tmp.o_valid);
354 if (tmp.o_valid & OBD_MD_FLSIZE)
355 oa->o_size += tmp.o_size;
356 if (tmp.o_valid & OBD_MD_FLBLOCKS)
357 oa->o_blocks += tmp.o_blocks;
358 if (tmp.o_valid & OBD_MD_FLCTIME &&
359 oa->o_ctime < tmp.o_ctime)
360 oa->o_ctime = tmp.o_ctime;
361 if (tmp.o_valid & OBD_MD_FLMTIME &&
362 oa->o_mtime < tmp.o_mtime)
363 oa->o_mtime = tmp.o_mtime;
369 static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
370 struct lov_stripe_md *md)
374 struct obd_export *export = class_conn2export(conn);
379 CERROR("LOV requires striping ea\n");
383 if (!export || !export->exp_obd)
386 lov = &export->exp_obd->u.lov;
387 for (i = 0; i < md->lmd_stripe_count; i++) {
388 /* create data objects with "parent" OA */
389 memcpy(&tmp, oa, sizeof(tmp));
390 tmp.o_id = md->lmd_oinfo[i].loi_id;
392 rc = obd_setattr(&lov->tgts[i].conn, &tmp, NULL);
394 CERROR("Error setattr object %Ld on %d\n",
400 static int lov_open(struct lustre_handle *conn, struct obdo *oa,
401 struct lov_stripe_md *md)
403 int rc = 0, rc2 = 0, i;
405 struct obd_export *export = class_conn2export(conn);
410 CERROR("LOV requires striping ea for opening\n");
414 if (!export || !export->exp_obd)
417 lov = &export->exp_obd->u.lov;
418 for (i = 0; i < md->lmd_stripe_count; i++) {
419 /* create data objects with "parent" OA */
420 memcpy(&tmp, oa, sizeof(tmp));
421 tmp.o_id = md->lmd_oinfo[i].loi_id;
423 rc = obd_open(&lov->tgts[i].conn, &tmp, NULL);
426 CERROR("Error open object %Ld on %d\n",
427 md->lmd_oinfo[i].loi_id, i);
433 static int lov_close(struct lustre_handle *conn, struct obdo *oa,
434 struct lov_stripe_md *md)
438 struct obd_export *export = class_conn2export(conn);
443 CERROR("LOV requires striping ea\n");
447 if (!export || !export->exp_obd)
450 lov = &export->exp_obd->u.lov;
451 for (i = 0; i < md->lmd_stripe_count; i++) {
452 /* create data objects with "parent" OA */
453 memcpy(&tmp, oa, sizeof(tmp));
454 tmp.o_id = md->lmd_oinfo[i].loi_id;
456 rc = obd_close(&lov->tgts[i].conn, &tmp, NULL);
458 CERROR("Error close object %Ld on %d\n",
459 md->lmd_oinfo[i].loi_id, i);
465 #define log2(n) ffz(~(n))
468 /* compute offset in stripe i corresponding to offset "in" */
469 __u64 lov_offset(struct lov_stripe_md *md, __u64 in, int i)
471 __u32 ssz = md->lmd_stripe_size;
472 /* full stripes across all * stripe size */
473 __u32 out = ( ((__u32)in) / (md->lmd_stripe_count * ssz)) * ssz;
474 __u32 off = (__u32)in % (md->lmd_stripe_count * ssz);
476 if ( in == 0xffffffffffffffff ) {
477 return 0xffffffffffffffff;
480 if ( (i+1) * ssz <= off )
482 else if ( i * ssz > off )
485 out += (off - (i * ssz)) % ssz;
490 /* compute offset in stripe i corresponding to offset "in" */
491 __u64 lov_stripe(struct lov_stripe_md *md, __u64 in, int *j)
493 __u32 ssz = md->lmd_stripe_size;
495 /* full stripes across all * stripe size */
496 *j = (((__u32) in)/ssz) % md->lmd_stripe_count;
497 off = (__u32)in % (md->lmd_stripe_count * ssz);
498 out = ( ((__u32)in) / (md->lmd_stripe_count * ssz)) * ssz +
499 (off - ((*j) * ssz)) % ssz;;
504 int lov_stripe_which(struct lov_stripe_md *md, __u64 in)
506 __u32 ssz = md->lmd_stripe_size;
508 j = (((__u32) in) / ssz) % md->lmd_stripe_count;
513 /* FIXME: maybe we'll just make one node the authoritative attribute node, then
514 * we can send this 'punch' to just the authoritative node and the nodes
515 * that the punch will affect. */
516 static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
517 struct lov_stripe_md *md,
518 obd_off start, obd_off end)
522 struct obd_export *export = class_conn2export(conn);
527 CERROR("LOV requires striping ea for desctruction\n");
531 if (!export || !export->exp_obd)
534 lov = &export->exp_obd->u.lov;
535 for (i = 0; i < md->lmd_stripe_count; i++) {
536 __u64 starti = lov_offset(md, start, i);
537 __u64 endi = lov_offset(md, end, i);
541 /* create data objects with "parent" OA */
542 memcpy(&tmp, oa, sizeof(tmp));
543 tmp.o_id = md->lmd_oinfo[i].loi_id;
545 rc = obd_punch(&lov->tgts[i].conn, &tmp, NULL,
548 CERROR("Error punch object %Ld on %d\n",
549 md->lmd_oinfo[i].loi_id, i);
554 static int lov_osc_brw_callback(struct io_cb_data *cbd, int err, int phase)
559 if (phase == CB_PHASE_START)
562 if (phase == CB_PHASE_FINISH) {
565 if (atomic_dec_and_test(&cbd->refcount))
566 ret = cbd->cb(cbd->data, cbd->err, phase);
574 static inline int lov_brw(int cmd, struct lustre_handle *conn,
575 struct lov_stripe_md *md,
577 struct brw_page *pga,
578 brw_callback_t callback, struct io_cb_data *cbd)
580 int stripe_count = md->lmd_stripe_count;
581 struct obd_export *export = class_conn2export(conn);
587 struct lov_stripe_md md;
589 struct brw_page *ioarr;
591 struct io_cb_data *our_cb;
594 lov = &export->exp_obd->u.lov;
596 our_cb = ll_init_cb();
600 OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo));
602 GOTO(out_cbdata, rc = -ENOMEM);
604 OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs);
606 GOTO(out_sinfo, rc = -ENOMEM);
608 for (i = 0; i < oa_bufs; i++) {
610 which = lov_stripe_which(md, pga[i].off);
611 stripeinfo[which].bufct++;
614 for (i = 0; i < stripe_count; i++) {
616 stripeinfo[i].index = stripeinfo[i - 1].index +
617 stripeinfo[i - 1].bufct;
618 stripeinfo[i].md.lmd_object_id = md->lmd_oinfo[i].loi_id;
621 for (i = 0; i < oa_bufs; i++) {
623 which = lov_stripe_which(md, pga[i].off);
625 shift = stripeinfo[which].index;
626 LASSERT(shift + stripeinfo[which].subcount < oa_bufs);
627 ioarr[shift + stripeinfo[which].subcount] = pga[i];
628 ioarr[shift + stripeinfo[which].subcount].off =
629 lov_offset(md, pga[i].off, which);
630 stripeinfo[which].subcount++;
633 our_cb->cb = callback;
636 /* This is the only race-free way I can think of to get the refcount
638 atomic_set(&our_cb->refcount, 0);
639 for (i = 0; i < stripe_count; i++)
640 if (stripeinfo[i].bufct)
641 atomic_inc(&our_cb->refcount);
643 for (i = 0; i < stripe_count; i++) {
644 int shift = stripeinfo[i].index;
645 if (stripeinfo[i].bufct) {
646 LASSERT(shift < oa_bufs);
647 obd_brw(cmd, &lov->tgts[i].conn, &stripeinfo[i].md,
648 stripeinfo[i].bufct, &ioarr[shift],
649 lov_osc_brw_callback, our_cb);
653 rc = callback(cbd, 0, CB_PHASE_START);
655 OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
657 OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
659 OBD_FREE(our_cb, sizeof(*our_cb));
663 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *md,
664 struct lustre_handle *parent_lock,
665 __u32 type, void *cookie, int cookielen, __u32 mode,
666 int *flags, void *cb, void *data, int datalen,
667 struct lustre_handle *lockhs)
670 struct obd_export *export = class_conn2export(conn);
672 struct lov_stripe_md submd;
676 CERROR("LOV requires striping ea for desctruction\n");
680 if (!export || !export->exp_obd)
683 lov = &export->exp_obd->u.lov;
684 for (i = 0; i < md->lmd_stripe_count; i++) {
685 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
686 struct ldlm_extent sub_ext;
688 sub_ext.start = lov_offset(md, extent->start, i);
689 sub_ext.end = lov_offset(md, extent->end, i);
690 if ( sub_ext.start == sub_ext.end )
693 submd.lmd_object_id = md->lmd_oinfo[i].loi_id;
694 submd.lmd_easize = sizeof(struct lov_mds_md);
695 submd.lmd_stripe_count = md->lmd_stripe_count;
696 /* XXX submd is not fully initialized here */
697 rc = obd_enqueue(&(lov->tgts[i].conn), &submd, parent_lock,
698 type, &sub_ext, sizeof(sub_ext), mode,
699 flags, cb, data, datalen, &(lockhs[i]));
700 // XXX add a lock debug statement here
702 CERROR("Error obd_enqueue object %Ld subobj %Ld\n",
703 md->lmd_object_id, md->lmd_oinfo[i].loi_id);
708 static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *md,
709 __u32 mode, struct lustre_handle *lockhs)
712 struct obd_export *export = class_conn2export(conn);
717 CERROR("LOV requires striping ea for lock cancellation\n");
721 if (!export || !export->exp_obd)
724 lov = &export->exp_obd->u.lov;
725 for (i = 0; i < md->lmd_stripe_count; i++) {
726 struct lov_stripe_md submd;
728 if ( lockhs[i].addr == 0 )
731 submd.lmd_object_id = md->lmd_oinfo[i].loi_id;
732 submd.lmd_easize = sizeof(struct lov_mds_md);
733 rc = obd_cancel(&lov->tgts[i].conn, &submd, mode, &lockhs[i]);
735 CERROR("Error cancel object %Ld subobj %Ld\n",
736 md->lmd_object_id, md->lmd_oinfo[i].loi_id);
741 static int lov_statfs(struct lustre_handle *conn, struct statfs *sfs)
743 struct obd_export *export = class_conn2export(conn);
745 struct statfs lov_sfs;
751 if (!export || !export->exp_obd)
754 lov = &export->exp_obd->u.lov;
756 /* We only get block data from the OBD */
757 for (i = 0 ; i < lov->desc.ld_tgt_count; i++) {
760 err = obd_statfs(&lov->tgts[i].conn, &lov_sfs);
762 CERROR("Error statfs OSC %s on %d: err = %d\n",
763 lov->tgts[i].uuid, i, err);
766 continue; /* XXX or break? - probably OK to continue */
769 memcpy(sfs, &lov_sfs, sizeof(lov_sfs));
772 sfs->f_bfree += lov_sfs.f_bfree;
773 sfs->f_bavail += lov_sfs.f_bavail;
774 sfs->f_blocks += lov_sfs.f_blocks;
775 /* XXX not sure about this one - depends on policy.
776 * - could be minimum if we always stripe on all OBDs
777 * (but that would be wrong for any other policy,
778 * if one of the OBDs has no more objects left)
779 * - could be sum if we stripe whole objects
780 * - could be average, just to give a nice number
781 * - we just pick first OST and hope it is enough
782 sfs->f_ffree += lov_sfs.f_ffree;
790 struct obd_ops lov_obd_ops = {
792 o_connect: lov_connect,
793 o_disconnect: lov_disconnect,
794 o_create: lov_create,
795 o_destroy: lov_destroy,
796 o_getattr: lov_getattr,
797 o_setattr: lov_setattr,
798 o_statfs: lov_statfs,
803 o_enqueue: lov_enqueue,
808 #define LOV_VERSION "v0.1"
810 static int __init lov_init(void)
812 printk(KERN_INFO "Lustre Logical Object Volume driver " LOV_VERSION
813 ", info@clusterfs.com\n");
814 return class_register_type(&lov_obd_ops, OBD_LOV_DEVICENAME);
817 static void __exit lov_exit(void)
819 class_unregister_type(OBD_LOV_DEVICENAME);
822 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
823 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver v0.1");
824 MODULE_LICENSE("GPL");
826 module_init(lov_init);
827 module_exit(lov_exit);