1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Copyright (C) 2002 Cluster File Systems, Inc.
7 * Author: Phil Schwan <phil@off.net>
8 * Peter Braam <braam@clusterfs.com>
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
15 #define DEBUG_SUBSYSTEM S_LOV
17 #include <linux/slab.h>
18 #include <linux/module.h>
19 #include <linux/obd_support.h>
20 #include <linux/lustre_lib.h>
21 #include <linux/lustre_net.h>
22 #include <linux/lustre_idl.h>
23 #include <linux/lustre_mds.h>
24 #include <linux/obd_class.h>
25 #include <linux/obd_lov.h>
26 #include <linux/init.h>
27 #include <asm/div64.h>
30 static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
31 obd_uuid_t cluuid, struct recovd_obd *recovd,
32 ptlrpc_recovery_cb_t recover)
34 struct ptlrpc_request *req = NULL;
35 struct lov_obd *lov = &obd->u.lov;
36 struct client_obd *mdc = &lov->mdcobd->u.cli;
37 struct lov_desc *desc = &lov->desc;
38 struct lustre_handle mdc_conn;
39 obd_uuid_t *uuidarray;
43 rc = class_connect(conn, obd, cluuid);
49 /* We don't want to actually do the underlying connections more than
50 * once, so keep track. */
52 if (lov->refcount > 1)
55 /* retrieve LOV metadata from MDS */
56 rc = obd_connect(&mdc_conn, lov->mdcobd, NULL, recovd, recover);
58 CERROR("cannot connect to mdc: rc = %d\n", rc);
62 rc = mdc_getlovinfo(obd, &mdc_conn, &req);
63 rc2 = obd_disconnect(&mdc_conn);
65 CERROR("cannot get lov info %d\n", rc);
70 CERROR("error disconnecting from MDS %d\n", rc2);
71 GOTO(out_conn, rc = rc2);
75 if (req->rq_repmsg->bufcount < 2 ||
76 req->rq_repmsg->buflens[0] < sizeof(*desc)) {
77 CERROR("LOV desc: invalid descriptor returned\n");
78 GOTO(out_conn, rc = -EINVAL);
81 memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
84 if (req->rq_repmsg->buflens[1] < sizeof(*uuidarray)*desc->ld_tgt_count){
85 CERROR("LOV desc: invalid uuid array returned\n");
86 GOTO(out_conn, rc = -EINVAL);
89 mdc->cl_max_mds_easize = lov_mds_md_size(desc->ld_tgt_count);
90 mdc->cl_max_ost_easize = lov_stripe_md_size(desc->ld_tgt_count);
92 if (memcmp(obd->obd_uuid, desc->ld_uuid, sizeof(desc->ld_uuid))) {
93 CERROR("LOV desc: uuid %s not on mds device (%s)\n",
94 obd->obd_uuid, desc->ld_uuid);
95 GOTO(out_conn, rc = -EINVAL);
98 if (desc->ld_tgt_count > 1000) {
99 CERROR("LOV desc: target count > 1000 (%d)\n",
101 GOTO(out_conn, rc = -EINVAL);
104 /* Because of 64-bit divide/mod operations only work with a 32-bit
105 * divisor in a 32-bit kernel, we cannot support a stripe width
106 * of 4GB or larger on 32-bit CPUs.
108 if ((desc->ld_default_stripe_count ?
109 desc->ld_default_stripe_count : desc->ld_tgt_count) *
110 desc->ld_default_stripe_size > ~0UL) {
111 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
112 desc->ld_default_stripe_size,
113 desc->ld_default_stripe_count ?
114 desc->ld_default_stripe_count : desc->ld_tgt_count,~0UL);
115 GOTO(out_conn, rc = -EINVAL);
118 lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count;
119 OBD_ALLOC(lov->tgts, lov->bufsize);
121 CERROR("Out of memory\n");
122 GOTO(out_conn, rc = -ENOMEM);
125 uuidarray = lustre_msg_buf(req->rq_repmsg, 1);
126 for (i = 0; i < desc->ld_tgt_count; i++)
127 memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray));
129 for (i = 0; i < desc->ld_tgt_count; i++) {
130 struct obd_device *tgt = class_uuid2obd(uuidarray[i]);
132 CERROR("Target %s not attached\n", uuidarray[i]);
133 GOTO(out_disc, rc = -EINVAL);
135 if (!(tgt->obd_flags & OBD_SET_UP)) {
136 CERROR("Target %s not set up\n", uuidarray[i]);
137 GOTO(out_disc, rc = -EINVAL);
139 rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd,
142 CERROR("Target %s connect error %d\n",
146 rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
147 sizeof(struct obd_device *), obd, NULL);
149 CERROR("Target %s REGISTER_LOV error %d\n",
153 desc->ld_active_tgt_count++;
154 lov->tgts[i].active = 1;
158 ptlrpc_req_finished(req);
163 desc->ld_active_tgt_count--;
164 lov->tgts[i].active = 0;
165 rc2 = obd_disconnect(&lov->tgts[i].conn);
167 CERROR("LOV Target %s disconnect error: rc = %d\n",
170 OBD_FREE(lov->tgts, lov->bufsize);
172 class_disconnect(conn);
176 static int lov_disconnect(struct lustre_handle *conn)
178 struct obd_device *obd = class_conn2obd(conn);
179 struct lov_obd *lov = &obd->u.lov;
185 /* Only disconnect the underlying laters on the final disconnect. */
187 if (lov->refcount != 0)
190 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
191 if (!lov->tgts[i].active) {
192 CERROR("Skipping disconnect for inactive OSC %s\n",
197 lov->desc.ld_active_tgt_count--;
198 lov->tgts[i].active = 0;
199 rc = obd_disconnect(&lov->tgts[i].conn);
201 CERROR("Target %s disconnect error %d\n",
202 lov->tgts[i].uuid, rc);
206 OBD_FREE(lov->tgts, lov->bufsize);
211 rc = class_disconnect(conn);
219 * -EINVAL : UUID can't be found in the LOV's target list
220 * -ENOTCONN: The UUID is found, but the target connection is bad (!)
221 * -EBADF : The UUID is found, but the OBD is the wrong type (!)
222 * -EALREADY: The OSC is already marked (in)active
224 static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
227 struct obd_device *obd;
231 CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
232 lov, uuid, activate);
234 spin_lock(&lov->lov_lock);
235 for (i = 0; i < lov->desc.ld_tgt_count; i++)
236 if (strncmp(uuid, lov->tgts[i].uuid,
237 sizeof(lov->tgts[i].uuid)) == 0)
240 if (i == lov->desc.ld_tgt_count)
241 GOTO(out, rc = -EINVAL);
243 obd = class_conn2obd(&lov->tgts[i].conn);
246 GOTO(out, rc = -ENOTCONN);
249 CDEBUG(D_INFO, "Found OBD %p type %s\n", obd, obd->obd_type->typ_name);
250 if (strcmp(obd->obd_type->typ_name, "osc") != 0) {
252 GOTO(out, rc = -EBADF);
255 if (lov->tgts[i].active == activate) {
256 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
257 activate ? "" : "in");
258 GOTO(out, rc = -EALREADY);
261 CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
263 lov->tgts[i].active = activate;
265 lov->desc.ld_active_tgt_count++;
267 lov->desc.ld_active_tgt_count--;
271 spin_unlock(&lov->lov_lock);
275 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
277 struct obd_ioctl_data* data = buf;
278 struct lov_obd *lov = &obd->u.lov;
282 if (data->ioc_inllen1 < 1) {
283 CERROR("osc setup requires an MDC UUID\n");
287 if (data->ioc_inllen1 > 37) {
288 CERROR("mdc UUID must be 36 characters or less\n");
292 spin_lock_init(&lov->lov_lock);
293 lov->mdcobd = class_uuid2obd(data->ioc_inlbuf1);
295 CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid,
303 /* the LOV expects oa->o_id to be set to the LOV object id */
304 static int lov_create(struct lustre_handle *conn, struct obdo *oa,
305 struct lov_stripe_md **ea)
307 struct obd_export *export = class_conn2export(conn);
309 struct lov_stripe_md *lsm;
310 struct lov_oinfo *loi;
312 int ost_count, ost_idx = 1, i, rc = 0;
324 lov = &export->exp_obd->u.lov;
326 spin_lock(&lov->lov_lock);
327 ost_count = lov->desc.ld_tgt_count;
328 oa->o_easize = lov_stripe_md_size(ost_count);
332 OBD_ALLOC(lsm, oa->o_easize);
334 spin_unlock(&lov->lov_lock);
335 GOTO(out_tmp, rc = -ENOMEM);
337 lsm->lsm_magic = LOV_MAGIC;
338 lsm->lsm_mds_easize = lov_mds_md_size(ost_count);
339 ost_idx = 0; /* if lsm->lsm_stripe_offset is set yet */
342 LASSERT(oa->o_valid & OBD_MD_FLID);
343 lsm->lsm_object_id = oa->o_id;
344 if (!lsm->lsm_stripe_count)
345 lsm->lsm_stripe_count = lov->desc.ld_default_stripe_count;
346 if (!lsm->lsm_stripe_count)
347 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
348 else if (lsm->lsm_stripe_count > lov->desc.ld_active_tgt_count)
349 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
351 if (!lsm->lsm_stripe_size)
352 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
354 /* Because of 64-bit divide/mod operations only work with a 32-bit
355 * divisor in a 32-bit kernel, we cannot support a stripe width
356 * of 4GB or larger on 32-bit CPUs.
358 if (lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL) {
359 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
360 lsm->lsm_stripe_size, lsm->lsm_stripe_count, ~0UL);
361 spin_unlock(&lov->lov_lock);
362 GOTO(out_free, rc = -EINVAL);
365 lsm->lsm_ost_count = ost_count;
366 if (!ost_idx || lsm->lsm_stripe_offset >= ost_count) {
367 int mult = lsm->lsm_object_id * lsm->lsm_stripe_count;
368 int stripe_offset = mult % ost_count;
369 int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count;
371 lsm->lsm_stripe_offset = stripe_offset + sub_offset;
374 while (!lov->tgts[lsm->lsm_stripe_offset].active)
375 lsm->lsm_stripe_offset = (lsm->lsm_stripe_offset+1) % ost_count;
377 /* Pick the OSTs before we release the lock */
378 ost_idx = lsm->lsm_stripe_offset;
379 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
380 CDEBUG(D_INODE, "objid "LPX64"[%d] is ost_idx %d (uuid %s)\n",
381 lsm->lsm_object_id, i, ost_idx, lov->tgts[ost_idx].uuid);
382 loi->loi_ost_idx = ost_idx;
384 ost_idx = (ost_idx + 1) % ost_count;
385 } while (!lov->tgts[ost_idx].active);
388 spin_unlock(&lov->lov_lock);
390 CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
391 lsm->lsm_stripe_count,lsm->lsm_object_id,lsm->lsm_stripe_offset);
393 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
394 struct lov_stripe_md obj_md;
395 struct lov_stripe_md *obj_mdp = &obj_md;
397 ost_idx = loi->loi_ost_idx;
399 /* create data objects with "parent" OA */
400 memcpy(tmp, oa, sizeof(*tmp));
401 tmp->o_easize = sizeof(struct lov_stripe_md);
402 rc = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp);
404 CERROR("error creating objid "LPX64" sub-object on "
405 "OST idx %d: rc = %d\n", oa->o_id, ost_idx, rc);
406 GOTO(out_cleanup, rc);
408 loi->loi_id = tmp->o_id;
409 loi->loi_size = tmp->o_size;
410 CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n",
411 lsm->lsm_object_id, loi->loi_id, ost_idx);
425 /* destroy already created objects here */
426 memcpy(tmp, oa, sizeof(*tmp));
427 tmp->o_id = loi->loi_id;
428 err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
430 CERROR("Failed to uncreate objid "LPX64" subobj "
431 LPX64" on OST idx %d: rc = %d\n",
432 oa->o_id, loi->loi_id, loi->loi_ost_idx,
436 OBD_FREE(lsm, oa->o_easize);
440 static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
441 struct lov_stripe_md *lsm)
444 struct obd_export *export = class_conn2export(conn);
446 struct lov_oinfo *loi;
451 CERROR("LOV requires striping ea for destruction\n");
455 if (lsm->lsm_magic != LOV_MAGIC) {
456 CERROR("LOV striping magic bad %#lx != %#lx\n",
457 lsm->lsm_magic, LOV_MAGIC);
461 if (!export || !export->exp_obd)
464 lov = &export->exp_obd->u.lov;
465 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
466 /* create data objects with "parent" OA */
467 memcpy(&tmp, oa, sizeof(tmp));
468 tmp.o_id = loi->loi_id;
469 rc = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
471 CERROR("Error destroying objid "LPX64" subobj "LPX64
472 " on OST idx %d\n: rc = %d",
473 oa->o_id, loi->loi_id, loi->loi_ost_idx, rc);
478 /* compute object size given "stripeno" and the ost size */
479 static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
482 unsigned long ssize = lsm->lsm_stripe_size;
483 unsigned long swidth = ssize * lsm->lsm_stripe_count;
484 unsigned long stripe_size;
490 /* do_div(a, b) returns a % b, and a = a / b */
491 stripe_size = do_div(ost_size, ssize);
494 lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
496 lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
501 static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
502 struct lov_stripe_md *lsm, int stripeno, int *new)
505 obdo_cpy_md(tgt, src, valid);
506 if (valid & OBD_MD_FLSIZE)
507 tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno);
510 if (valid & OBD_MD_FLSIZE) {
511 /* this handles sparse files properly */
514 lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
515 if (lov_size > tgt->o_size)
516 tgt->o_size = lov_size;
518 if (valid & OBD_MD_FLBLOCKS)
519 tgt->o_blocks += src->o_blocks;
520 if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
521 tgt->o_ctime = src->o_ctime;
522 if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
523 tgt->o_mtime = src->o_mtime;
527 static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
528 struct lov_stripe_md *lsm)
531 struct obd_export *export = class_conn2export(conn);
533 struct lov_oinfo *loi;
539 CERROR("LOV requires striping ea\n");
543 if (lsm->lsm_magic != LOV_MAGIC) {
544 CERROR("LOV striping magic bad %#lx != %#lx\n",
545 lsm->lsm_magic, LOV_MAGIC);
549 if (!export || !export->exp_obd)
552 lov = &export->exp_obd->u.lov;
555 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
558 if (loi->loi_id == 0)
561 CERROR("objid "LPX64"[%d] has subobj "LPX64" at idx %u\n",
562 oa->o_id, i, loi->loi_id, loi->loi_ost_idx);
563 /* create data objects with "parent" OA */
564 memcpy(&tmp, oa, sizeof(tmp));
565 tmp.o_id = loi->loi_id;
567 err = obd_getattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL);
569 CERROR("Error getattr objid "LPX64" subobj "LPX64
570 " on OST idx %d: rc = %d\n",
571 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
574 continue; /* XXX or break? */
576 lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new);
581 static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
582 struct lov_stripe_md *lsm)
585 struct obd_export *export = class_conn2export(conn);
587 struct lov_oinfo *loi;
591 /* Note that this code is currently unused, hence LBUG(), just
592 * to know when/if it is ever revived that it needs cleanups.
597 CERROR("LOV requires striping ea\n");
601 if (lsm->lsm_magic != LOV_MAGIC) {
602 CERROR("LOV striping magic bad %#lx != %#lx\n",
603 lsm->lsm_magic, LOV_MAGIC);
607 if (!export || !export->exp_obd)
610 /* size changes should go through punch and not setattr */
611 LASSERT(!(oa->o_valid & OBD_MD_FLSIZE));
613 lov = &export->exp_obd->u.lov;
614 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
617 /* create data objects with "parent" OA */
618 memcpy(&tmp, oa, sizeof(tmp));
619 tmp.o_id = loi->loi_id;
621 err = obd_setattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL);
623 CERROR("Error setattr objid "LPX64" subobj "LPX64
624 " on OST idx %d: rc = %d\n",
625 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
633 static int lov_open(struct lustre_handle *conn, struct obdo *oa,
634 struct lov_stripe_md *lsm)
637 struct obd_export *export = class_conn2export(conn);
639 struct lov_oinfo *loi;
645 CERROR("LOV requires striping ea for opening\n");
649 if (lsm->lsm_magic != LOV_MAGIC) {
650 CERROR("LOV striping magic bad %#lx != %#lx\n",
651 lsm->lsm_magic, LOV_MAGIC);
655 if (!export || !export->exp_obd)
662 lov = &export->exp_obd->u.lov;
665 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
668 /* create data objects with "parent" OA */
669 memcpy(tmp, oa, sizeof(*tmp));
670 tmp->o_id = loi->loi_id;
672 err = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
674 CERROR("Error open objid "LPX64" subobj "LPX64
675 " on OST idx %d: rc = %d\n",
676 oa->o_id, lsm->lsm_oinfo[i].loi_id,
677 loi->loi_ost_idx, rc);
682 lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &new);
684 /* FIXME: returning an error, but having opened some objects is a bad
685 * idea, since they will likely never be closed. We either
686 * need to not return an error if _some_ objects could be
687 * opened, and leave it to read/write to return -EIO (with
688 * hopefully partial error status) or close all opened objects
689 * and return an error. I think the former is preferred.
695 static int lov_close(struct lustre_handle *conn, struct obdo *oa,
696 struct lov_stripe_md *lsm)
699 struct obd_export *export = class_conn2export(conn);
701 struct lov_oinfo *loi;
706 CERROR("LOV requires striping ea\n");
710 if (lsm->lsm_magic != LOV_MAGIC) {
711 CERROR("LOV striping magic bad %#lx != %#lx\n",
712 lsm->lsm_magic, LOV_MAGIC);
716 if (!export || !export->exp_obd)
719 lov = &export->exp_obd->u.lov;
720 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
723 /* create data objects with "parent" OA */
724 memcpy(&tmp, oa, sizeof(tmp));
725 tmp.o_id = loi->loi_id;
727 err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
729 CERROR("Error close objid "LPX64" subobj "LPX64
730 " on OST idx %d: rc = %d\n",
731 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
740 #define log2(n) ffz(~(n))
743 #warning FIXME: merge these two functions now that they are nearly the same
745 /* compute ost offset in stripe "stripeno" corresponding to offset "lov_off" */
746 static obd_off lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
749 unsigned long ssize = lsm->lsm_stripe_size;
750 unsigned long swidth = ssize * lsm->lsm_stripe_count;
751 unsigned long stripe_off, this_stripe;
753 if (lov_off == OBD_OBJECT_EOF || lov_off == 0)
756 /* do_div(a, b) returns a % b, and a = a / b */
757 stripe_off = do_div(lov_off, swidth);
759 this_stripe = stripeno * ssize;
760 if (stripe_off <= this_stripe)
763 stripe_off -= this_stripe;
765 if (stripe_off > ssize)
770 return lov_off * ssize + stripe_off;
773 /* compute which stripe number "lov_off" will be written into */
774 static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
776 unsigned long ssize = lsm->lsm_stripe_size;
777 unsigned long swidth = ssize * lsm->lsm_stripe_count;
778 unsigned long stripe_off;
780 stripe_off = do_div(lov_off, swidth);
782 return stripe_off / ssize;
786 /* FIXME: maybe we'll just make one node the authoritative attribute node, then
787 * we can send this 'punch' to just the authoritative node and the nodes
788 * that the punch will affect. */
789 static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
790 struct lov_stripe_md *lsm,
791 obd_off start, obd_off end)
794 struct obd_export *export = class_conn2export(conn);
796 struct lov_oinfo *loi;
801 CERROR("LOV requires striping ea\n");
805 if (lsm->lsm_magic != LOV_MAGIC) {
806 CERROR("LOV striping magic bad %#lx != %#lx\n",
807 lsm->lsm_magic, LOV_MAGIC);
811 if (!export || !export->exp_obd)
814 lov = &export->exp_obd->u.lov;
815 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
816 obd_off starti = lov_stripe_offset(lsm, start, i);
817 obd_off endi = lov_stripe_offset(lsm, end, i);
822 /* create data objects with "parent" OA */
823 memcpy(&tmp, oa, sizeof(tmp));
824 tmp.o_id = loi->loi_id;
826 err = obd_punch(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL,
829 CERROR("Error punch objid "LPX64" subobj "LPX64
830 " on OST idx %d: rc = %d\n",
831 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
839 static int lov_osc_brw_callback(struct io_cb_data *cbd, int err, int phase)
844 if (phase == CB_PHASE_START)
847 if (phase == CB_PHASE_FINISH) {
850 if (atomic_dec_and_test(&cbd->refcount))
851 ret = cbd->cb(cbd->data, cbd->err, phase);
859 static inline int lov_brw(int cmd, struct lustre_handle *conn,
860 struct lov_stripe_md *lsm, obd_count oa_bufs,
861 struct brw_page *pga,
862 brw_callback_t callback, struct io_cb_data *cbd)
864 int stripe_count = lsm->lsm_stripe_count;
865 struct obd_export *export = class_conn2export(conn);
871 struct lov_stripe_md lsm;
873 } *stripeinfo, *si, *si_last;
874 struct brw_page *ioarr;
876 struct io_cb_data *our_cb;
877 struct lov_oinfo *loi;
882 CERROR("LOV requires striping ea\n");
886 if (lsm->lsm_magic != LOV_MAGIC) {
887 CERROR("LOV striping magic bad %#lx != %#lx\n",
888 lsm->lsm_magic, LOV_MAGIC);
892 lov = &export->exp_obd->u.lov;
894 our_cb = ll_init_cb();
898 OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo));
900 GOTO(out_cbdata, rc = -ENOMEM);
902 OBD_ALLOC(where, sizeof(*where) * oa_bufs);
904 GOTO(out_sinfo, rc = -ENOMEM);
906 OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs);
908 GOTO(out_where, rc = -ENOMEM);
910 /* This is the only race-free way I can think of to get the refcount
912 atomic_set(&our_cb->refcount, 0);
913 our_cb->cb = callback;
916 for (i = 0; i < oa_bufs; i++) {
917 where[i] = lov_stripe_number(lsm, pga[i].off);
918 if (stripeinfo[where[i]].bufct++ == 0)
919 atomic_inc(&our_cb->refcount);
922 for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo;
923 i < stripe_count; i++, loi++, si_last = si, si++) {
925 si->index = si_last->index + si_last->bufct;
926 si->lsm.lsm_object_id = loi->loi_id;
927 si->ost_idx = loi->loi_ost_idx;
930 for (i = 0; i < oa_bufs; i++) {
931 int which = where[i];
934 shift = stripeinfo[which].index + stripeinfo[which].subcount;
935 LASSERT(shift < oa_bufs);
936 ioarr[shift] = pga[i];
937 ioarr[shift].off = lov_stripe_offset(lsm, pga[i].off, which);
938 stripeinfo[which].subcount++;
941 for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) {
942 int shift = si->index;
945 LASSERT(shift < oa_bufs);
946 /* XXX handle error returns here */
947 obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
948 &si->lsm, si->bufct, &ioarr[shift],
949 lov_osc_brw_callback, our_cb);
953 rc = callback(cbd, 0, CB_PHASE_START);
955 OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
957 OBD_FREE(where, sizeof(*where) * oa_bufs);
959 OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
961 OBD_FREE(our_cb, sizeof(*our_cb));
965 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
966 struct lustre_handle *parent_lock,
967 __u32 type, void *cookie, int cookielen, __u32 mode,
968 int *flags, void *cb, void *data, int datalen,
969 struct lustre_handle *lockhs)
971 struct obd_export *export = class_conn2export(conn);
973 struct lov_oinfo *loi;
978 CERROR("LOV requires striping ea\n");
982 if (lsm->lsm_magic != LOV_MAGIC) {
983 CERROR("LOV striping magic bad %#lx != %#lx\n",
984 lsm->lsm_magic, LOV_MAGIC);
988 if (!export || !export->exp_obd)
991 lov = &export->exp_obd->u.lov;
992 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
993 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
994 struct ldlm_extent sub_ext;
995 struct lov_stripe_md submd;
997 sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
998 sub_ext.end = lov_stripe_offset(lsm, extent->end, i);
999 if (sub_ext.start == sub_ext.end)
1002 submd.lsm_object_id = loi->loi_id;
1003 /* XXX submd lsm_mds_easize should be that from the subobj,
1004 * and the subobj should get it opaquely from the LOV.
1006 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1007 submd.lsm_stripe_count = 0;
1008 /* XXX submd is not fully initialized here */
1009 rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
1010 parent_lock, type, &sub_ext, sizeof(sub_ext),
1011 mode, flags, cb, data, datalen, &(lockhs[i]));
1012 // XXX add a lock debug statement here
1014 CERROR("Error enqueue objid "LPX64" subobj "LPX64
1015 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1016 loi->loi_id, loi->loi_ost_idx, rc);
1021 static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
1022 __u32 mode, struct lustre_handle *lockhs)
1024 struct obd_export *export = class_conn2export(conn);
1025 struct lov_obd *lov;
1026 struct lov_oinfo *loi;
1031 CERROR("LOV requires striping ea\n");
1035 if (lsm->lsm_magic != LOV_MAGIC) {
1036 CERROR("LOV striping magic bad %#lx != %#lx\n",
1037 lsm->lsm_magic, LOV_MAGIC);
1041 if (!export || !export->exp_obd)
1044 lov = &export->exp_obd->u.lov;
1045 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1046 struct lov_stripe_md submd;
1048 if (lockhs[i].addr == 0)
1051 submd.lsm_object_id = loi->loi_id;
1052 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1053 submd.lsm_stripe_count = 0;
1054 rc = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
1057 CERROR("Error cancel objid "LPX64" subobj "LPX64
1058 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1059 loi->loi_id, loi->loi_ost_idx, rc);
1064 static int lov_cancel_unused(struct lustre_handle *conn,
1065 struct lov_stripe_md *lsm, int local_only)
1067 struct obd_export *export = class_conn2export(conn);
1068 struct lov_obd *lov;
1069 struct lov_oinfo *loi;
1074 CERROR("LOV requires striping ea for lock cancellation\n");
1078 if (!export || !export->exp_obd)
1081 lov = &export->exp_obd->u.lov;
1082 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1083 struct lov_stripe_md submd;
1085 submd.lsm_object_id = loi->loi_id;
1086 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1087 submd.lsm_stripe_count = 0;
1088 rc = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
1089 &submd, local_only);
1091 CERROR("Error cancel unused objid "LPX64" subobj "LPX64
1092 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1093 loi->loi_id, loi->loi_ost_idx, rc);
1098 static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
1100 struct obd_export *export = class_conn2export(conn);
1101 struct lov_obd *lov;
1102 struct obd_statfs lov_sfs;
1108 if (!export || !export->exp_obd)
1111 lov = &export->exp_obd->u.lov;
1113 /* We only get block data from the OBD */
1114 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1117 err = obd_statfs(&lov->tgts[i].conn, &lov_sfs);
1119 CERROR("Error statfs OSC %s idx %d: err = %d\n",
1120 lov->tgts[i].uuid, i, err);
1123 continue; /* XXX or break? - probably OK to continue */
1126 memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
1129 osfs->os_bfree += lov_sfs.os_bfree;
1130 osfs->os_bavail += lov_sfs.os_bavail;
1131 osfs->os_blocks += lov_sfs.os_blocks;
1132 /* XXX not sure about this one - depends on policy.
1133 * - could be minimum if we always stripe on all OBDs
1134 * (but that would be wrong for any other policy,
1135 * if one of the OBDs has no more objects left)
1136 * - could be sum if we stripe whole objects
1137 * - could be average, just to give a nice number
1138 * - we just pick first OST and hope it is enough
1139 sfs->f_ffree += lov_sfs.f_ffree;
1146 static int lov_iocontrol(long cmd, struct lustre_handle *conn, int len,
1147 void *karg, void *uarg)
1149 struct obd_device *obddev = class_conn2obd(conn);
1150 struct obd_ioctl_data *data = karg;
1151 struct lov_obd *lov = &obddev->u.lov;
1156 case IOC_LOV_SET_OSC_ACTIVE:
1157 rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset);
1160 if (lov->desc.ld_tgt_count == 0)
1163 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1164 int err = obd_iocontrol(cmd, &lov->tgts[i].conn,
1174 struct obd_ops lov_obd_ops = {
1176 o_connect: lov_connect,
1177 o_disconnect: lov_disconnect,
1178 o_create: lov_create,
1179 o_destroy: lov_destroy,
1180 o_getattr: lov_getattr,
1181 o_setattr: lov_setattr,
1182 o_statfs: lov_statfs,
1187 o_enqueue: lov_enqueue,
1188 o_cancel: lov_cancel,
1189 o_cancel_unused: lov_cancel_unused,
1190 o_iocontrol: lov_iocontrol
1194 #define LOV_VERSION "v0.1"
1196 static int __init lov_init(void)
1198 printk(KERN_INFO "Lustre Logical Object Volume driver " LOV_VERSION
1199 ", info@clusterfs.com\n");
1200 return class_register_type(&lov_obd_ops, OBD_LOV_DEVICENAME);
1203 static void __exit lov_exit(void)
1205 class_unregister_type(OBD_LOV_DEVICENAME);
1208 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1209 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver v0.1");
1210 MODULE_LICENSE("GPL");
1212 module_init(lov_init);
1213 module_exit(lov_exit);