1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Copyright (C) 2002 Cluster File Systems, Inc.
7 * Author: Phil Schwan <phil@off.net>
8 * Peter Braam <braam@clusterfs.com>
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
15 #define DEBUG_SUBSYSTEM S_LOV
17 #include <linux/slab.h>
18 #include <linux/module.h>
19 #include <linux/obd_support.h>
20 #include <linux/lustre_lib.h>
21 #include <linux/lustre_net.h>
22 #include <linux/lustre_idl.h>
23 #include <linux/lustre_mds.h>
24 #include <linux/obd_class.h>
25 #include <linux/obd_lov.h>
26 #include <linux/init.h>
27 #include <linux/random.h>
28 #include <linux/slab.h>
29 #include <asm/div64.h>
30 #include <linux/lprocfs_status.h>
32 extern struct lprocfs_vars status_var_nm_1[];
33 extern struct lprocfs_vars status_class_var[];
35 static kmem_cache_t *lov_file_cache;
37 struct lov_file_handles {
38 struct list_head lfh_list;
41 struct lustre_handle *lfh_handles;
45 static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
46 obd_uuid_t cluuid, struct recovd_obd *recovd,
47 ptlrpc_recovery_cb_t recover)
49 struct ptlrpc_request *req = NULL;
50 struct lov_obd *lov = &obd->u.lov;
51 struct client_obd *mdc = &lov->mdcobd->u.cli;
52 struct lov_desc *desc = &lov->desc;
53 struct obd_export *exp;
54 struct lustre_handle mdc_conn;
55 obd_uuid_t *uuidarray;
60 rc = class_connect(conn, obd, cluuid);
66 /* We don't want to actually do the underlying connections more than
67 * once, so keep track. */
69 if (lov->refcount > 1)
72 exp = class_conn2export(conn);
73 INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head);
75 /* retrieve LOV metadata from MDS */
76 rc = obd_connect(&mdc_conn, lov->mdcobd, NULL, recovd, recover);
78 CERROR("cannot connect to mdc: rc = %d\n", rc);
82 rc = mdc_getlovinfo(obd, &mdc_conn, &req);
83 rc2 = obd_disconnect(&mdc_conn);
85 CERROR("cannot get lov info %d\n", rc);
90 CERROR("error disconnecting from MDS %d\n", rc2);
91 GOTO(out_conn, rc = rc2);
95 if (req->rq_repmsg->bufcount < 2 ||
96 req->rq_repmsg->buflens[0] < sizeof(*desc)) {
97 CERROR("LOV desc: invalid descriptor returned\n");
98 GOTO(out_conn, rc = -EINVAL);
101 memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
102 lov_unpackdesc(desc);
104 if (req->rq_repmsg->buflens[1] < sizeof(*uuidarray)*desc->ld_tgt_count){
105 CERROR("LOV desc: invalid uuid array returned\n");
106 GOTO(out_conn, rc = -EINVAL);
109 mdc->cl_max_mds_easize = lov_mds_md_size(desc->ld_tgt_count);
110 mdc->cl_max_ost_easize = lov_stripe_md_size(desc->ld_tgt_count);
112 if (memcmp(obd->obd_uuid, desc->ld_uuid, sizeof(desc->ld_uuid))) {
113 CERROR("LOV desc: uuid %s not on mds device (%s)\n",
114 obd->obd_uuid, desc->ld_uuid);
115 GOTO(out_conn, rc = -EINVAL);
118 if (desc->ld_tgt_count > 1000) {
119 CERROR("LOV desc: target count > 1000 (%d)\n",
121 GOTO(out_conn, rc = -EINVAL);
124 /* Because of 64-bit divide/mod operations only work with a 32-bit
125 * divisor in a 32-bit kernel, we cannot support a stripe width
126 * of 4GB or larger on 32-bit CPUs.
128 if ((desc->ld_default_stripe_count ?
129 desc->ld_default_stripe_count : desc->ld_tgt_count) *
130 desc->ld_default_stripe_size > ~0UL) {
131 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
132 desc->ld_default_stripe_size,
133 desc->ld_default_stripe_count ?
134 desc->ld_default_stripe_count : desc->ld_tgt_count,~0UL);
135 GOTO(out_conn, rc = -EINVAL);
138 lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count;
139 OBD_ALLOC(lov->tgts, lov->bufsize);
141 CERROR("Out of memory\n");
142 GOTO(out_conn, rc = -ENOMEM);
145 uuidarray = lustre_msg_buf(req->rq_repmsg, 1);
146 for (i = 0; i < desc->ld_tgt_count; i++)
147 memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray));
149 for (i = 0; i < desc->ld_tgt_count; i++) {
150 struct obd_device *tgt = class_uuid2obd(uuidarray[i]);
153 CERROR("Target %s not attached\n", uuidarray[i]);
154 GOTO(out_disc, rc = -EINVAL);
157 if (!(tgt->obd_flags & OBD_SET_UP)) {
158 CERROR("Target %s not set up\n", uuidarray[i]);
159 GOTO(out_disc, rc = -EINVAL);
162 rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd,
165 CERROR("Target %s connect error %d\n",
169 rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
170 sizeof(struct obd_device *), obd, NULL);
172 CERROR("Target %s REGISTER_LOV error %d\n",
176 desc->ld_active_tgt_count++;
177 lov->tgts[i].active = 1;
181 ptlrpc_req_finished(req);
186 desc->ld_active_tgt_count--;
187 lov->tgts[i].active = 0;
188 rc2 = obd_disconnect(&lov->tgts[i].conn);
190 CERROR("LOV Target %s disconnect error: rc = %d\n",
193 OBD_FREE(lov->tgts, lov->bufsize);
195 class_disconnect(conn);
199 static int lov_disconnect(struct lustre_handle *conn)
201 struct obd_device *obd = class_conn2obd(conn);
202 struct lov_obd *lov = &obd->u.lov;
203 struct obd_export *exp;
204 struct list_head *p, *n;
210 /* Only disconnect the underlying layers on the final disconnect. */
212 if (lov->refcount != 0)
215 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
216 if (!lov->tgts[i].active) {
217 CERROR("Skipping disconnect for inactive OSC %s\n",
222 lov->desc.ld_active_tgt_count--;
223 lov->tgts[i].active = 0;
224 rc = obd_disconnect(&lov->tgts[i].conn);
226 CERROR("Target %s disconnect error %d\n",
227 lov->tgts[i].uuid, rc);
231 OBD_FREE(lov->tgts, lov->bufsize);
235 exp = class_conn2export(conn);
236 list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) {
237 /* XXX close these, instead of just discarding them? */
238 struct lov_file_handles *lfh;
239 lfh = list_entry(p, typeof(*lfh), lfh_list);
240 CERROR("discarding open LOV handle %p:"LPX64"\n",
241 lfh, lfh->lfh_cookie);
242 list_del(&lfh->lfh_list);
243 OBD_FREE(lfh->lfh_handles,
244 lfh->lfh_count * sizeof(*lfh->lfh_handles));
245 kmem_cache_free(lov_file_cache, lfh);
249 rc = class_disconnect(conn);
257 * -EINVAL : UUID can't be found in the LOV's target list
258 * -ENOTCONN: The UUID is found, but the target connection is bad (!)
259 * -EBADF : The UUID is found, but the OBD is the wrong type (!)
260 * -EALREADY: The OSC is already marked (in)active
262 static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
265 struct obd_device *obd;
269 CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
270 lov, uuid, activate);
272 spin_lock(&lov->lov_lock);
273 for (i = 0; i < lov->desc.ld_tgt_count; i++)
274 if (strncmp(uuid, lov->tgts[i].uuid,
275 sizeof(lov->tgts[i].uuid)) == 0)
278 if (i == lov->desc.ld_tgt_count)
279 GOTO(out, rc = -EINVAL);
281 obd = class_conn2obd(&lov->tgts[i].conn);
284 GOTO(out, rc = -ENOTCONN);
287 CDEBUG(D_INFO, "Found OBD %p type %s\n", obd, obd->obd_type->typ_name);
288 if (strcmp(obd->obd_type->typ_name, "osc") != 0) {
290 GOTO(out, rc = -EBADF);
293 if (lov->tgts[i].active == activate) {
294 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
295 activate ? "" : "in");
296 GOTO(out, rc = -EALREADY);
299 CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
301 lov->tgts[i].active = activate;
303 lov->desc.ld_active_tgt_count++;
305 lov->desc.ld_active_tgt_count--;
309 spin_unlock(&lov->lov_lock);
313 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
315 struct obd_ioctl_data* data = buf;
316 struct lov_obd *lov = &obd->u.lov;
320 if (data->ioc_inllen1 < 1) {
321 CERROR("osc setup requires an MDC UUID\n");
325 if (data->ioc_inllen1 > 37) {
326 CERROR("mdc UUID must be 36 characters or less\n");
330 spin_lock_init(&lov->lov_lock);
331 lov->mdcobd = class_uuid2obd(data->ioc_inlbuf1);
333 CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid,
340 static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle)
342 struct lov_file_handles *lfh = NULL;
344 if (!handle || !handle->addr)
347 lfh = (struct lov_file_handles *)(unsigned long)(handle->addr);
348 if (!kmem_cache_validate(lov_file_cache, lfh))
351 if (lfh->lfh_cookie != handle->cookie)
357 /* the LOV expects oa->o_id to be set to the LOV object id */
358 static int lov_create(struct lustre_handle *conn, struct obdo *oa,
359 struct lov_stripe_md **ea)
361 struct obd_export *export = class_conn2export(conn);
363 struct lov_stripe_md *lsm;
364 struct lov_oinfo *loi;
366 int ost_count, ost_idx = 1, i, rc = 0;
378 lov = &export->exp_obd->u.lov;
380 spin_lock(&lov->lov_lock);
381 ost_count = lov->desc.ld_tgt_count;
382 oa->o_easize = lov_stripe_md_size(ost_count);
386 OBD_ALLOC(lsm, oa->o_easize);
388 spin_unlock(&lov->lov_lock);
389 GOTO(out_tmp, rc = -ENOMEM);
391 lsm->lsm_magic = LOV_MAGIC;
392 lsm->lsm_mds_easize = lov_mds_md_size(ost_count);
393 ost_idx = 0; /* if lsm->lsm_stripe_offset is set yet */
396 LASSERT(oa->o_valid & OBD_MD_FLID);
397 lsm->lsm_object_id = oa->o_id;
398 if (!lsm->lsm_stripe_count)
399 lsm->lsm_stripe_count = lov->desc.ld_default_stripe_count;
400 if (!lsm->lsm_stripe_count)
401 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
402 else if (lsm->lsm_stripe_count > lov->desc.ld_active_tgt_count)
403 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
405 if (!lsm->lsm_stripe_size)
406 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
408 /* Because of 64-bit divide/mod operations only work with a 32-bit
409 * divisor in a 32-bit kernel, we cannot support a stripe width
410 * of 4GB or larger on 32-bit CPUs.
412 if (lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL) {
413 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
414 lsm->lsm_stripe_size, lsm->lsm_stripe_count, ~0UL);
415 spin_unlock(&lov->lov_lock);
416 GOTO(out_free, rc = -EINVAL);
419 lsm->lsm_ost_count = ost_count;
420 if (!ost_idx || lsm->lsm_stripe_offset >= ost_count) {
421 int mult = lsm->lsm_object_id * lsm->lsm_stripe_count;
422 int stripe_offset = mult % ost_count;
423 int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count;
425 lsm->lsm_stripe_offset = stripe_offset + sub_offset;
428 while (!lov->tgts[lsm->lsm_stripe_offset].active)
429 lsm->lsm_stripe_offset = (lsm->lsm_stripe_offset+1) % ost_count;
431 /* Pick the OSTs before we release the lock */
432 ost_idx = lsm->lsm_stripe_offset;
433 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
434 CDEBUG(D_INODE, "objid "LPX64"[%d] is ost_idx %d (uuid %s)\n",
435 lsm->lsm_object_id, i, ost_idx, lov->tgts[ost_idx].uuid);
436 loi->loi_ost_idx = ost_idx;
438 ost_idx = (ost_idx + 1) % ost_count;
439 } while (!lov->tgts[ost_idx].active);
442 spin_unlock(&lov->lov_lock);
444 CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
445 lsm->lsm_stripe_count,lsm->lsm_object_id,lsm->lsm_stripe_offset);
447 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
448 struct lov_stripe_md obj_md;
449 struct lov_stripe_md *obj_mdp = &obj_md;
451 ost_idx = loi->loi_ost_idx;
453 /* create data objects with "parent" OA */
454 memcpy(tmp, oa, sizeof(*tmp));
455 tmp->o_easize = sizeof(struct lov_stripe_md);
456 rc = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp);
458 CERROR("error creating objid "LPX64" sub-object on "
459 "OST idx %d: rc = %d\n", oa->o_id, ost_idx, rc);
460 GOTO(out_cleanup, rc);
462 loi->loi_id = tmp->o_id;
463 loi->loi_size = tmp->o_size;
464 CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n",
465 lsm->lsm_object_id, loi->loi_id, ost_idx);
479 /* destroy already created objects here */
480 memcpy(tmp, oa, sizeof(*tmp));
481 tmp->o_id = loi->loi_id;
482 err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
484 CERROR("Failed to uncreate objid "LPX64" subobj "
485 LPX64" on OST idx %d: rc = %d\n",
486 oa->o_id, loi->loi_id, loi->loi_ost_idx,
490 OBD_FREE(lsm, oa->o_easize);
494 static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
495 struct lov_stripe_md *lsm)
498 struct obd_export *export = class_conn2export(conn);
500 struct lov_oinfo *loi;
501 struct lov_file_handles *lfh = NULL;
506 CERROR("LOV requires striping ea for destruction\n");
510 if (lsm->lsm_magic != LOV_MAGIC) {
511 CERROR("LOV striping magic bad %#lx != %#lx\n",
512 lsm->lsm_magic, LOV_MAGIC);
516 if (!export || !export->exp_obd)
519 if (oa->o_valid & OBD_MD_FLHANDLE)
520 lfh = lov_handle2lfh(obdo_handle(oa));
522 lov = &export->exp_obd->u.lov;
523 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
524 memcpy(&tmp, oa, sizeof(tmp));
525 tmp.o_id = loi->loi_id;
527 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
528 sizeof(lfh->lfh_handles[i]));
530 tmp.o_valid &= ~OBD_MD_FLHANDLE;
531 rc = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
533 CERROR("Error destroying objid "LPX64" subobj "LPX64
534 " on OST idx %d\n: rc = %d",
535 oa->o_id, loi->loi_id, loi->loi_ost_idx, rc);
540 /* compute object size given "stripeno" and the ost size */
541 static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
544 unsigned long ssize = lsm->lsm_stripe_size;
545 unsigned long swidth = ssize * lsm->lsm_stripe_count;
546 unsigned long stripe_size;
552 /* do_div(a, b) returns a % b, and a = a / b */
553 stripe_size = do_div(ost_size, ssize);
556 lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
558 lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
563 static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
564 struct lov_stripe_md *lsm, int stripeno, int *new)
567 obdo_cpy_md(tgt, src, valid);
568 if (valid & OBD_MD_FLSIZE)
569 tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno);
572 if (valid & OBD_MD_FLSIZE) {
573 /* this handles sparse files properly */
576 lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
577 if (lov_size > tgt->o_size)
578 tgt->o_size = lov_size;
580 if (valid & OBD_MD_FLBLOCKS)
581 tgt->o_blocks += src->o_blocks;
582 if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
583 tgt->o_ctime = src->o_ctime;
584 if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
585 tgt->o_mtime = src->o_mtime;
589 static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
590 struct lov_stripe_md *lsm)
593 struct obd_export *export = class_conn2export(conn);
595 struct lov_oinfo *loi;
596 struct lov_file_handles *lfh = NULL;
602 CERROR("LOV requires striping ea\n");
606 if (lsm->lsm_magic != LOV_MAGIC) {
607 CERROR("LOV striping magic bad %#lx != %#lx\n",
608 lsm->lsm_magic, LOV_MAGIC);
612 if (!export || !export->exp_obd)
615 lov = &export->exp_obd->u.lov;
617 if (oa->o_valid & OBD_MD_FLHANDLE)
618 lfh = lov_handle2lfh(obdo_handle(oa));
620 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
623 if (loi->loi_id == 0)
626 CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
627 "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx);
628 /* create data objects with "parent" OA */
629 memcpy(&tmp, oa, sizeof(tmp));
630 tmp.o_id = loi->loi_id;
632 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
633 sizeof(lfh->lfh_handles[i]));
635 tmp.o_valid &= ~OBD_MD_FLHANDLE;
637 err = obd_getattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL);
639 CERROR("Error getattr objid "LPX64" subobj "LPX64
640 " on OST idx %d: rc = %d\n",
641 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
644 continue; /* XXX or break? */
646 lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new);
651 static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
652 struct lov_stripe_md *lsm)
655 struct obd_export *export = class_conn2export(conn);
657 struct lov_oinfo *loi;
658 struct lov_file_handles *lfh = NULL;
662 /* Note that this code is currently unused, hence LBUG(), just
663 * to know when/if it is ever revived that it needs cleanups.
668 CERROR("LOV requires striping ea\n");
672 if (lsm->lsm_magic != LOV_MAGIC) {
673 CERROR("LOV striping magic bad %#lx != %#lx\n",
674 lsm->lsm_magic, LOV_MAGIC);
678 if (!export || !export->exp_obd)
681 /* size changes should go through punch and not setattr */
682 LASSERT(!(oa->o_valid & OBD_MD_FLSIZE));
688 if (oa->o_valid & OBD_MD_FLHANDLE)
689 lfh = lov_handle2lfh(obdo_handle(oa));
691 lov = &export->exp_obd->u.lov;
692 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
695 obdo_cpy_md(tmp, oa, oa->o_valid);
698 memcpy(obdo_handle(tmp), &lfh->lfh_handles[i],
699 sizeof(lfh->lfh_handles[i]));
701 tmp->o_valid &= ~OBD_MD_FLHANDLE;
703 tmp->o_id = loi->loi_id;
705 err = obd_setattr(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
707 CERROR("Error setattr objid "LPX64" subobj "LPX64
708 " on OST idx %d: rc = %d\n",
709 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
718 static int lov_open(struct lustre_handle *conn, struct obdo *oa,
719 struct lov_stripe_md *lsm)
722 struct obd_export *export = class_conn2export(conn);
724 struct lov_oinfo *loi;
725 struct lov_file_handles *lfh = NULL;
731 CERROR("LOV requires striping ea for opening\n");
735 if (lsm->lsm_magic != LOV_MAGIC) {
736 CERROR("LOV striping magic bad %#lx != %#lx\n",
737 lsm->lsm_magic, LOV_MAGIC);
741 if (!export || !export->exp_obd)
748 lfh = kmem_cache_alloc(lov_file_cache, GFP_KERNEL);
750 GOTO(out_tmp, rc = -ENOMEM);
751 OBD_ALLOC(lfh->lfh_handles,
752 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
753 if (!lfh->lfh_handles)
754 GOTO(out_lfh, rc = -ENOMEM);
756 lov = &export->exp_obd->u.lov;
759 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
762 /* create data objects with "parent" OA */
763 memcpy(tmp, oa, sizeof(*tmp));
764 tmp->o_id = loi->loi_id;
766 err = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
768 CERROR("Error open objid "LPX64" subobj "LPX64
769 " on OST idx %d: rc = %d\n",
770 oa->o_id, lsm->lsm_oinfo[i].loi_id,
771 loi->loi_ost_idx, rc);
776 lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &new);
778 if (tmp->o_valid & OBD_MD_FLHANDLE)
779 memcpy(&lfh->lfh_handles[i], obdo_handle(tmp),
780 sizeof(lfh->lfh_handles[i]));
783 if (tmp->o_valid & OBD_MD_FLHANDLE) {
784 struct lustre_handle *handle = obdo_handle(oa);
786 lfh->lfh_count = lsm->lsm_stripe_count;
787 get_random_bytes(&lfh->lfh_cookie, sizeof(lfh->lfh_cookie));
789 handle->addr = (__u64)(unsigned long)lfh;
790 handle->cookie = lfh->lfh_cookie;
791 oa->o_valid |= OBD_MD_FLHANDLE;
792 list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head);
796 /* FIXME: returning an error, but having opened some objects is a bad
797 * idea, since they will likely never be closed. We either
798 * need to not return an error if _some_ objects could be
799 * opened, and leave it to read/write to return -EIO (with
800 * hopefully partial error status) or close all opened objects
801 * and return an error. I think the former is preferred.
808 OBD_FREE(lfh->lfh_handles,
809 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
811 lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
812 kmem_cache_free(lov_file_cache, lfh);
816 static int lov_close(struct lustre_handle *conn, struct obdo *oa,
817 struct lov_stripe_md *lsm)
820 struct obd_export *export = class_conn2export(conn);
822 struct lov_oinfo *loi;
823 struct lov_file_handles *lfh = NULL;
828 CERROR("LOV requires striping ea\n");
832 if (lsm->lsm_magic != LOV_MAGIC) {
833 CERROR("LOV striping magic bad %#lx != %#lx\n",
834 lsm->lsm_magic, LOV_MAGIC);
838 if (!export || !export->exp_obd)
841 if (oa->o_valid & OBD_MD_FLHANDLE)
842 lfh = lov_handle2lfh(obdo_handle(oa));
844 lov = &export->exp_obd->u.lov;
845 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
848 /* create data objects with "parent" OA */
849 memcpy(&tmp, oa, sizeof(tmp));
850 tmp.o_id = loi->loi_id;
852 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
853 sizeof(lfh->lfh_handles[i]));
855 tmp.o_valid &= ~OBD_MD_FLHANDLE;
857 err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
859 CERROR("Error close objid "LPX64" subobj "LPX64
860 " on OST idx %d: rc = %d\n",
861 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
867 list_del(&lfh->lfh_list);
868 OBD_FREE(lfh->lfh_handles,
869 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
870 lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
871 kmem_cache_free(lov_file_cache, lfh);
878 #define log2(n) ffz(~(n))
881 #warning FIXME: merge these two functions now that they are nearly the same
883 /* compute ost offset in stripe "stripeno" corresponding to offset "lov_off" */
884 static obd_off lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
887 unsigned long ssize = lsm->lsm_stripe_size;
888 unsigned long swidth = ssize * lsm->lsm_stripe_count;
889 unsigned long stripe_off, this_stripe;
891 if (lov_off == OBD_OBJECT_EOF || lov_off == 0)
894 /* do_div(a, b) returns a % b, and a = a / b */
895 stripe_off = do_div(lov_off, swidth);
897 this_stripe = stripeno * ssize;
898 if (stripe_off <= this_stripe)
901 stripe_off -= this_stripe;
903 if (stripe_off > ssize)
908 return lov_off * ssize + stripe_off;
911 /* compute which stripe number "lov_off" will be written into */
912 static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
914 unsigned long ssize = lsm->lsm_stripe_size;
915 unsigned long swidth = ssize * lsm->lsm_stripe_count;
916 unsigned long stripe_off;
918 stripe_off = do_div(lov_off, swidth);
920 return stripe_off / ssize;
924 /* FIXME: maybe we'll just make one node the authoritative attribute node, then
925 * we can send this 'punch' to just the authoritative node and the nodes
926 * that the punch will affect. */
927 static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
928 struct lov_stripe_md *lsm,
929 obd_off start, obd_off end)
932 struct obd_export *export = class_conn2export(conn);
934 struct lov_oinfo *loi;
935 struct lov_file_handles *lfh = NULL;
940 CERROR("LOV requires striping ea\n");
944 if (lsm->lsm_magic != LOV_MAGIC) {
945 CERROR("LOV striping magic bad %#lx != %#lx\n",
946 lsm->lsm_magic, LOV_MAGIC);
950 if (!export || !export->exp_obd)
953 if (oa->o_valid & OBD_MD_FLHANDLE)
954 lfh = lov_handle2lfh(obdo_handle(oa));
956 lov = &export->exp_obd->u.lov;
957 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
958 obd_off starti = lov_stripe_offset(lsm, start, i);
959 obd_off endi = lov_stripe_offset(lsm, end, i);
964 /* create data objects with "parent" OA */
965 memcpy(&tmp, oa, sizeof(tmp));
966 tmp.o_id = loi->loi_id;
968 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
969 sizeof(lfh->lfh_handles[i]));
971 tmp.o_valid &= ~OBD_MD_FLHANDLE;
973 err = obd_punch(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL,
976 CERROR("Error punch objid "LPX64" subobj "LPX64
977 " on OST idx %d: rc = %d\n",
978 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
986 static int lov_osc_brw_cb(struct brw_cb_data *brw_cbd, int err, int phase)
991 if (phase == CB_PHASE_START)
994 if (phase == CB_PHASE_FINISH) {
996 brw_cbd->brw_err = err;
997 if (atomic_dec_and_test(&brw_cbd->brw_refcount))
998 ret = brw_cbd->brw_cb(brw_cbd->brw_data, brw_cbd->brw_err, phase);
1006 static inline int lov_brw(int cmd, struct lustre_handle *conn,
1007 struct lov_stripe_md *lsm, obd_count oa_bufs,
1008 struct brw_page *pga,
1009 brw_cb_t brw_cb, struct brw_cb_data *brw_cbd)
1011 int stripe_count = lsm->lsm_stripe_count;
1012 struct obd_export *export = class_conn2export(conn);
1013 struct lov_obd *lov;
1018 struct lov_stripe_md lsm;
1020 } *stripeinfo, *si, *si_last;
1021 struct brw_page *ioarr;
1023 struct brw_cb_data *osc_brw_cbd;
1024 struct lov_oinfo *loi;
1029 CERROR("LOV requires striping ea\n");
1033 if (lsm->lsm_magic != LOV_MAGIC) {
1034 CERROR("LOV striping magic bad %#lx != %#lx\n",
1035 lsm->lsm_magic, LOV_MAGIC);
1039 lov = &export->exp_obd->u.lov;
1041 osc_brw_cbd = ll_init_brw_cb_data();
1045 OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo));
1047 GOTO(out_cbdata, rc = -ENOMEM);
1049 OBD_ALLOC(where, sizeof(*where) * oa_bufs);
1051 GOTO(out_sinfo, rc = -ENOMEM);
1053 OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs);
1055 GOTO(out_where, rc = -ENOMEM);
1057 /* This is the only race-free way I can think of to get the refcount
1059 atomic_set(&osc_brw_cbd->brw_refcount, 0);
1060 osc_brw_cbd->brw_cb = brw_cb;
1061 osc_brw_cbd->brw_data = brw_cbd;
1063 for (i = 0; i < oa_bufs; i++) {
1064 where[i] = lov_stripe_number(lsm, pga[i].off);
1065 if (stripeinfo[where[i]].bufct++ == 0)
1066 atomic_inc(&osc_brw_cbd->brw_refcount);
1069 for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo;
1070 i < stripe_count; i++, loi++, si_last = si, si++) {
1072 si->index = si_last->index + si_last->bufct;
1073 si->lsm.lsm_object_id = loi->loi_id;
1074 si->ost_idx = loi->loi_ost_idx;
1077 for (i = 0; i < oa_bufs; i++) {
1078 int which = where[i];
1081 shift = stripeinfo[which].index + stripeinfo[which].subcount;
1082 LASSERT(shift < oa_bufs);
1083 ioarr[shift] = pga[i];
1084 ioarr[shift].off = lov_stripe_offset(lsm, pga[i].off, which);
1085 stripeinfo[which].subcount++;
1088 for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) {
1089 int shift = si->index;
1092 LASSERT(shift < oa_bufs);
1093 /* XXX handle error returns here */
1094 obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
1095 &si->lsm, si->bufct, &ioarr[shift],
1096 lov_osc_brw_cb, osc_brw_cbd);
1100 rc = brw_cb(brw_cbd, 0, CB_PHASE_START);
1102 OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
1104 OBD_FREE(where, sizeof(*where) * oa_bufs);
1106 OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
1108 OBD_FREE(osc_brw_cbd, sizeof(*osc_brw_cbd));
1112 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
1113 struct lustre_handle *parent_lock,
1114 __u32 type, void *cookie, int cookielen, __u32 mode,
1115 int *flags, void *cb, void *data, int datalen,
1116 struct lustre_handle *lockhs)
1118 struct obd_export *export = class_conn2export(conn);
1119 struct lov_obd *lov;
1120 struct lov_oinfo *loi;
1125 CERROR("LOV requires striping ea\n");
1129 if (lsm->lsm_magic != LOV_MAGIC) {
1130 CERROR("LOV striping magic bad %#lx != %#lx\n",
1131 lsm->lsm_magic, LOV_MAGIC);
1135 if (!export || !export->exp_obd)
1138 lov = &export->exp_obd->u.lov;
1139 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1140 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
1141 struct ldlm_extent sub_ext;
1142 struct lov_stripe_md submd;
1144 sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
1145 sub_ext.end = lov_stripe_offset(lsm, extent->end, i);
1146 if (sub_ext.start == sub_ext.end)
1149 submd.lsm_object_id = loi->loi_id;
1150 /* XXX submd lsm_mds_easize should be that from the subobj,
1151 * and the subobj should get it opaquely from the LOV.
1153 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1154 submd.lsm_stripe_count = 0;
1155 /* XXX submd is not fully initialized here */
1156 rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
1157 parent_lock, type, &sub_ext, sizeof(sub_ext),
1158 mode, flags, cb, data, datalen, &(lockhs[i]));
1159 // XXX add a lock debug statement here
1161 CERROR("Error enqueue objid "LPX64" subobj "LPX64
1162 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1163 loi->loi_id, loi->loi_ost_idx, rc);
1168 static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
1169 __u32 mode, struct lustre_handle *lockhs)
1171 struct obd_export *export = class_conn2export(conn);
1172 struct lov_obd *lov;
1173 struct lov_oinfo *loi;
1178 CERROR("LOV requires striping ea\n");
1182 if (lsm->lsm_magic != LOV_MAGIC) {
1183 CERROR("LOV striping magic bad %#lx != %#lx\n",
1184 lsm->lsm_magic, LOV_MAGIC);
1188 if (!export || !export->exp_obd)
1191 lov = &export->exp_obd->u.lov;
1192 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1193 struct lov_stripe_md submd;
1195 if (lockhs[i].addr == 0)
1198 submd.lsm_object_id = loi->loi_id;
1199 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1200 submd.lsm_stripe_count = 0;
1201 rc = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
1204 CERROR("Error cancel objid "LPX64" subobj "LPX64
1205 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1206 loi->loi_id, loi->loi_ost_idx, rc);
1211 static int lov_cancel_unused(struct lustre_handle *conn,
1212 struct lov_stripe_md *lsm, int flags)
1214 struct obd_export *export = class_conn2export(conn);
1215 struct lov_obd *lov;
1216 struct lov_oinfo *loi;
1221 CERROR("LOV requires striping ea for lock cancellation\n");
1225 if (!export || !export->exp_obd)
1228 lov = &export->exp_obd->u.lov;
1229 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1230 struct lov_stripe_md submd;
1232 submd.lsm_object_id = loi->loi_id;
1233 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1234 submd.lsm_stripe_count = 0;
1235 rc = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
1238 CERROR("Error cancel unused objid "LPX64" subobj "LPX64
1239 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1240 loi->loi_id, loi->loi_ost_idx, rc);
1245 static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
1247 struct obd_export *export = class_conn2export(conn);
1248 struct lov_obd *lov;
1249 struct obd_statfs lov_sfs;
1255 if (!export || !export->exp_obd)
1258 lov = &export->exp_obd->u.lov;
1260 /* We only get block data from the OBD */
1261 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1264 if (!lov->tgts[i].active)
1267 err = obd_statfs(&lov->tgts[i].conn, &lov_sfs);
1269 CERROR("Error statfs OSC %s idx %d: err = %d\n",
1270 lov->tgts[i].uuid, i, err);
1273 continue; /* XXX or break? - probably OK to continue */
1276 memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
1279 osfs->os_bfree += lov_sfs.os_bfree;
1280 osfs->os_bavail += lov_sfs.os_bavail;
1281 osfs->os_blocks += lov_sfs.os_blocks;
1282 /* XXX not sure about this one - depends on policy.
1283 * - could be minimum if we always stripe on all OBDs
1284 * (but that would be wrong for any other policy,
1285 * if one of the OBDs has no more objects left)
1286 * - could be sum if we stripe whole objects
1287 * - could be average, just to give a nice number
1288 * - we just pick first OST and hope it is enough
1289 sfs->f_ffree += lov_sfs.f_ffree;
1296 static int lov_iocontrol(long cmd, struct lustre_handle *conn, int len,
1297 void *karg, void *uarg)
1299 struct obd_device *obddev = class_conn2obd(conn);
1300 struct obd_ioctl_data *data = karg;
1301 struct lov_obd *lov = &obddev->u.lov;
1302 struct lov_desc *desc;
1303 struct lov_tgt_desc *tgtdesc;
1310 case IOC_LOV_SET_OSC_ACTIVE:
1311 rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset);
1313 case OBD_IOC_LOV_GET_CONFIG:
1316 if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
1319 data = (struct obd_ioctl_data *)buf;
1321 if (sizeof(*desc) > data->ioc_inllen1) {
1326 count = lov->desc.ld_tgt_count;
1328 if (sizeof(*uuidp) * count > data->ioc_inllen2) {
1333 desc = (struct lov_desc *)data->ioc_inlbuf1;
1334 uuidp = (obd_uuid_t *)data->ioc_inlbuf2;
1335 memcpy(desc, &(lov->desc), sizeof(*desc));
1337 tgtdesc = lov->tgts;
1338 for (i = 0; i < count; i++, uuidp++, tgtdesc++)
1339 memcpy(uuidp, tgtdesc->uuid, sizeof(*uuidp));
1341 rc = copy_to_user((void *)uarg, buf, len);
1345 if (lov->desc.ld_tgt_count == 0)
1348 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1349 int err = obd_iocontrol(cmd, &lov->tgts[i].conn,
1359 int lov_attach(struct obd_device *dev,
1360 obd_count len, void *data)
1363 rc = lprocfs_reg_obd(dev, (struct lprocfs_vars*)status_var_nm_1,
1368 int lov_detach(struct obd_device *dev)
1371 rc = lprocfs_dereg_obd(dev);
1376 struct obd_ops lov_obd_ops = {
1377 o_attach: lov_attach,
1378 o_detach: lov_detach,
1380 o_connect: lov_connect,
1381 o_disconnect: lov_disconnect,
1382 o_create: lov_create,
1383 o_destroy: lov_destroy,
1384 o_getattr: lov_getattr,
1385 o_setattr: lov_setattr,
1386 o_statfs: lov_statfs,
1391 o_enqueue: lov_enqueue,
1392 o_cancel: lov_cancel,
1393 o_cancel_unused: lov_cancel_unused,
1394 o_iocontrol: lov_iocontrol
1398 #define LOV_VERSION "v0.1"
1400 static int __init lov_init(void)
1404 printk(KERN_INFO "Lustre Logical Object Volume driver " LOV_VERSION
1405 ", info@clusterfs.com\n");
1406 lov_file_cache = kmem_cache_create("ll_lov_file_data",
1407 sizeof(struct lov_file_handles),
1409 if (!lov_file_cache)
1412 rc = class_register_type(&lov_obd_ops,
1413 (struct lprocfs_vars*)status_class_var,
1414 OBD_LOV_DEVICENAME);
1421 static void __exit lov_exit(void)
1423 if (kmem_cache_destroy(lov_file_cache))
1424 CERROR("couldn't free LOV open cache\n");
1425 class_unregister_type(OBD_LOV_DEVICENAME);
1428 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1429 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver " LOV_VERSION);
1430 MODULE_LICENSE("GPL");
1432 module_init(lov_init);
1433 module_exit(lov_exit);