1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Copyright (C) 2002 Cluster File Systems, Inc.
7 * Author: Phil Schwan <phil@off.net>
8 * Peter Braam <braam@clusterfs.com>
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
15 #define DEBUG_SUBSYSTEM S_LOV
17 #include <linux/slab.h>
18 #include <linux/module.h>
19 #include <linux/obd_support.h>
20 #include <linux/lustre_lib.h>
21 #include <linux/lustre_net.h>
22 #include <linux/lustre_idl.h>
23 #include <linux/lustre_mds.h>
24 #include <linux/obd_class.h>
25 #include <linux/obd_lov.h>
26 #include <linux/init.h>
27 #include <linux/random.h>
28 #include <linux/slab.h>
29 #include <asm/div64.h>
31 static kmem_cache_t *lov_file_cache;
33 struct lov_file_handles {
34 struct list_head lfh_list;
37 struct lustre_handle *lfh_handles;
41 static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
42 obd_uuid_t cluuid, struct recovd_obd *recovd,
43 ptlrpc_recovery_cb_t recover)
45 struct ptlrpc_request *req = NULL;
46 struct lov_obd *lov = &obd->u.lov;
47 struct client_obd *mdc = &lov->mdcobd->u.cli;
48 struct lov_desc *desc = &lov->desc;
49 struct obd_export *exp;
50 struct lustre_handle mdc_conn;
51 obd_uuid_t *uuidarray;
55 rc = class_connect(conn, obd, cluuid);
61 /* We don't want to actually do the underlying connections more than
62 * once, so keep track. */
64 if (lov->refcount > 1)
67 exp = class_conn2export(conn);
68 INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head);
70 /* retrieve LOV metadata from MDS */
71 rc = obd_connect(&mdc_conn, lov->mdcobd, NULL, recovd, recover);
73 CERROR("cannot connect to mdc: rc = %d\n", rc);
77 rc = mdc_getlovinfo(obd, &mdc_conn, &req);
78 rc2 = obd_disconnect(&mdc_conn);
80 CERROR("cannot get lov info %d\n", rc);
85 CERROR("error disconnecting from MDS %d\n", rc2);
86 GOTO(out_conn, rc = rc2);
90 if (req->rq_repmsg->bufcount < 2 ||
91 req->rq_repmsg->buflens[0] < sizeof(*desc)) {
92 CERROR("LOV desc: invalid descriptor returned\n");
93 GOTO(out_conn, rc = -EINVAL);
96 memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
99 if (req->rq_repmsg->buflens[1] < sizeof(*uuidarray)*desc->ld_tgt_count){
100 CERROR("LOV desc: invalid uuid array returned\n");
101 GOTO(out_conn, rc = -EINVAL);
104 mdc->cl_max_mds_easize = lov_mds_md_size(desc->ld_tgt_count);
105 mdc->cl_max_ost_easize = lov_stripe_md_size(desc->ld_tgt_count);
107 if (memcmp(obd->obd_uuid, desc->ld_uuid, sizeof(desc->ld_uuid))) {
108 CERROR("LOV desc: uuid %s not on mds device (%s)\n",
109 obd->obd_uuid, desc->ld_uuid);
110 GOTO(out_conn, rc = -EINVAL);
113 if (desc->ld_tgt_count > 1000) {
114 CERROR("LOV desc: target count > 1000 (%d)\n",
116 GOTO(out_conn, rc = -EINVAL);
119 /* Because of 64-bit divide/mod operations only work with a 32-bit
120 * divisor in a 32-bit kernel, we cannot support a stripe width
121 * of 4GB or larger on 32-bit CPUs.
123 if ((desc->ld_default_stripe_count ?
124 desc->ld_default_stripe_count : desc->ld_tgt_count) *
125 desc->ld_default_stripe_size > ~0UL) {
126 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
127 desc->ld_default_stripe_size,
128 desc->ld_default_stripe_count ?
129 desc->ld_default_stripe_count : desc->ld_tgt_count,~0UL);
130 GOTO(out_conn, rc = -EINVAL);
133 lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count;
134 OBD_ALLOC(lov->tgts, lov->bufsize);
136 CERROR("Out of memory\n");
137 GOTO(out_conn, rc = -ENOMEM);
140 uuidarray = lustre_msg_buf(req->rq_repmsg, 1);
141 for (i = 0; i < desc->ld_tgt_count; i++)
142 memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray));
144 for (i = 0; i < desc->ld_tgt_count; i++) {
145 struct obd_device *tgt = class_uuid2obd(uuidarray[i]);
148 CERROR("Target %s not attached\n", uuidarray[i]);
149 GOTO(out_disc, rc = -EINVAL);
152 if (!(tgt->obd_flags & OBD_SET_UP)) {
153 CERROR("Target %s not set up\n", uuidarray[i]);
154 GOTO(out_disc, rc = -EINVAL);
157 rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd,
160 CERROR("Target %s connect error %d\n",
164 rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
165 sizeof(struct obd_device *), obd, NULL);
167 CERROR("Target %s REGISTER_LOV error %d\n",
171 desc->ld_active_tgt_count++;
172 lov->tgts[i].active = 1;
176 ptlrpc_req_finished(req);
181 desc->ld_active_tgt_count--;
182 lov->tgts[i].active = 0;
183 rc2 = obd_disconnect(&lov->tgts[i].conn);
185 CERROR("LOV Target %s disconnect error: rc = %d\n",
188 OBD_FREE(lov->tgts, lov->bufsize);
190 class_disconnect(conn);
194 static int lov_disconnect(struct lustre_handle *conn)
196 struct obd_device *obd = class_conn2obd(conn);
197 struct lov_obd *lov = &obd->u.lov;
198 struct obd_export *exp;
199 struct list_head *p, *n;
205 /* Only disconnect the underlying layers on the final disconnect. */
207 if (lov->refcount != 0)
210 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
211 if (!lov->tgts[i].active) {
212 CERROR("Skipping disconnect for inactive OSC %s\n",
217 lov->desc.ld_active_tgt_count--;
218 lov->tgts[i].active = 0;
219 rc = obd_disconnect(&lov->tgts[i].conn);
221 CERROR("Target %s disconnect error %d\n",
222 lov->tgts[i].uuid, rc);
226 OBD_FREE(lov->tgts, lov->bufsize);
230 exp = class_conn2export(conn);
231 list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) {
232 /* XXX close these, instead of just discarding them? */
233 struct lov_file_handles *lfh;
234 lfh = list_entry(p, typeof(*lfh), lfh_list);
235 CERROR("discarding open LOV handle %p:"LPX64"\n",
236 lfh, lfh->lfh_cookie);
237 list_del(&lfh->lfh_list);
238 OBD_FREE(lfh->lfh_handles,
239 lfh->lfh_count * sizeof(*lfh->lfh_handles));
240 kmem_cache_free(lov_file_cache, lfh);
244 rc = class_disconnect(conn);
252 * -EINVAL : UUID can't be found in the LOV's target list
253 * -ENOTCONN: The UUID is found, but the target connection is bad (!)
254 * -EBADF : The UUID is found, but the OBD is the wrong type (!)
255 * -EALREADY: The OSC is already marked (in)active
257 static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
260 struct obd_device *obd;
264 CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
265 lov, uuid, activate);
267 spin_lock(&lov->lov_lock);
268 for (i = 0; i < lov->desc.ld_tgt_count; i++)
269 if (strncmp(uuid, lov->tgts[i].uuid,
270 sizeof(lov->tgts[i].uuid)) == 0)
273 if (i == lov->desc.ld_tgt_count)
274 GOTO(out, rc = -EINVAL);
276 obd = class_conn2obd(&lov->tgts[i].conn);
279 GOTO(out, rc = -ENOTCONN);
282 CDEBUG(D_INFO, "Found OBD %p type %s\n", obd, obd->obd_type->typ_name);
283 if (strcmp(obd->obd_type->typ_name, "osc") != 0) {
285 GOTO(out, rc = -EBADF);
288 if (lov->tgts[i].active == activate) {
289 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
290 activate ? "" : "in");
291 GOTO(out, rc = -EALREADY);
294 CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
296 lov->tgts[i].active = activate;
298 lov->desc.ld_active_tgt_count++;
300 lov->desc.ld_active_tgt_count--;
304 spin_unlock(&lov->lov_lock);
308 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
310 struct obd_ioctl_data* data = buf;
311 struct lov_obd *lov = &obd->u.lov;
315 if (data->ioc_inllen1 < 1) {
316 CERROR("osc setup requires an MDC UUID\n");
320 if (data->ioc_inllen1 > 37) {
321 CERROR("mdc UUID must be 36 characters or less\n");
325 spin_lock_init(&lov->lov_lock);
326 lov->mdcobd = class_uuid2obd(data->ioc_inlbuf1);
328 CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid,
335 static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle)
337 struct lov_file_handles *lfh = NULL;
339 if (!handle || !handle->addr)
342 lfh = (struct lov_file_handles *)(unsigned long)(handle->addr);
343 if (!kmem_cache_validate(lov_file_cache, lfh))
346 if (lfh->lfh_cookie != handle->cookie)
352 /* the LOV expects oa->o_id to be set to the LOV object id */
353 static int lov_create(struct lustre_handle *conn, struct obdo *oa,
354 struct lov_stripe_md **ea)
356 struct obd_export *export = class_conn2export(conn);
358 struct lov_stripe_md *lsm;
359 struct lov_oinfo *loi;
361 int ost_count, ost_idx = 1, i, rc = 0;
373 lov = &export->exp_obd->u.lov;
375 spin_lock(&lov->lov_lock);
376 ost_count = lov->desc.ld_tgt_count;
377 oa->o_easize = lov_stripe_md_size(ost_count);
381 OBD_ALLOC(lsm, oa->o_easize);
383 spin_unlock(&lov->lov_lock);
384 GOTO(out_tmp, rc = -ENOMEM);
386 lsm->lsm_magic = LOV_MAGIC;
387 lsm->lsm_mds_easize = lov_mds_md_size(ost_count);
388 ost_idx = 0; /* if lsm->lsm_stripe_offset is set yet */
391 LASSERT(oa->o_valid & OBD_MD_FLID);
392 lsm->lsm_object_id = oa->o_id;
393 if (!lsm->lsm_stripe_count)
394 lsm->lsm_stripe_count = lov->desc.ld_default_stripe_count;
395 if (!lsm->lsm_stripe_count)
396 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
397 else if (lsm->lsm_stripe_count > lov->desc.ld_active_tgt_count)
398 lsm->lsm_stripe_count = lov->desc.ld_active_tgt_count;
400 if (!lsm->lsm_stripe_size)
401 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
403 /* Because of 64-bit divide/mod operations only work with a 32-bit
404 * divisor in a 32-bit kernel, we cannot support a stripe width
405 * of 4GB or larger on 32-bit CPUs.
407 if (lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL) {
408 CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
409 lsm->lsm_stripe_size, lsm->lsm_stripe_count, ~0UL);
410 spin_unlock(&lov->lov_lock);
411 GOTO(out_free, rc = -EINVAL);
414 lsm->lsm_ost_count = ost_count;
415 if (!ost_idx || lsm->lsm_stripe_offset >= ost_count) {
416 int mult = lsm->lsm_object_id * lsm->lsm_stripe_count;
417 int stripe_offset = mult % ost_count;
418 int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count;
420 lsm->lsm_stripe_offset = stripe_offset + sub_offset;
423 while (!lov->tgts[lsm->lsm_stripe_offset].active)
424 lsm->lsm_stripe_offset = (lsm->lsm_stripe_offset+1) % ost_count;
426 /* Pick the OSTs before we release the lock */
427 ost_idx = lsm->lsm_stripe_offset;
428 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
429 CDEBUG(D_INODE, "objid "LPX64"[%d] is ost_idx %d (uuid %s)\n",
430 lsm->lsm_object_id, i, ost_idx, lov->tgts[ost_idx].uuid);
431 loi->loi_ost_idx = ost_idx;
433 ost_idx = (ost_idx + 1) % ost_count;
434 } while (!lov->tgts[ost_idx].active);
437 spin_unlock(&lov->lov_lock);
439 CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
440 lsm->lsm_stripe_count,lsm->lsm_object_id,lsm->lsm_stripe_offset);
442 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
443 struct lov_stripe_md obj_md;
444 struct lov_stripe_md *obj_mdp = &obj_md;
446 ost_idx = loi->loi_ost_idx;
448 /* create data objects with "parent" OA */
449 memcpy(tmp, oa, sizeof(*tmp));
450 tmp->o_easize = sizeof(struct lov_stripe_md);
451 rc = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp);
453 CERROR("error creating objid "LPX64" sub-object on "
454 "OST idx %d: rc = %d\n", oa->o_id, ost_idx, rc);
455 GOTO(out_cleanup, rc);
457 loi->loi_id = tmp->o_id;
458 loi->loi_size = tmp->o_size;
459 CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n",
460 lsm->lsm_object_id, loi->loi_id, ost_idx);
474 /* destroy already created objects here */
475 memcpy(tmp, oa, sizeof(*tmp));
476 tmp->o_id = loi->loi_id;
477 err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
479 CERROR("Failed to uncreate objid "LPX64" subobj "
480 LPX64" on OST idx %d: rc = %d\n",
481 oa->o_id, loi->loi_id, loi->loi_ost_idx,
485 OBD_FREE(lsm, oa->o_easize);
489 static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
490 struct lov_stripe_md *lsm)
493 struct obd_export *export = class_conn2export(conn);
495 struct lov_oinfo *loi;
496 struct lov_file_handles *lfh = NULL;
501 CERROR("LOV requires striping ea for destruction\n");
505 if (lsm->lsm_magic != LOV_MAGIC) {
506 CERROR("LOV striping magic bad %#lx != %#lx\n",
507 lsm->lsm_magic, LOV_MAGIC);
511 if (!export || !export->exp_obd)
514 if (oa->o_valid & OBD_MD_FLHANDLE)
515 lfh = lov_handle2lfh(obdo_handle(oa));
517 lov = &export->exp_obd->u.lov;
518 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
519 memcpy(&tmp, oa, sizeof(tmp));
520 tmp.o_id = loi->loi_id;
522 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
523 sizeof(lfh->lfh_handles[i]));
525 tmp.o_valid &= ~OBD_MD_FLHANDLE;
526 rc = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
528 CERROR("Error destroying objid "LPX64" subobj "LPX64
529 " on OST idx %d\n: rc = %d",
530 oa->o_id, loi->loi_id, loi->loi_ost_idx, rc);
535 /* compute object size given "stripeno" and the ost size */
536 static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
539 unsigned long ssize = lsm->lsm_stripe_size;
540 unsigned long swidth = ssize * lsm->lsm_stripe_count;
541 unsigned long stripe_size;
547 /* do_div(a, b) returns a % b, and a = a / b */
548 stripe_size = do_div(ost_size, ssize);
551 lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
553 lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
558 static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
559 struct lov_stripe_md *lsm, int stripeno, int *new)
562 obdo_cpy_md(tgt, src, valid);
563 if (valid & OBD_MD_FLSIZE)
564 tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno);
567 if (valid & OBD_MD_FLSIZE) {
568 /* this handles sparse files properly */
571 lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
572 if (lov_size > tgt->o_size)
573 tgt->o_size = lov_size;
575 if (valid & OBD_MD_FLBLOCKS)
576 tgt->o_blocks += src->o_blocks;
577 if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
578 tgt->o_ctime = src->o_ctime;
579 if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
580 tgt->o_mtime = src->o_mtime;
584 static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
585 struct lov_stripe_md *lsm)
588 struct obd_export *export = class_conn2export(conn);
590 struct lov_oinfo *loi;
591 struct lov_file_handles *lfh = NULL;
597 CERROR("LOV requires striping ea\n");
601 if (lsm->lsm_magic != LOV_MAGIC) {
602 CERROR("LOV striping magic bad %#lx != %#lx\n",
603 lsm->lsm_magic, LOV_MAGIC);
607 if (!export || !export->exp_obd)
610 lov = &export->exp_obd->u.lov;
612 if (oa->o_valid & OBD_MD_FLHANDLE)
613 lfh = lov_handle2lfh(obdo_handle(oa));
615 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
618 if (loi->loi_id == 0)
621 CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
622 "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx);
623 /* create data objects with "parent" OA */
624 memcpy(&tmp, oa, sizeof(tmp));
625 tmp.o_id = loi->loi_id;
627 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
628 sizeof(lfh->lfh_handles[i]));
630 tmp.o_valid &= ~OBD_MD_FLHANDLE;
632 err = obd_getattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL);
634 CERROR("Error getattr objid "LPX64" subobj "LPX64
635 " on OST idx %d: rc = %d\n",
636 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
639 continue; /* XXX or break? */
641 lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new);
646 static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
647 struct lov_stripe_md *lsm)
650 struct obd_export *export = class_conn2export(conn);
652 struct lov_oinfo *loi;
653 struct lov_file_handles *lfh = NULL;
657 /* Note that this code is currently unused, hence LBUG(), just
658 * to know when/if it is ever revived that it needs cleanups.
663 CERROR("LOV requires striping ea\n");
667 if (lsm->lsm_magic != LOV_MAGIC) {
668 CERROR("LOV striping magic bad %#lx != %#lx\n",
669 lsm->lsm_magic, LOV_MAGIC);
673 if (!export || !export->exp_obd)
676 /* size changes should go through punch and not setattr */
677 LASSERT(!(oa->o_valid & OBD_MD_FLSIZE));
683 if (oa->o_valid & OBD_MD_FLHANDLE)
684 lfh = lov_handle2lfh(obdo_handle(oa));
686 lov = &export->exp_obd->u.lov;
687 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
690 obdo_cpy_md(tmp, oa, oa->o_valid);
693 memcpy(obdo_handle(tmp), &lfh->lfh_handles[i],
694 sizeof(lfh->lfh_handles[i]));
696 tmp->o_valid &= ~OBD_MD_FLHANDLE;
698 tmp->o_id = loi->loi_id;
700 err = obd_setattr(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
702 CERROR("Error setattr objid "LPX64" subobj "LPX64
703 " on OST idx %d: rc = %d\n",
704 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
713 static int lov_open(struct lustre_handle *conn, struct obdo *oa,
714 struct lov_stripe_md *lsm)
717 struct obd_export *export = class_conn2export(conn);
719 struct lov_oinfo *loi;
720 struct lov_file_handles *lfh = NULL;
726 CERROR("LOV requires striping ea for opening\n");
730 if (lsm->lsm_magic != LOV_MAGIC) {
731 CERROR("LOV striping magic bad %#lx != %#lx\n",
732 lsm->lsm_magic, LOV_MAGIC);
736 if (!export || !export->exp_obd)
743 lfh = kmem_cache_alloc(lov_file_cache, GFP_KERNEL);
745 GOTO(out_tmp, rc = -ENOMEM);
746 OBD_ALLOC(lfh->lfh_handles,
747 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
748 if (!lfh->lfh_handles)
749 GOTO(out_lfh, rc = -ENOMEM);
751 lov = &export->exp_obd->u.lov;
754 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
757 /* create data objects with "parent" OA */
758 memcpy(tmp, oa, sizeof(*tmp));
759 tmp->o_id = loi->loi_id;
761 err = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL);
763 CERROR("Error open objid "LPX64" subobj "LPX64
764 " on OST idx %d: rc = %d\n",
765 oa->o_id, lsm->lsm_oinfo[i].loi_id,
766 loi->loi_ost_idx, rc);
771 lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &new);
773 if (tmp->o_valid & OBD_MD_FLHANDLE)
774 memcpy(&lfh->lfh_handles[i], obdo_handle(tmp),
775 sizeof(lfh->lfh_handles[i]));
778 if (tmp->o_valid & OBD_MD_FLHANDLE) {
779 struct lustre_handle *handle = obdo_handle(oa);
781 lfh->lfh_count = lsm->lsm_stripe_count;
782 get_random_bytes(&lfh->lfh_cookie, sizeof(lfh->lfh_cookie));
784 handle->addr = (__u64)(unsigned long)lfh;
785 handle->cookie = lfh->lfh_cookie;
786 oa->o_valid |= OBD_MD_FLHANDLE;
787 list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head);
791 /* FIXME: returning an error, but having opened some objects is a bad
792 * idea, since they will likely never be closed. We either
793 * need to not return an error if _some_ objects could be
794 * opened, and leave it to read/write to return -EIO (with
795 * hopefully partial error status) or close all opened objects
796 * and return an error. I think the former is preferred.
803 OBD_FREE(lfh->lfh_handles,
804 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
806 lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
807 kmem_cache_free(lov_file_cache, lfh);
811 static int lov_close(struct lustre_handle *conn, struct obdo *oa,
812 struct lov_stripe_md *lsm)
815 struct obd_export *export = class_conn2export(conn);
817 struct lov_oinfo *loi;
818 struct lov_file_handles *lfh = NULL;
823 CERROR("LOV requires striping ea\n");
827 if (lsm->lsm_magic != LOV_MAGIC) {
828 CERROR("LOV striping magic bad %#lx != %#lx\n",
829 lsm->lsm_magic, LOV_MAGIC);
833 if (!export || !export->exp_obd)
836 if (oa->o_valid & OBD_MD_FLHANDLE)
837 lfh = lov_handle2lfh(obdo_handle(oa));
839 lov = &export->exp_obd->u.lov;
840 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
843 /* create data objects with "parent" OA */
844 memcpy(&tmp, oa, sizeof(tmp));
845 tmp.o_id = loi->loi_id;
847 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
848 sizeof(lfh->lfh_handles[i]));
850 tmp.o_valid &= ~OBD_MD_FLHANDLE;
852 err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL);
854 CERROR("Error close objid "LPX64" subobj "LPX64
855 " on OST idx %d: rc = %d\n",
856 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
862 list_del(&lfh->lfh_list);
863 OBD_FREE(lfh->lfh_handles,
864 lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles));
865 lfh->lfh_cookie = DEAD_HANDLE_MAGIC;
866 kmem_cache_free(lov_file_cache, lfh);
873 #define log2(n) ffz(~(n))
876 #warning FIXME: merge these two functions now that they are nearly the same
878 /* compute ost offset in stripe "stripeno" corresponding to offset "lov_off" */
879 static obd_off lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
882 unsigned long ssize = lsm->lsm_stripe_size;
883 unsigned long swidth = ssize * lsm->lsm_stripe_count;
884 unsigned long stripe_off, this_stripe;
886 if (lov_off == OBD_OBJECT_EOF || lov_off == 0)
889 /* do_div(a, b) returns a % b, and a = a / b */
890 stripe_off = do_div(lov_off, swidth);
892 this_stripe = stripeno * ssize;
893 if (stripe_off <= this_stripe)
896 stripe_off -= this_stripe;
898 if (stripe_off > ssize)
903 return lov_off * ssize + stripe_off;
906 /* compute which stripe number "lov_off" will be written into */
907 static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
909 unsigned long ssize = lsm->lsm_stripe_size;
910 unsigned long swidth = ssize * lsm->lsm_stripe_count;
911 unsigned long stripe_off;
913 stripe_off = do_div(lov_off, swidth);
915 return stripe_off / ssize;
919 /* FIXME: maybe we'll just make one node the authoritative attribute node, then
920 * we can send this 'punch' to just the authoritative node and the nodes
921 * that the punch will affect. */
922 static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
923 struct lov_stripe_md *lsm,
924 obd_off start, obd_off end)
927 struct obd_export *export = class_conn2export(conn);
929 struct lov_oinfo *loi;
930 struct lov_file_handles *lfh = NULL;
935 CERROR("LOV requires striping ea\n");
939 if (lsm->lsm_magic != LOV_MAGIC) {
940 CERROR("LOV striping magic bad %#lx != %#lx\n",
941 lsm->lsm_magic, LOV_MAGIC);
945 if (!export || !export->exp_obd)
948 if (oa->o_valid & OBD_MD_FLHANDLE)
949 lfh = lov_handle2lfh(obdo_handle(oa));
951 lov = &export->exp_obd->u.lov;
952 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
953 obd_off starti = lov_stripe_offset(lsm, start, i);
954 obd_off endi = lov_stripe_offset(lsm, end, i);
959 /* create data objects with "parent" OA */
960 memcpy(&tmp, oa, sizeof(tmp));
961 tmp.o_id = loi->loi_id;
963 memcpy(obdo_handle(&tmp), &lfh->lfh_handles[i],
964 sizeof(lfh->lfh_handles[i]));
966 tmp.o_valid &= ~OBD_MD_FLHANDLE;
968 err = obd_punch(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL,
971 CERROR("Error punch objid "LPX64" subobj "LPX64
972 " on OST idx %d: rc = %d\n",
973 oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
981 struct lov_brw_cb_data {
982 atomic_t lbc_remaining;
983 wait_queue_head_t lbc_waitq;
986 static int lov_osc_brw_callback(struct io_cb_data *cbd, int err, int phase)
989 struct lov_brw_cb_data *lbc = cbd->data;
993 if (phase == CB_PHASE_START) {
994 /* We raise the reference count here, so that it's still
995 * around when we go to inspect in case of failure.
996 * Balanced in the loop at the bottom of lov_brw.
998 atomic_inc(&cbd->desc->bd_refcount);
1002 if (phase == CB_PHASE_FINISH) {
1004 CDEBUG(D_HA, "err %d on BRW to %s\n", err,
1005 cbd->desc->bd_connection->c_remote_uuid);
1009 CDEBUG(D_HA, "BRW to %s complete\n",
1010 cbd->desc->bd_connection->c_remote_uuid);
1014 if (atomic_dec_and_test(&lbc->lbc_remaining))
1015 wake_up(&lbc->lbc_waitq);
1023 static int lov_brw(int cmd, struct lustre_handle *conn,
1024 struct lov_stripe_md *lsm, obd_count oa_bufs,
1025 struct brw_page *pga, brw_callback_t callback,
1026 struct io_cb_data *cbd)
1028 int stripe_count = lsm->lsm_stripe_count;
1029 struct obd_export *export = class_conn2export(conn);
1030 struct lov_obd *lov;
1035 struct lov_stripe_md lsm;
1037 } *stripeinfo, *si, *si_last;
1038 struct brw_page *ioarr;
1040 struct io_cb_data *cb_data;
1041 struct lov_oinfo *loi;
1042 struct lov_brw_cb_data lbc;
1043 struct l_wait_info lwi;
1048 CERROR("LOV requires striping ea\n");
1052 if (lsm->lsm_magic != LOV_MAGIC) {
1053 CERROR("LOV striping magic bad %#lx != %#lx\n",
1054 lsm->lsm_magic, LOV_MAGIC);
1058 lov = &export->exp_obd->u.lov;
1060 OBD_ALLOC(stripeinfo, sizeof(*stripeinfo) * stripe_count);
1064 OBD_ALLOC(where, sizeof(*where) * oa_bufs);
1066 GOTO(out_sinfo, rc = -ENOMEM);
1068 OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs);
1070 GOTO(out_where, rc = -ENOMEM);
1072 OBD_ALLOC(cb_data, sizeof(*cb_data) * stripe_count);
1074 GOTO(out_ioarr, rc = -ENOMEM);
1076 init_waitqueue_head(&lbc.lbc_waitq);
1077 atomic_set(&lbc.lbc_remaining, 0);
1079 /* Compute the page count per stripe, and set where[i] to be the
1080 * stripe number for this brw_page.
1082 for (i = 0; i < oa_bufs; i++) {
1083 where[i] = lov_stripe_number(lsm, pga[i].off);
1084 if (stripeinfo[where[i]].bufct++ == 0)
1085 atomic_inc(&lbc.lbc_remaining);
1088 /* Find the starting offset within the page array for each stripeinfo,
1089 * and the index within this LOV's vector of possible OSCs.
1091 for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo;
1092 i < stripe_count; i++, loi++, si_last = si, si++) {
1094 si->index = si_last->index + si_last->bufct;
1095 si->lsm.lsm_object_id = loi->loi_id;
1096 si->ost_idx = loi->loi_ost_idx;
1099 /* Repack the requests densely into ioarr, with each target's pages in
1100 * order, and then grouped by stripe order (A1A2A3B1B2B3C1C2, for a
1101 * write with striping pattern of ABCABCAB)).
1103 for (i = 0; i < oa_bufs; i++) {
1104 int which = where[i];
1107 shift = stripeinfo[which].index + stripeinfo[which].subcount;
1108 LASSERT(shift < oa_bufs);
1109 ioarr[shift] = pga[i];
1110 ioarr[shift].off = lov_stripe_offset(lsm, pga[i].off, which);
1111 stripeinfo[which].subcount++;
1114 /* For each target to which we are writing -- some stripes might have
1115 * zero pages to write, e.g. the write is < stripe_count *stripe_width
1116 * -- call obd_brw for the range of brw_pages sent to that target.
1117 * ([offset, count] will be A:[0, 3], B:[3, 3], C:[6, 2] for the
1120 for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) {
1121 int shift = si->index;
1124 struct io_cb_data *data = &cb_data[i];
1125 LASSERT(shift < oa_bufs);
1127 /* This looks like ll_init_cb, except in-place. */
1128 init_waitqueue_head(&data->waitq);
1129 atomic_set(&data->refcount, 2);
1131 data->cb = callback;
1133 /* XXX handle error returns here */
1134 rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
1135 &si->lsm, si->bufct, &ioarr[shift],
1136 lov_osc_brw_callback, data);
1138 /* On error, pretend this didn't exist, because we won't
1139 * have seen a START call to add a ref to this OBD's
1140 * desc, and so we don't want to muddle with the
1141 * likely-deleted desc below.
1149 /* A brief note on the recovery story here:
1151 * Each obd_brw gets its own io_cb_data, and they're all fused into a
1152 * single allocation (cb_data). The lov_osc_brw_callback invocation
1153 * that results from each obd_brw's underlying bulk send/recv completing
1154 * will mark that io_cb_data as complete, and decrement the
1155 * lbc_remaining count in the LOV's "master" callback data.
1157 * The LOV will go to sleep as soon as all the (async) obd_brws have
1158 * been started. lov_osc_brw_callback will wake it up iff all OSCs have
1159 * completed (lbc_remaining has reached zero). If the timeout expires,
1160 * the LOV will walk the cb_data vector and initiate recovery on any
1161 * connection associated with an as-yet-incomplete desc.
1164 /* XXX Make sure that the callback doesn't block here, by faking
1165 * XXX "completion". This is very very gross, and we might be
1166 * XXX better off just not calling the callback at all.
1169 (void)callback(cbd, 0, CB_PHASE_START);
1170 /* XXX Watch us ignore the return code! */
1172 lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, NULL, NULL, NULL);
1173 rc = l_wait_event(lbc.lbc_waitq, atomic_read(&lbc.lbc_remaining) == 0,
1176 for (i = 0; i < oa_bufs; i++) {
1177 if (stripeinfo[i].bufct == 0)
1180 if (!cb_data[i].complete) {
1181 CERROR("invoking recovery for OSC %s: %d\n",
1182 lov->tgts[stripeinfo[i].ost_idx].uuid, rc);
1183 recovd_conn_fail(cb_data[i].desc->bd_connection);
1185 ptlrpc_bulk_decref(cb_data[i].desc);
1188 (void)callback(cbd, 0, CB_PHASE_FINISH);
1189 /* XXX We need an error reporting/bytes-written story here, statim. */
1193 OBD_FREE(cb_data, sizeof(*cb_data) * oa_bufs);
1195 OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
1197 OBD_FREE(where, sizeof(*where) * oa_bufs);
1199 OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
1204 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
1205 struct lustre_handle *parent_lock,
1206 __u32 type, void *cookie, int cookielen, __u32 mode,
1207 int *flags, void *cb, void *data, int datalen,
1208 struct lustre_handle *lockhs)
1210 struct obd_export *export = class_conn2export(conn);
1211 struct lov_obd *lov;
1212 struct lov_oinfo *loi;
1217 CERROR("LOV requires striping ea\n");
1221 if (lsm->lsm_magic != LOV_MAGIC) {
1222 CERROR("LOV striping magic bad %#lx != %#lx\n",
1223 lsm->lsm_magic, LOV_MAGIC);
1227 if (!export || !export->exp_obd)
1230 lov = &export->exp_obd->u.lov;
1231 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1232 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
1233 struct ldlm_extent sub_ext;
1234 struct lov_stripe_md submd;
1236 sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
1237 sub_ext.end = lov_stripe_offset(lsm, extent->end, i);
1238 if (sub_ext.start == sub_ext.end)
1241 submd.lsm_object_id = loi->loi_id;
1242 /* XXX submd lsm_mds_easize should be that from the subobj,
1243 * and the subobj should get it opaquely from the LOV.
1245 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1246 submd.lsm_stripe_count = 0;
1247 /* XXX submd is not fully initialized here */
1248 rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
1249 parent_lock, type, &sub_ext, sizeof(sub_ext),
1250 mode, flags, cb, data, datalen, &(lockhs[i]));
1251 // XXX add a lock debug statement here
1253 CERROR("Error enqueue objid "LPX64" subobj "LPX64
1254 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1255 loi->loi_id, loi->loi_ost_idx, rc);
1260 static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
1261 __u32 mode, struct lustre_handle *lockhs)
1263 struct obd_export *export = class_conn2export(conn);
1264 struct lov_obd *lov;
1265 struct lov_oinfo *loi;
1270 CERROR("LOV requires striping ea\n");
1274 if (lsm->lsm_magic != LOV_MAGIC) {
1275 CERROR("LOV striping magic bad %#lx != %#lx\n",
1276 lsm->lsm_magic, LOV_MAGIC);
1280 if (!export || !export->exp_obd)
1283 lov = &export->exp_obd->u.lov;
1284 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1285 struct lov_stripe_md submd;
1287 if (lockhs[i].addr == 0)
1290 submd.lsm_object_id = loi->loi_id;
1291 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1292 submd.lsm_stripe_count = 0;
1293 rc = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
1296 CERROR("Error cancel objid "LPX64" subobj "LPX64
1297 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1298 loi->loi_id, loi->loi_ost_idx, rc);
1303 static int lov_cancel_unused(struct lustre_handle *conn,
1304 struct lov_stripe_md *lsm, int local_only)
1306 struct obd_export *export = class_conn2export(conn);
1307 struct lov_obd *lov;
1308 struct lov_oinfo *loi;
1313 CERROR("LOV requires striping ea for lock cancellation\n");
1317 if (!export || !export->exp_obd)
1320 lov = &export->exp_obd->u.lov;
1321 for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
1322 struct lov_stripe_md submd;
1324 submd.lsm_object_id = loi->loi_id;
1325 submd.lsm_mds_easize = lov_mds_md_size(lsm->lsm_ost_count);
1326 submd.lsm_stripe_count = 0;
1327 rc = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
1328 &submd, local_only);
1330 CERROR("Error cancel unused objid "LPX64" subobj "LPX64
1331 " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
1332 loi->loi_id, loi->loi_ost_idx, rc);
1337 static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
1339 struct obd_export *export = class_conn2export(conn);
1340 struct lov_obd *lov;
1341 struct obd_statfs lov_sfs;
1347 if (!export || !export->exp_obd)
1350 lov = &export->exp_obd->u.lov;
1352 /* We only get block data from the OBD */
1353 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1356 if (!lov->tgts[i].active)
1359 err = obd_statfs(&lov->tgts[i].conn, &lov_sfs);
1361 CERROR("Error statfs OSC %s idx %d: err = %d\n",
1362 lov->tgts[i].uuid, i, err);
1365 continue; /* XXX or break? - probably OK to continue */
1368 memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
1371 osfs->os_bfree += lov_sfs.os_bfree;
1372 osfs->os_bavail += lov_sfs.os_bavail;
1373 osfs->os_blocks += lov_sfs.os_blocks;
1374 /* XXX not sure about this one - depends on policy.
1375 * - could be minimum if we always stripe on all OBDs
1376 * (but that would be wrong for any other policy,
1377 * if one of the OBDs has no more objects left)
1378 * - could be sum if we stripe whole objects
1379 * - could be average, just to give a nice number
1380 * - we just pick first OST and hope it is enough
1381 sfs->f_ffree += lov_sfs.f_ffree;
1388 static int lov_iocontrol(long cmd, struct lustre_handle *conn, int len,
1389 void *karg, void *uarg)
1391 struct obd_device *obddev = class_conn2obd(conn);
1392 struct obd_ioctl_data *data = karg;
1393 struct lov_obd *lov = &obddev->u.lov;
1398 case IOC_LOV_SET_OSC_ACTIVE:
1399 rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset);
1402 if (lov->desc.ld_tgt_count == 0)
1405 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1406 int err = obd_iocontrol(cmd, &lov->tgts[i].conn,
1416 struct obd_ops lov_obd_ops = {
1418 o_connect: lov_connect,
1419 o_disconnect: lov_disconnect,
1420 o_create: lov_create,
1421 o_destroy: lov_destroy,
1422 o_getattr: lov_getattr,
1423 o_setattr: lov_setattr,
1424 o_statfs: lov_statfs,
1429 o_enqueue: lov_enqueue,
1430 o_cancel: lov_cancel,
1431 o_cancel_unused: lov_cancel_unused,
1432 o_iocontrol: lov_iocontrol
1436 #define LOV_VERSION "v0.1"
1438 static int __init lov_init(void)
1440 printk(KERN_INFO "Lustre Logical Object Volume driver " LOV_VERSION
1441 ", info@clusterfs.com\n");
1442 lov_file_cache = kmem_cache_create("ll_lov_file_data",
1443 sizeof(struct lov_file_handles),
1445 if (!lov_file_cache)
1448 return class_register_type(&lov_obd_ops, OBD_LOV_DEVICENAME);
1451 static void __exit lov_exit(void)
1453 class_unregister_type(OBD_LOV_DEVICENAME);
1454 if (kmem_cache_destroy(lov_file_cache))
1455 CERROR("couldn't free LOV open cache\n");
1458 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1459 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver " LOV_VERSION);
1460 MODULE_LICENSE("GPL");
1462 module_init(lov_init);
1463 module_exit(lov_exit);