1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
5 * Lustre Metadata Server (mds) handling of striped file data
7 * Copyright (C) 2001-2006 Cluster File Systems, Inc.
8 * Author: Peter Braam <braam@clusterfs.com>
9 * wangdi <wangdi@clusterfs.com>
11 * This file is part of the Lustre file system, http://www.lustre.org
12 * Lustre is a trademark of Cluster File Systems, Inc.
14 * You may have signed or agreed to another license before downloading
15 * this software. If so, you are bound by the terms and conditions
16 * of that agreement, and the following does not apply to you. See the
17 * LICENSE file included with this distribution for more information.
19 * If you did not agree to a different license, then this copy of Lustre
20 * is open source software; you can redistribute it and/or modify it
21 * under the terms of version 2 of the GNU General Public License as
22 * published by the Free Software Foundation.
24 * In either case, Lustre is distributed in the hope that it will be
25 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
26 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * license text for more details.
30 # define EXPORT_SYMTAB
32 #define DEBUG_SUBSYSTEM S_MDS
34 #include <linux/module.h>
36 #include <obd_class.h>
37 #include <lustre_ver.h>
38 #include <obd_support.h>
40 #include <lprocfs_status.h>
41 #include <lustre_mds.h>
42 #include <lustre_fid.h>
43 #include <lustre/lustre_idl.h>
45 #include "mdd_internal.h"
47 static int mdd_notify(struct obd_device *host, struct obd_device *watched,
48 enum obd_notify_event ev, void *owner)
50 struct mdd_device *mdd = owner;
54 LASSERT(owner != NULL);
57 case OBD_NOTIFY_ACTIVE:
59 case OBD_NOTIFY_SYNC_NONBLOCK:
60 rc = md_do_upcall(NULL, &mdd->mdd_md_dev, MD_LOV_SYNC);
62 case OBD_NOTIFY_CONFIG:
63 rc = md_do_upcall(NULL, &mdd->mdd_md_dev, MD_LOV_CONFIG);
66 CDEBUG(D_INFO, "Unhandled notification %#x\n", ev);
72 /* The obd is created for handling data stack for mdd */
73 int mdd_init_obd(const struct lu_env *env, struct mdd_device *mdd,
74 struct lustre_cfg *cfg)
76 char *dev = lustre_cfg_string(cfg, 0);
77 int rc, name_size, uuid_size;
80 struct lustre_cfg_bufs *bufs;
81 struct lustre_cfg *lcfg;
82 struct obd_device *obd;
85 mds_id = mdd2lu_dev(mdd)->ld_site->ls_node_id;
86 name_size = strlen(MDD_OBD_NAME) + 35;
87 uuid_size = strlen(MDD_OBD_UUID) + 35;
89 OBD_ALLOC(name, name_size);
90 OBD_ALLOC(uuid, uuid_size);
91 if (name == NULL || uuid == NULL)
92 GOTO(cleanup_mem, rc = -ENOMEM);
96 GOTO(cleanup_mem, rc = -ENOMEM);
98 snprintf(name, strlen(MDD_OBD_NAME) + 35, "%s-%s-%d",
99 MDD_OBD_NAME, dev, mds_id);
101 snprintf(uuid, strlen(MDD_OBD_UUID) + 35, "%s-%s-%d",
102 MDD_OBD_UUID, dev, mds_id);
104 lustre_cfg_bufs_reset(bufs, name);
105 lustre_cfg_bufs_set_string(bufs, 1, MDD_OBD_TYPE);
106 lustre_cfg_bufs_set_string(bufs, 2, uuid);
107 lustre_cfg_bufs_set_string(bufs, 3, (char*)dev/* MDD_OBD_PROFILE */);
108 lustre_cfg_bufs_set_string(bufs, 4, (char*)dev);
110 lcfg = lustre_cfg_new(LCFG_ATTACH, bufs);
113 GOTO(cleanup_mem, rc = -ENOMEM);
115 rc = class_attach(lcfg);
117 GOTO(lcfg_cleanup, rc);
119 obd = class_name2obd(name);
121 CERROR("Can not find obd %s\n", MDD_OBD_NAME);
125 obd->obd_recovering = 1;
126 obd->u.mds.mds_id = mds_id;
127 rc = class_setup(obd, lcfg);
129 GOTO(class_detach, rc);
132 * Add here for obd notify mechanism, when adding a new ost, the mds
133 * will notify this mdd.
135 obd->obd_upcall.onu_upcall = mdd_notify;
136 obd->obd_upcall.onu_owner = mdd;
137 mdd->mdd_obd_dev = obd;
142 class_detach(obd, lcfg);
144 lustre_cfg_free(lcfg);
147 OBD_FREE(name, name_size);
149 OBD_FREE(uuid, uuid_size);
153 int mdd_fini_obd(const struct lu_env *env, struct mdd_device *mdd,
154 struct lustre_cfg *lcfg)
156 struct obd_device *obd;
160 obd = mdd2obd_dev(mdd);
163 rc = class_cleanup(obd, lcfg);
165 GOTO(lcfg_cleanup, rc);
167 obd->obd_upcall.onu_upcall = NULL;
168 obd->obd_upcall.onu_owner = NULL;
169 rc = class_detach(obd, lcfg);
171 GOTO(lcfg_cleanup, rc);
172 mdd->mdd_obd_dev = NULL;
179 int mdd_get_md(const struct lu_env *env, struct mdd_object *obj,
180 void *md, int *md_size, const char *name)
185 rc = mdo_xattr_get(env, obj, mdd_buf_get(env, md, *md_size), name,
186 mdd_object_capa(env, obj));
188 * XXX: Handling of -ENODATA, the right way is to have ->do_md_get()
189 * exported by dt layer.
191 if (rc == 0 || rc == -ENODATA) {
195 CERROR("Error %d reading eadata \n", rc);
197 /* XXX: Convert lov EA but fixed after verification test. */
204 int mdd_get_md_locked(const struct lu_env *env, struct mdd_object *obj,
205 void *md, int *md_size, const char *name)
208 mdd_read_lock(env, obj);
209 rc = mdd_get_md(env, obj, md, md_size, name);
210 mdd_read_unlock(env, obj);
214 static int mdd_lov_set_stripe_md(const struct lu_env *env,
215 struct mdd_object *obj, struct lu_buf *buf,
216 struct thandle *handle)
218 struct mdd_device *mdd = mdo2mdd(&obj->mod_obj);
219 struct obd_device *obd = mdd2obd_dev(mdd);
220 struct obd_export *lov_exp = obd->u.mds.mds_osc_exp;
221 struct lov_stripe_md *lsm = NULL;
225 LASSERT(S_ISDIR(mdd_object_type(obj)) || S_ISREG(mdd_object_type(obj)));
226 rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, lov_exp, 0,
230 obd_free_memmd(lov_exp, &lsm);
232 rc = mdd_xattr_set_txn(env, obj, buf, MDS_LOV_MD_NAME, 0, handle);
234 CDEBUG(D_INFO, "set lov ea of "DFID" rc %d \n", PFID(mdo2fid(obj)), rc);
239 * Permission check is done before call it,
240 * no need check again.
242 static int mdd_lov_set_dir_md(const struct lu_env *env,
243 struct mdd_object *obj, struct lu_buf *buf,
244 struct thandle *handle)
246 struct lov_user_md *lum = NULL;
250 LASSERT(S_ISDIR(mdd_object_type(obj)));
251 lum = (struct lov_user_md*)buf->lb_buf;
253 /* if { size, offset, count } = { 0, -1, 0 } (i.e. all default
254 * values specified) then delete default striping from dir. */
255 if ((lum->lmm_stripe_size == 0 && lum->lmm_stripe_count == 0 &&
256 lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1)) ||
257 /* lmm_stripe_size == -1 is deprecated in 1.4.6 */
258 lum->lmm_stripe_size == (typeof(lum->lmm_stripe_size))(-1)){
259 rc = mdd_xattr_set_txn(env, obj, &LU_BUF_NULL,
260 MDS_LOV_MD_NAME, 0, handle);
263 CDEBUG(D_INFO, "delete lov ea of "DFID" rc %d \n",
264 PFID(mdo2fid(obj)), rc);
266 rc = mdd_lov_set_stripe_md(env, obj, buf, handle);
271 int mdd_lsm_sanity_check(const struct lu_env *env, struct mdd_object *obj)
273 struct lu_attr *tmp_la = &mdd_env_info(env)->mti_la;
274 struct md_ucred *uc = md_ucred(env);
278 rc = mdd_la_get(env, obj, tmp_la, BYPASS_CAPA);
282 if ((uc->mu_fsuid != tmp_la->la_uid) && !mdd_capable(uc, CAP_FOWNER))
283 rc = mdd_permission_internal_locked(env, obj, tmp_la,
289 int mdd_lov_set_md(const struct lu_env *env, struct mdd_object *pobj,
290 struct mdd_object *child, struct lov_mds_md *lmmp,
291 int lmm_size, struct thandle *handle, int set_stripe)
298 buf = mdd_buf_get(env, lmmp, lmm_size);
299 mode = mdd_object_type(child);
300 if (S_ISREG(mode) && lmm_size > 0) {
302 rc = mdd_lov_set_stripe_md(env, child, buf, handle);
304 rc = mdd_xattr_set_txn(env, child, buf,
305 MDS_LOV_MD_NAME, 0, handle);
307 } else if (S_ISDIR(mode)) {
308 if (lmmp == NULL && lmm_size == 0) {
309 struct mdd_device *mdd = mdd_obj2mdd_dev(child);
310 struct lov_mds_md *lmm = mdd_max_lmm_get(env, mdd);
311 int size = sizeof(*lmm);
313 /* Get parent dir stripe and set */
315 rc = mdd_get_md_locked(env, pobj, lmm, &size,
318 buf = mdd_buf_get(env, lmm, size);
319 rc = mdd_xattr_set_txn(env, child, buf,
320 MDS_LOV_MD_NAME, 0, handle);
322 CERROR("error on copy stripe info: rc "
326 LASSERT(lmmp != NULL && lmm_size > 0);
327 rc = mdd_lov_set_dir_md(env, child, buf, handle);
330 CDEBUG(D_INFO, "Set lov md %p size %d for fid "DFID" rc %d\n",
331 lmmp, lmm_size, PFID(mdo2fid(child)), rc);
336 * XXX: this is for create lsm object id, which should identify the lsm object
337 * unique in the whole mds, as I see. But it seems, we still not need it
338 * now. Right? So just borrow the ll_fid_build_ino().
340 static obd_id mdd_lov_create_id(const struct lu_fid *fid)
342 return fid_flatten(fid);
345 static int mdd_lov_objid_alloc(const struct lu_env *env,
346 struct mdd_device *mdd)
348 struct mdd_thread_info *info = mdd_env_info(env);
349 struct mds_obd *mds = &mdd->mdd_obd_dev->u.mds;
351 OBD_ALLOC(info->mti_oti.oti_objid,
352 mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id));
353 return (info->mti_oti.oti_objid == NULL ? -ENOMEM : 0);
356 void mdd_lov_objid_update(const struct lu_env *env, struct mdd_device *mdd)
358 struct mdd_thread_info *info = mdd_env_info(env);
359 if (info->mti_oti.oti_objid != NULL)
360 mds_lov_update_objids(mdd->mdd_obd_dev,
361 info->mti_oti.oti_objid);
364 static void mdd_lov_objid_from_lmm(const struct lu_env *env,
365 struct mdd_device *mdd,
366 struct lov_mds_md *lmm)
368 struct mds_obd *mds = &mdd->mdd_obd_dev->u.mds;
369 struct mdd_thread_info *info = mdd_env_info(env);
370 mds_objids_from_lmm(info->mti_oti.oti_objid, lmm, &mds->mds_lov_desc);
373 static void mdd_lov_objid_free(const struct lu_env *env,
374 struct mdd_device *mdd)
376 struct mdd_thread_info *info = mdd_env_info(env);
377 struct mds_obd *mds = &mdd->mdd_obd_dev->u.mds;
379 OBD_FREE(info->mti_oti.oti_objid,
380 mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id));
381 info->mti_oti.oti_objid = NULL;
384 void mdd_lov_create_finish(const struct lu_env *env, struct mdd_device *mdd,
385 struct lov_mds_md *lmm, int lmm_size,
386 const struct md_op_spec *spec)
388 struct mdd_thread_info *info = mdd_env_info(env);
390 if (lmm && !spec->u.sp_ea.no_lov_create)
391 OBD_FREE(lmm, lmm_size);
393 if (info->mti_oti.oti_objid != NULL)
394 mdd_lov_objid_free(env, mdd);
397 int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd,
398 struct mdd_object *parent, struct mdd_object *child,
399 struct lov_mds_md **lmm, int *lmm_size,
400 const struct md_op_spec *spec, struct lu_attr *la)
402 struct obd_device *obd = mdd2obd_dev(mdd);
403 struct obd_export *lov_exp = obd->u.mds.mds_osc_exp;
405 struct lov_stripe_md *lsm = NULL;
406 const void *eadata = spec->u.sp_ea.eadata;
407 __u32 create_flags = spec->sp_cr_flags;
408 struct obd_trans_info *oti = &mdd_env_info(env)->mti_oti;
412 if (create_flags & MDS_OPEN_DELAY_CREATE ||
413 !(create_flags & FMODE_WRITE))
417 rc = mdd_lov_objid_alloc(env, mdd);
421 /* replay case, has objects already, only get lov from eadata */
422 if (spec->u.sp_ea.no_lov_create != 0) {
423 *lmm = (struct lov_mds_md *)spec->u.sp_ea.eadata;
424 *lmm_size = spec->u.sp_ea.eadatalen;
425 mdd_lov_objid_from_lmm(env, mdd, *lmm);
429 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_ALLOC_OBDO))
430 GOTO(out_ids, rc = -ENOMEM);
432 LASSERT(lov_exp != NULL);
433 oa = &mdd_env_info(env)->mti_oa;
435 oa->o_uid = 0; /* must have 0 uid / gid on OST */
437 oa->o_gr = FILTER_GROUP_MDS0 + mdd2lu_dev(mdd)->ld_site->ls_node_id;
438 oa->o_mode = S_IFREG | 0600;
439 oa->o_id = mdd_lov_create_id(mdd_object_fid(child));
440 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS |
441 OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGROUP;
444 if (!(create_flags & MDS_OPEN_HAS_OBJS)) {
445 if (create_flags & MDS_OPEN_HAS_EA) {
446 LASSERT(eadata != NULL);
447 rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, lov_exp,
448 0, &lsm, (void*)eadata);
451 lsm->lsm_object_id = oa->o_id;
452 lsm->lsm_object_gr = oa->o_gr;
453 } else if (parent != NULL) {
454 /* get lov ea from parent and set to lov */
455 struct lov_mds_md *_lmm;
458 _lmm_size = mdd_lov_mdsize(env, mdd);
459 _lmm = mdd_max_lmm_get(env, mdd);
462 GOTO(out_oti, rc = -ENOMEM);
464 rc = mdd_get_md_locked(env, parent, _lmm,
468 rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
469 lov_exp, 0, &lsm, _lmm);
474 rc = obd_create(lov_exp, oa, &lsm, oti);
477 CERROR("Create error for "DFID": %d\n",
478 PFID(mdo2fid(child)), rc);
483 LASSERT(lsm->lsm_object_gr >= FILTER_GROUP_MDS0);
485 LASSERT(eadata != NULL);
486 rc = obd_iocontrol(OBD_IOC_LOV_SETEA, lov_exp, 0, &lsm,
490 lsm->lsm_object_id = oa->o_id;
491 lsm->lsm_object_gr = oa->o_gr;
495 * Sometimes, we may truncate some object(without lsm) then open it
496 * (with write flags), so creating lsm above. The Nonzero(truncated)
497 * size should tell ost, since size attr is in charge by OST.
499 if (la->la_size && la->la_valid & LA_SIZE) {
500 struct obd_info *oinfo = &mdd_env_info(env)->mti_oi;
502 memset(oinfo, 0, sizeof(*oinfo));
504 oa->o_size = la->la_size;
506 /* When setting attr to ost, FLBKSZ is not needed. */
507 oa->o_valid &= ~OBD_MD_FLBLKSZ;
508 obdo_from_la(oa, la, OBD_MD_FLTYPE | OBD_MD_FLATIME |
509 OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLSIZE);
512 * XXX: Pack lustre id to OST, in OST, it will be packed by
513 * filter_fid, but can not see what is the usages. So just pack
514 * o_seq o_ver here, maybe fix it after this cycle.
516 oa->o_fid = fid_seq(mdd_object_fid(child));
517 oa->o_generation = fid_oid(mdd_object_fid(child));
518 oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
521 oinfo->oi_capa = mdo_capa_get(env, child, NULL,
522 CAPA_OPC_MDS_DEFAULT);
523 if (IS_ERR(oinfo->oi_capa))
524 oinfo->oi_capa = NULL;
526 rc = obd_setattr(lov_exp, oinfo, oti);
527 capa_put(oinfo->oi_capa);
529 CERROR("Error setting attrs for "DFID": rc %d\n",
530 PFID(mdo2fid(child)), rc);
532 CERROR("obd_setattr for "DFID" rc %d\n",
533 PFID(mdo2fid(child)), rc);
540 /* blksize should be changed after create data object */
541 la->la_valid |= LA_BLKSIZE;
542 la->la_blksize = oa->o_blksize;
544 rc = obd_packmd(lov_exp, lmm, lsm);
546 CERROR("Cannot pack lsm, err = %d\n", rc);
553 oti_free_cookies(oti);
556 obd_free_memmd(lov_exp, &lsm);
558 mdd_lov_objid_free(env, mdd);
565 * used when destroying orphans and from mds_reint_unlink() when MDS wants to
566 * destroy objects on OSS.
569 int mdd_lovobj_unlink(const struct lu_env *env, struct mdd_device *mdd,
570 struct mdd_object *obj, struct lu_attr *la,
571 struct lov_mds_md *lmm, int lmm_size,
572 struct llog_cookie *logcookies,
575 struct obd_device *obd = mdd2obd_dev(mdd);
576 struct obd_export *lov_exp = obd->u.mds.mds_osc_exp;
577 struct lov_stripe_md *lsm = NULL;
578 struct obd_trans_info *oti = &mdd_env_info(env)->mti_oti;
579 struct obdo *oa = &mdd_env_info(env)->mti_oa;
586 rc = obd_unpackmd(lov_exp, &lsm, lmm, lmm_size);
588 CERROR("Error unpack md %p\n", lmm);
591 LASSERT(rc >= sizeof(*lsm));
595 oa->o_id = lsm->lsm_object_id;
596 oa->o_gr = FILTER_GROUP_MDS0 + mdd2lu_dev(mdd)->ld_site->ls_node_id;
597 oa->o_mode = la->la_mode & S_IFMT;
598 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
601 if (log_unlink && logcookies) {
602 oa->o_valid |= OBD_MD_FLCOOKIE;
603 oti->oti_logcookies = logcookies;
606 CDEBUG(D_INFO, "destroying OSS object %d/%d\n",
607 (int)oa->o_id, (int)oa->o_gr);
609 rc = obd_destroy(lov_exp, oa, lsm, oti, NULL);
611 obd_free_memmd(lov_exp, &lsm);
617 * called with obj not locked.
619 int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd,
620 struct mdd_object *obj, struct lu_attr *la)
622 struct md_attr *ma = &mdd_env_info(env)->mti_ma;
626 if (unlikely(la->la_nlink != 0)) {
627 CWARN("Attempt to destroy OSS object when nlink == %d\n",
632 ma->ma_lmm_size = mdd_lov_mdsize(env, mdd);
633 ma->ma_lmm = mdd_max_lmm_get(env, mdd);
634 ma->ma_cookie_size = mdd_lov_cookiesize(env, mdd);
635 ma->ma_cookie = mdd_max_cookie_get(env, mdd);
636 if (ma->ma_lmm == NULL || ma->ma_cookie == NULL)
637 RETURN(rc = -ENOMEM);
640 rc = mdd_get_md_locked(env, obj, ma->ma_lmm, &ma->ma_lmm_size,
643 CWARN("Get lov ea failed for "DFID"\n", PFID(mdo2fid(obj)));
646 ma->ma_valid = MA_LOV;
648 rc = mdd_unlink_log(env, mdd, obj, ma);
650 CWARN("mds unlink log for "DFID" failed: %d\n",
651 PFID(mdo2fid(obj)), rc);
654 if (ma->ma_valid | MA_COOKIE)
655 rc = mdd_lovobj_unlink(env, mdd, obj, la,
656 ma->ma_lmm, ma->ma_lmm_size,
662 int mdd_unlink_log(const struct lu_env *env, struct mdd_device *mdd,
663 struct mdd_object *mdd_cobj, struct md_attr *ma)
665 struct obd_device *obd = mdd2obd_dev(mdd);
667 LASSERT(ma->ma_valid & MA_LOV);
669 if ((ma->ma_cookie_size > 0) &&
670 (mds_log_op_unlink(obd, ma->ma_lmm, ma->ma_lmm_size,
671 ma->ma_cookie, ma->ma_cookie_size) > 0)) {
672 ma->ma_valid |= MA_COOKIE;
677 int mdd_setattr_log(const struct lu_env *env, struct mdd_device *mdd,
678 const struct md_attr *ma,
679 struct lov_mds_md *lmm, int lmm_size,
680 struct llog_cookie *logcookies, int cookies_size)
682 struct obd_device *obd = mdd2obd_dev(mdd);
684 /* journal chown/chgrp in llog, just like unlink */
686 CDEBUG(D_INFO, "setattr llog for uid/gid=%lu/%lu\n",
687 (unsigned long)ma->ma_attr.la_uid,
688 (unsigned long)ma->ma_attr.la_gid);
689 return mds_log_op_setattr(obd, ma->ma_attr.la_uid,
690 ma->ma_attr.la_gid, lmm,
691 lmm_size, logcookies,
697 int mdd_lov_setattr_async(const struct lu_env *env, struct mdd_object *obj,
698 struct lov_mds_md *lmm, int lmm_size,
699 struct llog_cookie *logcookies)
701 struct mdd_device *mdd = mdo2mdd(&obj->mod_obj);
702 struct obd_device *obd = mdd2obd_dev(mdd);
703 struct lu_attr *tmp_la = &mdd_env_info(env)->mti_la;
704 const struct lu_fid *fid = mdd_object_fid(obj);
709 mdd_read_lock(env, obj);
710 rc = mdo_attr_get(env, obj, tmp_la, mdd_object_capa(env, obj));
711 mdd_read_unlock(env, obj);
715 oc = mdo_capa_get(env, obj, NULL, CAPA_OPC_MDS_DEFAULT);
719 rc = mds_osc_setattr_async(obd, tmp_la->la_uid, tmp_la->la_gid, lmm,
720 lmm_size, logcookies, fid_seq(fid),