X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flod%2Flod_dev.c;h=e36aee1d59d7d1f1c90377ff2ef0e2a4d150b807;hb=a674871d5f9e4819b3428593e24df6e52096612f;hp=8411ed978fb87fa38862ed7ed984ce7d71d3e9ab;hpb=189c466ff8f7cbf6f1ebb53295f09df2ae8237a2;p=fs%2Flustre-release.git diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 8411ed9..e36aee1 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -23,7 +23,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Intel, Inc. + * Copyright (c) 2012, 2013, Intel Corporation. * */ /* @@ -38,18 +38,64 @@ * Author: Mikhail Pershin */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_MDS #include +#include +#include #include +#include #include "lod_internal.h" +/** + * Lookup MDT/OST index \a tgt by FID \a fid. + * + * \param lod LOD to be lookup at. + * \param fid FID of object to find MDT/OST. + * \param tgt MDT/OST index to return. + * \param type indidcate the FID is on MDS or OST. + **/ +int lod_fld_lookup(const struct lu_env *env, struct lod_device *lod, + const struct lu_fid *fid, __u32 *tgt, int type) +{ + struct lu_seq_range range = { 0 }; + struct lu_server_fld *server_fld; + int rc = 0; + ENTRY; + + LASSERTF(fid_is_sane(fid), "Invalid FID "DFID"\n", PFID(fid)); + if (fid_is_idif(fid)) { + *tgt = fid_idif_ost_idx(fid); + RETURN(rc); + } + + if (!lod->lod_initialized || (!fid_seq_in_fldb(fid_seq(fid)))) { + LASSERT(lu_site2seq(lod2lu_dev(lod)->ld_site) != NULL); + *tgt = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; + RETURN(rc); + } + + server_fld = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_server_fld; + fld_range_set_type(&range, type); + rc = fld_server_lookup(env, server_fld, fid_seq(fid), &range); + if (rc) + RETURN(rc); + + *tgt = range.lsr_index; + + CDEBUG(D_INFO, "LOD: got tgt %x for sequence: " + LPX64"\n", *tgt, fid_seq(fid)); + + RETURN(rc); +} + +extern struct lu_object_operations lod_lu_obj_ops; +extern struct lu_object_operations lod_lu_robj_ops; +extern struct dt_object_operations lod_obj_ops; + /* Slab for OSD object allocation */ -cfs_mem_cache_t *lod_object_kmem; +struct kmem_cache *lod_object_kmem; static struct lu_kmem_descr lod_caches[] = { { @@ -65,6 +111,127 @@ static struct lu_kmem_descr lod_caches[] = { static struct lu_device *lod_device_fini(const struct lu_env *env, struct lu_device *d); +struct lu_object *lod_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct lod_object *lod_obj; + struct lu_object *lu_obj; + const struct lu_fid *fid = &hdr->loh_fid; + mdsno_t mds; + int rc = 0; + ENTRY; + + OBD_SLAB_ALLOC_PTR_GFP(lod_obj, lod_object_kmem, __GFP_IO); + if (lod_obj == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = lod_fld_lookup(env, lu2lod_dev(dev), fid, &mds, LU_SEQ_RANGE_MDT); + if (rc) { + OBD_SLAB_FREE_PTR(lod_obj, lod_object_kmem); + RETURN(ERR_PTR(rc)); + } + + lod_obj->ldo_mds_num = mds; + lu_obj = lod2lu_obj(lod_obj); + dt_object_init(&lod_obj->ldo_obj, NULL, dev); + lod_obj->ldo_obj.do_ops = &lod_obj_ops; + if (likely(mds == lu_site2seq(dev->ld_site)->ss_node_id)) + lu_obj->lo_ops = &lod_lu_obj_ops; + else + lu_obj->lo_ops = &lod_lu_robj_ops; + RETURN(lu_obj); +} + +static int lod_cleanup_desc_tgts(const struct lu_env *env, + struct lod_device *lod, + struct lod_tgt_descs *ltd, + struct lustre_cfg *lcfg) +{ + struct lu_device *next; + int rc = 0; + int i; + + lod_getref(ltd); + if (ltd->ltd_tgts_size <= 0) { + lod_putref(lod, ltd); + return 0; + } + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + struct lod_tgt_desc *tgt; + int rc1; + + tgt = LTD_TGT(ltd, i); + LASSERT(tgt && tgt->ltd_tgt); + next = &tgt->ltd_tgt->dd_lu_dev; + rc1 = next->ld_ops->ldo_process_config(env, next, lcfg); + if (rc1) { + CERROR("%s: error cleaning up LOD index %u: cmd %#x" + ": rc = %d\n", lod2obd(lod)->obd_name, i, + lcfg->lcfg_command, rc1); + rc = rc1; + } + } + lod_putref(lod, ltd); + return rc; +} + +static int lodname2mdt_index(char *lodname, long *index) +{ + char *ptr, *tmp; + + /* The lodname suppose to be fsname-MDTxxxx-mdtlov */ + ptr = strrchr(lodname, '-'); + if (ptr == NULL) { + CERROR("invalid MDT index in '%s'\n", lodname); + return -EINVAL; + } + + if (strncmp(ptr, "-mdtlov", 7) != 0) { + CERROR("invalid MDT index in '%s'\n", lodname); + return -EINVAL; + } + + if ((unsigned long)ptr - (unsigned long)lodname <= 8) { + CERROR("invalid MDT index in '%s'\n", lodname); + return -EINVAL; + } + + if (strncmp(ptr - 8, "-MDT", 4) != 0) { + CERROR("invalid MDT index in '%s'\n", lodname); + return -EINVAL; + } + + *index = simple_strtol(ptr - 4, &tmp, 16); + if (*tmp != '-' || *index > INT_MAX || *index < 0) { + CERROR("invalid MDT index in '%s'\n", lodname); + return -EINVAL; + } + return 0; +} + +/** + * Procss config log on LOD + * \param env environment info + * \param dev lod device + * \param lcfg config log + * + * Add osc config log, + * marker 20 (flags=0x01, v2.2.49.56) lustre-OST0001 'add osc' + * add_uuid nid=192.168.122.162@tcp(0x20000c0a87aa2) 0: 1:nidxxx + * attach 0:lustre-OST0001-osc-MDT0001 1:osc 2:lustre-MDT0001-mdtlov_UUID + * setup 0:lustre-OST0001-osc-MDT0001 1:lustre-OST0001_UUID 2:nid + * lov_modify_tgts add 0:lustre-MDT0001-mdtlov 1:lustre-OST0001_UUID 2:1 3:1 + * marker 20 (flags=0x02, v2.2.49.56) lustre-OST0001 'add osc' + * + * Add mdc config log + * marker 10 (flags=0x01, v2.2.49.56) lustre-MDT0000 'add osp' + * add_uuid nid=192.168.122.162@tcp(0x20000c0a87aa2) 0: 1:nid + * attach 0:lustre-MDT0000-osp-MDT0001 1:osp 2:lustre-MDT0001-mdtlov_UUID + * setup 0:lustre-MDT0000-osp-MDT0001 1:lustre-MDT0000_UUID 2:nid + * modify_mdc_tgts add 0:lustre-MDT0001 1:lustre-MDT0000_UUID 2:0 3:1 + * marker 10 (flags=0x02, v2.2.49.56) lustre-MDT0000_UUID 'add osp' + **/ static int lod_process_config(const struct lu_env *env, struct lu_device *dev, struct lustre_cfg *lcfg) @@ -72,17 +239,21 @@ static int lod_process_config(const struct lu_env *env, struct lod_device *lod = lu2lod_dev(dev); struct lu_device *next = &lod->lod_child->dd_lu_dev; char *arg1; - int rc, i; + int rc = 0; ENTRY; switch(lcfg->lcfg_command) { - case LCFG_LOV_DEL_OBD: case LCFG_LOV_ADD_INA: - case LCFG_LOV_ADD_OBD: { + case LCFG_LOV_ADD_OBD: + case LCFG_ADD_MDC: { __u32 index; + __u32 mdt_index; int gen; - /* lov_modify_tgts add 0:lov_mdsA 1:osp 2:0 3:1 */ + /* lov_modify_tgts add 0:lov_mdsA 1:osp 2:0 3:1 + * modify_mdc_tgts add 0:lustre-MDT0001 + * 1:lustre-MDT0001-mdc0002 + * 2:2 3:1*/ arg1 = lustre_cfg_string(lcfg, 1); if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) @@ -90,7 +261,38 @@ static int lod_process_config(const struct lu_env *env, if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) GOTO(out, rc = -EINVAL); - rc = -EINVAL; + if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) { + char *mdt; + mdt = strstr(lustre_cfg_string(lcfg, 0), "-MDT"); + /* 1.8 configs don't have "-MDT0000" at the end */ + if (mdt == NULL) { + mdt_index = 0; + } else { + long long_index; + rc = lodname2mdt_index( + lustre_cfg_string(lcfg, 0), + &long_index); + if (rc != 0) + GOTO(out, rc); + mdt_index = long_index; + } + rc = lod_add_device(env, lod, arg1, index, gen, + mdt_index, LUSTRE_OSC_NAME, 1); + } else if (lcfg->lcfg_command == LCFG_ADD_MDC) { + mdt_index = index; + rc = lod_add_device(env, lod, arg1, index, gen, + mdt_index, LUSTRE_MDC_NAME, 1); + } else if (lcfg->lcfg_command == LCFG_LOV_ADD_INA) { + /*FIXME: Add mdt_index for LCFG_LOV_ADD_INA*/ + mdt_index = 0; + rc = lod_add_device(env, lod, arg1, index, gen, + mdt_index, LUSTRE_OSC_NAME, 0); + } else { + rc = lod_del_device(env, lod, + &lod->lod_ost_descs, + arg1, index, gen); + } + break; } @@ -104,24 +306,14 @@ static int lod_process_config(const struct lu_env *env, if (rc > 0) rc = 0; GOTO(out, rc); - } - + } case LCFG_CLEANUP: + case LCFG_PRE_CLEANUP: { lu_dev_del_linkage(dev->ld_site, dev); - lod_getref(lod); - lod_foreach_ost(lod, i) { - struct lod_ost_desc *ost; - ost = OST_TGT(lod, i); - LASSERT(ost && ost->ltd_ost); - next = &ost->ltd_ost->dd_lu_dev; - rc = next->ld_ops->ldo_process_config(env, next, lcfg); - if (rc) - CERROR("%s: can't process %u: %d\n", - lod2obd(lod)->obd_name, - lcfg->lcfg_command, rc); - } - lod_putref(lod); - + lod_cleanup_desc_tgts(env, lod, &lod->lod_mdt_descs, lcfg); + lod_cleanup_desc_tgts(env, lod, &lod->lod_ost_descs, lcfg); + if (lcfg->lcfg_command == LCFG_PRE_CLEANUP) + break; /* * do cleanup on underlying storage only when * all OSPs are cleaned up, as they use that OSD as well @@ -136,7 +328,7 @@ static int lod_process_config(const struct lu_env *env, if (rc) CERROR("error in disconnect from storage: %d\n", rc); break; - + } default: CERROR("%s: unknown command %u\n", lod2obd(lod)->obd_name, lcfg->lcfg_command); @@ -153,7 +345,6 @@ static int lod_recovery_complete(const struct lu_env *env, { struct lod_device *lod = lu2lod_dev(dev); struct lu_device *next = &lod->lod_child->dd_lu_dev; - struct lod_ost_desc *ost; int i, rc; ENTRY; @@ -162,18 +353,20 @@ static int lod_recovery_complete(const struct lu_env *env, rc = next->ld_ops->ldo_recovery_complete(env, next); - lod_getref(lod); - lod_foreach_ost(lod, i) { - ost = OST_TGT(lod, i); - LASSERT(ost && ost->ltd_ost); - next = &ost->ltd_ost->dd_lu_dev; - rc = next->ld_ops->ldo_recovery_complete(env, next); - if (rc) - CERROR("%s: can't complete recovery on #%d: %d\n", - lod2obd(lod)->obd_name, i, rc); + lod_getref(&lod->lod_ost_descs); + if (lod->lod_osts_size > 0) { + cfs_foreach_bit(lod->lod_ost_bitmap, i) { + struct lod_tgt_desc *tgt; + tgt = OST_TGT(lod, i); + LASSERT(tgt && tgt->ltd_tgt); + next = &tgt->ltd_ost->dd_lu_dev; + rc = next->ld_ops->ldo_recovery_complete(env, next); + if (rc) + CERROR("%s: can't complete recovery on #%d:" + "%d\n", lod2obd(lod)->obd_name, i, rc); + } } - lod_putref(lod); - + lod_putref(lod, &lod->lod_ost_descs); RETURN(rc); } @@ -186,11 +379,19 @@ static int lod_prepare(const struct lu_env *env, struct lu_device *pdev, ENTRY; rc = next->ld_ops->ldo_prepare(env, pdev, next); + if (rc != 0) { + CERROR("%s: prepare bottom error: rc = %d\n", + lod2obd(lod)->obd_name, rc); + RETURN(rc); + } + + lod->lod_initialized = 1; RETURN(rc); } const struct lu_device_operations lod_lu_ops = { + .ldo_object_alloc = lod_object_alloc, .ldo_process_config = lod_process_config, .ldo_recovery_complete = lod_recovery_complete, .ldo_prepare = lod_prepare, @@ -211,19 +412,77 @@ static int lod_statfs(const struct lu_env *env, static struct thandle *lod_trans_create(const struct lu_env *env, struct dt_device *dev) { - return dt_trans_create(env, dt2lod_dev(dev)->lod_child); + struct thandle *th; + + th = dt_trans_create(env, dt2lod_dev(dev)->lod_child); + if (IS_ERR(th)) + return th; + + CFS_INIT_LIST_HEAD(&th->th_remote_update_list); + return th; +} + +static int lod_remote_sync(const struct lu_env *env, struct dt_device *dev, + struct thandle *th) +{ + struct update_request *update; + int rc = 0; + ENTRY; + + if (cfs_list_empty(&th->th_remote_update_list)) + RETURN(0); + + cfs_list_for_each_entry(update, &th->th_remote_update_list, + ur_list) { + /* In DNE phase I, there should be only one OSP + * here, so we will do send/receive one by one, + * instead of sending them parallel, will fix this + * in Phase II */ + th->th_current_request = update; + rc = dt_trans_start(env, update->ur_dt, th); + if (rc != 0) { + /* FIXME how to revert the partial results + * once error happened? Resolved by 2 Phase commit */ + update->ur_rc = rc; + break; + } + } + + RETURN(rc); } static int lod_trans_start(const struct lu_env *env, struct dt_device *dev, struct thandle *th) { - return dt_trans_start(env, dt2lod_dev(dev)->lod_child, th); + struct lod_device *lod = dt2lod_dev((struct dt_device *) dev); + int rc; + + rc = lod_remote_sync(env, dev, th); + if (rc) + return rc; + + return dt_trans_start(env, lod->lod_child, th); } static int lod_trans_stop(const struct lu_env *env, struct thandle *th) { - /* XXX: we don't know next device, will be fixed with DNE */ - return dt_trans_stop(env, th->th_dev, th); + struct update_request *update; + struct update_request *tmp; + int rc = 0; + int rc2 = 0; + + cfs_list_for_each_entry_safe(update, tmp, + &th->th_remote_update_list, + ur_list) { + th->th_current_request = update; + rc2 = dt_trans_stop(env, update->ur_dt, th); + if (unlikely(rc2 != 0 && rc == 0)) + rc = rc2; + } + + rc2 = dt_trans_stop(env, th->th_dev, th); + + return rc2 != 0 ? rc2 : rc; } static void lod_conf_get(const struct lu_env *env, @@ -240,7 +499,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) int rc = 0, i; ENTRY; - lod_getref(lod); + lod_getref(&lod->lod_ost_descs); lod_foreach_ost(lod, i) { ost = OST_TGT(lod, i); LASSERT(ost && ost->ltd_ost); @@ -251,7 +510,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) break; } } - lod_putref(lod); + lod_putref(lod, &lod->lod_ost_descs); if (rc == 0) rc = dt_sync(env, lod->lod_child); @@ -373,11 +632,29 @@ out: RETURN(rc); } +static int lod_tgt_desc_init(struct lod_tgt_descs *ltd) +{ + mutex_init(<d->ltd_mutex); + init_rwsem(<d->ltd_rw_sem); + + /* the OST array and bitmap are allocated/grown dynamically as OSTs are + * added to the LOD, see lod_add_device() */ + ltd->ltd_tgt_bitmap = CFS_ALLOCATE_BITMAP(32); + if (ltd->ltd_tgt_bitmap == NULL) + RETURN(-ENOMEM); + + ltd->ltd_tgts_size = 32; + ltd->ltd_tgtnr = 0; + + ltd->ltd_death_row = 0; + ltd->ltd_refcount = 0; + return 0; +} + static int lod_init0(const struct lu_env *env, struct lod_device *lod, struct lu_device_type *ldt, struct lustre_cfg *cfg) { struct dt_device_param ddp; - struct proc_dir_entry *lov_proc_dir; struct obd_device *obd; int rc; ENTRY; @@ -401,30 +678,25 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod, dt_conf_get(env, &lod->lod_dt_dev, &ddp); lod->lod_osd_max_easize = ddp.ddp_max_ea_size; - /* for compatibility we link old procfs's OSC entries to osp ones */ - lov_proc_dir = lprocfs_srch(proc_lustre_root, "lov"); - if (lov_proc_dir) { - cfs_proc_dir_entry_t *symlink = NULL; - char *name; - OBD_ALLOC(name, strlen(obd->obd_name) + 1); - if (name) { - strcpy(name, obd->obd_name); - if (strstr(name, "lov")) - symlink = lprocfs_add_symlink(name, - lov_proc_dir, - "../lod/%s", - obd->obd_name); - OBD_FREE(name, strlen(obd->obd_name) + 1); - lod->lod_symlink = symlink; - } - } + /* setup obd to be used with old lov code */ + rc = lod_pools_init(lod, cfg); + if (rc) + GOTO(out_disconnect, rc); - cfs_mutex_init(&lod->lod_mutex); - cfs_init_rwsem(&lod->lod_rw_sem); - cfs_spin_lock_init(&lod->lod_desc_lock); + rc = lod_procfs_init(lod); + if (rc) + GOTO(out_pools, rc); + + spin_lock_init(&lod->lod_desc_lock); + spin_lock_init(&lod->lod_connects_lock); + lod_tgt_desc_init(&lod->lod_mdt_descs); + lod_tgt_desc_init(&lod->lod_ost_descs); RETURN(0); +out_pools: + lod_pools_fini(lod); +out_disconnect: obd_disconnect(lod->lod_child_exp); RETURN(rc); } @@ -471,10 +743,22 @@ static struct lu_device *lod_device_fini(const struct lu_env *env, struct lu_device *d) { struct lod_device *lod = lu2lod_dev(d); + int rc; ENTRY; - if (lod->lod_symlink) - lprocfs_remove(&lod->lod_symlink); + lod_pools_fini(lod); + + lod_procfs_fini(lod); + + rc = lod_fini_tgt(lod, &lod->lod_ost_descs); + if (rc) + CERROR("%s:can not fini ost descs %d\n", + lod2obd(lod)->obd_name, rc); + + rc = lod_fini_tgt(lod, &lod->lod_mdt_descs); + if (rc) + CERROR("%s:can not fini mdt descs %d\n", + lod2obd(lod)->obd_name, rc); RETURN(NULL); } @@ -499,11 +783,11 @@ static int lod_obd_connect(const struct lu_env *env, struct obd_export **exp, *exp = class_conn2export(&conn); - cfs_mutex_lock(&lod->lod_mutex); + spin_lock(&lod->lod_connects_lock); lod->lod_connects++; /* at the moment we expect the only user */ LASSERT(lod->lod_connects == 1); - cfs_mutex_unlock(&lod->lod_mutex); + spin_unlock(&lod->lod_connects_lock); RETURN(0); } @@ -520,16 +804,16 @@ static int lod_obd_disconnect(struct obd_export *exp) ENTRY; /* Only disconnect the underlying layers on the final disconnect. */ - cfs_mutex_lock(&lod->lod_mutex); + spin_lock(&lod->lod_connects_lock); lod->lod_connects--; if (lod->lod_connects != 0) { /* why should there be more than 1 connect? */ - cfs_mutex_unlock(&lod->lod_mutex); + spin_unlock(&lod->lod_connects_lock); CERROR("%s: disconnect #%d\n", exp->exp_obd->obd_name, lod->lod_connects); goto out; } - cfs_mutex_unlock(&lod->lod_mutex); + spin_unlock(&lod->lod_connects_lock); /* the last user of lod has gone, let's release the device */ release = 1; @@ -585,25 +869,37 @@ static struct lu_device_type lod_device_type = { .ldt_ctx_tags = LCT_MD_THREAD, }; -static int lod_obd_health_check(const struct lu_env *env, - struct obd_device *obd) +static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) { - struct lod_device *d = lu2lod_dev(obd->obd_lu_dev); - struct lod_ost_desc *ost; - int i, rc = 1; - ENTRY; + int rc = -EINVAL; - LASSERT(d); - lod_getref(d); - lod_foreach_ost(d, i) { - ost = OST_TGT(d, i); - LASSERT(ost && ost->ltd_ost); - rc = obd_health_check(env, ost->ltd_exp->exp_obd); - /* one healthy device is enough */ - if (rc == 0) - break; + if (KEY_IS(KEY_OSP_CONNECTED)) { + struct obd_device *obd = exp->exp_obd; + struct lod_device *d; + struct lod_ost_desc *ost; + int i, rc = 1; + + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(-EAGAIN); + + d = lu2lod_dev(obd->obd_lu_dev); + lod_getref(&d->lod_ost_descs); + lod_foreach_ost(d, i) { + ost = OST_TGT(d, i); + LASSERT(ost && ost->ltd_ost); + + rc = obd_get_info(env, ost->ltd_exp, keylen, key, + vallen, val, lsm); + /* one healthy device is enough */ + if (rc == 0) + break; + } + lod_putref(d, &d->lod_ost_descs); + RETURN(rc); } - lod_putref(d); + RETURN(rc); } @@ -611,7 +907,11 @@ static struct obd_ops lod_obd_device_ops = { .o_owner = THIS_MODULE, .o_connect = lod_obd_connect, .o_disconnect = lod_obd_disconnect, - .o_health_check = lod_obd_health_check, + .o_get_info = lod_obd_get_info, + .o_pool_new = lod_pool_new, + .o_pool_rem = lod_pool_remove, + .o_pool_add = lod_pool_add, + .o_pool_del = lod_pool_del, }; static int __init lod_mod_init(void) @@ -626,8 +926,11 @@ static int __init lod_mod_init(void) lprocfs_lod_init_vars(&lvars); - rc = class_register_type(&lod_obd_device_ops, NULL, lvars.module_vars, - LUSTRE_LOD_NAME, &lod_device_type); + rc = class_register_type(&lod_obd_device_ops, NULL, NULL, +#ifndef HAVE_ONLY_PROCFS_SEQ + lvars.module_vars, +#endif + LUSTRE_LOD_NAME, &lod_device_type); if (rc) { lu_kmem_fini(lod_caches); return rc;