X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fofd%2Fofd_dev.c;h=302f4354c4660849ccc1421a5e80f5c4da8f7b11;hb=b2cb6fd1095f9c483b7bc1ebbbfdaef719aea87c;hp=4ee6feed1a8d741c07bd0ec87f8e661eeececf60;hpb=5065210e4d04fb5f67626e7f3c10e208556cdf8b;p=fs%2Flustre-release.git diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 4ee6fee..302f435 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -61,11 +61,132 @@ static struct lu_kmem_descr ofd_caches[] = { } }; +static int ofd_connect_to_next(const struct lu_env *env, struct ofd_device *m, + const char *next, struct obd_export **exp) +{ + struct obd_connect_data *data = NULL; + struct obd_device *obd; + int rc; + ENTRY; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + + obd = class_name2obd(next); + if (obd == NULL) { + CERROR("%s: can't locate next device: %s\n", + m->ofd_dt_dev.dd_lu_dev.ld_obd->obd_name, next); + GOTO(out, rc = -ENOTCONN); + } + + data->ocd_connect_flags = OBD_CONNECT_VERSION; + data->ocd_version = LUSTRE_VERSION_CODE; + + rc = obd_connect(NULL, exp, obd, &obd->obd_uuid, data, NULL); + if (rc) { + CERROR("%s: cannot connect to next dev %s: rc = %d\n", + m->ofd_dt_dev.dd_lu_dev.ld_obd->obd_name, next, rc); + GOTO(out, rc); + } + + m->ofd_dt_dev.dd_lu_dev.ld_site = + m->ofd_osd_exp->exp_obd->obd_lu_dev->ld_site; + LASSERT(m->ofd_dt_dev.dd_lu_dev.ld_site); + m->ofd_osd = lu2dt_dev(m->ofd_osd_exp->exp_obd->obd_lu_dev); + m->ofd_dt_dev.dd_lu_dev.ld_site->ls_top_dev = &m->ofd_dt_dev.dd_lu_dev; + +out: + if (data) + OBD_FREE_PTR(data); + RETURN(rc); +} + +static int ofd_stack_init(const struct lu_env *env, + struct ofd_device *m, struct lustre_cfg *cfg) +{ + const char *dev = lustre_cfg_string(cfg, 0); + struct lu_device *d; + struct ofd_thread_info *info = ofd_info(env); + struct lustre_mount_info *lmi; + int rc; + char *osdname; + + ENTRY; + + lmi = server_get_mount(dev); + if (lmi == NULL) { + CERROR("Cannot get mount info for %s!\n", dev); + RETURN(-ENODEV); + } + + /* find bottom osd */ + OBD_ALLOC(osdname, MTI_NAME_MAXLEN); + if (osdname == NULL) + RETURN(-ENOMEM); + + snprintf(osdname, MTI_NAME_MAXLEN, "%s-osd", dev); + rc = ofd_connect_to_next(env, m, osdname, &m->ofd_osd_exp); + OBD_FREE(osdname, MTI_NAME_MAXLEN); + if (rc) + RETURN(rc); + + d = m->ofd_osd_exp->exp_obd->obd_lu_dev; + LASSERT(d); + m->ofd_osd = lu2dt_dev(d); + + snprintf(info->fti_u.name, sizeof(info->fti_u.name), + "%s-osd", lustre_cfg_string(cfg, 0)); + + RETURN(rc); +} + +static void ofd_stack_fini(const struct lu_env *env, struct ofd_device *m, + struct lu_device *top) +{ + struct obd_device *obd = ofd_obd(m); + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + char flags[3] = ""; + + ENTRY; + + lu_site_purge(env, top->ld_site, ~0); + + /* process cleanup, pass mdt obd name to get obd umount flags */ + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + if (obd->obd_force) + strcat(flags, "F"); + if (obd->obd_fail) + strcat(flags, "A"); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + if (!lcfg) { + CERROR("Cannot alloc lcfg!\n"); + RETURN_EXIT; + } + + LASSERT(top); + top->ld_ops->ldo_process_config(env, top, lcfg); + lustre_cfg_free(lcfg); + + lu_site_purge(env, top->ld_site, ~0); + + LASSERT(m->ofd_osd_exp); + obd_disconnect(m->ofd_osd_exp); + m->ofd_osd = NULL; + + EXIT; +} + /* used by MGS to process specific configurations */ static int ofd_process_config(const struct lu_env *env, struct lu_device *d, struct lustre_cfg *cfg) { - int rc = 0; + struct ofd_device *m = ofd_dev(d); + struct dt_device *dt_next = m->ofd_osd; + struct lu_device *next = &dt_next->dd_lu_dev; + int rc; ENTRY; @@ -76,6 +197,9 @@ static int ofd_process_config(const struct lu_env *env, struct lu_device *d, lprocfs_ofd_init_vars(&lvars); rc = class_process_proc_param(PARAM_OST, lvars.obd_vars, cfg, d->ld_obd); + if (rc > 0 || rc == -ENOSYS) + /* we don't understand; pass it on */ + rc = next->ld_ops->ldo_process_config(env, next, cfg); break; } case LCFG_SPTLRPC_CONF: { @@ -83,12 +207,63 @@ static int ofd_process_config(const struct lu_env *env, struct lu_device *d, break; } default: + /* others are passed further */ + rc = next->ld_ops->ldo_process_config(env, next, cfg); break; } RETURN(rc); } +static int ofd_object_init(const struct lu_env *env, struct lu_object *o, + const struct lu_object_conf *conf) +{ + struct ofd_device *d = ofd_dev(o->lo_dev); + struct lu_device *under; + struct lu_object *below; + int rc = 0; + + ENTRY; + + CDEBUG(D_INFO, "object init, fid = "DFID"\n", + PFID(lu_object_fid(o))); + + under = &d->ofd_osd->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + if (below != NULL) + lu_object_add(o, below); + else + rc = -ENOMEM; + + RETURN(rc); +} + +static void ofd_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct ofd_object *of = ofd_obj(o); + struct lu_object_header *h; + + ENTRY; + + h = o->lo_header; + CDEBUG(D_INFO, "object free, fid = "DFID"\n", + PFID(lu_object_fid(o))); + + lu_object_fini(o); + lu_object_header_fini(h); + OBD_SLAB_FREE_PTR(of, ofd_object_kmem); + EXIT; +} + +static int ofd_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return (*p)(env, cookie, LUSTRE_OST_NAME"-object@%p", o); +} + struct lu_object_operations ofd_obj_ops = { + .loo_object_init = ofd_object_init, + .loo_object_free = ofd_object_free, + .loo_object_print = ofd_object_print }; static struct lu_object *ofd_object_alloc(const struct lu_env *env, @@ -116,18 +291,36 @@ static struct lu_object *ofd_object_alloc(const struct lu_env *env, } } -static int ofd_start(const struct lu_env *env, struct lu_device *parent, - struct lu_device *dev) +extern int ost_handle(struct ptlrpc_request *req); + +static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev, + struct lu_device *dev) { - struct obd_device *obd = dev->ld_obd; - int rc = 0; + struct ofd_thread_info *info; + struct ofd_device *ofd = ofd_dev(dev); + struct obd_device *obd = ofd_obd(ofd); + struct lu_device *next = &ofd->ofd_osd->dd_lu_dev; + int rc; ENTRY; - LASSERT(obd->obd_no_conn); - cfs_spin_lock(&obd->obd_dev_lock); - obd->obd_no_conn = 0; - cfs_spin_unlock(&obd->obd_dev_lock); + rc = lu_env_refill((struct lu_env *)env); + if (rc != 0) { + CERROR("Failure to refill session: '%d'\n", rc); + RETURN(rc); + } + + info = ofd_info_init(env, NULL); + if (info == NULL) + RETURN(-EFAULT); + + /* initialize lower device */ + rc = next->ld_ops->ldo_prepare(env, dev, next); + + target_recovery_init(&ofd->ofd_lut, ost_handle); + + if (obd->obd_recovering == 0) + ofd_postrecov(env, ofd); RETURN(rc); } @@ -135,25 +328,122 @@ static int ofd_start(const struct lu_env *env, struct lu_device *parent, static int ofd_recovery_complete(const struct lu_env *env, struct lu_device *dev) { - int rc = 0; + struct ofd_device *ofd = ofd_dev(dev); + struct lu_device *next = &ofd->ofd_osd->dd_lu_dev; + int rc = 0; ENTRY; + /* Grant space for object precreation on the self export. + * This initial reserved space (i.e. 20MB for zfs and 560KB for ldiskfs) + * is enough to create 20k objects. It is then adapted based on the + * precreate request size (see ofd_grant_create() + */ + ofd_grant_connect(env, dev->ld_obd->obd_self_export, + OST_MAX_PRECREATE * ofd->ofd_dt_conf.ddp_inodespace); + rc = next->ld_ops->ldo_recovery_complete(env, next); RETURN(rc); } static struct lu_device_operations ofd_lu_ops = { .ldo_object_alloc = ofd_object_alloc, .ldo_process_config = ofd_process_config, - .ldo_prepare = ofd_start, .ldo_recovery_complete = ofd_recovery_complete, + .ldo_prepare = ofd_prepare, }; +static int ofd_procfs_init(struct ofd_device *ofd) +{ + struct lprocfs_static_vars lvars; + struct obd_device *obd = ofd_obd(ofd); + cfs_proc_dir_entry_t *entry; + int rc = 0; + + ENTRY; + + /* lprocfs must be setup before the ofd so state can be safely added + * to /proc incrementally as the ofd is setup */ + lprocfs_ofd_init_vars(&lvars); + rc = lprocfs_obd_setup(obd, lvars.obd_vars); + if (rc) { + CERROR("%s: lprocfs_obd_setup failed: %d.\n", + obd->obd_name, rc); + RETURN(rc); + } + + rc = lprocfs_alloc_obd_stats(obd, LPROC_OFD_LAST); + if (rc) { + CERROR("%s: lprocfs_alloc_obd_stats failed: %d.\n", + obd->obd_name, rc); + GOTO(obd_cleanup, rc); + } + + /* Init OFD private stats here */ + lprocfs_counter_init(obd->obd_stats, LPROC_OFD_READ_BYTES, + LPROCFS_CNTR_AVGMINMAX, "read_bytes", "bytes"); + lprocfs_counter_init(obd->obd_stats, LPROC_OFD_WRITE_BYTES, + LPROCFS_CNTR_AVGMINMAX, "write_bytes", "bytes"); + + rc = lproc_ofd_attach_seqstat(obd); + if (rc) { + CERROR("%s: create seqstat failed: %d.\n", obd->obd_name, rc); + GOTO(free_obd_stats, rc); + } + + entry = lprocfs_register("exports", obd->obd_proc_entry, NULL, NULL); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CERROR("%s: error %d setting up lprocfs for %s\n", + obd->obd_name, rc, "exports"); + GOTO(free_obd_stats, rc); + } + obd->obd_proc_exports_entry = entry; + + entry = lprocfs_add_simple(obd->obd_proc_exports_entry, "clear", + lprocfs_nid_stats_clear_read, + lprocfs_nid_stats_clear_write, obd, NULL); + if (IS_ERR(entry)) { + rc = PTR_ERR(entry); + CERROR("%s: add proc entry 'clear' failed: %d.\n", + obd->obd_name, rc); + GOTO(free_obd_stats, rc); + } + + rc = lprocfs_job_stats_init(obd, LPROC_OFD_STATS_LAST, + ofd_stats_counter_init); + if (rc) + GOTO(remove_entry_clear, rc); + RETURN(0); +remove_entry_clear: + lprocfs_remove_proc_entry("clear", obd->obd_proc_exports_entry); +free_obd_stats: + lprocfs_free_obd_stats(obd); +obd_cleanup: + lprocfs_obd_cleanup(obd); + return rc; +} + +static int ofd_procfs_fini(struct ofd_device *ofd) +{ + struct obd_device *obd = ofd_obd(ofd); + + lprocfs_job_stats_fini(obd); + lprocfs_remove_proc_entry("clear", obd->obd_proc_exports_entry); + lprocfs_free_per_client_stats(obd); + lprocfs_free_obd_stats(obd); + lprocfs_obd_cleanup(obd); + return 0; +} + +extern int ost_handle(struct ptlrpc_request *req); + static int ofd_init0(const struct lu_env *env, struct ofd_device *m, struct lu_device_type *ldt, struct lustre_cfg *cfg) { const char *dev = lustre_cfg_string(cfg, 0); + struct ofd_thread_info *info = NULL; struct obd_device *obd; + struct obd_statfs *osfs; int rc; ENTRY; @@ -170,11 +460,49 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, obd->u.obt.obt_magic = OBT_MAGIC; + m->ofd_fmd_max_num = OFD_FMD_MAX_NUM_DEFAULT; + m->ofd_fmd_max_age = OFD_FMD_MAX_AGE_DEFAULT; + + cfs_spin_lock_init(&m->ofd_flags_lock); + m->ofd_raid_degraded = 0; + m->ofd_syncjournal = 0; + ofd_slc_set(m); + m->ofd_grant_compat_disable = 0; + + /* statfs data */ + cfs_spin_lock_init(&m->ofd_osfs_lock); + m->ofd_osfs_age = cfs_time_shift_64(-1000); + m->ofd_osfs_unstable = 0; + m->ofd_statfs_inflight = 0; + m->ofd_osfs_inflight = 0; + + /* grant data */ + cfs_spin_lock_init(&m->ofd_grant_lock); + m->ofd_tot_dirty = 0; + m->ofd_tot_granted = 0; + m->ofd_tot_pending = 0; + m->ofd_max_group = 0; + + cfs_rwlock_init(&obd->u.filter.fo_sptlrpc_lock); + sptlrpc_rule_set_init(&obd->u.filter.fo_sptlrpc_rset); + + obd->u.filter.fo_fl_oss_capa = 0; + CFS_INIT_LIST_HEAD(&obd->u.filter.fo_capa_keys); + obd->u.filter.fo_capa_hash = init_capa_hash(); + if (obd->u.filter.fo_capa_hash == NULL) + RETURN(-ENOMEM); + m->ofd_dt_dev.dd_lu_dev.ld_ops = &ofd_lu_ops; m->ofd_dt_dev.dd_lu_dev.ld_obd = obd; /* set this lu_device to obd, because error handling need it */ obd->obd_lu_dev = &m->ofd_dt_dev.dd_lu_dev; + rc = ofd_procfs_init(m); + if (rc) { + CERROR("Can't init ofd lprocfs, rc %d\n", rc); + RETURN(rc); + } + /* No connection accepted until configurations will finish */ obd->obd_no_conn = 1; obd->obd_replayable = 1; @@ -187,7 +515,75 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, } } + info = ofd_info_init(env, NULL); + if (info == NULL) + RETURN(-EFAULT); + + rc = ofd_stack_init(env, m, cfg); + if (rc) { + CERROR("Can't init device stack, rc %d\n", rc); + GOTO(err_fini_proc, rc); + } + + /* populate cached statfs data */ + osfs = &ofd_info(env)->fti_u.osfs; + rc = ofd_statfs_internal(env, m, osfs, 0, NULL); + if (rc != 0) { + CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc); + GOTO(err_fini_stack, rc); + } + if (!IS_PO2(osfs->os_bsize)) { + CERROR("%s: blocksize (%d) is not a power of 2\n", + obd->obd_name, osfs->os_bsize); + GOTO(err_fini_stack, rc = -EPROTO); + } + m->ofd_blockbits = cfs_fls(osfs->os_bsize) - 1; + + snprintf(info->fti_u.name, sizeof(info->fti_u.name), "filter-%p", m); + m->ofd_namespace = ldlm_namespace_new(obd, info->fti_u.name, + LDLM_NAMESPACE_SERVER, + LDLM_NAMESPACE_GREEDY, + LDLM_NS_TYPE_OST); + if (m->ofd_namespace == NULL) + GOTO(err_fini_stack, rc = -ENOMEM); + /* set obd_namespace for compatibility with old code */ + obd->obd_namespace = m->ofd_namespace; + ldlm_register_intent(m->ofd_namespace, ofd_intent_policy); + m->ofd_namespace->ns_lvbo = &ofd_lvbo; + m->ofd_namespace->ns_lvbp = m; + + ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "filter_ldlm_cb_client", &obd->obd_ldlm_client); + + dt_conf_get(env, m->ofd_osd, &m->ofd_dt_conf); + + /* Allow at most ddp_grant_reserved% of the available filesystem space + * to be granted to clients, so that any errors in the grant overhead + * calculations do not allow granting more space to clients than can be + * written. Assumes that in aggregate the grant overhead calculations do + * not have more than ddp_grant_reserved% estimation error in them. */ + m->ofd_grant_ratio = + ofd_grant_ratio_conv(m->ofd_dt_conf.ddp_grant_reserved); + + rc = lut_init(env, &m->ofd_lut, obd, m->ofd_osd); + if (rc) + GOTO(err_free_ns, rc); + + rc = ofd_fs_setup(env, m, obd); + if (rc) + GOTO(err_fini_lut, rc); + RETURN(0); +err_fini_lut: + lut_fini(env, &m->ofd_lut); +err_free_ns: + ldlm_namespace_free(m->ofd_namespace, 0, obd->obd_force); + obd->obd_namespace = m->ofd_namespace = NULL; +err_fini_stack: + ofd_stack_fini(env, m, &m->ofd_osd->dd_lu_dev); +err_fini_proc: + ofd_procfs_fini(m); + return rc; } static void ofd_fini(const struct lu_env *env, struct ofd_device *m) @@ -195,14 +591,30 @@ static void ofd_fini(const struct lu_env *env, struct ofd_device *m) struct obd_device *obd = ofd_obd(m); struct lu_device *d = &m->ofd_dt_dev.dd_lu_dev; + target_recovery_fini(obd); obd_exports_barrier(obd); obd_zombie_barrier(); + lut_fini(env, &m->ofd_lut); + ofd_fs_cleanup(env, m); + + ofd_free_capa_keys(m); + cleanup_capa_hash(obd->u.filter.fo_capa_hash); + + if (m->ofd_namespace != NULL) { + ldlm_namespace_free(m->ofd_namespace, NULL, + d->ld_obd->obd_force); + d->ld_obd->obd_namespace = m->ofd_namespace = NULL; + } + + ofd_stack_fini(env, m, &m->ofd_dt_dev.dd_lu_dev); + ofd_procfs_fini(m); LASSERT(cfs_atomic_read(&d->ld_ref) == 0); + server_put_mount(obd->obd_name, NULL); EXIT; } -static struct lu_device* ofd_device_fini(const struct lu_env *env, +static struct lu_device *ofd_device_fini(const struct lu_env *env, struct lu_device *d) { ENTRY; @@ -252,6 +664,17 @@ static void ofd_key_exit(const struct lu_context *ctx, struct ofd_thread_info *info = data; info->fti_env = NULL; + info->fti_exp = NULL; + + info->fti_xid = 0; + info->fti_transno = 0; + info->fti_pre_version = 0; + info->fti_obj = NULL; + info->fti_has_trans = 0; + info->fti_mult_trans = 0; + info->fti_used = 0; + + memset(&info->fti_attr, 0, sizeof info->fti_attr); } struct lu_context_key ofd_thread_key = { @@ -292,6 +715,12 @@ int __init ofd_init(void) if (rc) return rc; + rc = ofd_fmd_init(); + if (rc) { + lu_kmem_fini(ofd_caches); + return(rc); + } + lprocfs_ofd_init_vars(&lvars); rc = class_register_type(&ofd_obd_ops, NULL, lvars.module_vars, @@ -301,8 +730,9 @@ int __init ofd_init(void) void __exit ofd_exit(void) { - class_unregister_type(LUSTRE_OST_NAME); + ofd_fmd_exit(); lu_kmem_fini(ofd_caches); + class_unregister_type(LUSTRE_OST_NAME); } MODULE_AUTHOR("Whamcloud, Inc. ");