From: Mikhal Pershin Date: Sun, 11 Dec 2016 17:16:25 +0000 (+0300) Subject: LU-3285 mdt: use generic grant code at MDT X-Git-Tag: 2.10.56~64^2~10 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=0697cf73c74d92bb89e89b188a6793fb28eb0e8c LU-3285 mdt: use generic grant code at MDT Use grants at MDT for Data-on-MDT needs. Add parameter to reserve part of available space for metadata and never grants it to clients. Test-Parameters: mdssizegb=20 testlist=dom-performance Signed-off-by: Mikhal Pershin Change-Id: I2612352062871e4edd3817f32e7d96cb95a0a52b Reviewed-on: https://review.whamcloud.com/28021 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Jinshan Xiong --- diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h index fa9ad61..0e6ed95 100644 --- a/lustre/include/lu_target.h +++ b/lustre/include/lu_target.h @@ -126,6 +126,9 @@ struct tg_grants_data { u64 tgd_tot_granted; /* grant used by I/Os in progress (between prepare and commit) */ u64 tgd_tot_pending; + /* amount of available space in percentage that is never used for + * grants, used on MDT to always keep space for metadata. */ + u64 tgd_reserved_pcnt; /* number of clients using grants */ int tgd_tot_granted_clients; /* shall we grant space to clients not @@ -520,6 +523,13 @@ long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, struct obd_statfs *osfs, __u64 max_age, int *from_cache); +int tgt_tot_dirty_seq_show(struct seq_file *m, void *data); +int tgt_tot_granted_seq_show(struct seq_file *m, void *data); +int tgt_tot_pending_seq_show(struct seq_file *m, void *data); +int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data); +ssize_t tgt_grant_compat_disable_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off); /* target/update_trans.c */ int distribute_txn_init(const struct lu_env *env, diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h index d2a3d8d..0f23af7 100644 --- a/lustre/include/lustre_osc.h +++ b/lustre/include/lustre_osc.h @@ -655,7 +655,10 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, struct ptlrpc_request_set *set); int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg); - +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata); +int osc_disconnect(struct obd_export *exp); int osc_punch_send(struct obd_export *exp, struct obdo *oa, obd_enqueue_update_f upcall, void *cookie); diff --git a/lustre/include/uapi/linux/lustre/lustre_idl.h b/lustre/include/uapi/linux/lustre/lustre_idl.h index 71c77e0..7118cdc 100644 --- a/lustre/include/uapi/linux/lustre/lustre_idl.h +++ b/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -850,6 +850,7 @@ struct ptlrpc_body_v2 { OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \ OBD_CONNECT_MULTIMODRPCS | \ OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \ + OBD_CONNECT_GRANT_PARAM | \ OBD_CONNECT_FLAGS2) #define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 003b44b..960f90c 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -196,6 +196,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, RETURN(-ENOMEM); } + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + /* indicate MDT features supported by this client */ data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | OBD_CONNECT_GRANT | @@ -216,7 +220,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | OBD_CONNECT_SUBTREE | - OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS; + OBD_CONNECT_MULTIMODRPCS | + OBD_CONNECT_GRANT_PARAM | OBD_CONNECT_FLAGS2; data->ocd_connect_flags2 = 0; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 411552c..18eb9ba 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -2612,7 +2612,8 @@ static struct obd_ops mdc_obd_ops = { .o_add_conn = client_import_add_conn, .o_del_conn = client_import_del_conn, .o_connect = client_connect_import, - .o_disconnect = client_disconnect_export, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, .o_iocontrol = mdc_iocontrol, .o_set_info_async = mdc_set_info_async, .o_statfs = mdc_statfs, diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 2791ad3..966a5d2 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -415,7 +415,8 @@ static int mdt_statfs(struct tgt_session_info *tsi) { struct ptlrpc_request *req = tgt_ses_req(tsi); struct mdt_thread_info *info = tsi2mdt_info(tsi); - struct md_device *next = info->mti_mdt->mdt_child; + struct mdt_device *mdt = info->mti_mdt; + struct tg_grants_data *tgd = &mdt->mdt_lut.lut_tgd; struct ptlrpc_service_part *svcpt; struct obd_statfs *osfs; int rc; @@ -440,24 +441,44 @@ static int mdt_statfs(struct tgt_session_info *tsi) if (!osfs) GOTO(out, rc = -EPROTO); - /** statfs information are cached in the mdt_device */ - if (cfs_time_before_64(info->mti_mdt->mdt_osfs_age, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS))) { - /** statfs data is too old, get up-to-date one */ - rc = next->md_ops->mdo_statfs(info->mti_env, next, osfs); - if (rc) - GOTO(out, rc); - spin_lock(&info->mti_mdt->mdt_lock); - info->mti_mdt->mdt_osfs = *osfs; - info->mti_mdt->mdt_osfs_age = cfs_time_current_64(); - spin_unlock(&info->mti_mdt->mdt_lock); - } else { - /** use cached statfs data */ - spin_lock(&info->mti_mdt->mdt_lock); - *osfs = info->mti_mdt->mdt_osfs; - spin_unlock(&info->mti_mdt->mdt_lock); - } + rc = tgt_statfs_internal(tsi->tsi_env, &mdt->mdt_lut, osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + NULL); + if (unlikely(rc)) + GOTO(out, rc); + /* at least try to account for cached pages. its still racy and + * might be under-reporting if clients haven't announced their + * caches with brw recently */ + CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu" + " pending %llu free %llu avail %llu\n", + tgd->tgd_tot_dirty, tgd->tgd_tot_granted, + tgd->tgd_tot_pending, + osfs->os_bfree << tgd->tgd_blockbits, + osfs->os_bavail << tgd->tgd_blockbits); + + osfs->os_bavail -= min_t(u64, osfs->os_bavail, + ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending + + osfs->os_bsize - 1) >> tgd->tgd_blockbits)); + + tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__); + CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; " + "%llu objects: %llu free; state %x\n", + osfs->os_blocks, osfs->os_bfree, osfs->os_bavail, + osfs->os_files, osfs->os_ffree, osfs->os_state); + + if (!exp_grant_param_supp(tsi->tsi_exp) && + tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) { + /* clients which don't support OBD_CONNECT_GRANT_PARAM + * should not see a block size > page size, otherwise + * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12) + * block size which is the biggest block size known to work + * with all client's page size. */ + osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT; + osfs->os_bfree <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT; + osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT; + osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT; + } if (rc == 0) mdt_counter_incr(req, LPROC_MDT_STATFS); out: @@ -4931,8 +4952,9 @@ static int mdt_postrecov(const struct lu_env *, struct mdt_device *); static int mdt_init0(const struct lu_env *env, struct mdt_device *m, struct lu_device_type *ldt, struct lustre_cfg *cfg) { - struct mdt_thread_info *info; - struct obd_device *obd; + struct mdt_thread_info *info; + struct obd_device *obd; + struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd; const char *dev = lustre_cfg_string(cfg, 0); const char *num = lustre_cfg_string(cfg, 2); struct lustre_mount_info *lmi = NULL; @@ -4989,7 +5011,6 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids); init_rwsem(&m->mdt_squash.rsi_sem); spin_lock_init(&m->mdt_lock); - m->mdt_osfs_age = cfs_time_shift_64(-1000); m->mdt_enable_remote_dir = 0; m->mdt_enable_remote_dir_gid = 0; @@ -5070,6 +5091,15 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, if (rc) GOTO(err_free_hsm, rc); + /* Amount of available space excluded from granting and reserved + * for metadata. It is in percentage and 50% is default value. */ + tgd->tgd_reserved_pcnt = 50; + + if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits)) + m->mdt_brw_size = 1U << tgd->tgd_blockbits; + else + m->mdt_brw_size = ONE_MB_BRW_SIZE; + rc = mdt_fs_setup(env, m, obd, lsi); if (rc) GOTO(err_tgt, rc); @@ -5474,7 +5504,8 @@ static int mdt_connect_internal(const struct lu_env *env, data->ocd_connect_flags &= ~OBD_CONNECT_XATTR; if (OCD_HAS_FLAG(data, BRW_SIZE)) { - data->ocd_brw_size = min(data->ocd_brw_size, MD_MAX_BRW_SIZE); + data->ocd_brw_size = min(data->ocd_brw_size, + mdt->mdt_brw_size); if (data->ocd_brw_size == 0) { CERROR("%s: cli %s/%p ocd_connect_flags: %#llx " "ocd_version: %x ocd_grant: %d ocd_index: %u " @@ -5488,9 +5519,29 @@ static int mdt_connect_internal(const struct lu_env *env, } } - if (OCD_HAS_FLAG(data, GRANT)) - data->ocd_grant = mdt_grant_connect(env, exp, data->ocd_grant, - !reconnect); + if (OCD_HAS_FLAG(data, GRANT_PARAM)) { + struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf; + + /* client is reporting its page size, for future use */ + exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits; + data->ocd_grant_blkbits = mdt->mdt_lut.lut_tgd.tgd_blockbits; + /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs + * it's LDISKFS_DIR_REC_LEN(20) = 28. */ + data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1); + /* ocd_grant_tax_kb is in 1K byte blocks */ + data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10; + data->ocd_grant_max_blks = ddp->ddp_max_extent_blks; + } + + if (OCD_HAS_FLAG(data, GRANT)) { + /* Save connect_data we have so far because tgt_grant_connect() + * uses it to calculate grant. */ + exp->exp_connect_data = *data; + tgt_grant_connect(env, exp, data, !reconnect); + } + + if (OCD_HAS_FLAG(data, MAXBYTES)) + data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes; /* NB: Disregard the rule against updating * exp_connect_data.ocd_connect_flags in this case, since @@ -5685,11 +5736,15 @@ static inline void mdt_disable_slc(struct mdt_device *mdt) static int mdt_obd_disconnect(struct obd_export *exp) { - int rc; - ENTRY; + int rc; + + ENTRY; - LASSERT(exp); - class_export_get(exp); + LASSERT(exp); + class_export_get(exp); + + if (!(exp->exp_flags & OBD_OPT_FORCE)) + tgt_grant_sanity_check(exp->exp_obd, __func__); if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) && !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) { @@ -5703,6 +5758,8 @@ static int mdt_obd_disconnect(struct obd_export *exp) if (rc != 0) CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc); + tgt_grant_discard(exp); + rc = mdt_export_cleanup(exp); nodemap_del_member(exp); class_export_put(exp); @@ -5873,6 +5930,17 @@ static int mdt_destroy_export(struct obd_export *exp) LASSERT(list_empty(&exp->exp_outstanding_replies)); LASSERT(list_empty(&exp->exp_mdt_data.med_open_head)); + /* + * discard grants once we're sure no more + * interaction with the client is possible + */ + tgt_grant_discard(exp); + if (exp_connect_flags(exp) & OBD_CONNECT_GRANT) + exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--; + + if (!(exp->exp_flags & OBD_OPT_FORCE)) + tgt_grant_sanity_check(exp->exp_obd, __func__); + RETURN(0); } diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index f81cfa1..522e03d 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -221,6 +221,9 @@ struct mdt_device { int mdt_max_ea_size; + /* preferred BRW size, decided by storage type and capability */ + __u32 mdt_brw_size; + struct upcall_cache *mdt_identity_cache; unsigned int mdt_capa_conf:1, @@ -233,10 +236,6 @@ struct mdt_device { /* lock for osfs and md_root */ spinlock_t mdt_lock; - /* statfs optimization: we cache a bit */ - struct obd_statfs mdt_osfs; - __u64 mdt_osfs_age; - /* root squash */ struct root_squash_info mdt_squash; diff --git a/lustre/mdt/mdt_io.c b/lustre/mdt/mdt_io.c index 2548612..caaadf7 100644 --- a/lustre/mdt/mdt_io.c +++ b/lustre/mdt/mdt_io.c @@ -33,47 +33,6 @@ #include #include "mdt_internal.h" -/* --------------- MDT grant code ---------------- */ - -long mdt_grant_connect(const struct lu_env *env, - struct obd_export *exp, - u64 want, bool conservative) -{ - struct mdt_device *mdt = mdt_exp2dev(exp); - u64 left; - long grant; - - ENTRY; - - dt_statfs(env, mdt->mdt_bottom, &mdt->mdt_osfs); - - left = (mdt->mdt_osfs.os_bavail * mdt->mdt_osfs.os_bsize) / 2; - - grant = left; - - CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %ld want: %llu left: %llu\n", - exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, - exp, grant, want, left); - - return grant; -} - -void mdt_grant_prepare_write(const struct lu_env *env, - struct obd_export *exp, struct obdo *oa, - struct niobuf_remote *rnb, int niocount) -{ - struct mdt_device *mdt = mdt_exp2dev(exp); - u64 left; - - ENTRY; - - left = (mdt->mdt_osfs.os_bavail * mdt->mdt_osfs.os_bsize) / 2; - - /* grant more space back to the client if possible */ - oa->o_grant = left; -} -/* ---------------- end of MDT grant code ---------------- */ - /* functions below are stubs for now, they will be implemented with * grant support on MDT */ static inline void mdt_io_counter_incr(struct obd_export *exp, int opcode, @@ -82,19 +41,6 @@ static inline void mdt_io_counter_incr(struct obd_export *exp, int opcode, return; } -void mdt_grant_prepare_read(const struct lu_env *env, - struct obd_export *exp, struct obdo *oa) -{ - return; -} - -void mdt_grant_commit(struct obd_export *exp, unsigned long pending, - int rc) -{ - return; - -} - static inline void mdt_dom_read_lock(struct mdt_object *mo) { down_read(&mo->mot_dom_sem); @@ -174,7 +120,7 @@ static int mdt_preprw_write(const struct lu_env *env, struct obd_export *exp, /* Process incoming grant info, set OBD_BRW_GRANTED flag and grant some * space back if possible */ - mdt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt); + tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt); mdt_dom_read_lock(mo); if (!mdt_object_exists(mo)) { @@ -191,8 +137,11 @@ static int mdt_preprw_write(const struct lu_env *env, struct obd_export *exp, if (unlikely(rc < 0)) GOTO(err, rc); /* correct index for local buffers to continue with */ - for (k = 0; k < rc; k++) - lnb[j+k].lnb_flags = rnb[i].rnb_flags; + for (k = 0; k < rc; k++) { + lnb[j + k].lnb_flags = rnb[i].rnb_flags; + if (!(rnb[i].rnb_flags & OBD_BRW_GRANTED)) + lnb[j + k].lnb_rc = -ENOSPC; + } j += rc; *nr_local += rc; tot_bytes += rnb[i].rnb_len; @@ -209,11 +158,11 @@ err: unlock: mdt_dom_read_unlock(mo); /* tgt_grant_prepare_write() was called, so we must commit */ - mdt_grant_commit(exp, oa->o_grant_used, rc); + tgt_grant_commit(exp, oa->o_grant_used, rc); /* let's still process incoming grant information packed in the oa, * but without enforcing grant since we won't proceed with the write. * Just like a read request actually. */ - mdt_grant_prepare_read(env, exp, oa); + tgt_grant_prepare_read(env, exp, oa); return rc; } @@ -256,7 +205,7 @@ int mdt_obd_preprw(const struct lu_env *env, int cmd, struct obd_export *exp, objcount, obj, rnb, nr_local, lnb, jobid); } else if (cmd == OBD_BRW_READ) { - mdt_grant_prepare_read(env, exp, oa); + tgt_grant_prepare_read(env, exp, oa); rc = mdt_preprw_read(env, exp, mdt, mo, la, obj->ioo_bufcnt, rnb, nr_local, lnb, jobid); @@ -368,6 +317,12 @@ out_stop: if (rc == -ENOSPC) th->th_sync = 1; + + if (rc == 0 && granted > 0) { + if (tgt_grant_commit_cb_add(th, exp, granted) == 0) + granted = 0; + } + th->th_result = rc; dt_trans_stop(env, dt, th); if (rc == -ENOSPC && retries++ < 3) { @@ -379,7 +334,8 @@ out_stop: out: dt_bufs_put(env, dob, lnb, niocount); mdt_dom_read_unlock(mo); - mdt_grant_commit(exp, granted, old_rc); + if (granted > 0) + tgt_grant_commit(exp, granted, old_rc); RETURN(rc); } diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index c60742e..15deaae 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -775,7 +775,20 @@ LPROC_SEQ_FOPS(mdt_hsm_cdt_control); LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_hard); LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_soft); +LPROC_SEQ_FOPS_RO(tgt_tot_dirty); +LPROC_SEQ_FOPS_RO(tgt_tot_granted); +LPROC_SEQ_FOPS_RO(tgt_tot_pending); +LPROC_SEQ_FOPS(tgt_grant_compat_disable); + static struct lprocfs_vars lprocfs_mdt_obd_vars[] = { + { .name = "tot_dirty", + .fops = &tgt_tot_dirty_fops }, + { .name = "tot_pending", + .fops = &tgt_tot_pending_fops }, + { .name = "tot_granted", + .fops = &tgt_tot_granted_fops }, + { .name = "grant_compat_disable", + .fops = &tgt_grant_compat_disable_fops }, { .name = "recovery_status", .fops = &mdt_recovery_status_fops }, { .name = "num_exports", diff --git a/lustre/ofd/lproc_ofd.c b/lustre/ofd/lproc_ofd.c index b23eda2..27fa94d 100644 --- a/lustre/ofd/lproc_ofd.c +++ b/lustre/ofd/lproc_ofd.c @@ -70,69 +70,6 @@ static int ofd_seqs_seq_show(struct seq_file *m, void *data) LPROC_SEQ_FOPS_RO(ofd_seqs); /** - * Show estimate of total amount of dirty data on clients. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error - */ -static int ofd_tot_dirty_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *obd = m->private; - struct tg_grants_data *tgd; - - LASSERT(obd != NULL); - tgd = &obd->u.obt.obt_lut->lut_tgd; - seq_printf(m, "%llu\n", tgd->tgd_tot_dirty); - return 0; -} -LPROC_SEQ_FOPS_RO(ofd_tot_dirty); - -/** - * Show total amount of space granted to clients. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error - */ -static int ofd_tot_granted_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *obd = m->private; - struct tg_grants_data *tgd; - - LASSERT(obd != NULL); - tgd = &obd->u.obt.obt_lut->lut_tgd; - seq_printf(m, "%llu\n", tgd->tgd_tot_granted); - return 0; -} -LPROC_SEQ_FOPS_RO(ofd_tot_granted); - -/** - * Show total amount of space used by IO in progress. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error - */ -static int ofd_tot_pending_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *obd = m->private; - struct tg_grants_data *tgd; - - LASSERT(obd != NULL); - tgd = &obd->u.obt.obt_lut->lut_tgd; - seq_printf(m, "%llu\n", tgd->tgd_tot_pending); - return 0; -} -LPROC_SEQ_FOPS_RO(ofd_tot_pending); - -/** * Show total number of grants for precreate. * * \param[in] m seq_file handle @@ -634,70 +571,6 @@ ofd_sync_lock_cancel_seq_write(struct file *file, const char __user *buffer, LPROC_SEQ_FOPS(ofd_sync_lock_cancel); /** - * Show if grants compatibility mode is disabled. - * - * When tgd_grant_compat_disable is set, we don't grant any space to clients - * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such - * a client is inflated since it consumes PAGE_SIZE of grant space per - * block, (i.e. typically 4kB units), but underlaying file system might have - * block size bigger than page size, e.g. ZFS. See LU-2049 for details. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error - */ -static int ofd_grant_compat_disable_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *obd = m->private; - struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; - - seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable); - return 0; -} - -/** - * Change grant compatibility mode. - * - * Setting tgd_grant_compat_disable prohibit any space granting to clients - * not supporting OBD_CONNECT_GRANT_PARAM. See details above. - * - * \param[in] file proc file - * \param[in] buffer string which represents mode - * 1: disable compatibility mode - * 0: enable compatibility mode - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * - * \retval \a count on success - * \retval negative number on error - */ -static ssize_t -ofd_grant_compat_disable_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct obd_device *obd = m->private; - struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; - __s64 val; - int rc; - - rc = lprocfs_str_to_s64(buffer, count, &val); - if (rc) - return rc; - - if (val < 0) - return -EINVAL; - - tgd->tgd_grant_compat_disable = !!val; - - return count; -} -LPROC_SEQ_FOPS(ofd_grant_compat_disable); - -/** * Show the limit of soft sync RPCs. * * This value defines how many IO RPCs with OBD_BRW_SOFT_SYNC flag @@ -898,6 +771,11 @@ LPROC_SEQ_FOPS_RW_TYPE(ofd, ir_factor); LPROC_SEQ_FOPS_RW_TYPE(ofd, checksum_dump); LPROC_SEQ_FOPS_RW_TYPE(ofd, job_interval); +LPROC_SEQ_FOPS_RO(tgt_tot_dirty); +LPROC_SEQ_FOPS_RO(tgt_tot_granted); +LPROC_SEQ_FOPS_RO(tgt_tot_pending); +LPROC_SEQ_FOPS(tgt_grant_compat_disable); + struct lprocfs_vars lprocfs_ofd_obd_vars[] = { { .name = "seqs_allocated", .fops = &ofd_seqs_fops }, @@ -906,11 +784,11 @@ struct lprocfs_vars lprocfs_ofd_obd_vars[] = { { .name = "last_id", .fops = &ofd_last_id_fops }, { .name = "tot_dirty", - .fops = &ofd_tot_dirty_fops }, + .fops = &tgt_tot_dirty_fops }, { .name = "tot_pending", - .fops = &ofd_tot_pending_fops }, + .fops = &tgt_tot_pending_fops }, { .name = "tot_granted", - .fops = &ofd_tot_granted_fops }, + .fops = &tgt_tot_granted_fops }, { .name = "grant_precreate", .fops = &ofd_grant_precreate_fops }, { .name = "precreate_batch", @@ -940,7 +818,7 @@ struct lprocfs_vars lprocfs_ofd_obd_vars[] = { { .name = "checksum_dump", .fops = &ofd_checksum_dump_fops }, { .name = "grant_compat_disable", - .fops = &ofd_grant_compat_disable_fops }, + .fops = &tgt_grant_compat_disable_fops }, { .name = "client_cache_count", .fops = &ofd_fmd_max_num_fops }, { .name = "client_cache_seconds", diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 43e9acc..fcb477a 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2902,7 +2902,6 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, struct ofd_thread_info *info = NULL; struct obd_device *obd; struct tg_grants_data *tgd = &m->ofd_lut.lut_tgd; - struct obd_statfs *osfs; struct lu_fid fid; struct nm_config_file *nodemap_config; struct obd_device_target *obt; @@ -2930,22 +2929,8 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, m->ofd_raid_degraded = 0; m->ofd_syncjournal = 0; ofd_slc_set(m); - tgd->tgd_grant_compat_disable = 0; m->ofd_soft_sync_limit = OFD_SOFT_SYNC_LIMIT_DEFAULT; - /* statfs data */ - spin_lock_init(&tgd->tgd_osfs_lock); - tgd->tgd_osfs_age = cfs_time_shift_64(-1000); - tgd->tgd_osfs_unstable = 0; - tgd->tgd_statfs_inflight = 0; - tgd->tgd_osfs_inflight = 0; - - /* grant data */ - spin_lock_init(&tgd->tgd_grant_lock); - tgd->tgd_tot_dirty = 0; - tgd->tgd_tot_granted = 0; - tgd->tgd_tot_pending = 0; - m->ofd_seq_count = 0; init_waitqueue_head(&m->ofd_inconsistency_thread.t_ctl_waitq); INIT_LIST_HEAD(&m->ofd_inconsistency_list); @@ -3008,27 +2993,13 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, "filter_ldlm_cb_client", &obd->obd_ldlm_client); - dt_conf_get(env, m->ofd_osd, &m->ofd_lut.lut_dt_conf); - rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice, OBD_FAIL_OST_ALL_REQUEST_NET, OBD_FAIL_OST_ALL_REPLY_NET); if (rc) GOTO(err_free_ns, rc); - /* populate cached statfs data */ - osfs = &ofd_info(env)->fti_u.osfs; - rc = tgt_statfs_internal(env, &m->ofd_lut, osfs, 0, NULL); - if (rc != 0) { - CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc); - GOTO(err_fini_lut, rc); - } - if (!is_power_of_2(osfs->os_bsize)) { - CERROR("%s: blocksize (%d) is not a power of 2\n", - obd->obd_name, osfs->os_bsize); - GOTO(err_fini_lut, rc = -EPROTO); - } - tgd->tgd_blockbits = fls(osfs->os_bsize) - 1; + tgd->tgd_reserved_pcnt = 0; if (DT_DEF_BRW_SIZE < (1U << tgd->tgd_blockbits)) m->ofd_brw_size = 1U << tgd->tgd_blockbits; @@ -3037,7 +3008,8 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, m->ofd_cksum_types_supported = cksum_types_supported_server(); m->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT; - if (osfs->os_bsize * osfs->os_blocks < OFD_PRECREATE_SMALL_FS) + if (tgd->tgd_osfs.os_bsize * tgd->tgd_osfs.os_blocks < + OFD_PRECREATE_SMALL_FS) m->ofd_precreate_batch = OFD_PRECREATE_BATCH_SMALL; rc = ofd_fs_setup(env, m, obd); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 87b6c22..e4c6a04 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2621,16 +2621,14 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, } EXPORT_SYMBOL(osc_set_info_async); -static int osc_reconnect(const struct lu_env *env, - struct obd_export *exp, struct obd_device *obd, - struct obd_uuid *cluuid, - struct obd_connect_data *data, - void *localdata) +int osc_reconnect(const struct lu_env *env, struct obd_export *exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) { - struct client_obd *cli = &obd->u.cli; + struct client_obd *cli = &obd->u.cli; - if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { - long lost_grant; + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; long grant; spin_lock(&cli->cl_loi_list_lock); @@ -2651,8 +2649,9 @@ static int osc_reconnect(const struct lu_env *env, RETURN(0); } +EXPORT_SYMBOL(osc_reconnect); -static int osc_disconnect(struct obd_export *exp) +int osc_disconnect(struct obd_export *exp) { struct obd_device *obd = class_exp2obd(exp); int rc; @@ -2679,6 +2678,7 @@ static int osc_disconnect(struct obd_export *exp) osc_del_shrink_grant(&obd->u.cli); return rc; } +EXPORT_SYMBOL(osc_disconnect); int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg) diff --git a/lustre/target/tgt_grant.c b/lustre/target/tgt_grant.c index 1caad7c..e3264a5 100644 --- a/lustre/target/tgt_grant.c +++ b/lustre/target/tgt_grant.c @@ -308,6 +308,8 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut, if (unlikely(rc)) GOTO(out, rc); + osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX); + spin_lock(&tgd->tgd_grant_lock); spin_lock(&tgd->tgd_osfs_lock); /* calculate how much space was written while we released the @@ -428,6 +430,7 @@ static u64 tgt_grant_space_left(struct obd_export *exp) u64 left; u64 avail; u64 unstable; + u64 reserved; ENTRY; assert_spin_locked(&tgd->tgd_grant_lock); @@ -438,7 +441,8 @@ static u64 tgt_grant_space_left(struct obd_export *exp) unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */ spin_unlock(&tgd->tgd_osfs_lock); - tot_granted = tgd->tgd_tot_granted; + reserved = left * tgd->tgd_reserved_pcnt / 100; + tot_granted = tgd->tgd_tot_granted + reserved; if (left < tot_granted) { int mask = (left + unstable < @@ -1500,3 +1504,132 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp, RETURN(rc); } EXPORT_SYMBOL(tgt_grant_commit_cb_add); + + +/** + * Show estimate of total amount of dirty data on clients. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * + * \retval 0 on success + * \retval negative value on error + */ +int tgt_tot_dirty_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct tg_grants_data *tgd; + + LASSERT(obd != NULL); + tgd = &obd->u.obt.obt_lut->lut_tgd; + seq_printf(m, "%llu\n", tgd->tgd_tot_dirty); + return 0; +} +EXPORT_SYMBOL(tgt_tot_dirty_seq_show); + +/** + * Show total amount of space granted to clients. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * + * \retval 0 on success + * \retval negative value on error + */ +int tgt_tot_granted_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct tg_grants_data *tgd; + + LASSERT(obd != NULL); + tgd = &obd->u.obt.obt_lut->lut_tgd; + seq_printf(m, "%llu\n", tgd->tgd_tot_granted); + return 0; +} +EXPORT_SYMBOL(tgt_tot_granted_seq_show); + +/** + * Show total amount of space used by IO in progress. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * + * \retval 0 on success + * \retval negative value on error + */ +int tgt_tot_pending_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct tg_grants_data *tgd; + + LASSERT(obd != NULL); + tgd = &obd->u.obt.obt_lut->lut_tgd; + seq_printf(m, "%llu\n", tgd->tgd_tot_pending); + return 0; +} +EXPORT_SYMBOL(tgt_tot_pending_seq_show); + +/** + * Show if grants compatibility mode is disabled. + * + * When tgd_grant_compat_disable is set, we don't grant any space to clients + * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such + * a client is inflated since it consumes PAGE_SIZE of grant space per + * block, (i.e. typically 4kB units), but underlaying file system might have + * block size bigger than page size, e.g. ZFS. See LU-2049 for details. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * + * \retval 0 on success + * \retval negative value on error + */ +int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *obd = m->private; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + + seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable); + return 0; +} +EXPORT_SYMBOL(tgt_grant_compat_disable_seq_show); + +/** + * Change grant compatibility mode. + * + * Setting tgd_grant_compat_disable prohibit any space granting to clients + * not supporting OBD_CONNECT_GRANT_PARAM. See details above. + * + * \param[in] file proc file + * \param[in] buffer string which represents mode + * 1: disable compatibility mode + * 0: enable compatibility mode + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * + * \retval \a count on success + * \retval negative number on error + */ +ssize_t tgt_grant_compat_disable_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *obd = m->private; + struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd; + __s64 val; + int rc; + + rc = lprocfs_str_to_s64(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + + tgd->tgd_grant_compat_disable = !!val; + + return count; +} +EXPORT_SYMBOL(tgt_grant_compat_disable_seq_write); + diff --git a/lustre/target/tgt_main.c b/lustre/target/tgt_main.c index 4d39237..3783674 100644 --- a/lustre/target/tgt_main.c +++ b/lustre/target/tgt_main.c @@ -152,6 +152,8 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut, struct lu_attr attr; struct lu_fid fid; struct dt_object *o; + struct tg_grants_data *tgd = &lut->lut_tgd; + struct obd_statfs *osfs; int i, rc = 0; ENTRY; @@ -188,6 +190,38 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut, if (!obd->obd_replayable) RETURN(0); + /* initialize grant and statfs data in target */ + dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf); + + /* statfs data */ + spin_lock_init(&tgd->tgd_osfs_lock); + tgd->tgd_osfs_age = cfs_time_shift_64(-1000); + tgd->tgd_osfs_unstable = 0; + tgd->tgd_statfs_inflight = 0; + tgd->tgd_osfs_inflight = 0; + + /* grant data */ + spin_lock_init(&tgd->tgd_grant_lock); + tgd->tgd_tot_dirty = 0; + tgd->tgd_tot_granted = 0; + tgd->tgd_tot_pending = 0; + tgd->tgd_grant_compat_disable = 0; + + /* populate cached statfs data */ + osfs = &tgt_th_info(env)->tti_u.osfs; + rc = tgt_statfs_internal(env, lut, osfs, 0, NULL); + if (rc != 0) { + CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut), + rc); + GOTO(out, rc); + } + if (!is_power_of_2(osfs->os_bsize)) { + CERROR("%s: blocksize (%d) is not a power of 2\n", + tgt_name(lut), osfs->os_bsize); + GOTO(out, rc = -EPROTO); + } + tgd->tgd_blockbits = fls(osfs->os_bsize) - 1; + spin_lock_init(&lut->lut_translock); spin_lock_init(&lut->lut_client_bitmap_lock);